options(scipen=999)
rm(list = ls())
install.packages('tidyverse')
install.packages(c('tidyverse','lme4'))
require(tidyverse)
packageVersion('tidyverse') # for 'tidyverse' package
installed.packages()[names(sessionInfo()$otherPkgs), "Version"] # for all the loaded packages
This reports on objects that exist with the same name in two or more places. Useful for finding an object that may be masking any functions.
conflicts()
readr
)dat = read_csv('my_data.csv')
readr
)dat = read_delim('my_data.txt', delim="\t")
readr
)write_delim(data_frame, 'file_name.txt', delim='\t')
readr
)write_csv(data_frame, 'file_name.csv')
write_excel_csv(data_frame, 'file_name.csv')
All columns are encoded as UTF-8 (useful for exporting Japanese
characters). write_excel_csv()
also includes a UTF-8 Byte
order mark.
save(my.data, file='file_name.rds')
save(list=Filter(function(x)'lmerMod'%in%class(get(x)),ls()),file=paste0('models_',substr(date(),9,10),substr(date(),5,7),substr(date(),21,24),'.rds'))
load('file_name.rds')
I use a built-in dataset ChickWeight
in some examples.
The data frame has 578 rows and 4 columns from an experiment on the
effect of diet on early growth of chicks.
First 6 rows of the data:
weight | Time | Chick | Diet |
---|---|---|---|
42 | 0 | 1 | 1 |
51 | 2 | 1 | 1 |
59 | 4 | 1 | 1 |
64 | 6 | 1 | 1 |
76 | 8 | 1 | 1 |
93 | 10 | 1 | 1 |
Column | Description |
---|---|
weight | a numeric vector giving the body weight of the chick (gm). |
Time | a numeric vector giving the number of days since birth when the measurement was made. |
Chick | an ordered factor with levels 18 < … < 48 giving a unique identifier for the chick. The ordering of the levels groups chicks on the same diet together and orders them according to their final weight (lightest to heaviest) within diet. |
Diet | a factor with levels 1, …, 4 indicating which experimental diet the chick received. |
summary(ChickWeight)
## weight Time Chick Diet
## Min. : 35.0 Min. : 0.00 13 : 12 1:220
## 1st Qu.: 63.0 1st Qu.: 4.00 9 : 12 2:120
## Median :103.0 Median :10.00 20 : 12 3:120
## Mean :121.8 Mean :10.72 10 : 12 4:118
## 3rd Qu.:163.8 3rd Qu.:16.00 17 : 12
## Max. :373.0 Max. :21.00 19 : 12
## (Other):506
Rmisc
)summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T)
## Diet N weight sd se ci
## 1 1 220 102.6455 56.65655 3.819784 7.528242
## 2 2 120 122.6167 71.60749 6.536840 12.943596
## 3 3 120 142.9500 86.54176 7.900146 15.643078
## 4 4 118 135.2627 68.82871 6.336197 12.548506
tidyr
)ChickWeight %>% tidyr::expand(crossing(Chick,Diet)) # list all possible combinations
## # A tibble: 200 × 2
## Chick Diet
## <ord> <fct>
## 1 18 1
## 2 18 2
## 3 18 3
## 4 18 4
## 5 16 1
## 6 16 2
## 7 16 3
## 8 16 4
## 9 15 1
## 10 15 2
## # ℹ 190 more rows
ChickWeight %>% tidyr::expand(nesting(Chick,Diet)) # list combinations that are present in the data
## # A tibble: 50 × 2
## Chick Diet
## <ord> <fct>
## 1 18 1
## 2 16 1
## 3 15 1
## 4 13 1
## 5 9 1
## 6 20 1
## 7 10 1
## 8 8 1
## 9 17 1
## 10 19 1
## # ℹ 40 more rows
dplyr
)new.dat = ChickWeight %>% mutate_at(vars(Chick), as.factor) # convert column 'Chick' to factor
new.dat = ChickWeight %>% mutate_all(funs(as.factor)) # convert all columns to factor
new.dat = ChickWeight %>% mutate_at(vars(Time), as.numeric) # convert 'Time' to numeric
dplyr
)new.dat = ChickWeight %>% filter(Diet %in% c(1,3)) # keep only the observations for which the values of the variable 'Diet' is 1 or 3
new.dat = ChickWeight %>% filter(Diet!=1) # keep only the observations for which the values of the variable 'Diet' is NOT 1
new.dat = ChickWeight %>% filter(!(Diet %in% c(1,3))) # keep only the observations for which the values of the variable 'Diet' is NOT 1 or 3
new.dat = ChickWeight %>% filter(Diet %in% c(1,3), weight > 60) # keep only the observations for which the values of the variable 'Diet' is 1 or 3 and the values for the variable 'weight' is greater than 60
dplyr
)new.dat = ChickWeight %>% slice(n()) # select the last row
new.dat = ChickWeight %>% slice(3:n()) # select all rows except for the first two
new.dat = ChickWeight %>% slice_head(n=3) # select the first three rows
dplyr
)new.dat = ChickWeight %>% select(weight, Time) # keep only the columns 'weight' and 'Time'
new.dat = ChickWeight %>% select(-Diet) # drop the column 'Diet'
tidyselect
)new.dat = ChickWeight %>% select(-(last_col(offset=2):last_col())) # drop the last 3 columns
Time
will spread
across column names) (package: dplyr
,
tidyr
)longer.dat = ChickWeight %>% select(-Diet)
head(longer.dat) # data frame before conversion
## weight Time Chick
## 1 42 0 1
## 2 51 2 1
## 3 59 4 1
## 4 64 6 1
## 5 76 8 1
## 6 93 10 1
wider.dat = longer.dat %>% pivot_wider(names_from = Time, values_from = weight)
head(wider.dat) # data frame after conversion
## # A tibble: 6 × 13
## Chick `0` `2` `4` `6` `8` `10` `12` `14` `16` `18` `20` `21`
## <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 42 51 59 64 76 93 106 125 149 171 199 205
## 2 2 40 49 58 72 84 103 122 138 162 187 209 215
## 3 3 43 39 55 67 84 99 115 138 163 187 198 202
## 4 4 42 49 56 67 74 87 102 108 136 154 160 157
## 5 5 41 42 48 60 79 106 141 164 197 199 220 223
## 6 6 41 49 59 74 97 124 141 148 155 160 160 157
dplyr
, tidyr
)head(wider.dat) # data frame before conversion (from above)
## # A tibble: 6 × 13
## Chick `0` `2` `4` `6` `8` `10` `12` `14` `16` `18` `20` `21`
## <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 42 51 59 64 76 93 106 125 149 171 199 205
## 2 2 40 49 58 72 84 103 122 138 162 187 209 215
## 3 3 43 39 55 67 84 99 115 138 163 187 198 202
## 4 4 42 49 56 67 74 87 102 108 136 154 160 157
## 5 5 41 42 48 60 79 106 141 164 197 199 220 223
## 6 6 41 49 59 74 97 124 141 148 155 160 160 157
longer.dat = wider.dat %>% pivot_longer(cols=!Chick, names_to = "Time", values_to = "weight")
head(longer.dat) # data frame after conversion
## # A tibble: 6 × 3
## Chick Time weight
## <ord> <chr> <dbl>
## 1 1 0 42
## 2 1 2 51
## 3 1 4 59
## 4 1 6 64
## 5 1 8 76
## 6 1 10 93
tibble
,tidyr
)tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) # example data
## # A tibble: 4 × 2
## Item Condition
## <int> <chr>
## 1 1 1_A
## 2 2 1_B
## 3 3 2_A
## 4 4 2_B
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) %>%
separate(Condition,c("Number","Alphabet"),sep="_") # separate Condition column into number and alphabet
## # A tibble: 4 × 3
## Item Number Alphabet
## <int> <chr> <chr>
## 1 1 1 A
## 2 2 1 B
## 3 3 2 A
## 4 4 2 B
dplyr
)new.dat = ChickWeight %>% rename(time = Time, chick = Chick, diet = Diet)
colnames(new.dat)
## [1] "weight" "time" "chick" "diet"
colnames(ChickWeight) = c("Weight","Time","Chick","Diet")
dplyr
)summary(ChickWeight$Diet)
## 1 2 3 4
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=recode(Diet,'1'='A','2'='B','3'='C','4'='D'))
summary(new.dat$Diet)
## A B C D
## 220 120 120 118
forcats
)summary(ChickWeight$Diet)
## 1 2 3 4
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=fct_relevel(Diet,c('4','3','2','1')))
summary(new.dat$Diet)
## 4 3 2 1
## 118 120 120 220
dplyr
)new.dat = ChickWeight %>% mutate(weight_code = if_else(weight<60, 'below_60', '60_or_above')) # add a column `weight_code`
head(new.dat)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet weight_code
## 1 42 0 1 1 below_60
## 2 51 2 1 1 below_60
## 3 59 4 1 1 below_60
## 4 64 6 1 1 60_or_above
## 5 76 8 1 1 60_or_above
## 6 93 10 1 1 60_or_above
new.dat = ChickWeight %>% mutate(weight_code = case_when(
weight<50 ~ 'below_50',
between(weight,50,70) ~ 'between_50_and_70',
weight>70 ~ 'above_70'
)) # vectorise multiple `if_else` statements
head(new.dat)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet weight_code
## 1 42 0 1 1 below_50
## 2 51 2 1 1 between_50_and_70
## 3 59 4 1 1 between_50_and_70
## 4 64 6 1 1 between_50_and_70
## 5 76 8 1 1 above_70
## 6 93 10 1 1 above_70
dplyr
)new.dat = ChickWeight %>%
mutate(previous_weight = lag(weight,1), # look one row ahead
future_weight = lead(weight,1)) # look one row behind
head(new.dat)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet previous_weight future_weight
## 1 42 0 1 1 NA 51
## 2 51 2 1 1 42 59
## 3 59 4 1 1 51 64
## 4 64 6 1 1 59 76
## 5 76 8 1 1 64 93
## 6 93 10 1 1 76 106
new.dat = ChickWeight %>% distinct()
Note: dplyr
chains can be connected.
For example:
new.dat = ChickWeight %>% select(-Diet) %>% mutate_at(vars(Chick), as.factor) %>% filter(weight > 60)
NA
for certain values in column A based on the
values in column B (e.g., if Time
equals 2, put
NA
to weight
) (package:
dplyr
)new.dat = ChickWeight %>% mutate(weight = replace(weight, which(Time==2), NA))
head(new.dat)
## Grouped Data: weight ~ Time | Chick
## weight Time Chick Diet
## 1 42 0 1 1
## 2 NA 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
NA
for values that deviate more than 2.5 SDs from
the group mean (e.g, NA
if weight
deviates
more than 2.5 SDs from the mean of the Diet
group)
(package: dplyr
)new.dat = ChickWeight %>% group_by(Diet) %>% mutate(zWeight = scale(weight), weight = replace(weight, which(!between(zWeight, -2.5, 2.5)), NA))
new.dat %>% filter(is.na(weight)) # check rows with `NA`
## # A tibble: 13 × 5
## # Groups: Diet [4]
## weight Time Chick Diet zWeight[,1]
## <dbl> <dbl> <ord> <fct> <dbl>
## 1 NA 18 7 1 2.60
## 2 NA 20 7 1 3.27
## 3 NA 21 7 1 3.57
## 4 NA 18 14 1 2.57
## 5 NA 20 14 1 2.76
## 6 NA 21 14 1 2.88
## 7 NA 18 21 2 2.57
## 8 NA 20 21 2 2.73
## 9 NA 21 21 2 2.91
## 10 NA 21 29 2 2.60
## 11 NA 20 35 3 2.52
## 12 NA 21 35 3 2.66
## 13 NA 21 48 4 2.71
sum(is.na(new.dat$weight))/length(new.dat$weight)
## [1] 0.02249135
tidyr
)dat.na.dropped = new.dat %>% drop_na(weight) # drop row if the column weight contains NA
dat.na.dropped = new.dat %>% drop_na() # drop row if any of the column contains NA
dat.na.dropped = new.dat %>% na.omit() # drop row if any of the column contains NA (using the R base 'stats' package)
dat.na.replaced = new.dat %>% mutate_at(vars(weight,Time), ~replace_na(., 0)) # replace NA in 'weight' and 'Time' columns with 0
dat.na.replaced = new.dat %>% mutate_all(~replace_na(., 0)) # replace NA in all columns with 0
ggplot2
)ggplot(ChickWeight, aes(x=weight)) + geom_histogram()
ggplot2
)ggplot(ChickWeight, aes(x=Time, y=weight)) +
geom_point(shape=1) +
geom_smooth(method=lm) +
xlab('Time') +
ylab('Weight')
Group by Diet
ggplot(ChickWeight, aes(x=Time, y=weight)) +
geom_point(shape=1) +
geom_smooth(method=lm) +
xlab('Time') +
ylab('Weight') +
facet_wrap(vars(Diet), ncol=2)
ggplot2
)First, create a summarised data frame for plotting (package:
Rmisc
)
plot.dat = summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T)
plot.dat # check data
## Diet N weight sd se ci
## 1 1 220 102.6455 56.65655 3.819784 7.528242
## 2 2 120 122.6167 71.60749 6.536840 12.943596
## 3 3 120 142.9500 86.54176 7.900146 15.643078
## 4 4 118 135.2627 68.82871 6.336197 12.548506
Plot
ggplot(plot.dat, aes(x=Diet,y=weight,fill=Diet)) +
theme_bw() + # optional theme
geom_bar(position=position_dodge(),stat="identity",colour="black",size=.4,alpha=.8) +
geom_errorbar(aes(ymin=weight-ci,ymax=weight+ci),size=.4,width=.3) +
scale_fill_manual('Diet',values=c('red','royalblue','deeppink','darkgrey')) +
xlab("Diet") +
ylab("Weight")
knitr
)knitr::kable(summary(sleepstudy)) # 'sleepstudy' data from the 'lme4' package
Reaction | Days | Subject | |
---|---|---|---|
Min. :194.3 | Min. :0.0 | 308 : 10 | |
1st Qu.:255.4 | 1st Qu.:2.0 | 309 : 10 | |
Median :288.7 | Median :4.5 | 310 : 10 | |
Mean :298.5 | Mean :4.5 | 330 : 10 | |
3rd Qu.:336.8 | 3rd Qu.:7.0 | 331 : 10 | |
Max. :466.4 | Max. :9.0 | 332 : 10 | |
NA | NA | (Other):120 |
dplyr
)ChickWeight %>% count(Time, Diet) %>%
mutate(prop = prop.table(n))
## Time Diet n prop
## 1 0 1 20 0.03460208
## 2 0 2 10 0.01730104
## 3 0 3 10 0.01730104
## 4 0 4 10 0.01730104
## 5 2 1 20 0.03460208
## 6 2 2 10 0.01730104
## 7 2 3 10 0.01730104
## 8 2 4 10 0.01730104
## 9 4 1 19 0.03287197
## 10 4 2 10 0.01730104
## 11 4 3 10 0.01730104
## 12 4 4 10 0.01730104
## 13 6 1 19 0.03287197
## 14 6 2 10 0.01730104
## 15 6 3 10 0.01730104
## 16 6 4 10 0.01730104
## 17 8 1 19 0.03287197
## 18 8 2 10 0.01730104
## 19 8 3 10 0.01730104
## 20 8 4 10 0.01730104
## 21 10 1 19 0.03287197
## 22 10 2 10 0.01730104
## 23 10 3 10 0.01730104
## 24 10 4 10 0.01730104
## 25 12 1 19 0.03287197
## 26 12 2 10 0.01730104
## 27 12 3 10 0.01730104
## 28 12 4 10 0.01730104
## 29 14 1 18 0.03114187
## 30 14 2 10 0.01730104
## 31 14 3 10 0.01730104
## 32 14 4 10 0.01730104
## 33 16 1 17 0.02941176
## 34 16 2 10 0.01730104
## 35 16 3 10 0.01730104
## 36 16 4 10 0.01730104
## 37 18 1 17 0.02941176
## 38 18 2 10 0.01730104
## 39 18 3 10 0.01730104
## 40 18 4 10 0.01730104
## 41 20 1 17 0.02941176
## 42 20 2 10 0.01730104
## 43 20 3 10 0.01730104
## 44 20 4 9 0.01557093
## 45 21 1 16 0.02768166
## 46 21 2 10 0.01730104
## 47 21 3 10 0.01730104
## 48 21 4 9 0.01557093
ChickWeight %>% count(Diet) %>%
mutate(prop = prop.table(n))
## Diet n prop
## 1 1 220 0.3806228
## 2 2 120 0.2076125
## 3 3 120 0.2076125
## 4 4 118 0.2041522
knitr
)example.model = lmer(Reaction ~ Days + (Days|Subject), sleepstudy) # run a linear-mixed effects model
summary(example.model) # full summary
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1743.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9536 -0.4634 0.0231 0.4634 5.1793
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Subject (Intercept) 612.10 24.741
## Days 35.07 5.922 0.07
## Residual 654.94 25.592
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 251.405 6.825 36.838
## Days 10.467 1.546 6.771
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.138
coef.table = coef(summary(example.model)) # extract the coefficient table
rownames(coef.table) = c("Intercept","Days") # rename rows (if you want)
knitr::kable(coef.table, digits=2) # round off to 2 decimal places
Estimate | Std. Error | t value | |
---|---|---|---|
Intercept | 251.41 | 6.82 | 36.84 |
Days | 10.47 | 1.55 | 6.77 |
stringi
)Age = c('21','18','19','31','29')
stri_trans_nfkc(Age)
## [1] "21" "18" "19" "31" "29"
stringr
)example = c('high (not sure)','high','Low','High','low?','medium','medium/low')
str_replace_all(example,c('\\/.*'='', # remove all characters after '/'
'\\(.*'='', # remove all characters after '('
'[[:punct:]]'='')) # remove all special characters
## [1] "high " "high" "Low" "High" "low" "medium" "medium"
str_to_lower(example) # convert string to lowercase
## [1] "high (not sure)" "high" "low" "high" "low?" "medium"
## [7] "medium/low"
str_to_upper(example) # convert string to uppercase
## [1] "HIGH (NOT SURE)" "HIGH" "LOW" "HIGH" "LOW?" "MEDIUM"
## [7] "MEDIUM/LOW"
Note: The above codes can be embedded in
dplyr
chain
dat = dat %>% mutate(Age=stri_trans_nfkc(Age), Values=str_replace_all(Values,'[[:punct:]]'=''))