Set-up

Avoid scientific notation
options(scipen=999)
Clear the workspace
rm(list = ls())

Packages

Install a package/ packages
install.packages('tidyverse')
install.packages(c('tidyverse','lme4'))
Load a package
require(tidyverse)
Find out which package version is loaded in R
packageVersion('tidyverse') # for 'tidyverse' package
 
installed.packages()[names(sessionInfo()$otherPkgs), "Version"] # for all the loaded packages
Check conflicts

This reports on objects that exist with the same name in two or more places. Useful for finding an object that may be masking any functions.

conflicts()

Import/ Export data

Read a cvs file (package: readr)
dat = read_csv('my_data.csv') 
Read a tab-delimited text file (package: readr)
dat = read_delim('my_data.txt', delim="\t") 
Write a data frame to a tab-delimited text file (package: readr)
write_delim(data_frame, 'file_name.txt',  delim='\t')
Write a data frame to a csv file (package: readr)
write_csv(data_frame, 'file_name.csv')
write_excel_csv(data_frame, 'file_name.csv')

All columns are encoded as UTF-8 (useful for exporting Japanese characters). write_excel_csv() also includes a UTF-8 Byte order mark.

Save data as an .rds file
save(my.data, file='file_name.rds')
Save all ‘lmerMod’ objects (LME model output) as an .rds file with today’s date (e.g., 30Jun2022)
save(list=Filter(function(x)'lmerMod'%in%class(get(x)),ls()),file=paste0('models_',substr(date(),9,10),substr(date(),5,7),substr(date(),21,24),'.rds'))
Load an .rds file
load('file_name.rds')

Check data

I use a built-in dataset ChickWeight in some examples. The data frame has 578 rows and 4 columns from an experiment on the effect of diet on early growth of chicks.

First 6 rows of the data:

weight Time Chick Diet
42 0 1 1
51 2 1 1
59 4 1 1
64 6 1 1
76 8 1 1
93 10 1 1
Column Description
weight a numeric vector giving the body weight of the chick (gm).
Time a numeric vector giving the number of days since birth when the measurement was made.
Chick an ordered factor with levels 18 < … < 48 giving a unique identifier for the chick. The ordering of the levels groups chicks on the same diet together and orders them according to their final weight (lightest to heaviest) within diet.
Diet a factor with levels 1, …, 4 indicating which experimental diet the chick received.
Summarise a data frame
summary(ChickWeight)
##      weight           Time           Chick     Diet   
##  Min.   : 35.0   Min.   : 0.00   13     : 12   1:220  
##  1st Qu.: 63.0   1st Qu.: 4.00   9      : 12   2:120  
##  Median :103.0   Median :10.00   20     : 12   3:120  
##  Mean   :121.8   Mean   :10.72   10     : 12   4:118  
##  3rd Qu.:163.8   3rd Qu.:16.00   17     : 12          
##  Max.   :373.0   Max.   :21.00   19     : 12          
##                                  (Other):506
Get count, mean, SD, SE, and 95% CI for each condition/group (e.g, summary of weight for Diet) (package: Rmisc)
summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T) 
##   Diet   N   weight       sd       se        ci
## 1    1 220 102.6455 56.65655 3.819784  7.528242
## 2    2 120 122.6167 71.60749 6.536840 12.943596
## 3    3 120 142.9500 86.54176 7.900146 15.643078
## 4    4 118 135.2627 68.82871 6.336197 12.548506
List possible combinations of two columns (package: tidyr)
ChickWeight %>% tidyr::expand(crossing(Chick,Diet)) # list all possible combinations
## # A tibble: 200 × 2
##    Chick Diet 
##    <ord> <fct>
##  1 18    1    
##  2 18    2    
##  3 18    3    
##  4 18    4    
##  5 16    1    
##  6 16    2    
##  7 16    3    
##  8 16    4    
##  9 15    1    
## 10 15    2    
## # ℹ 190 more rows
ChickWeight %>% tidyr::expand(nesting(Chick,Diet)) # list combinations that are present in the data
## # A tibble: 50 × 2
##    Chick Diet 
##    <ord> <fct>
##  1 18    1    
##  2 16    1    
##  3 15    1    
##  4 13    1    
##  5 9     1    
##  6 20    1    
##  7 10    1    
##  8 8     1    
##  9 17    1    
## 10 19    1    
## # ℹ 40 more rows

Modify a data frame

Convert a column/ columns into factor/numeric (package: dplyr)
new.dat = ChickWeight %>% mutate_at(vars(Chick), as.factor) # convert column 'Chick' to factor
new.dat = ChickWeight %>% mutate_all(funs(as.factor)) # convert all columns to factor
new.dat = ChickWeight %>% mutate_at(vars(Time), as.numeric) # convert 'Time' to numeric
Filter observations (package: dplyr)
new.dat = ChickWeight %>% filter(Diet %in% c(1,3))  # keep only the observations for which the values of the variable 'Diet' is 1 or 3
new.dat = ChickWeight %>% filter(Diet!=1)  # keep only the observations for which the values of the variable 'Diet' is NOT 1 
new.dat = ChickWeight %>% filter(!(Diet %in% c(1,3)))  # keep only the observations for which the values of the variable 'Diet' is NOT 1 or 3
new.dat = ChickWeight %>% filter(Diet %in% c(1,3), weight > 60)  # keep only the observations for which the values of the variable 'Diet' is 1 or 3 and the values for the variable 'weight' is greater than 60
Select rows (package: dplyr)
new.dat = ChickWeight %>% slice(n())  # select the last row
new.dat = ChickWeight %>% slice(3:n())  # select all rows except for the first two
new.dat = ChickWeight %>% slice_head(n=3)  # select the first three rows
Select columns (package: dplyr)
new.dat = ChickWeight %>% select(weight, Time)  # keep only the columns 'weight' and 'Time'
new.dat = ChickWeight %>% select(-Diet)  # drop the column 'Diet'
Select columns (package: tidyselect)
new.dat = ChickWeight %>% select(-(last_col(offset=2):last_col()))  # drop the last 3 columns
Convert data to a wider format (e.g., Time will spread across column names) (package: dplyr, tidyr)
longer.dat = ChickWeight %>% select(-Diet) 
head(longer.dat) # data frame before conversion
##   weight Time Chick
## 1     42    0     1
## 2     51    2     1
## 3     59    4     1
## 4     64    6     1
## 5     76    8     1
## 6     93   10     1
wider.dat = longer.dat %>% pivot_wider(names_from = Time, values_from = weight)
head(wider.dat) # data frame after conversion
## # A tibble: 6 × 13
##   Chick   `0`   `2`   `4`   `6`   `8`  `10`  `12`  `14`  `16`  `18`  `20`  `21`
##   <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1        42    51    59    64    76    93   106   125   149   171   199   205
## 2 2        40    49    58    72    84   103   122   138   162   187   209   215
## 3 3        43    39    55    67    84    99   115   138   163   187   198   202
## 4 4        42    49    56    67    74    87   102   108   136   154   160   157
## 5 5        41    42    48    60    79   106   141   164   197   199   220   223
## 6 6        41    49    59    74    97   124   141   148   155   160   160   157
Convert data to a longer format (e.g., ‘Time’ becomes a column name) (package: dplyr, tidyr)
head(wider.dat) # data frame before conversion (from above)
## # A tibble: 6 × 13
##   Chick   `0`   `2`   `4`   `6`   `8`  `10`  `12`  `14`  `16`  `18`  `20`  `21`
##   <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1        42    51    59    64    76    93   106   125   149   171   199   205
## 2 2        40    49    58    72    84   103   122   138   162   187   209   215
## 3 3        43    39    55    67    84    99   115   138   163   187   198   202
## 4 4        42    49    56    67    74    87   102   108   136   154   160   157
## 5 5        41    42    48    60    79   106   141   164   197   199   220   223
## 6 6        41    49    59    74    97   124   141   148   155   160   160   157
longer.dat = wider.dat %>% pivot_longer(cols=!Chick, names_to = "Time", values_to = "weight")
head(longer.dat) # data frame after conversion
## # A tibble: 6 × 3
##   Chick Time  weight
##   <ord> <chr>  <dbl>
## 1 1     0         42
## 2 1     2         51
## 3 1     4         59
## 4 1     6         64
## 5 1     8         76
## 6 1     10        93
Separate a text column to multiple columns (Text to Columns in Excel) (package: tibble,tidyr)
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) # example data
## # A tibble: 4 × 2
##    Item Condition
##   <int> <chr>    
## 1     1 1_A      
## 2     2 1_B      
## 3     3 2_A      
## 4     4 2_B
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) %>% 
  separate(Condition,c("Number","Alphabet"),sep="_") # separate Condition column into number and alphabet
## # A tibble: 4 × 3
##    Item Number Alphabet
##   <int> <chr>  <chr>   
## 1     1 1      A       
## 2     2 1      B       
## 3     3 2      A       
## 4     4 2      B
Change variable names (new_name = old_name) (package: dplyr)
new.dat = ChickWeight %>% rename(time = Time, chick = Chick, diet = Diet)
colnames(new.dat)
## [1] "weight" "time"   "chick"  "diet"
Change all variable names
colnames(ChickWeight) = c("Weight","Time","Chick","Diet")
Replace factor values/names (package: dplyr)
summary(ChickWeight$Diet) 
##   1   2   3   4 
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=recode(Diet,'1'='A','2'='B','3'='C','4'='D'))
summary(new.dat$Diet) 
##   A   B   C   D 
## 220 120 120 118
Reorder factor levels (package: forcats)
summary(ChickWeight$Diet) 
##   1   2   3   4 
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=fct_relevel(Diet,c('4','3','2','1')))
summary(new.dat$Diet) 
##   4   3   2   1 
## 118 120 120 220
Add/Replace values conditionally (package: dplyr)
new.dat = ChickWeight %>% mutate(weight_code = if_else(weight<60, 'below_60', '60_or_above')) # add a column `weight_code`
head(new.dat)
## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet weight_code
## 1     42    0     1    1    below_60
## 2     51    2     1    1    below_60
## 3     59    4     1    1    below_60
## 4     64    6     1    1 60_or_above
## 5     76    8     1    1 60_or_above
## 6     93   10     1    1 60_or_above
new.dat = ChickWeight %>% mutate(weight_code = case_when(
  weight<50 ~ 'below_50',
  between(weight,50,70) ~ 'between_50_and_70',
  weight>70 ~ 'above_70'
)) # vectorise multiple `if_else` statements
head(new.dat)
## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet       weight_code
## 1     42    0     1    1          below_50
## 2     51    2     1    1 between_50_and_70
## 3     59    4     1    1 between_50_and_70
## 4     64    6     1    1 between_50_and_70
## 5     76    8     1    1          above_70
## 6     93   10     1    1          above_70
Compute lagged or leading values (package: dplyr)
new.dat = ChickWeight %>% 
  mutate(previous_weight = lag(weight,1), # look one row ahead
         future_weight = lead(weight,1)) # look one row behind
head(new.dat) 
## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet previous_weight future_weight
## 1     42    0     1    1              NA            51
## 2     51    2     1    1              42            59
## 3     59    4     1    1              51            64
## 4     64    6     1    1              59            76
## 5     76    8     1    1              64            93
## 6     93   10     1    1              76           106
Remove duplicates
new.dat = ChickWeight %>% distinct()

Note: dplyr chains can be connected. For example:

new.dat = ChickWeight %>% select(-Diet) %>% mutate_at(vars(Chick), as.factor) %>% filter(weight > 60)

Deal with NAs

Put NA for certain values in column A based on the values in column B (e.g., if Time equals 2, put NA to weight) (package: dplyr)
new.dat = ChickWeight %>% mutate(weight = replace(weight, which(Time==2), NA))
head(new.dat)
## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     NA    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
Put NA for values that deviate more than 2.5 SDs from the group mean (e.g, NA if weight deviates more than 2.5 SDs from the mean of the Diet group) (package: dplyr)
new.dat = ChickWeight %>% group_by(Diet) %>% mutate(zWeight = scale(weight), weight = replace(weight, which(!between(zWeight, -2.5, 2.5)), NA)) 

new.dat %>% filter(is.na(weight)) # check rows with `NA`
## # A tibble: 13 × 5
## # Groups:   Diet [4]
##    weight  Time Chick Diet  zWeight[,1]
##     <dbl> <dbl> <ord> <fct>       <dbl>
##  1     NA    18 7     1            2.60
##  2     NA    20 7     1            3.27
##  3     NA    21 7     1            3.57
##  4     NA    18 14    1            2.57
##  5     NA    20 14    1            2.76
##  6     NA    21 14    1            2.88
##  7     NA    18 21    2            2.57
##  8     NA    20 21    2            2.73
##  9     NA    21 21    2            2.91
## 10     NA    21 29    2            2.60
## 11     NA    20 35    3            2.52
## 12     NA    21 35    3            2.66
## 13     NA    21 48    4            2.71
Compute the proportion of NA
sum(is.na(new.dat$weight))/length(new.dat$weight)
## [1] 0.02249135
Drop rows containing NA (package: tidyr)
dat.na.dropped = new.dat %>% drop_na(weight) # drop row if the column weight contains NA
dat.na.dropped = new.dat %>% drop_na() # drop row if any of the column contains NA
dat.na.dropped = new.dat %>% na.omit() # drop row if any of the column contains NA (using the R base 'stats' package)
Replace NA with 0 across multiple columns
dat.na.replaced = new.dat %>% mutate_at(vars(weight,Time), ~replace_na(., 0)) # replace NA in 'weight' and 'Time' columns with 0

dat.na.replaced = new.dat %>% mutate_all(~replace_na(., 0)) # replace NA in all columns with 0

Plot

Histogram (package: ggplot2)
ggplot(ChickWeight, aes(x=weight)) + geom_histogram()

Correlation plot (package: ggplot2)
ggplot(ChickWeight, aes(x=Time, y=weight)) +
  geom_point(shape=1) + 
  geom_smooth(method=lm) +
  xlab('Time') +
  ylab('Weight')

Group by Diet

ggplot(ChickWeight, aes(x=Time, y=weight)) +
  geom_point(shape=1) + 
  geom_smooth(method=lm) +
  xlab('Time') +
  ylab('Weight') +
  facet_wrap(vars(Diet), ncol=2)

Bar graph with error bars (with 95% CI) (package: ggplot2)

First, create a summarised data frame for plotting (package: Rmisc)

plot.dat = summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T) 
plot.dat # check data
##   Diet   N   weight       sd       se        ci
## 1    1 220 102.6455 56.65655 3.819784  7.528242
## 2    2 120 122.6167 71.60749 6.536840 12.943596
## 3    3 120 142.9500 86.54176 7.900146 15.643078
## 4    4 118 135.2627 68.82871 6.336197 12.548506

Plot

ggplot(plot.dat, aes(x=Diet,y=weight,fill=Diet)) + 
  theme_bw() + # optional theme
geom_bar(position=position_dodge(),stat="identity",colour="black",size=.4,alpha=.8) +
  geom_errorbar(aes(ymin=weight-ci,ymax=weight+ci),size=.4,width=.3) +
  scale_fill_manual('Diet',values=c('red','royalblue','deeppink','darkgrey')) +
  xlab("Diet") +
  ylab("Weight") 

Tables

Summary table of a dataset (package: knitr)
knitr::kable(summary(sleepstudy)) # 'sleepstudy' data from the 'lme4' package
Reaction Days Subject
Min. :194.3 Min. :0.0 308 : 10
1st Qu.:255.4 1st Qu.:2.0 309 : 10
Median :288.7 Median :4.5 310 : 10
Mean :298.5 Mean :4.5 330 : 10
3rd Qu.:336.8 3rd Qu.:7.0 331 : 10
Max. :466.4 Max. :9.0 332 : 10
NA NA (Other):120

Frequency table (package: dplyr)

ChickWeight %>% count(Time, Diet) %>% 
  mutate(prop = prop.table(n))
##    Time Diet  n       prop
## 1     0    1 20 0.03460208
## 2     0    2 10 0.01730104
## 3     0    3 10 0.01730104
## 4     0    4 10 0.01730104
## 5     2    1 20 0.03460208
## 6     2    2 10 0.01730104
## 7     2    3 10 0.01730104
## 8     2    4 10 0.01730104
## 9     4    1 19 0.03287197
## 10    4    2 10 0.01730104
## 11    4    3 10 0.01730104
## 12    4    4 10 0.01730104
## 13    6    1 19 0.03287197
## 14    6    2 10 0.01730104
## 15    6    3 10 0.01730104
## 16    6    4 10 0.01730104
## 17    8    1 19 0.03287197
## 18    8    2 10 0.01730104
## 19    8    3 10 0.01730104
## 20    8    4 10 0.01730104
## 21   10    1 19 0.03287197
## 22   10    2 10 0.01730104
## 23   10    3 10 0.01730104
## 24   10    4 10 0.01730104
## 25   12    1 19 0.03287197
## 26   12    2 10 0.01730104
## 27   12    3 10 0.01730104
## 28   12    4 10 0.01730104
## 29   14    1 18 0.03114187
## 30   14    2 10 0.01730104
## 31   14    3 10 0.01730104
## 32   14    4 10 0.01730104
## 33   16    1 17 0.02941176
## 34   16    2 10 0.01730104
## 35   16    3 10 0.01730104
## 36   16    4 10 0.01730104
## 37   18    1 17 0.02941176
## 38   18    2 10 0.01730104
## 39   18    3 10 0.01730104
## 40   18    4 10 0.01730104
## 41   20    1 17 0.02941176
## 42   20    2 10 0.01730104
## 43   20    3 10 0.01730104
## 44   20    4  9 0.01557093
## 45   21    1 16 0.02768166
## 46   21    2 10 0.01730104
## 47   21    3 10 0.01730104
## 48   21    4  9 0.01557093
ChickWeight %>% count(Diet) %>% 
  mutate(prop = prop.table(n))
##   Diet   n      prop
## 1    1 220 0.3806228
## 2    2 120 0.2076125
## 3    3 120 0.2076125
## 4    4 118 0.2041522
Linear-mixed effects model output (parameter estimates, standard errors & t-statistics) (package: knitr)
example.model = lmer(Reaction ~ Days + (Days|Subject), sleepstudy) # run a linear-mixed effects model
summary(example.model) # full summary 
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1743.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9536 -0.4634  0.0231  0.4634  5.1793 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  Subject  (Intercept) 612.10   24.741       
##           Days         35.07    5.922   0.07
##  Residual             654.94   25.592       
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  251.405      6.825  36.838
## Days          10.467      1.546   6.771
## 
## Correlation of Fixed Effects:
##      (Intr)
## Days -0.138
coef.table = coef(summary(example.model)) # extract the coefficient table

rownames(coef.table) = c("Intercept","Days") # rename rows (if you want)

knitr::kable(coef.table, digits=2) # round off to 2 decimal places 
Estimate Std. Error t value
Intercept 251.41 6.82 36.84
Days 10.47 1.55 6.77

Special characters

Convert Japanese full-width characters (Zenkaku) into half-width (Hankaku) (package: stringi)
Age = c('21','18','19','31','29')
stri_trans_nfkc(Age)
## [1] "21" "18" "19" "31" "29"
Remove/Change characters (package: stringr)
example = c('high (not sure)','high','Low','High','low?','medium','medium/low')
str_replace_all(example,c('\\/.*'='', # remove all characters after '/'
                          '\\(.*'='', # remove all characters after '('
                          '[[:punct:]]'='')) # remove all special characters
## [1] "high "  "high"   "Low"    "High"   "low"    "medium" "medium"
str_to_lower(example) # convert string to lowercase
## [1] "high (not sure)" "high"            "low"             "high"            "low?"            "medium"         
## [7] "medium/low"
str_to_upper(example) # convert string to uppercase
## [1] "HIGH (NOT SURE)" "HIGH"            "LOW"             "HIGH"            "LOW?"            "MEDIUM"         
## [7] "MEDIUM/LOW"

Note: The above codes can be embedded in dplyr chain

dat = dat %>% mutate(Age=stri_trans_nfkc(Age), Values=str_replace_all(Values,'[[:punct:]]'=''))