Set-up

The packages used for this script:

require(Rmisc)
require(tidyverse)
require(ggplot2)
require(lme4)
require(stringi)
require(stringdist)
Avoid scientific notation
options(scipen=999)
Clear the workspace
rm(list = ls())

Packages

Install a package/ packages
install.packages('tidyverse')
install.packages(c('tidyverse','lme4'))
Load a package
require(tidyverse)
Check R version
R.version.string
## [1] "R version 4.3.3 (2024-02-29 ucrt)"
Check package version
packageVersion('tidyverse') # for 'tidyverse' package
 
installed.packages()[names(sessionInfo()$otherPkgs), "Version"] # for all the loaded packages
Check conflicts

This reports on objects that exist with the same name in two or more places. Useful for finding an object that may be masking any functions.

conflicts()

Import/ Export data

Read a cvs file
dat = read_csv('my_data.csv') 
Read a tab-delimited text file
dat = read_delim('my_data.txt', delim="\t") 
Write a data frame to a tab-delimited text file
write_delim(data_frame, 'file_name.txt',  delim='\t')
Write a data frame to a csv file
write_csv(data_frame, 'file_name.csv')
write_excel_csv(data_frame, 'file_name.csv')

All columns are encoded as UTF-8 (useful for exporting Japanese characters). write_excel_csv() also includes a UTF-8 Byte order mark.

Save data as an .rds file
save(my.data, file='file_name.rds')
Save all data in the environment as an .rds file
save(list=ls(), file="file_name.rds")
Save all ‘lmerMod’ objects (LME model output) as an .rds file with today’s date (e.g., 30Jun2022)
save(list=Filter(function(x)'lmerMod'%in%class(get(x)),ls()),file=paste0('models_',substr(date(),9,10),substr(date(),5,7),substr(date(),21,24),'.rds'))
Load an .rds file
load('file_name.rds')

Check data

I use a built-in dataset ChickWeight in some examples. The data frame has 578 rows and 4 columns from an experiment on the effect of diet on early growth of chicks.

First 6 rows of the data:

weight Time Chick Diet
42 0 1 1
51 2 1 1
59 4 1 1
64 6 1 1
76 8 1 1
93 10 1 1
Column Description
weight a numeric vector giving the body weight of the chick (gm).
Time a numeric vector giving the number of days since birth when the measurement was made.
Chick an ordered factor with levels 18 < … < 48 giving a unique identifier for the chick. The ordering of the levels groups chicks on the same diet together and orders them according to their final weight (lightest to heaviest) within diet.
Diet a factor with levels 1, …, 4 indicating which experimental diet the chick received.
Summarise a data frame
summary(ChickWeight)
##      weight           Time           Chick     Diet   
##  Min.   : 35.0   Min.   : 0.00   13     : 12   1:220  
##  1st Qu.: 63.0   1st Qu.: 4.00   9      : 12   2:120  
##  Median :103.0   Median :10.00   20     : 12   3:120  
##  Mean   :121.8   Mean   :10.72   10     : 12   4:118  
##  3rd Qu.:163.8   3rd Qu.:16.00   17     : 12          
##  Max.   :373.0   Max.   :21.00   19     : 12          
##                                  (Other):506
Get count, mean, SD, SE, and 95% CI for each condition/group
summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T) # summary of weight for Diet
List possible combinations of two columns
ChickWeight %>% tidyr::expand(crossing(Chick,Diet)) # list all possible combinations
ChickWeight %>% tidyr::expand(nesting(Chick,Diet)) # list combinations that are present in the data
Show unique values for specified columns
ChickWeight %>% distinct(Chick,Diet)

Modify a data frame

Convert a column/ columns into factor/numeric
new.dat = ChickWeight %>% mutate_at(vars(Chick), as.factor) # convert column 'Chick' to factor
new.dat = ChickWeight %>% mutate_all(funs(as.factor)) # convert all columns to factor
new.dat = ChickWeight %>% mutate_at(vars(Time), as.numeric) # convert 'Time' to numeric
Filter observations
new.dat = ChickWeight %>% filter(Diet %in% c(1,3))  # keep only the observations for which the values of the variable 'Diet' is 1 or 3
new.dat = ChickWeight %>% filter(Diet!=1)  # keep only the observations for which the values of the variable 'Diet' is NOT 1 
new.dat = ChickWeight %>% filter(!(Diet %in% c(1,3)))  # keep only the observations for which the values of the variable 'Diet' is NOT 1 or 3
new.dat = ChickWeight %>% filter(Diet %in% c(1,3), weight > 60)  # keep only the observations for which the values of the variable 'Diet' is 1 or 3 and the values for the variable 'weight' is greater than 60
Select rows
new.dat = ChickWeight %>% slice(n())  # select the last row
new.dat = ChickWeight %>% slice(3:n())  # select all rows except for the first two
new.dat = ChickWeight %>% slice_head(n=3)  # select the first three rows
Select columns
new.dat = ChickWeight %>% select(weight, Time)  # keep only the columns 'weight' and 'Time'
new.dat = ChickWeight %>% select(-Diet)  # drop the column 'Diet'
Select columns
new.dat = ChickWeight %>% select(-(last_col(offset=2):last_col()))  # drop the last 3 columns
Convert data to a wider format

(e.g., Time will spread across column names)

longer.dat = ChickWeight %>% select(-Diet) 
head(longer.dat) # data frame before conversion
wider.dat = longer.dat %>% pivot_wider(names_from = Time, values_from = weight)
head(wider.dat) # data frame after conversion
Convert data to a longer format

(e.g., ‘Time’ becomes a column name)

head(wider.dat) # data frame before conversion (from above)
longer.dat = wider.dat %>% pivot_longer(cols=!Chick, names_to = "Time", values_to = "weight")
head(longer.dat) # data frame after conversion
Separate a text column to multiple columns

(Text to Columns in Excel)

tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) # example data
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) %>% 
  separate(Condition,c("Number","Alphabet"),sep="_") # separate Condition column into number and alphabet
Change variable names

(new_name = old_name)

new.dat = ChickWeight %>% rename(time = Time, chick = Chick, diet = Diet)
colnames(new.dat)
## [1] "weight" "time"   "chick"  "diet"
Change all variable names
colnames(ChickWeight) = c("Weight","Time","Chick","Diet")
Replace factor values/names
summary(ChickWeight$Diet) 
##   1   2   3   4 
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=recode(Diet,'1'='A','2'='B','3'='C','4'='D'))
summary(new.dat$Diet) 
##   A   B   C   D 
## 220 120 120 118
Reorder factor levels
summary(ChickWeight$Diet) 
##   1   2   3   4 
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=fct_relevel(Diet,c('4','3','2','1')))
summary(new.dat$Diet) 
##   4   3   2   1 
## 118 120 120 220
Add/Replace values conditionally
new.dat = ChickWeight %>% mutate(weight_code = if_else(weight<60, 'below_60', '60_or_above')) # add a column `weight_code`
head(new.dat)
new.dat = ChickWeight %>% mutate(weight_code = case_when(
  weight<50 ~ 'below_50',
  between(weight,50,70) ~ 'between_50_and_70',
  weight>70 ~ 'above_70'
)) # vectorise multiple `if_else` statements
head(new.dat)
Compute lagged or leading values
new.dat = ChickWeight %>% 
  mutate(previous_weight = lag(weight,1), # look one row ahead
         future_weight = lead(weight,1)) # look one row behind
head(new.dat) 
Remove duplicates
new.dat = ChickWeight %>% distinct()

Note: dplyr chains can be connected. For example:

new.dat = ChickWeight %>% select(-Diet) %>% mutate_at(vars(Chick), as.factor) %>% filter(weight > 60)

Deal with NAs

Put NA for certain values in column A based on the values in column B

(e.g., if Time equals 2, put NA to weight)

new.dat = ChickWeight %>% mutate(weight = replace(weight, which(Time==2), NA))
head(new.dat)
Put NA for values that deviate more than 2.5 SDs from the group mean

(e.g, NA if weight deviates more than 2.5 SDs from the mean of the Diet group) (package: dplyr)

new.dat = ChickWeight %>% group_by(Diet) %>% mutate(zWeight = scale(weight), weight = replace(weight, which(!between(zWeight, -2.5, 2.5)), NA)) %>% ungroup()

new.dat %>% filter(is.na(weight)) # check rows with `NA`
Compute the proportion of NA
sum(is.na(new.dat$weight))/length(new.dat$weight)
## [1] 0.02249135
new.dat %>% summarise(na_sum=sum(is.na(weight)), na_prop=sum(is.na(weight))/length(weight)) # count & proportion
Compute the proportion of NA per group
new.dat %>% group_by(Diet) %>% summarise(na_sum=sum(is.na(weight)), na_prop=sum(is.na(weight))/length(weight))
Drop rows containing NA
dat.na.dropped = new.dat %>% drop_na(weight) # drop row if the column weight contains NA
dat.na.dropped = new.dat %>% drop_na() # drop row if any of the column contains NA
dat.na.dropped = new.dat %>% na.omit() # drop row if any of the column contains NA (using the R base 'stats' package)
Replace NA with 0 across multiple columns
dat.na.replaced = new.dat %>% mutate_at(vars(weight,Time), ~replace_na(., 0)) # replace NA in 'weight' and 'Time' columns with 0

dat.na.replaced = new.dat %>% mutate_all(~replace_na(., 0)) # replace NA in all columns with 0

Plot

Histogram
ggplot(ChickWeight, aes(x=weight)) + geom_histogram()

Correlation plot
ggplot(ChickWeight, aes(x=Time, y=weight)) +
  geom_point(shape=1) + 
  geom_smooth(method=lm) +
  xlab('Time') +
  ylab('Weight')

Group by Diet

ggplot(ChickWeight, aes(x=Time, y=weight)) +
  geom_point(shape=1) + 
  geom_smooth(method=lm) +
  xlab('Time') +
  ylab('Weight') +
  facet_wrap(vars(Diet), ncol=2)

Bar graph with error bars (with 95% CI)

First, create a summarised data frame for plotting (package: Rmisc)

plot.dat = summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T) 
plot.dat # check data

Plot

ggplot(plot.dat, aes(x=Diet,y=weight,fill=Diet)) + 
  theme_bw() + # optional theme
geom_bar(position=position_dodge(),stat="identity",colour="black",size=.4,alpha=.8) +
  geom_errorbar(aes(ymin=weight-ci,ymax=weight+ci),size=.4,width=.3) +
  scale_fill_manual('Diet',values=c('red','royalblue','deeppink','darkgrey')) +
  xlab("Diet") +
  ylab("Weight") 

Tables

Summary table of a dataset
knitr::kable(summary(sleepstudy)) # 'sleepstudy' data from the 'lme4' package
Reaction Days Subject
Min. :194.3 Min. :0.0 308 : 10
1st Qu.:255.4 1st Qu.:2.0 309 : 10
Median :288.7 Median :4.5 310 : 10
Mean :298.5 Mean :4.5 330 : 10
3rd Qu.:336.8 3rd Qu.:7.0 331 : 10
Max. :466.4 Max. :9.0 332 : 10
NA NA (Other):120

Frequency table

ChickWeight %>% count(Time, Diet) %>% 
  mutate(prop = prop.table(n))
ChickWeight %>% count(Diet) %>% 
  mutate(prop = prop.table(n))
Linear-mixed effects model output

(parameter estimates, standard errors & t-statistics)

example.model = lmer(Reaction ~ Days + (Days|Subject), sleepstudy) # run a linear-mixed effects model
summary(example.model) # full summary 
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
##    Data: sleepstudy
## 
## REML criterion at convergence: 1743.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9536 -0.4634  0.0231  0.4634  5.1793 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  Subject  (Intercept) 612.10   24.741       
##           Days         35.07    5.922   0.07
##  Residual             654.94   25.592       
## Number of obs: 180, groups:  Subject, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  251.405      6.825  36.838
## Days          10.467      1.546   6.771
## 
## Correlation of Fixed Effects:
##      (Intr)
## Days -0.138
coef.table = coef(summary(example.model)) # extract the coefficient table

rownames(coef.table) = c("Intercept","Days") # rename rows (if you want)

knitr::kable(coef.table, digits=2) # round off to 2 decimal places 
Estimate Std. Error t value
Intercept 251.41 6.82 36.84
Days 10.47 1.55 6.77

Deal with characters

Remove whitespace
str_trim(" text with space ", side="both") # default is "both"
## [1] "text with space"
str_trim(" text with space ", side="left")
## [1] "text with space "
str_trim(" text with space ", side="right")
## [1] " text with space"
Convert Japanese full-width characters (Zenkaku) into half-width (Hankaku)
Age = c('21','18','19','31','29')
stri_trans_nfkc(Age)
## [1] "21" "18" "19" "31" "29"
Remove/Change characters
example = c('high (not sure)','high','Low','High','low?','medium','medium/low')
str_replace_all(example,c('\\/.*'='', # remove all characters after '/'
                          '\\(.*'='', # remove all characters after '('
                          '[[:punct:]]'='')) # remove all special characters
## [1] "high "  "high"   "Low"    "High"   "low"    "medium" "medium"
str_to_lower(example) # convert string to lowercase
## [1] "high (not sure)" "high"            "low"             "high"            "low?"            "medium"         
## [7] "medium/low"
str_to_upper(example) # convert string to uppercase
## [1] "HIGH (NOT SURE)" "HIGH"            "LOW"             "HIGH"            "LOW?"            "MEDIUM"         
## [7] "MEDIUM/LOW"

Note: The above codes can be embedded in dplyr chain

dat = dat %>% mutate(Age=stri_trans_nfkc(Age), Values=str_replace_all(Values,'[[:punct:]]'=''))
Compute Levenshtein distance
stringdist("cat", "cap", method = "lv")
## [1] 1