The packages used for this script:
require(Rmisc)
require(tidyverse)
require(ggplot2)
require(lme4)
require(stringi)
require(stringdist)
options(scipen=999)
rm(list = ls())
install.packages('tidyverse')
install.packages(c('tidyverse','lme4'))
require(tidyverse)
R.version.string
## [1] "R version 4.3.3 (2024-02-29 ucrt)"
packageVersion('tidyverse') # for 'tidyverse' package
installed.packages()[names(sessionInfo()$otherPkgs), "Version"] # for all the loaded packages
This reports on objects that exist with the same name in two or more places. Useful for finding an object that may be masking any functions.
conflicts()
dat = read_csv('my_data.csv')
dat = read_delim('my_data.txt', delim="\t")
write_delim(data_frame, 'file_name.txt', delim='\t')
write_csv(data_frame, 'file_name.csv')
write_excel_csv(data_frame, 'file_name.csv')
All columns are encoded as UTF-8 (useful for exporting Japanese
characters). write_excel_csv()
also includes a UTF-8 Byte
order mark.
save(my.data, file='file_name.rds')
save(list=ls(), file="file_name.rds")
save(list=Filter(function(x)'lmerMod'%in%class(get(x)),ls()),file=paste0('models_',substr(date(),9,10),substr(date(),5,7),substr(date(),21,24),'.rds'))
load('file_name.rds')
I use a built-in dataset ChickWeight
in some examples.
The data frame has 578 rows and 4 columns from an experiment on the
effect of diet on early growth of chicks.
First 6 rows of the data:
weight | Time | Chick | Diet |
---|---|---|---|
42 | 0 | 1 | 1 |
51 | 2 | 1 | 1 |
59 | 4 | 1 | 1 |
64 | 6 | 1 | 1 |
76 | 8 | 1 | 1 |
93 | 10 | 1 | 1 |
Column | Description |
---|---|
weight | a numeric vector giving the body weight of the chick (gm). |
Time | a numeric vector giving the number of days since birth when the measurement was made. |
Chick | an ordered factor with levels 18 < … < 48 giving a unique identifier for the chick. The ordering of the levels groups chicks on the same diet together and orders them according to their final weight (lightest to heaviest) within diet. |
Diet | a factor with levels 1, …, 4 indicating which experimental diet the chick received. |
summary(ChickWeight)
## weight Time Chick Diet
## Min. : 35.0 Min. : 0.00 13 : 12 1:220
## 1st Qu.: 63.0 1st Qu.: 4.00 9 : 12 2:120
## Median :103.0 Median :10.00 20 : 12 3:120
## Mean :121.8 Mean :10.72 10 : 12 4:118
## 3rd Qu.:163.8 3rd Qu.:16.00 17 : 12
## Max. :373.0 Max. :21.00 19 : 12
## (Other):506
summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T) # summary of weight for Diet
ChickWeight %>% tidyr::expand(crossing(Chick,Diet)) # list all possible combinations
ChickWeight %>% tidyr::expand(nesting(Chick,Diet)) # list combinations that are present in the data
ChickWeight %>% distinct(Chick,Diet)
new.dat = ChickWeight %>% mutate_at(vars(Chick), as.factor) # convert column 'Chick' to factor
new.dat = ChickWeight %>% mutate_all(funs(as.factor)) # convert all columns to factor
new.dat = ChickWeight %>% mutate_at(vars(Time), as.numeric) # convert 'Time' to numeric
new.dat = ChickWeight %>% filter(Diet %in% c(1,3)) # keep only the observations for which the values of the variable 'Diet' is 1 or 3
new.dat = ChickWeight %>% filter(Diet!=1) # keep only the observations for which the values of the variable 'Diet' is NOT 1
new.dat = ChickWeight %>% filter(!(Diet %in% c(1,3))) # keep only the observations for which the values of the variable 'Diet' is NOT 1 or 3
new.dat = ChickWeight %>% filter(Diet %in% c(1,3), weight > 60) # keep only the observations for which the values of the variable 'Diet' is 1 or 3 and the values for the variable 'weight' is greater than 60
new.dat = ChickWeight %>% slice(n()) # select the last row
new.dat = ChickWeight %>% slice(3:n()) # select all rows except for the first two
new.dat = ChickWeight %>% slice_head(n=3) # select the first three rows
new.dat = ChickWeight %>% select(weight, Time) # keep only the columns 'weight' and 'Time'
new.dat = ChickWeight %>% select(-Diet) # drop the column 'Diet'
new.dat = ChickWeight %>% select(-(last_col(offset=2):last_col())) # drop the last 3 columns
(e.g., Time
will spread across column names)
longer.dat = ChickWeight %>% select(-Diet)
head(longer.dat) # data frame before conversion
wider.dat = longer.dat %>% pivot_wider(names_from = Time, values_from = weight)
head(wider.dat) # data frame after conversion
(e.g., ‘Time’ becomes a column name)
head(wider.dat) # data frame before conversion (from above)
longer.dat = wider.dat %>% pivot_longer(cols=!Chick, names_to = "Time", values_to = "weight")
head(longer.dat) # data frame after conversion
(Text to Columns in Excel)
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) # example data
tibble(Item=1:4, Condition=c("1_A","1_B","2_A","2_B")) %>%
separate(Condition,c("Number","Alphabet"),sep="_") # separate Condition column into number and alphabet
(new_name = old_name)
new.dat = ChickWeight %>% rename(time = Time, chick = Chick, diet = Diet)
colnames(new.dat)
## [1] "weight" "time" "chick" "diet"
colnames(ChickWeight) = c("Weight","Time","Chick","Diet")
summary(ChickWeight$Diet)
## 1 2 3 4
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=recode(Diet,'1'='A','2'='B','3'='C','4'='D'))
summary(new.dat$Diet)
## A B C D
## 220 120 120 118
summary(ChickWeight$Diet)
## 1 2 3 4
## 220 120 120 118
new.dat = ChickWeight %>% mutate(Diet=fct_relevel(Diet,c('4','3','2','1')))
summary(new.dat$Diet)
## 4 3 2 1
## 118 120 120 220
new.dat = ChickWeight %>% mutate(weight_code = if_else(weight<60, 'below_60', '60_or_above')) # add a column `weight_code`
head(new.dat)
new.dat = ChickWeight %>% mutate(weight_code = case_when(
weight<50 ~ 'below_50',
between(weight,50,70) ~ 'between_50_and_70',
weight>70 ~ 'above_70'
)) # vectorise multiple `if_else` statements
head(new.dat)
new.dat = ChickWeight %>%
mutate(previous_weight = lag(weight,1), # look one row ahead
future_weight = lead(weight,1)) # look one row behind
head(new.dat)
new.dat = ChickWeight %>% distinct()
Note: dplyr
chains can be connected.
For example:
new.dat = ChickWeight %>% select(-Diet) %>% mutate_at(vars(Chick), as.factor) %>% filter(weight > 60)
NA
for certain values in column A based on the
values in column B(e.g., if Time
equals 2, put NA
to
weight
)
new.dat = ChickWeight %>% mutate(weight = replace(weight, which(Time==2), NA))
head(new.dat)
NA
for values that deviate more than 2.5 SDs from
the group mean(e.g, NA
if weight
deviates more than 2.5
SDs from the mean of the Diet
group) (package:
dplyr
)
new.dat = ChickWeight %>% group_by(Diet) %>% mutate(zWeight = scale(weight), weight = replace(weight, which(!between(zWeight, -2.5, 2.5)), NA)) %>% ungroup()
new.dat %>% filter(is.na(weight)) # check rows with `NA`
sum(is.na(new.dat$weight))/length(new.dat$weight)
## [1] 0.02249135
new.dat %>% summarise(na_sum=sum(is.na(weight)), na_prop=sum(is.na(weight))/length(weight)) # count & proportion
new.dat %>% group_by(Diet) %>% summarise(na_sum=sum(is.na(weight)), na_prop=sum(is.na(weight))/length(weight))
dat.na.dropped = new.dat %>% drop_na(weight) # drop row if the column weight contains NA
dat.na.dropped = new.dat %>% drop_na() # drop row if any of the column contains NA
dat.na.dropped = new.dat %>% na.omit() # drop row if any of the column contains NA (using the R base 'stats' package)
dat.na.replaced = new.dat %>% mutate_at(vars(weight,Time), ~replace_na(., 0)) # replace NA in 'weight' and 'Time' columns with 0
dat.na.replaced = new.dat %>% mutate_all(~replace_na(., 0)) # replace NA in all columns with 0
ggplot(ChickWeight, aes(x=weight)) + geom_histogram()
ggplot(ChickWeight, aes(x=Time, y=weight)) +
geom_point(shape=1) +
geom_smooth(method=lm) +
xlab('Time') +
ylab('Weight')
Group by Diet
ggplot(ChickWeight, aes(x=Time, y=weight)) +
geom_point(shape=1) +
geom_smooth(method=lm) +
xlab('Time') +
ylab('Weight') +
facet_wrap(vars(Diet), ncol=2)
First, create a summarised data frame for plotting (package:
Rmisc
)
plot.dat = summarySE(ChickWeight, measurevar="weight", groupvars=c("Diet"),na.rm=T)
plot.dat # check data
Plot
ggplot(plot.dat, aes(x=Diet,y=weight,fill=Diet)) +
theme_bw() + # optional theme
geom_bar(position=position_dodge(),stat="identity",colour="black",size=.4,alpha=.8) +
geom_errorbar(aes(ymin=weight-ci,ymax=weight+ci),size=.4,width=.3) +
scale_fill_manual('Diet',values=c('red','royalblue','deeppink','darkgrey')) +
xlab("Diet") +
ylab("Weight")
knitr::kable(summary(sleepstudy)) # 'sleepstudy' data from the 'lme4' package
Reaction | Days | Subject | |
---|---|---|---|
Min. :194.3 | Min. :0.0 | 308 : 10 | |
1st Qu.:255.4 | 1st Qu.:2.0 | 309 : 10 | |
Median :288.7 | Median :4.5 | 310 : 10 | |
Mean :298.5 | Mean :4.5 | 330 : 10 | |
3rd Qu.:336.8 | 3rd Qu.:7.0 | 331 : 10 | |
Max. :466.4 | Max. :9.0 | 332 : 10 | |
NA | NA | (Other):120 |
ChickWeight %>% count(Time, Diet) %>%
mutate(prop = prop.table(n))
ChickWeight %>% count(Diet) %>%
mutate(prop = prop.table(n))
(parameter estimates, standard errors & t-statistics)
example.model = lmer(Reaction ~ Days + (Days|Subject), sleepstudy) # run a linear-mixed effects model
summary(example.model) # full summary
## Linear mixed model fit by REML ['lmerMod']
## Formula: Reaction ~ Days + (Days | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1743.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9536 -0.4634 0.0231 0.4634 5.1793
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Subject (Intercept) 612.10 24.741
## Days 35.07 5.922 0.07
## Residual 654.94 25.592
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 251.405 6.825 36.838
## Days 10.467 1.546 6.771
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.138
coef.table = coef(summary(example.model)) # extract the coefficient table
rownames(coef.table) = c("Intercept","Days") # rename rows (if you want)
knitr::kable(coef.table, digits=2) # round off to 2 decimal places
Estimate | Std. Error | t value | |
---|---|---|---|
Intercept | 251.41 | 6.82 | 36.84 |
Days | 10.47 | 1.55 | 6.77 |
str_trim(" text with space ", side="both") # default is "both"
## [1] "text with space"
str_trim(" text with space ", side="left")
## [1] "text with space "
str_trim(" text with space ", side="right")
## [1] " text with space"
Age = c('21','18','19','31','29')
stri_trans_nfkc(Age)
## [1] "21" "18" "19" "31" "29"
example = c('high (not sure)','high','Low','High','low?','medium','medium/low')
str_replace_all(example,c('\\/.*'='', # remove all characters after '/'
'\\(.*'='', # remove all characters after '('
'[[:punct:]]'='')) # remove all special characters
## [1] "high " "high" "Low" "High" "low" "medium" "medium"
str_to_lower(example) # convert string to lowercase
## [1] "high (not sure)" "high" "low" "high" "low?" "medium"
## [7] "medium/low"
str_to_upper(example) # convert string to uppercase
## [1] "HIGH (NOT SURE)" "HIGH" "LOW" "HIGH" "LOW?" "MEDIUM"
## [7] "MEDIUM/LOW"
Note: The above codes can be embedded in
dplyr
chain
dat = dat %>% mutate(Age=stri_trans_nfkc(Age), Values=str_replace_all(Values,'[[:punct:]]'=''))
stringdist("cat", "cap", method = "lv")
## [1] 1