#################################################################################### #This is a file with executable R code for chapter 3 of Natalia Levshina's (2015) #How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. #################################################################################### ###Section 3.1 ##Main text install.packages("modeest") library(Rling); library(modeest) data(ldt) head(ldt) mean(ldt$Length) sort(ldt$Length) median(ldt$Length) quantile(ldt$Length, 0.25) quantile(ldt$Length, 0.5) table(ldt$Length) mlv(ldt$Length) summary(ldt$Length) max(ldt$Length) min(ldt$Length) range(ldt$Length) var(ldt$Length) sqrt(var(ldt$Length)) sd(ldt$Length) IQR(ldt$Length) mad(ldt$Length, constant = 1) ##Boxes with additional information attach(ldt) mean(Length) detach(ldt) mean(Length) ###Section 3.2 ##Main text install.packages("ggplot2") library(Rling); library(ggplot2) data(ldt) hist(ldt$Mean_RT, main = "Histogram of mean reaction times", xlab = "reaction times, ms") plot(density(ldt$Mean_RT), main = "Density plot of mean reaction times", xlab = "reaction times, ms") qqnorm(ldt$Mean_RT) qqline(ldt$Mean_RT) summary(ldt$Mean_RT) shapiro.test(ldt$Mean_RT) boxplot(ldt$Mean_RT, main = "Mean reaction times", ylab = "reaction time in ms") ldt[ldt$Mean_RT > 1200, ] boxplot.stats(ldt$Mean_RT)$out normalize(ldt$Mean_RT) ldt[abs(normalize(ldt$Mean_RT)) >= 2.5,] ldt[abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5,] outliers <- which(abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5) outliers ldt_remove <- ldt[-outliers,] dim(ldt_remove) mean(ldt$Mean_RT) + 2*sd(ldt$Mean_RT) ldt_new <- ldt ldt_new[outliers, 3] <- 1114.666 ldt_new[outliers,] ##Boxes with additional information nbins <- 1 + 3.32*log10(length(ldt$Mean_RT)) nbins binsize <- diff(range(ldt$Mean_RT))/nbins ggplot(ldt, aes(x = Mean_RT)) + geom_histogram(binwidth = binsize, fill = "white", colour = "black") ggplot(ldt, aes(x = Mean_RT)) + geom_line(stat = "density") ggplot(ldt, aes(sample = Mean_RT)) + stat_qq() options(scipen = 999) ggplot(ldt, aes(x = 1, y = Mean_RT)) + geom_boxplot() + theme(axis.title.x = element_blank()) + scale_x_continuous(breaks = NULL) ###Section 3.3 ##Main text install.packages("ggplot2") #if you have not done so yet library(Rling); library(ggplot2) data(ldt) summary(ldt$Freq) plot(sort(ldt$Freq, decreasing = TRUE), type = "b", main = "Zipf’s law", ylab = "Word frequency") par(mfrow = c(1, 3)) hist(ldt$Freq, main = "Histogram of word frequencies", xlab = "Word frequency in a corpus", ylab = "Relative frequency in the sample") plot(density(ldt$Freq), main = "Density plot of word frequencies",xlab = "Word frequency in a corpus") qqnorm(ldt$Freq) qqline(ldt$Freq) log(0) hist(log1p(ldt$Freq), main = "Histogram of log-frequencies", xlab = "Log-transformed word frequency", ylab = "Relative frequency in the sample") plot(density(log1p(ldt$Freq)), main = "Density plot of log-frequencies", xlab = "Log-transformed word frequency in a corpus") qqnorm(log1p(ldt$Freq)) qqline(log1p(ldt$Freq)) shapiro.test(log1p(ldt$Freq)) ## Boxes with additional information ggplot(ldt, aes(x = 1:nrow(ldt), y = sort(Freq, decreasing = TRUE))) + geom_line() + geom_point() + xlab("Index") + ylab("Frequency")