####################################################################################
#This is a file with executable R code for chapter 3 of Natalia Levshina's (2015) 
#How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. 
####################################################################################

###Section 3.1

##Main text


install.packages("modeest")

library(Rling); library(modeest)

data(ldt)

head(ldt)

mean(ldt$Length)

sort(ldt$Length)

median(ldt$Length)

quantile(ldt$Length, 0.25)

quantile(ldt$Length, 0.5)

table(ldt$Length)

mlv(ldt$Length)

summary(ldt$Length)

max(ldt$Length)

min(ldt$Length)

range(ldt$Length)

var(ldt$Length)

sqrt(var(ldt$Length))

sd(ldt$Length)

IQR(ldt$Length)

mad(ldt$Length, constant = 1)


##Boxes with additional information


attach(ldt)

mean(Length)

detach(ldt)

mean(Length)


###Section 3.2

##Main text


install.packages("ggplot2")

library(Rling); library(ggplot2)

data(ldt)

hist(ldt$Mean_RT, main = "Histogram of mean reaction times", xlab = "reaction times, ms")

plot(density(ldt$Mean_RT), main = "Density plot of mean reaction times", xlab = "reaction times, ms")

qqnorm(ldt$Mean_RT)

qqline(ldt$Mean_RT)

summary(ldt$Mean_RT)

shapiro.test(ldt$Mean_RT)

boxplot(ldt$Mean_RT, main = "Mean reaction times", ylab = "reaction time in ms")

ldt[ldt$Mean_RT > 1200, ]

boxplot.stats(ldt$Mean_RT)$out

normalize(ldt$Mean_RT)

ldt[abs(normalize(ldt$Mean_RT)) >= 2.5,]

ldt[abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5,]

outliers <- which(abs(normalize(ldt$Mean_RT, method = "mad")) >= 2.5)

outliers

ldt_remove <- ldt[-outliers,]

dim(ldt_remove)

mean(ldt$Mean_RT) + 2*sd(ldt$Mean_RT)

ldt_new <- ldt

ldt_new[outliers, 3] <- 1114.666

ldt_new[outliers,]


##Boxes with additional information


nbins <- 1 + 3.32*log10(length(ldt$Mean_RT))

nbins

binsize <- diff(range(ldt$Mean_RT))/nbins

ggplot(ldt, aes(x = Mean_RT)) + geom_histogram(binwidth = binsize, fill = "white", colour = "black")

ggplot(ldt, aes(x = Mean_RT)) + geom_line(stat = "density")

ggplot(ldt, aes(sample = Mean_RT)) + stat_qq()

options(scipen = 999)

ggplot(ldt, aes(x = 1, y = Mean_RT)) + geom_boxplot() + theme(axis.title.x = element_blank()) + scale_x_continuous(breaks = NULL)


###Section 3.3


##Main text


install.packages("ggplot2") #if you have not done so yet

library(Rling); library(ggplot2)

data(ldt)

summary(ldt$Freq)

plot(sort(ldt$Freq, decreasing = TRUE), type = "b", main = "Zipf’s law", ylab = "Word frequency")

par(mfrow = c(1, 3))

hist(ldt$Freq, main = "Histogram of word frequencies", xlab = "Word frequency in a corpus", ylab = "Relative frequency in the sample")

plot(density(ldt$Freq), main = "Density plot of word frequencies",xlab = "Word frequency in a corpus")

qqnorm(ldt$Freq)

qqline(ldt$Freq)

log(0)

hist(log1p(ldt$Freq), main = "Histogram of log-frequencies", xlab = "Log-transformed word frequency", ylab = "Relative frequency in the sample")

plot(density(log1p(ldt$Freq)), main = "Density plot of log-frequencies", xlab = "Log-transformed word frequency in a corpus")

qqnorm(log1p(ldt$Freq))

qqline(log1p(ldt$Freq))

shapiro.test(log1p(ldt$Freq))


## Boxes with additional information

ggplot(ldt, aes(x = 1:nrow(ldt), y = sort(Freq, decreasing = TRUE))) + geom_line() + geom_point() + xlab("Index") + ylab("Frequency")