####################################################################################
#This is a file with executable R code for chapter 5 of Natalia Levshina's (2015) 
#How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. 
####################################################################################

###Section 5.2

##Main text

install.packages(c("ggplot2", "gplots")) # remove "ggplot2" if you have already installed it

library(Rling); library(ggplot2); library(gplots)

data(pym_high)

data(pym_low)

head(pym_high)

str(pym_high)

summary(pym_high$assoc)

summary(pym_low$assoc)

boxplot(pym_high$assoc, pym_low$assoc, names = c("high", "low"), main = "Box plots of average numbers of associations", xlab = "Frequency group", ylab = "Average number of associations")

boxplot.stats(pym_low$assoc)$out

pym_low[pym_low$assoc == 3, ]

t.test(pym_high$assoc, pym_low$assoc, alternative = "greater")

t.test(pym_low$assoc, pym_high$assoc, alternative = "less")

qnorm((1 - 0.95)/2, lower.tail = FALSE)

qt((1 - 0.95)/2, df = length(pym_high$assoc) - 1, lower.tail = FALSE)

se.high <- sd(pym_high$assoc)/sqrt(length(pym_high$assoc))

se.high

ci.lower.high <- mean(pym_high$assoc) - 1.96*se.high

ci.lower.high

ci.upper.high <- mean(pym_high$assoc) + 1.96*se.high

ci.upper.high

se.low <- sd(pym_low$assoc)/sqrt(length(pym_low$assoc))

se.low

ci.lower.low <- mean(pym_low$assoc) - 1.96*se.low

ci.lower.low

ci.upper.low <- mean(pym_low$assoc) + 1.96*se.low

ci.upper.low

means <- c(mean(pym_high$assoc), mean(pym_low$assoc))

means

ci.lower <- c(ci.lower.high, ci.lower.low)

ci.lower

ci.upper <- c(ci.upper.high, ci.upper.low)

ci.upper

barplot2(means, plot.ci = TRUE, ci.l = ci.lower, ci.u = ci.upper, main = "Bar plot with 95% confidence intervals", xlab = "Frequency groups", ylab = "Average number of associations", names = c("High", "Low"))


##Boxes with additional information

pym_assoc <- data.frame(assoc = c(pym_high$assoc, pym_low$assoc), freq = c(rep("high", 50), rep("low", 51)))

head(pym_assoc)

ggplot(pym_assoc, aes(x = freq, y = assoc)) + geom_boxplot() + xlab("Frequency group") + ylab("Average number of associations")

pym_low[order(pym_low$imag),]

pym_low[order(-pym_low$imag),]

pym_low[order(pym_low$syl, pym_low$let),]

head(pym_assoc)

tail(pym_assoc)

t.test(pym_assoc$assoc ~ pym_assoc$freq, alternative = "greater")

levels(pym_assoc$freq)

assoc.df <- data.frame(group = c("High", "Low"), mean = means, se = c(se.high, se.low))

assoc.df

ggplot(assoc.df, aes(x = group, y = mean)) + geom_bar(stat = "identity", fill = "lightblue", colour = "black") + xlab("Frequency group") + ylab("Average number of associations") + geom_errorbar(aes(ymin = mean - 1.96*se, ymax = mean + 1.96*se), width = 0.2)


###Section 5.3

##Main text

install.packages("ggplot2") #if you haven't installed the package yet

library(Rling); library(ggplot2)

data(pym_high)

data(pym_low)

summary(pym_high$conc)

summary(pym_low$conc)

qqnorm(pym_high$conc, main = "Q–Q plot of concreteness scores")

qqline(pym_high$conc)

plot(density(pym_high$conc), main = "Density plot of concreteness scores", xlab = "Concreteness")

stripchart(list(pym_high$conc, pym_low$conc), main = "Distribution of concreteness scores", group.names = c("high", "low"), method = "jitter", xlim = c(1, 7))

rug(pym_high$conc, side = 1)

rug(pym_low$conc, side = 3)

pym_high[pym_high$conc > 6, 4, drop = FALSE]

pym_high[pym_high$conc > 6, 4]

pym_high[pym_high$conc > 2&pym_high$conc < 4, 4, drop = FALSE]

pym_high[pym_high$conc > 4&pym_high$conc < 6, 4, drop = FALSE]

shapiro.test(pym_high$conc)

shapiro.test(pym_low$conc)

wilcox.test(pym_high$conc, pym_low$conc, correct = FALSE, conf.int = TRUE)


##Boxes with additional information

pym_conc <- data.frame(conc = c(pym_high$conc, pym_low$conc), freq = c(rep("high", 50), rep("low", 51)))

ggplot(pym_conc, aes(x = freq, y = conc)) + geom_point(position = position_jitter(width = 0.05), shape=0) + coord_flip() + labs(x = "Frequency group", y = "Average concreteness score", ylim = c(1, 7))


###Section 5.4


##Main text

library(Rling)

data(pym_high)

diff <- rnorm(50, -1.35, 1.27)

nn <- pym_high$assoc + diff

head(nn)

nn <- round(nn, 2)

shapiro.test(diff)

t.test(pym_high$assoc, nn, alternative = "greater", paired = TRUE)