####################################################################################
#This is a file with executable R code for chapter 15 of Natalia Levshina's (2015) 
#How to Do Linguistics with R. Amsterdam/Philadelphia: John Benjamins. 
####################################################################################


###Section 15.2


##Main text


install.packages(c("cluster", "pvclust", "vcd"))

library(Rling); library(cluster); library(pvclust); library(vcd)

data(caus)

str(caus)

levels(caus$Cx)

be_made_toV <- caus[caus$Cx == "be_made_toV", -1]

cause_toV <- caus[caus$Cx == "cause_toV", -1]

get_toV <- caus[caus$Cx == "get_toV", -1]

get_Ved <- caus[caus$Cx == "get_Ved", -1]

get_Ving <- caus[caus$Cx == "get_Ving", -1]

have_V <- caus[caus$Cx == "have_V", -1]

have_Ved <- caus[caus$Cx == "have_Ved", -1]

have_Ving <- caus[caus$Cx == "have_Ving", -1]

make_V <- caus[caus$Cx == "make_V", -1]

prop.table(table(make_V$CrSem))

prop.table(table(make_V$CeSem))

make_V.bp <- bp(make_V)

make_V.bp

be_made_toV.bp <- bp(be_made_toV)

cause_toV.bp <- bp(cause_toV)

get_toV.bp <- bp(get_toV)

get_Ved.bp <- bp(get_Ved)

get_Ving.bp <- bp(get_Ving)

have_V.bp <- bp(have_V)

have_Ved.bp <- bp(have_Ved)

have_Ving.bp <- bp(have_Ving)

caus.bp <- rbind(be_made_toV.bp, cause_toV.bp, get_toV.bp, get_Ved.bp, get_Ving.bp, have_V.bp, have_Ved.bp, have_Ving.bp, make_V.bp)

dim(caus.bp)

rownames(caus.bp) <- levels(caus$Cx)

caus.dist <- dist(caus.bp, method = "canberra")

round(caus.dist, 2)

max(caus.dist)

min(caus.dist)

caus.hc <- hclust(caus.dist, method = "ward.D2")

plot(caus.hc, hang = -1)

test.clust <- cutree(caus.hc, k = 2)

test.clust

summary(silhouette(test.clust, caus.dist))$avg.width

asw <- sapply(2:8, function(x) summary(silhouette(cutree(caus.hc, k = x), caus.dist))$avg.width)

asw

rect.hclust(caus.hc, k = 4)

c1 <- caus.bp[c(4,7),]

c2 <- caus.bp[-c(4,7),]

c1.bp <- colMeans(c1)

c2.bp <- colMeans(c2)

diff <- c1.bp - c2.bp

sort(diff, decreasing = TRUE)

plot(sort(diff)*1.2, 1:length(diff), type = "n", xlab = "cluster 2 <–-> cluster 1", yaxt = "n", ylab = "")

text(sort(diff), 1:length(diff), names(sort(diff)))

cluster <- as.character(caus$Cx)

cluster[cluster == "get_Ved"|cluster == "have_Ved"] = "1"

cluster[cluster != "1"] = "2"

cluster <- as.factor(cluster)

assocstats(table(cluster, caus$CrSem))

caus.pvc <- pvclust(t(caus.bp), method.hclust = "ward.D2", method.dist = "canberra")

plot(caus.pvc, hang = -1)

pvrect(caus.pvc, alpha = 0.95)

test.clust <- kmeans(caus.bp, 4, nstart = 25)

test.clust$cluster

test.clust$tot.withinss

wcss <- sapply(1:8, function(x) kmeans(caus.bp, x, nstart = 25)$tot.withinss)

wcss

plot(1:8, wcss, type = "b", main = "Scree plot of WCSS for n clusters", xlab = "n of clusters", ylab = "WCSS")

caus.pam <- pam(caus.dist, k = 4)

caus.pam$clustering

caus.pam$medoids

caus.pam$silinfo$avg.width

asw <- sapply(2:8, function(x) pam(caus.dist, k = x)$silinfo$avg.width)

asw

##Boxes with additional information

caus.split <- split(caus, caus$Cx)

str(caus.split)

length(caus.split)

caus.split <- lapply(caus.split, function(x) x = x[, -1])

caus.split.bp <- lapply(caus.split, bp)

caus.split.bp[1]

caus.bp1 <- do.call(rbind, caus.split.bp)

identical(caus.bp, caus.bp1)

adist("water", "wine")