This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(FactoMineR)
library(here)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(lattice)
library(lsr)
library(RColorBrewer)
library(tidyr)
library(vcd)
annotation_file <- read.csv(here("BNCspoken_prog_conc_anno_collanalysis.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file)
## Rows: 4,641
## Columns: 18
## $ SE_Filename <fct> filename#0, filename#0, filename#0, filename#2, filenam…
## $ BNC_Filename <fct> SAAB, SAAB, SAAB, SAA3, SAA3, SAA3, SAA3, SABT, SAB7, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> be inviting, were having, 'm not going, 's not going, i…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> invite, have, NA, NA, NA, NA, play, come, say, NA, NA, …
## $ X. <fct> NA, NA, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, NA, c, N…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense <fct> modal, past, GOING TO, GOING TO, GOING TO, GOING TO, pr…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Time.reference <fct> future, future, present, present, present, present, pre…
## $ Continuous <fct> no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y…
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, other interesting function, NA, NA, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated <- annotation_file[1:4641,]
# Extract only progressives
BNCprog <- annotated[annotated$Tense != "NOT progressive" & annotated$Tense != "GOING TO" & annotated$Tense != "catenative", ]
BNCprog$Tense <- droplevels(BNCprog)$Tense # Drop unused levels
# Number of progressives extracted
nrow(BNCprog)
## [1] 3444
# Ratio of genuine progressives in progressive query
nrow(BNCprog)/4641
## [1] 0.7420814
# File to be used for Covayring Collexeme Analysis #
prog_lemmas <- read.table(here("prog_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)
say <- prog_lemmas[prog_lemmas$Lemma=="say",]
summary(say$Corpus)
## Spoken BNC 2014 sample Textbook Conversation
## 153 29
#source(here("coll.analysis_mpfr.r"))
#coll.analysis()
## Preparing list of long lemmas ##
SpokenProg_lemmas <- read.table(file = here("prog_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)
FictionProg_lemmas <- read.table(file = here("FictionProg_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)
prog_lemmas <- rbind(SpokenProg_lemmas, FictionProg_lemmas)
#write.table(prog_lemmas, file = here("CovCA.csv"), sep = "\t")
prog_lemmas <- read.table(file = here("CovCA.csv"), sep = "\t", stringsAsFactors = TRUE)
## Coding for semantic domains ##
prog_semantic_domains <- as.data.frame(sort(unique(prog_lemmas$Lemma)))
names(prog_semantic_domains)[1] <- "lemma"
head(prog_semantic_domains)
## lemma
## 1 abandon
## 2 accept
## 3 accord
## 4 ache
## 5 act
## 6 act out
semantic_Biber <- read.csv(file = here("semantic_domains_Biber2006_p246.csv"), sep = "\t", stringsAsFactors = TRUE) # This is the table I made using the verbs listed in Biber (2006: 246-247)
semantic_Edwards <- read.csv(here("semantic_domains_Ewards.csv"), sep = "\t", stringsAsFactors = TRUE)
names(semantic_Biber)[1] <- "lemma"
names(semantic_Biber)[2] <- "semantic_domain_Biber"
head(semantic_Biber)
## lemma semantic_domain_Biber
## 1 accompany activity
## 2 accuse communication
## 3 acknowledge communication
## 4 acquire activity
## 5 add activity
## 6 address communication
names(semantic_Edwards)[1] <- "lemma"
names(semantic_Edwards)[2] <- "semantic_domain_Edwards"
head(semantic_Edwards)
## lemma semantic_domain_Edwards Edwards
## 1 accept activity Edwards
## 2 accompany activity Edwards
## 3 acquire activity Edwards
## 4 add activity Edwards
## 5 advance activity Edwards
## 6 apply activity Edwards
semantic_domains <- merge(semantic_Biber, semantic_Edwards, by = "lemma", all = TRUE)
head(semantic_domains)
## lemma semantic_domain_Biber semantic_domain_Edwards Edwards
## 1 accompany activity activity Edwards
## 2 accuse communication communication Edwards
## 3 acknowledge communication communication Edwards
## 4 acquire activity activity Edwards
## 5 add activity activity Edwards
## 6 address communication mental Edwards
semantic_domains$agreement <- TRUE
semantic_domains$agreement <- ifelse (
(semantic_domains$semantic_domain_Biber == semantic_domains$semantic_domain_Edwards
), TRUE, FALSE)
#write.table(semantic_domains, file = here("semantic_domains1.csv"), sep = "\t")
# This table has been manually edited so that all disagreements between the two lists have been removed
semantic_domains <- read.csv(here("4_SemanticDomains_VerbLemmas1.csv"), sep = "\t", header = T, stringsAsFactors = TRUE)
names(semantic_domains)[1] <- "Lemma"
names(semantic_domains)[2] <- "Semantic domain"
prog_semantics <- merge(prog_lemmas, semantic_domains, by = "Lemma")
head(prog_semantics, 30); tail(prog_semantics, 30)
## Lemma Corpus Semantic domain
## 1 abandon Youth Fiction Sampled activity
## 2 accept Spoken BNC 2014 sample mental
## 3 accept Spoken BNC 2014 sample mental
## 4 accord Youth Fiction Sampled mental
## 5 accord Youth Fiction Sampled mental
## 6 ache Youth Fiction Sampled activity
## 7 ache Spoken BNC 2014 sample activity
## 8 act Textbook Fiction activity
## 9 act Textbook Conversation activity
## 10 act Youth Fiction Sampled activity
## 11 act Textbook Conversation activity
## 12 act Youth Fiction Sampled activity
## 13 act Textbook Fiction activity
## 14 act out Youth Fiction Sampled activity
## 15 action Spoken BNC 2014 sample occurrence
## 16 add Youth Fiction Sampled activity
## 17 add Spoken BNC 2014 sample activity
## 18 add Textbook Conversation activity
## 19 add Spoken BNC 2014 sample activity
## 20 address Textbook Fiction unclear_other
## 21 address Youth Fiction Sampled unclear_other
## 22 advance Youth Fiction Sampled activity
## 23 advance Youth Fiction Sampled activity
## 24 advise Textbook Conversation communication
## 25 agree Spoken BNC 2014 sample mental
## 26 aim Spoken BNC 2014 sample mental
## 27 aim Spoken BNC 2014 sample mental
## 28 aim for Youth Fiction Sampled mental
## 29 allow Spoken BNC 2014 sample facilitation_causation
## 30 amuse Youth Fiction Sampled activity
## Lemma Corpus Semantic domain
## 7809 write Textbook Fiction communication
## 7810 write Spoken BNC 2014 sample communication
## 7811 write Textbook Conversation communication
## 7812 write Textbook Conversation communication
## 7813 write Textbook Conversation communication
## 7814 write Textbook Fiction communication
## 7815 write Textbook Fiction communication
## 7816 write Textbook Conversation communication
## 7817 write Spoken BNC 2014 sample communication
## 7818 write Youth Fiction Sampled communication
## 7819 write Textbook Conversation communication
## 7820 write Textbook Conversation communication
## 7821 write Textbook Fiction communication
## 7822 write Textbook Fiction communication
## 7823 write Textbook Conversation communication
## 7824 write Textbook Conversation communication
## 7825 write Textbook Conversation communication
## 7826 write Textbook Conversation communication
## 7827 write Textbook Conversation communication
## 7828 write Textbook Fiction communication
## 7829 write Textbook Fiction communication
## 7830 write Textbook Conversation communication
## 7831 write Textbook Conversation communication
## 7832 write Youth Fiction Sampled communication
## 7833 write Spoken BNC 2014 sample communication
## 7834 write Textbook Conversation communication
## 7835 write down Spoken BNC 2014 sample communication
## 7836 write out Youth Fiction Sampled communication
## 7837 yawn Spoken BNC 2014 sample activity
## 7838 yell Youth Fiction Sampled communication
#write.table(prog_semantics, file = here("4_prog_semantics_full.csv"))
prog_semantics <- read.table(file = here("4_prog_semantics_full.csv"), stringsAsFactors = TRUE)
## Remove unclear cases which include several light verbs (TAKE, HAVE) and other highly polysemous verbs - WARNING: This means I no longer have the same amount of progressives in from each corpora! ##
prog_semantics <- prog_semantics[prog_semantics$Semantic.domain != "unclear_other", ]
prog_semantics$Semantic.domain <- droplevels(prog_semantics)$Semantic.domain # Drop unsused level
summary(prog_semantics$Semantic.domain)
## activity aspectual communication
## 3146 105 986
## existence_relationship facilitation_causation mental
## 237 50 783
## occurrence
## 439
## Plotting the semantic categories ##
semantics_corpus <- as.data.frame(prop.table(table(prog_semantics[,2:3]), 1)*100); semantics_corpus # As percentages for each corpora under study
## Corpus Semantic.domain Freq
## 1 Spoken BNC 2014 sample activity 51.8115942
## 2 Textbook Conversation activity 58.2800697
## 3 Textbook Fiction activity 57.5250836
## 4 Youth Fiction Sampled activity 50.8951407
## 5 Spoken BNC 2014 sample aspectual 1.4492754
## 6 Textbook Conversation aspectual 1.4526438
## 7 Textbook Fiction aspectual 2.0903010
## 8 Youth Fiction Sampled aspectual 2.6427962
## 9 Spoken BNC 2014 sample communication 22.4033816
## 10 Textbook Conversation communication 15.8628704
## 11 Textbook Fiction communication 14.6321070
## 12 Youth Fiction Sampled communication 14.2369991
## 13 Spoken BNC 2014 sample existence_relationship 2.7777778
## 14 Textbook Conversation existence_relationship 4.2998257
## 15 Textbook Fiction existence_relationship 5.4347826
## 16 Youth Fiction Sampled existence_relationship 4.4330776
## 17 Spoken BNC 2014 sample facilitation_causation 0.7850242
## 18 Textbook Conversation facilitation_causation 1.1040093
## 19 Textbook Fiction facilitation_causation 0.5852843
## 20 Youth Fiction Sampled facilitation_causation 0.9377664
## 21 Spoken BNC 2014 sample mental 14.6739130
## 22 Textbook Conversation mental 12.3184195
## 23 Textbook Fiction mental 12.3745819
## 24 Youth Fiction Sampled mental 15.3452685
## 25 Spoken BNC 2014 sample occurrence 6.0990338
## 26 Textbook Conversation occurrence 6.6821615
## 27 Textbook Fiction occurrence 7.3578595
## 28 Youth Fiction Sampled occurrence 11.5089514
colors <- RColorBrewer::brewer.pal(9, "OrRd")
levels(semantics_corpus$Corpus)[1] <- "Spoken BNC2014"
levels(semantics_corpus$Corpus)[4] <- "Youth Fiction"
#tiff(here("semantic-domains.tiff"), height = 15, width = 23, units="cm", compression = "lzw", res = 300)
ggplot(semantics_corpus, aes(x = Semantic.domain, y = Freq, fill = Corpus)) + geom_bar(stat = 'identity', position = 'dodge') + scale_fill_manual(values = c("#BD241E", "#EA7E1E", "#A18A33", "#267226")) + ylab("% of progressives in each corpus") + xlab("Semantic domains") + labs(fill = "Corpora") + scale_x_discrete(labels= c("Activity", "Aspectual", "Communication", "Existence", "Causation", "Mental/state", "Occurrence")) +
theme(legend.position = c(0.8, 0.8), legend.background = element_rect(fill="gray90", size=.5, linetype="solid"),
text = element_text(size = 16),
panel.background = element_rect(fill = 'white', colour = 'darkgrey'), panel.grid.major = element_line(color = 'grey'), panel.grid.minor = element_line(color = 'white'))
dev.off()
## null device
## 1
## Preparing the tables for coll.analyses ##
Spoken_prog_semantics <- prog_semantics[prog_semantics$Corpus %in% c("Spoken BNC 2014", "Textbook Conversation"),c(3:2)]
summary(Spoken_prog_semantics)
## Semantic.domain Corpus
## activity :1003 Spoken BNC 2014 sample: 0
## aspectual : 25 Textbook Conversation :1721
## communication : 273 Textbook Fiction : 0
## existence_relationship: 74 Youth Fiction Sampled : 0
## facilitation_causation: 19
## mental : 212
## occurrence : 115
#write.table(Spoken_prog_semantics, file = here("Spoken_prog_semantics.csv"), sep = "\t", row.names = F) # File for CoVCA
Fiction_prog_semantics <- prog_semantics[prog_semantics$Corpus %in% c("Youth Fiction Sampled", "Textbook Fiction"),c(2:3)]
summary(Fiction_prog_semantics)
## Corpus Semantic.domain
## Spoken BNC 2014 sample: 0 activity :1285
## Textbook Conversation : 0 aspectual : 56
## Textbook Fiction :1196 communication : 342
## Youth Fiction Sampled :1173 existence_relationship: 117
## facilitation_causation: 18
## mental : 328
## occurrence : 223
#write.table(Fiction_prog_semantics, file = here("Fiction_prog_semantics.csv"), sep = "\t", row.names = F) # File for CoVCA
# Coll.analysis #
#source(here("coll.analysis_mpfr.r"))
#coll.analysis()
## Communication verbs
tibble::as_tibble(BNCprog[BNCprog$Lemma=="say" & BNCprog$Time.reference=="past",c(2,4:6)])
## # A tibble: 124 × 4
## BNC_Filename Concordance1 Concor.dance2 Concordance3
## <fct> <lgl> <fct> <lgl>
## 1 SAR5 NA was saying NA
## 2 SAR5 NA were saying NA
## 3 SAUJ NA was saying NA
## 4 SAVW NA was saying NA
## 5 SBKN NA were saying NA
## 6 SBM6 NA was just saying NA
## 7 SBTC NA was saying NA
## 8 SBYQ NA was saying NA
## 9 SBYQ NA was saying um NA
## 10 SBZ7 NA was saying NA
## # … with 114 more rows
prog <- readRDS(file = here("prog.rds"))
say_tell <- prog[prog$Lemma=="say" | prog$Lemma=="tell",]
#tiff(here("tell_say_tense.tiff"), height = 14, width = 16, units="cm", compression = "lzw", res = 300)
par(cex = 1.7, cex.main = 1)
vcd::mosaic(Tense ~ Corpus, say_tell, rot_labels = c(0, 45, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE), main = "Tense distribution of SAY and TELL progressives") # Best so far
dev.off()
## null device
## 1
prog_semantics <- read.table(file = here("4_prog_semantics_full.csv"), stringsAsFactors = TRUE)
glimpse(prog_semantics)
## Rows: 7,838
## Columns: 3
## $ Lemma <fct> abandon, accept, accept, accord, accord, ache, ache, a…
## $ Corpus <fct> Youth Fiction Sampled, Spoken BNC 2014 sample, Spoken …
## $ Semantic.domain <fct> activity, mental, mental, mental, mental, activity, ac…
head(prog_semantics)
## Lemma Corpus Semantic.domain
## 1 abandon Youth Fiction Sampled activity
## 2 accept Spoken BNC 2014 sample mental
## 3 accept Spoken BNC 2014 sample mental
## 4 accord Youth Fiction Sampled mental
## 5 accord Youth Fiction Sampled mental
## 6 ache Youth Fiction Sampled activity
## Remove unclear cases which include several light verbs (TAKE, HAVE) and other highly polysemous verbs - WARNING: This means I no longer have the same amount of progressives in from each corpora! ##
prog_semantics <- prog_semantics[prog_semantics$Semantic.domain != "unclear_other", ]
prog_semantics$Semantic.domain <- droplevels(prog_semantics)$Semantic.domain # Drop unused level
summary(prog_semantics$Semantic.domain)
## activity aspectual communication
## 3146 105 986
## existence_relationship facilitation_causation mental
## 237 50 783
## occurrence
## 439
## Remove lemmas ##
CA_data <- prog_semantics[,2:3]
levels(CA_data$Semantic.domain)
## [1] "activity" "aspectual" "communication"
## [4] "existence_relationship" "facilitation_causation" "mental"
## [7] "occurrence"
levels(CA_data$Semantic.domain) <- c("Activity", "Aspectual", "Communication", "Existence", "Causation", "Mental/state", "Occurrence")
CA_data <- table(CA_data)
# Following Desagulier's (2017) method
ca.object <- CA(CA_data)
#library("factoextra")
#fviz_ca_biplot(ca.object) # Do not see the immediate advantage of this additional package... The output appears to be exactly the same as above!
V <- sqrt(as.vector(chisq.test(CA_data)$statistic)/(sum(CA_data)*((min(ncol(CA_data), nrow(CA_data)))-1))); V # Cramer's V
## [1] 0.07989554
chisq <- chisq.test(CA_data)
inertia <- as.vector(chisq$statistic)/sum(chisq$observed); inertia
## [1] 0.01914989
## CA with supplementary variable ##
CA_data_sup <- t(CA_data)
x <- cbind(CA_data_sup, apply(CA_data_sup[,1:2], 1, sum))
colnames(x)[5] <- "Conversation"
x <- cbind(x, apply(CA_data_sup[,3:4], 1, sum))
colnames(x)[6] <- "Fiction"
x <- cbind(x, apply(CA_data_sup[,c(2:3)], 1, sum))
colnames(x)[7] <- "Textbook English"
x <- cbind(x, apply(CA_data_sup[,c(1,4)], 1, sum))
colnames(x)[8] <- "ENL Reference"
CA_data_sup = x ; CA_data_sup
## Spoken BNC 2014 sample Textbook Conversation Textbook Fiction
## Activity 858 1003 688
## Aspectual 24 25 25
## Communication 371 273 175
## Existence 46 74 65
## Causation 13 19 7
## Mental/state 243 212 148
## Occurrence 101 115 88
## Youth Fiction Sampled Conversation Fiction Textbook English
## Activity 597 1861 1285 1691
## Aspectual 31 49 56 50
## Communication 167 644 342 448
## Existence 52 120 117 139
## Causation 11 32 18 26
## Mental/state 180 455 328 360
## Occurrence 135 216 223 203
## ENL Reference
## Activity 1455
## Aspectual 55
## Communication 538
## Existence 98
## Causation 24
## Mental/state 423
## Occurrence 236
ca.object.sup <- CA(CA_data_sup, col.sup = 5:8); ca.object.sup
## **Results of the Correspondence Analysis (CA)**
## The row variable has 7 categories; the column variable has 4 categories
## The chi square of independence between the two variables is equal to 110.0353 (p-value = 3.100161e-15 ).
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$col" "results for the columns"
## 3 "$col$coord" "coord. for the columns"
## 4 "$col$cos2" "cos2 for the columns"
## 5 "$col$contrib" "contributions of the columns"
## 6 "$row" "results for the rows"
## 7 "$row$coord" "coord. for the rows"
## 8 "$row$cos2" "cos2 for the rows"
## 9 "$row$contrib" "contributions of the rows"
## 10 "$col.sup$coord" "coord. for supplementary columns"
## 11 "$col.sup$cos2" "cos2 for supplementary columns"
## 12 "$call" "summary called parameters"
## 13 "$call$marge.col" "weights of the columns"
## 14 "$call$marge.row" "weights of the rows"
prog <- readRDS(file = here("prog_collanalysis.rds")) # Full, annotated prog concordance lines from Textbook Conversation + BNC Spoken sample
glimpse(prog)
## Rows: 5,867
## Columns: 22
## $ Corpus <fct> Spoken BNC 2014 sample, Spoken BNC 2014 sample, Spoken …
## $ SE_Filename <fct> filename#0, filename#0, filename#2, filename#3, filenam…
## $ BNC_Filename <fct> SAAB, SAAB, SAA3, SABT, SAB7, SAFZ, SAG4, SAG4, SAG4, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <chr> "going to make a comment on the sleeping arrangements y…
## $ Concor.dance2 <chr> "be inviting", "were having", "'re playing", "are comin…
## $ Concordance3 <chr> "him to come and stay at their house stay at their hous…
## $ Lemma <fct> invite, have, play, come, say, say, earn, earn, laugh, …
## $ Contraction <fct> full, full, contract., full, full, contract., contract.…
## $ Negation <fct> positive, positive, positive, positive, positive, posit…
## $ Tense <fct> modal, past, present, present, present, present, presen…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Question <fct> statement, statement, statement, statement, statement, …
## $ Time.reference <chr> "future", "future", "present", "past/present", "present…
## $ Continuous <fct> no, yes, yes, yes, yes, yes, yes, yes, no, yes, yes, ye…
## $ Repeated <fct> no, no, no, no, no, no, yes, yes, no, no, no, no, no, y…
## $ Extra.function <chr> NA, "other interesting function", NA, NA, NA, "emphasis…
## $ Comment <chr> NA, NA, NA, NA, NA, NA, NA, NA, "like ", NA, NA, NA, NA…
## $ Annotator <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", …
## $ Register <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level <fct> Spoken BNC 2014 sample, Spoken BNC 2014 sample, Spoken …
## $ Textbook <chr> "Spoken BNC 2014 sample", "Spoken BNC 2014 sample", "Sp…
summary(prog$Corpus)
## Spoken BNC 2014 sample Textbook Conversation
## 3444 2423
## Prepare prog verb counts ##
prog_lemmas <- prog[,c("Lemma", "Corpus")]
prog_lemmas$Lemma <- stringr::word(prog_lemmas$Lemma,1) # Add new variable for lemma without any particles. For now, I will be ignoring phrasal verbs and only looking at the verb lemmas (it's too hard to extract phrasal verbs in FVPs!)
head(prog_lemmas);tail(prog_lemmas) # Check
## Lemma Corpus
## 1 invite Spoken BNC 2014 sample
## 2 have Spoken BNC 2014 sample
## 3 play Spoken BNC 2014 sample
## 4 come Spoken BNC 2014 sample
## 5 say Spoken BNC 2014 sample
## 6 say Spoken BNC 2014 sample
## Lemma Corpus
## 5862 think Textbook Conversation
## 5863 pull Textbook Conversation
## 5864 lose Textbook Conversation
## 5865 talk Textbook Conversation
## 5866 wait Textbook Conversation
## 5867 look Textbook Conversation
levels(prog_lemmas$Corpus) <- c("SpokenBNC_Prog", "TxBSpoken_Prog") # Rename factor levels
prog_lemmas <- prog_lemmas[,c(2,1)]
names(prog_lemmas)[1] <- "corpus"
names(prog_lemmas)[2] <- "lemma"
head(prog_lemmas);tail(prog_lemmas)
## corpus lemma
## 1 SpokenBNC_Prog invite
## 2 SpokenBNC_Prog have
## 3 SpokenBNC_Prog play
## 4 SpokenBNC_Prog come
## 5 SpokenBNC_Prog say
## 6 SpokenBNC_Prog say
## corpus lemma
## 5862 TxBSpoken_Prog think
## 5863 TxBSpoken_Prog pull
## 5864 TxBSpoken_Prog lose
## 5865 TxBSpoken_Prog talk
## 5866 TxBSpoken_Prog wait
## 5867 TxBSpoken_Prog look
summary(factor(prog_lemmas$lemma))
## do go say talk look try
## 539 530 243 222 218 182
## get think come have work wear
## 170 167 164 153 106 89
## make play take be wait tell
## 87 86 78 72 72 61
## sit watch feel live happen use
## 58 57 51 51 49 49
## run walk listen stay eat start
## 44 44 40 39 38 37
## read pay put move buy ask
## 36 35 35 32 31 29
## plan stand leave enjoy joke write
## 29 29 28 26 26 26
## give drink study learn drive hold
## 24 23 23 22 21 21
## hope kid lie rain call fly
## 21 21 21 20 19 19
## see help speak travel meet record
## 19 17 17 17 16 16
## sleep cook teach die pick wonder
## 16 15 15 14 14 14
## change fall follow keep laugh show
## 12 12 12 12 12 12
## smile push sell build chat cry
## 12 11 11 10 10 10
## hang miss sing struggle swim visit
## 10 10 10 10 10 10
## become grow lose pull send bring
## 9 9 9 9 9 8
## dance earn fight hear let prepare
## 8 8 8 8 8 8
## spend turn concentrate (Other)
## 8 8 7 944
## Creating lemma list used to filter the FVPs for the comparative DCAs ##
lemmas_with_phrasal_verbs <- sort(unique(prog$Lemma)); lemmas_with_phrasal_verbs # For now, I will be ignoring phrasal verbs and only looking at the verb lemmas
## [1] accept ache act action add
## [6] advise agree aim allow announce
## [11] annoy answer apologise apply appoint
## [16] approach argue arrange arrive ask
## [21] asphalt assume attack attract babysit
## [26] bake balloon bang bar bark
## [31] base bask in battle be beat
## [36] become beef up begin behave believe
## [41] bet blame blink block bloom
## [46] blow board boil bomb book
## [51] bore borrow bother bottle bounce up
## [56] bow brag break break down break up
## [61] breathe bring browse build burgeon
## [66] burn burst buy call call for
## [71] camp carry cat-sit catch cause
## [76] celebrate change charge chase chat
## [81] cheat check cheer chew chip in
## [86] choke choose chop clamber clap
## [91] clarify clean clear clear out clear up
## [96] climb close collaborate collect come
## [101] come across come back come down come in come off
## [106] come on come out come over come up comment
## [111] commit communicate compete complain compose
## [116] concentrate confuse consider consume contact
## [121] contemplate continue contradict control cook
## [126] cope copy cost cosy up cough
## [131] count crack up crash create cringe
## [136] criticise cross crowdfund crumble cry
## [141] cut cut down cycle dance date
## [146] deal decide decline decorate delete
## [151] deliver demand deny describe desert
## [156] design develop die die out dig
## [161] dig in digress disappear discover discuss
## [166] disobey dissect disturb diversify do
## [171] doodle download drag along draw dread
## [176] dream dress dress up drink drip
## [181] drive drive along drown dry dust
## [186] earn eat elate email emerge
## [191] encourage end enjoy evolve exaggerate
## [196] exclude expand expect experiment explain
## [201] explode exploit express extend face
## [206] facilitate fail fall fall apart fall asleep
## [211] fall down feed feel fight fight for
## [216] fill fill up film find finger
## [221] finish finish off fire flap flee
## [226] flick flirt flit fly fly through
## [231] focus fold follow fool around forget
## [236] fray freak out freewheel freeze frown
## [241] fry fuck fuel fume function
## [246] fundraise gain garden gasp gear up
## [251] get get along get away get back get into
## [256] get off get on get through get up get used
## [261] give give away give off give up gnaw
## [266] go go along go back go down go for
## [271] go off go on go out go over go round
## [276] go through go up grab greet grill
## [281] grin grope grow grow up guarantee
## [286] guess hack up hammer hamper hand
## [291] hand out handcuff hang hang out hang about
## [296] hang out hang over happen harass harm
## [301] hassle have have on head hear
## [306] heat up help help out hibernate hide
## [311] hike hit hog hold hold up
## [316] hop hope host huff hug
## [321] hunt hurt impoverish include indicate
## [326] inspect inspire insure interact interrogate
## [331] interview invite itch jog join
## [336] joke judge juggle jump keep
## [341] kick kid kill knacker kneel
## [346] knock knock around knock out land laugh
## [351] lay laze around lead lean in learn
## [356] leave let level lie lift
## [361] light up link listen live live up
## [366] look look after look around look for look forwad
## [371] look forward look into look over look up loop up
## [376] lose love make make up manage
## [381] march mark maximise mean meet
## [386] memorize mention mess mess up message
## [391] milk miss mix moan mock
## [396] monitor mouth move move in munch
## [401] name nest nick notice nourish
## [406] observe occur offer operate organise
## [411] overchange overflow overlap overreact overthink
## [416] pack paint panic park participate
## [421] party pass pay pedal perform
## [426] persevere phone pick pick up pilot
## [431] pinch piss pitch place plan
## [436] plan on plant play play up point
## [441] point in pollute pop out pop over pop up
## [446] pose post pour practice practise
## [451] pray prepare present press pretend
## [456] prey print proclaim produce promise
## [461] promote protest pull pull apart pull over
## [466] pump puncture purse push push for
## [471] push out push through put put in put into
## [476] put off put on put out put together put up
## [481] question queue queue up race rain
## [486] rake in rap react read realise
## [491] receive record recover recycle refer
## [496] reflect refuse rehearse relax release
## [501] rely rely on remember renovate rent
## [506] reopen repair repeat reply report
## [511] represent research resign retire return
## [516] rev reverse revise rewrite ride
## [521] ring rip off rip up rise risk
## [526] roast rob roll romanticise ruin
## [531] run run out rush sabotage save
## [536] savour say scam scan scrabble
## [541] search see seek sell send
## [546] serve set set up settle shake
## [551] share shine shoot shop shout
## [556] show shower shrug shut sign
## [561] sing sit sit up skinny-dip slam
## [566] sleep sleet slow smash smell
## [571] smile smirk smoke sniff snooze
## [576] snow soak socialise sort out sound
## [581] speak spell spend spit split
## [586] splutter sponsor spot spray spread
## [591] spring up sprint squander squeeze stalk
## [596] stand stand up stare start start up
## [601] starve stay steal stick stop
## [606] stress stretch strip strip off stroll
## [611] struggle study suffer suggest sunbathe
## [616] support surf sweat swim swish
## [621] take take back take out take over take part
## [626] take up talk taste tax teach
## [631] tear tease tell test text
## [636] think threaten throw throw up tick
## [641] tick along tidy tilt tip top
## [646] touch trail behind train transcribe travel
## [651] tread treat trend try turn
## [656] turn into turn off turn up tweet twist
## [661] twitch type understand unpack update
## [666] upload upset urge use vibrate
## [671] visit visualise voice volunteer vomit
## [676] vote wait wait for wake up walk
## [681] walk up want wash wash up waste
## [686] watch water wear wear through wheel
## [691] wheeze whisper whiz by win wind up
## [696] wipe wish wonder work work on
## [701] work out worry write write down yawn
## 705 Levels: accept ache act action add advise agree aim allow announce ... yawn
lemmas_short <- sort(unique(stringr::word(lemmas_with_phrasal_verbs,1))); lemmas_short # This is the lemma list which will be used to extract the FVPs from the full corpora processed with Spacy.
## [1] "accept" "ache" "act" "action" "add"
## [6] "advise" "agree" "aim" "allow" "announce"
## [11] "annoy" "answer" "apologise" "apply" "appoint"
## [16] "approach" "argue" "arrange" "arrive" "ask"
## [21] "asphalt" "assume" "attack" "attract" "babysit"
## [26] "bake" "balloon" "bang" "bar" "bark"
## [31] "base" "bask" "battle" "be" "beat"
## [36] "become" "beef" "begin" "behave" "believe"
## [41] "bet" "blame" "blink" "block" "bloom"
## [46] "blow" "board" "boil" "bomb" "book"
## [51] "bore" "borrow" "bother" "bottle" "bounce"
## [56] "bow" "brag" "break" "breathe" "bring"
## [61] "browse" "build" "burgeon" "burn" "burst"
## [66] "buy" "call" "camp" "carry" "cat-sit"
## [71] "catch" "cause" "celebrate" "change" "charge"
## [76] "chase" "chat" "cheat" "check" "cheer"
## [81] "chew" "chip" "choke" "choose" "chop"
## [86] "clamber" "clap" "clarify" "clean" "clear"
## [91] "climb" "close" "collaborate" "collect" "come"
## [96] "comment" "commit" "communicate" "compete" "complain"
## [101] "compose" "concentrate" "confuse" "consider" "consume"
## [106] "contact" "contemplate" "continue" "contradict" "control"
## [111] "cook" "cope" "copy" "cost" "cosy"
## [116] "cough" "count" "crack" "crash" "create"
## [121] "cringe" "criticise" "cross" "crowdfund" "crumble"
## [126] "cry" "cut" "cycle" "dance" "date"
## [131] "deal" "decide" "decline" "decorate" "delete"
## [136] "deliver" "demand" "deny" "describe" "desert"
## [141] "design" "develop" "die" "dig" "digress"
## [146] "disappear" "discover" "discuss" "disobey" "dissect"
## [151] "disturb" "diversify" "do" "doodle" "download"
## [156] "drag" "draw" "dread" "dream" "dress"
## [161] "drink" "drip" "drive" "drown" "dry"
## [166] "dust" "earn" "eat" "elate" "email"
## [171] "emerge" "encourage" "end" "enjoy" "evolve"
## [176] "exaggerate" "exclude" "expand" "expect" "experiment"
## [181] "explain" "explode" "exploit" "express" "extend"
## [186] "face" "facilitate" "fail" "fall" "feed"
## [191] "feel" "fight" "fill" "film" "find"
## [196] "finger" "finish" "fire" "flap" "flee"
## [201] "flick" "flirt" "flit" "fly" "focus"
## [206] "fold" "follow" "fool" "forget" "fray"
## [211] "freak" "freewheel" "freeze" "frown" "fry"
## [216] "fuck" "fuel" "fume" "function" "fundraise"
## [221] "gain" "garden" "gasp" "gear" "get"
## [226] "give" "gnaw" "go" "grab" "greet"
## [231] "grill" "grin" "grope" "grow" "guarantee"
## [236] "guess" "hack" "hammer" "hamper" "hand"
## [241] "handcuff" "hang" "happen" "harass" "harm"
## [246] "hassle" "have" "head" "hear" "heat"
## [251] "help" "hibernate" "hide" "hike" "hit"
## [256] "hog" "hold" "hop" "hope" "host"
## [261] "huff" "hug" "hunt" "hurt" "impoverish"
## [266] "include" "indicate" "inspect" "inspire" "insure"
## [271] "interact" "interrogate" "interview" "invite" "itch"
## [276] "jog" "join" "joke" "judge" "juggle"
## [281] "jump" "keep" "kick" "kid" "kill"
## [286] "knacker" "kneel" "knock" "land" "laugh"
## [291] "lay" "laze" "lead" "lean" "learn"
## [296] "leave" "let" "level" "lie" "lift"
## [301] "light" "link" "listen" "live" "look"
## [306] "loop" "lose" "love" "make" "manage"
## [311] "march" "mark" "maximise" "mean" "meet"
## [316] "memorize" "mention" "mess" "message" "milk"
## [321] "miss" "mix" "moan" "mock" "monitor"
## [326] "mouth" "move" "munch" "name" "nest"
## [331] "nick" "notice" "nourish" "observe" "occur"
## [336] "offer" "operate" "organise" "overchange" "overflow"
## [341] "overlap" "overreact" "overthink" "pack" "paint"
## [346] "panic" "park" "participate" "party" "pass"
## [351] "pay" "pedal" "perform" "persevere" "phone"
## [356] "pick" "pilot" "pinch" "piss" "pitch"
## [361] "place" "plan" "plant" "play" "point"
## [366] "pollute" "pop" "pose" "post" "pour"
## [371] "practice" "practise" "pray" "prepare" "present"
## [376] "press" "pretend" "prey" "print" "proclaim"
## [381] "produce" "promise" "promote" "protest" "pull"
## [386] "pump" "puncture" "purse" "push" "put"
## [391] "question" "queue" "race" "rain" "rake"
## [396] "rap" "react" "read" "realise" "receive"
## [401] "record" "recover" "recycle" "refer" "reflect"
## [406] "refuse" "rehearse" "relax" "release" "rely"
## [411] "remember" "renovate" "rent" "reopen" "repair"
## [416] "repeat" "reply" "report" "represent" "research"
## [421] "resign" "retire" "return" "rev" "reverse"
## [426] "revise" "rewrite" "ride" "ring" "rip"
## [431] "rise" "risk" "roast" "rob" "roll"
## [436] "romanticise" "ruin" "run" "rush" "sabotage"
## [441] "save" "savour" "say" "scam" "scan"
## [446] "scrabble" "search" "see" "seek" "sell"
## [451] "send" "serve" "set" "settle" "shake"
## [456] "share" "shine" "shoot" "shop" "shout"
## [461] "show" "shower" "shrug" "shut" "sign"
## [466] "sing" "sit" "skinny-dip" "slam" "sleep"
## [471] "sleet" "slow" "smash" "smell" "smile"
## [476] "smirk" "smoke" "sniff" "snooze" "snow"
## [481] "soak" "socialise" "sort" "sound" "speak"
## [486] "spell" "spend" "spit" "split" "splutter"
## [491] "sponsor" "spot" "spray" "spread" "spring"
## [496] "sprint" "squander" "squeeze" "stalk" "stand"
## [501] "stare" "start" "starve" "stay" "steal"
## [506] "stick" "stop" "stress" "stretch" "strip"
## [511] "stroll" "struggle" "study" "suffer" "suggest"
## [516] "sunbathe" "support" "surf" "sweat" "swim"
## [521] "swish" "take" "talk" "taste" "tax"
## [526] "teach" "tear" "tease" "tell" "test"
## [531] "text" "think" "threaten" "throw" "tick"
## [536] "tidy" "tilt" "tip" "top" "touch"
## [541] "trail" "train" "transcribe" "travel" "tread"
## [546] "treat" "trend" "try" "turn" "tweet"
## [551] "twist" "twitch" "type" "understand" "unpack"
## [556] "update" "upload" "upset" "urge" "use"
## [561] "vibrate" "visit" "visualise" "voice" "volunteer"
## [566] "vomit" "vote" "wait" "wake" "walk"
## [571] "want" "wash" "waste" "watch" "water"
## [576] "wear" "wheel" "wheeze" "whisper" "whiz"
## [581] "win" "wind" "wipe" "wish" "wonder"
## [586] "work" "worry" "write" "yawn"
#saveRDS(lemmas_short, file = here("lemmas_short.rds"))
## Prepare FVP counts in "Extract_verb_lemmas_spacy.R" script with the list created above. This script leads to the creation of the verb_counts.rds file"
## Prepare non-prog verb counts ##
verb_counts <- readRDS(file = here("FVP_counts_new.rds"))
head(verb_counts)
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 1 accept 11 340 12.69688471
## 2 ache 1 44 1.64312626
## 3 act 20 265 9.89610132
## 4 action 0 1 0.03734378
## 5 add 27 800 29.87502285
## 6 advise 7 73 2.72609583
#verb_counts <- verb_counts[,c(1,2,4)] # Get rid of total BNC verb count because I'm now only interested in the sample corresponding to the number of FVPs found in Textbook Conversation.
## Adjusting sample ratio with adjusted FVP count for Textbook Conversation (+4 FVPs) ##
verb_counts$BNCSpoken_count_sample <- verb_counts$BNCSpoken_count*(57007/1526439)
head(verb_counts, 20)
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 1 accept 11 340 12.6977757
## 2 ache 1 44 1.6432416
## 3 act 20 265 9.8967957
## 4 action 0 1 0.0373464
## 5 add 27 800 29.8771192
## 6 advise 7 73 2.7262871
## 7 agree 122 976 36.4500855
## 8 aim 5 107 3.9960647
## 9 allow 80 1584 59.1566961
## 10 announce 2 71 2.6515943
## 11 annoy 6 469 17.5154611
## 12 answer 51 366 13.6687820
## 13 apply 10 548 20.4658267
## 14 appoint 1 15 0.5601960
## 15 approach 6 98 3.6599471
## 16 argue 18 249 9.2992534
## 17 arrange 7 164 6.1248094
## 18 arrive 130 474 17.7021931
## 19 ask 342 4338 162.0086790
## 20 asphalt 0 7 0.2614248
## Subtract progressives from total FVP_counts
prog$Lemma_short <- stringr::word(prog$Lemma,1)
prog_TxB <- as.data.frame(table(prog[prog$Corpus=="Textbook Conversation", "Lemma_short"]))
names(prog_TxB)[1] <- "lemma"
names(prog_TxB)[2] <- "TxB_prog_count"
head(prog_TxB)
## lemma TxB_prog_count
## 1 act 2
## 2 add 1
## 3 advise 1
## 4 answer 3
## 5 appoint 1
## 6 approach 2
prog_BNC <- as.data.frame(table(prog[prog$Corpus=="Spoken BNC 2014 sample", "Lemma_short"]))
names(prog_BNC)[1] <- "lemma"
names(prog_BNC)[2] <- "BNC_prog_count"
head(prog_BNC)
## lemma BNC_prog_count
## 1 accept 2
## 2 ache 1
## 3 action 1
## 4 add 2
## 5 agree 1
## 6 aim 2
counttest <- merge(verb_counts, prog_TxB, by = 'lemma', all = TRUE)
counts <- merge(counttest, prog_BNC, by = 'lemma', all = TRUE)
counts[is.na(counts)] <- 0 # Replace all NAs with 0
counts$TxBSpoken_non_prog = counts$TxBSpoken_count - counts$TxB_prog_count
counts$BNC_non_prog = counts$BNCSpoken_count_sample - counts$BNC_prog_count
head(counts); tail(counts) # Problem: lots of negative non-prog values!!
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 1 accept 11 340 12.6977757 0
## 2 ache 1 44 1.6432416 0
## 3 act 20 265 9.8967957 2
## 4 action 0 1 0.0373464 0
## 5 add 27 800 29.8771192 1
## 6 advise 7 73 2.7262871 1
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 1 2 11 10.6977757
## 2 1 1 0.6432416
## 3 0 18 9.8967957
## 4 1 0 -0.9626536
## 5 2 26 27.8771192
## 6 0 6 2.7262871
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 584 scrabble 0 0 0
## 585 squeeze 0 0 0
## 586 stalk 0 0 0
## 587 tease 0 0 0
## 588 trail 0 0 0
## 589 vote 0 0 0
## TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 584 0 1 0 -1
## 585 0 1 0 -1
## 586 0 1 0 -1
## 587 0 1 0 -1
## 588 0 1 0 -1
## 589 0 1 0 -1
## What shall I do with all these decimal values from the BNC sample??
# a) Round up and down? Probably less than ideal!
#verb_counts$BNCSpoken_count_sample <- round(verb_counts$BNCSpoken_count_sample, 0)
# b) Peter's suggestion was to sample which very low-frequency verbs to remove but this is no longer necessary.
# c) It turns out that decimal points are not a problem for the coll.analyis script. Solution: just keep them in even if, conceptually, it doesn't make a lot of sense.
verb_counts[verb_counts$TxBSpoken_count==0 & verb_counts$BNCSpoken_count_sample==0,] # These are a few problematic lemmas for which there are progressive counts (because otherwise these lemmas would not have been in the list) but no FVP counts in either Textbook Conversation or the BNC sample. See below more...
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 68 cat-sit 0 0 0
## 324 overchange 0 0 0
## 425 scam 0 0 0
## 448 skinny-dip 0 0 0
## 451 sleet 0 0 0
#verb_counts <- verb_counts[verb_counts$TxBSpoken_count!=0 & verb_counts$BNCSpoken_count_sample!=0,] # Drop lemmas with zero in both corpora?
#verb_counts <- verb_counts[verb_counts$TxBSpoken_count!=0,]
#verb_counts <- verb_counts[verb_counts$BNCSpoken_count_sample!=0,] # Drop lemmas with a zero frequency in either corpus?
counts[counts$lemma=="say",]
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 424 say 817 50045 1869.001 29
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 424 214 788 1655.001
## TxB Conversation problem ##
summary(counts$TxBSpoken_non_prog) # We have vegative values!!
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 0.00 4.00 92.67 25.00 19876.00
counts[counts$TxBSpoken_non_prog<0,c("lemma", "TxBSpoken_count", "TxB_prog_count", "TxBSpoken_non_prog")] # Four verbs which were, unsurprisingly, not identified as such by the Spacy model
## lemma TxBSpoken_count TxB_prog_count TxBSpoken_non_prog
## 293 loop 0 1 -1
## 318 nourish 0 1 -1
## 324 overchange 0 1 -1
## 534 upset 0 1 -1
counts$TxBSpoken_non_prog[counts$TxBSpoken_non_prog<0] <- 0 # We will therefore set the non-prog count to 0 for these four lemmas. This adds four FVPs to the total count for the contingency tables.
## Alternative to the above ##
#counts <- subset(counts, !(TxBSpoken_non_prog<0)) # A more logical alternative, given the solution used for the Spoken BNC 2014? But it actually slightly increases the gap in total list-FVPs.
summary(counts$TxBSpoken_non_prog) # Check operation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 4.00 92.67 25.00 19876.00
## Bigger problem with reference corpus! ##
# Sampling (of the Spoken BNC 2014) and tagging errors sometimes lead to negative values, therefore I will have to do something about this. Maybe change all negative values to zero?
summary(counts$BNC_non_prog)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.656 0.120 3.921 90.939 16.374 21020.104
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57007
negatives <- counts[counts$BNC_non_prog<0,c("lemma", "BNCSpoken_count", "BNC_non_prog", "BNC_prog_count", "TxBSpoken_non_prog")] #
head(negatives)
## lemma BNCSpoken_count BNC_non_prog BNC_prog_count TxBSpoken_non_prog
## 4 action 1 -0.9626536 1 0
## 20 asphalt 7 -0.7385752 1 0
## 24 babysit 21 -0.2157256 1 2
## 26 balloon 6 -0.7759216 1 0
## 28 bar 17 -0.3651112 1 0
## 31 bask 1 -0.9626536 1 0
nrow(negatives) # 131 lemmas with negative BNC_non_prog counts
## [1] 131
sum(negatives$BNC_non_prog) # Equivalent to 96.64 FVPs
## [1] -96.63055
## Here is a way to think about this: no good! ##
lostBNC <- if_else(counts$BNC_non_prog<0, counts$BNC_prog_count, 0)
sum(lostBNC) # 177 # This is not making things any better, on the contrary!
## [1] 177
## Let's think about removing low-frequency lemmas which are only sampling artefacts and only keep the ones which are due to annotation errors:
counts[counts$lemma=="party",] # This is likely to be a tagging error. KEEP.
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 334 party 1 15 0.560196 1
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 334 1 0 -0.439804
counts[counts$lemma=="action",] # This is most likely a sampling artefact and I am not so interested in it because the CL value will not be insignificant either way.
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 4 action 0 1 0.0373464 0
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 4 1 0 -0.9626536
counts[counts$lemma=="shine",]
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 437 shine 10 19 0.7095816 2
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 437 1 8 -0.2904184
counts[counts$lemma=="sweat",]
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 497 sweat 0 92 3.435869 0
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 497 5 0 -1.564131
keep <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count>1,]; keep # These are potentially important ones I'd like to keep like JOKE and KID.
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 99 concentrate 9 101 3.7719863
## 166 exaggerate 3 36 1.3444704
## 191 flick 0 59 2.2034375
## 266 joke 16 388 14.4904028
## 272 kid 16 151 5.6393063
## 368 protest 4 19 0.7095816
## 376 queue 0 44 1.6432416
## 377 race 0 45 1.6805880
## 398 reopen 1 15 0.5601960
## 421 sabotage 0 15 0.5601960
## 468 spit 0 55 2.0540519
## 488 strip 1 50 1.8673200
## 497 sweat 0 92 3.4358687
## 552 wheel 1 29 1.0830456
## 565 apologise 0 0 0.0000000
## 570 crash 0 0 0.0000000
## TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 99 2 5 7 -1.2280137
## 166 3 4 0 -2.6555296
## 191 0 3 0 -0.7965625
## 266 11 15 5 -0.5095972
## 272 13 8 3 -2.3606937
## 368 1 2 3 -1.2904184
## 376 0 2 0 -0.3567584
## 377 0 2 0 -0.3194120
## 398 0 2 1 -1.4398040
## 421 0 2 0 -1.4398040
## 468 0 4 0 -1.9459481
## 488 0 2 1 -0.1326800
## 497 0 5 0 -1.5641313
## 552 0 2 1 -0.9169544
## 565 0 2 0 -2.0000000
## 570 0 2 0 -2.0000000
sum(keep$BNC_prog_count) # This adds 62 FVPs to the total.
## [1] 62
factor(keep$lemma) # For 16 unique lemmas
## [1] concentrate exaggerate flick joke kid protest
## [7] queue race reopen sabotage spit strip
## [13] sweat wheel apologise crash
## 16 Levels: concentrate exaggerate flick joke kid protest queue race ... crash
dispose <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count<=1,] # These are the low-frequency ones we can get rid of.
sum(dispose$BNC_prog_count) # This removes 115 FVPs from the total.
## [1] 115
factor(dispose$lemma) # For, as we would expect, 115 unique (low-frequency) lemmas.
## [1] action asphalt babysit balloon bar bask
## [7] battle beef bottle brag browse burgeon
## [13] cat-sit clarify collaborate compose contradict cosy
## [19] cringe crowdfund decline digress dissect doodle
## [25] exploit facilitate finger flap flee flit
## [31] fray freewheel fume function gasp gear
## [37] gnaw grope hamper harm hassle hibernate
## [43] hike hog huff inspect interrogate itch
## [49] laze maximise memorize monitor mouth munch
## [55] nest overflow overlap overthink party persevere
## [61] pilot pollute prey proclaim puncture rake
## [67] rev rewrite romanticise savour scam shine
## [73] skinny-dip sleet smirk snooze splutter sprint
## [79] stare sunbathe swish tilt tread trend
## [85] twitch unpack visualise voice wheeze whisper
## [91] whiz yawn bounce clamber contact contemplate
## [97] demand deny drag elate fuel hug
## [103] knacker name nick promise represent resign
## [109] rush scrabble squeeze stalk tease trail
## [115] vote
## 115 Levels: action asphalt babysit balloon bar bask battle beef bottle ... vote
# Question: Should I be getting rid of these if they happen to be more frequent in Textbook Conversation??
head(dispose[order(dispose$TxBSpoken_count, decreasing = T),], 12) # These lemmas should probably not be deleted because they are potentially relevant for Textbook Conversation!
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 437 shine 10 19 0.7095816
## 243 hike 7 22 0.8216208
## 479 stare 4 26 0.9710064
## 127 decline 3 22 0.8216208
## 190 flee 3 12 0.4481568
## 351 pollute 3 10 0.3734640
## 24 babysit 2 21 0.7842744
## 59 browse 2 11 0.4108104
## 98 compose 2 14 0.5228496
## 260 interrogate 2 4 0.1493856
## 540 voice 2 20 0.7469280
## 554 whisper 2 23 0.8589672
## TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 437 2 1 8 -0.29041842
## 243 2 1 5 -0.17837922
## 479 3 1 1 -0.02899363
## 127 0 1 3 -0.17837922
## 190 0 1 3 -0.55184321
## 351 1 1 2 -0.62653601
## 24 0 1 2 -0.21572562
## 59 0 1 2 -0.58918961
## 98 0 1 2 -0.47715041
## 260 0 1 2 -0.85061440
## 540 0 1 2 -0.25307202
## 554 1 1 1 -0.14103282
dispose2 <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count<=1 & counts$TxBSpoken_count<1,]
factor(dispose2$lemma) # That would mean that I only delete 97 verb lemmas & therefore 97 FVPs from the BNC total count
## [1] action asphalt balloon bar bask battle
## [7] beef bottle brag burgeon cat-sit clarify
## [13] collaborate contradict cosy cringe crowdfund digress
## [19] dissect doodle facilitate finger flap flit
## [25] fray freewheel function gear gnaw grope
## [31] hamper harm hassle hibernate hog huff
## [37] itch laze maximise memorize monitor mouth
## [43] nest overflow overlap overthink persevere pilot
## [49] prey proclaim puncture rake rev rewrite
## [55] romanticise savour scam skinny-dip sleet smirk
## [61] snooze splutter sprint sunbathe swish tilt
## [67] tread trend twitch unpack visualise wheeze
## [73] whiz yawn bounce clamber contact contemplate
## [79] demand deny drag elate fuel hug
## [85] knacker name nick promise represent resign
## [91] rush scrabble squeeze stalk tease trail
## [97] vote
## 97 Levels: action asphalt balloon bar bask battle beef bottle brag ... vote
nrow(counts)
## [1] 589
counts <- subset(counts, !(BNC_non_prog<0 & BNC_prog_count==1 & TxBSpoken_count<1)) # This excludes the 97 VFPs/hapax legomena lemmas from the count data frame
nrow(counts) # 492
## [1] 492
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 56979.33 FVPs in Spoken BNC
## [1] 56979.33
sum(counts$BNC_non_prog)
## [1] 53632.33
summary(counts$BNC_non_prog)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.656 1.367 6.129 109.009 24.839 21020.104
counts$BNC_non_prog[counts$BNC_non_prog<0] <- 0 # It doesn't make any sense to have negative non-progressive counts, therefore all negative counts will be levelled to 0.
sum(counts$BNC_non_prog)
## [1] 53659.63
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57006.63
## Summary
#counts <- counts[,c("lemma","TxB_prog_count", "TxBSpoken_non_prog", "BNC_prog_count", "BNC_non_prog")]
head(counts); tail(counts)
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 1 accept 11 340 12.697776 0
## 2 ache 1 44 1.643242 0
## 3 act 20 265 9.896796 2
## 5 add 27 800 29.877119 1
## 6 advise 7 73 2.726287 1
## 7 agree 122 976 36.450085 0
## BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 1 2 11 10.6977757
## 2 1 1 0.6432416
## 3 0 18 9.8967957
## 5 2 26 27.8771192
## 6 0 6 2.7262871
## 7 1 122 35.4500855
## lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 560 wonder 38 1733 64.72131
## 561 work 329 8008 299.06996
## 562 worry 142 1043 38.95229
## 563 write 182 2840 106.06377
## 565 apologise 0 0 0.00000
## 570 crash 0 0 0.00000
## TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 560 7 7 31 57.72131
## 561 50 56 279 243.06996
## 562 0 3 142 35.95229
## 563 18 8 164 98.06377
## 565 0 2 0 0.00000
## 570 0 2 0 0.00000
## Final check ##
sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog) # that's 57007 FVPs in TxB Spoken
## [1] 57007
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57006.63
(dif <- sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog) - sum(counts$BNC_prog_count) - sum(counts$BNC_non_prog)) # Difference of 0.36
## [1] 0.3694507
dif / (sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog)) * 100 # Difference in percentage (being honest and taking the largest of the two percentages ;-)
## [1] 0.0006480796
## Contingency table function ##
TxB_CTable <- function(verb) {
x = subset(counts, lemma==verb)
vprog = x$TxB_prog_count
tverb = x$TxBSpoken_count
vnonprog = tverb - vprog
tprog = sum(counts$TxB_prog_count)
tnonprog = sum(counts$TxBSpoken_non_prog)
xvprog = tprog - vprog
xvnonprog = tnonprog - vnonprog
verb = c(vprog, vnonprog, tverb)
other = c(xvprog, xvnonprog, (xvprog+xvnonprog))
totals = c(tprog, tnonprog, (tprog + tnonprog))
df=data.frame(verb=verb, other=other, row_totals=totals)
return(df)
}
TxB_CTable("say")
## verb other row_totals
## 1 29 2394 2423
## 2 788 53796 54584
## 3 817 56190 57007
##
BNC_CTable <- function(verb) {
x = subset(counts, lemma==verb)
vprog = x$BNC_prog_count
tverb = x$BNCSpoken_count_sample
vnonprog = x$BNC_non_prog
tprog = sum(counts$BNC_prog_count)
tnonprog = sum(counts$BNC_non_prog)
xvprog = tprog - vprog
xvnonprog = tnonprog - vnonprog
verb = c(vprog, vnonprog, tverb)
other = c(xvprog, xvnonprog, (xvprog+xvnonprog))
totals = c(tprog, tnonprog, (tprog + tnonprog))
df=data.frame(verb=verb, other=other, row_totals=totals)
return(df)
}
BNC_CTable("party")
## verb other row_totals
## 1 1.000000 3346.00 3347.00
## 2 0.000000 53659.63 53659.63
## 3 0.560196 57005.63 57006.63
BNC_CTable("say")
## verb other row_totals
## 1 214.000 3133.00 3347.00
## 2 1655.001 52004.63 53659.63
## 3 1869.001 55137.63 57006.63
# This file can be split into two to be used for two separate DCA analyses à la 2b.csv
#saveRDS(counts, here("counts4DCA_2b.rds")) # Last saved on 26 January 2020
counts <- readRDS(file =here("counts4DCA_2b.rds"))
### Files for DCA analysis à la 2b.csv
# Textbook conversation
TxB_Spoken_DCA <- counts[,c("lemma", "TxB_prog_count", "TxBSpoken_non_prog")]
head(TxB_Spoken_DCA); tail(TxB_Spoken_DCA)
## lemma TxB_prog_count TxBSpoken_non_prog
## 1 accept 0 11
## 2 ache 0 1
## 3 act 2 18
## 5 add 1 26
## 6 advise 1 6
## 7 agree 0 122
## lemma TxB_prog_count TxBSpoken_non_prog
## 560 wonder 7 31
## 561 work 50 279
## 562 worry 0 142
## 563 write 18 164
## 565 apologise 0 0
## 570 crash 0 0
nrow(TxB_Spoken_DCA)
## [1] 492
TxB_Spoken_DCA <- subset(TxB_Spoken_DCA, TxB_prog_count>0 | TxBSpoken_non_prog>0) # # Drop lemmas with zero occurrences in Textbook Conversation
nrow(TxB_Spoken_DCA)
## [1] 454
sum(TxB_Spoken_DCA$TxB_prog_count) + sum(TxB_Spoken_DCA$TxBSpoken_non_prog)
## [1] 57007
#write.table(TxB_Spoken_DCA, file = here("TxB_Spoken_DCA.csv"), sep = "\t")
# Spoken BNC2014 sample
BNC_DCA <- counts[,c("lemma", "BNC_prog_count", "BNC_non_prog")]
nrow(BNC_DCA)
## [1] 492
BNC_DCA <- subset(BNC_DCA, BNC_non_prog > 0 | BNC_prog_count > 0) # Drop lemmas with zero occurrences in Spoken BNC 2014
head(BNC_DCA); tail(BNC_DCA)
## lemma BNC_prog_count BNC_non_prog
## 1 accept 2 10.6977757
## 2 ache 1 0.6432416
## 3 act 0 9.8967957
## 5 add 2 27.8771192
## 6 advise 0 2.7262871
## 7 agree 1 35.4500855
## lemma BNC_prog_count BNC_non_prog
## 560 wonder 7 57.72131
## 561 work 56 243.06996
## 562 worry 3 35.95229
## 563 write 8 98.06377
## 565 apologise 2 0.00000
## 570 crash 2 0.00000
(sum(BNC_DCA$BNC_prog_count) + sum(BNC_DCA$BNC_non_prog)) - (sum(TxB_Spoken_DCA$TxB_prog_count) + sum(TxB_Spoken_DCA$TxBSpoken_non_prog)) # Difference in total number of FVPs = 71
## [1] -0.3694507
#write.table(BNC_DCA, file = here("BNC_DCA_decimals.csv"), sep = "\t")
library(collostructions)
counts <- read.csv(file = here("TxB_Spoken_DCA.csv"), sep = "\t")
head(counts)
## lemma TxB_prog_count TxBSpoken_non_prog
## 1 accept 0 11
## 2 ache 0 1
## 3 act 2 18
## 5 add 1 26
## 6 advise 1 6
## 7 agree 0 122
DCA_TxB <- collex.dist(counts) # default association measure with this package is log-likelihood
head(DCA_TxB); tail(DCA_TxB)
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF SHARED
## 1 wear 71 7.2 99 162.8 TxB_prog_count 227.79570 ***** Y
## 2 talk 88 12.5 205 280.5 TxB_prog_count 218.00717 ***** Y
## 3 do 206 73.7 1527 1659.3 TxB_prog_count 177.80929 ***** Y
## 4 look 117 44.1 921 993.9 TxB_prog_count 90.29046 ***** Y
## 5 wait 50 10.1 188 227.9 TxB_prog_count 88.14350 ***** Y
## 6 go 205 104.9 2263 2363.1 TxB_prog_count 83.38281 ***** Y
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF
## 449 have 59 116.2 2675 2617.8 TxBSpoken_non_prog 37.13818 *****
## 450 let 3 33.1 775 744.9 TxBSpoken_non_prog 47.32697 *****
## 451 love 0 23.1 544 520.9 TxBSpoken_non_prog 47.48734 *****
## 452 see 8 57.7 1350 1300.3 TxBSpoken_non_prog 70.78280 *****
## 453 want 2 50.1 1176 1127.9 TxBSpoken_non_prog 86.28789 *****
## 454 be 16 845.5 19876 19046.5 TxBSpoken_non_prog 1962.21780 *****
## SHARED
## 449 Y
## 450 Y
## 451 N
## 452 Y
## 453 Y
## 454 Y
library(flextable)
DCA_TxB %>%
rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>%
mutate(G2 = ifelse(ASSOC=="TxBSpoken_non_prog", -G2, G2)) %>%
mutate(G2 = round(G2, 1)) %>%
#mutate(across(.cols = c(E.CXN1, O.CXN2, E.CXN2), round, 0)) %>%
mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>%
mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>%
select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>%
filter(abs(G2) > 30) -> resultsTxB
resultsTxB
## lemma O:Enon-prog O:Eprog G2
## 1 wear 71:7.2 99:162.8 227.8
## 2 talk 88:12.5 205:280.5 218.0
## 3 do 206:73.7 1527:1659.3 177.8
## 4 look 117:44.1 921:993.9 90.3
## 5 wait 50:10.1 188:227.9 88.1
## 6 go 205:104.9 2263:2363.1 83.4
## 7 try 55:13.1 253:294.9 81.1
## 8 kid 13:0.7 3:15.3 67.0
## 9 work 50:14 279:315 60.2
## 10 play 56:17.4 353:391.6 58.4
## 11 plan 17:1.8 26:41.2 52.0
## 12 joke 11:0.7 5:15.3 50.1
## 13 rain 12:0.9 9:20.1 47.9
## 14 stand 20:3.7 67:83.3 38.5
## 15 study 18:3.1 56:70.9 36.5
## 16 lie 12:1.4 21:31.6 34.4
## 17 walk 27:7.6 151:170.4 32.3
## 18 smile 11:1.4 21:30.6 30.2
## 19 think 27:67.4 1558:1517.6 -33.1
## 20 have 59:116.2 2675:2617.8 -37.1
## 21 let 3:33.1 775:744.9 -47.3
## 22 love 0:23.1 544:520.9 -47.5
## 23 see 8:57.7 1350:1300.3 -70.8
## 24 want 2:50.1 1176:1127.9 -86.3
## 25 be 16:845.5 19876:19046.5 -1962.2
flextable(resultsTxB) %>%
compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>%
compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>%
compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
theme_booktabs() #%>%
lemma | O:Enon-prog | O:Eprog | G2 |
wear | 71:7.2 | 99:162.8 | 227.8 |
talk | 88:12.5 | 205:280.5 | 218.0 |
do | 206:73.7 | 1527:1659.3 | 177.8 |
look | 117:44.1 | 921:993.9 | 90.3 |
wait | 50:10.1 | 188:227.9 | 88.1 |
go | 205:104.9 | 2263:2363.1 | 83.4 |
try | 55:13.1 | 253:294.9 | 81.1 |
kid | 13:0.7 | 3:15.3 | 67.0 |
work | 50:14 | 279:315 | 60.2 |
play | 56:17.4 | 353:391.6 | 58.4 |
plan | 17:1.8 | 26:41.2 | 52.0 |
joke | 11:0.7 | 5:15.3 | 50.1 |
rain | 12:0.9 | 9:20.1 | 47.9 |
stand | 20:3.7 | 67:83.3 | 38.5 |
study | 18:3.1 | 56:70.9 | 36.5 |
lie | 12:1.4 | 21:31.6 | 34.4 |
walk | 27:7.6 | 151:170.4 | 32.3 |
smile | 11:1.4 | 21:30.6 | 30.2 |
think | 27:67.4 | 1558:1517.6 | -33.1 |
have | 59:116.2 | 2675:2617.8 | -37.1 |
let | 3:33.1 | 775:744.9 | -47.3 |
love | 0:23.1 | 544:520.9 | -47.5 |
see | 8:57.7 | 1350:1300.3 | -70.8 |
want | 2:50.1 | 1176:1127.9 | -86.3 |
be | 16:845.5 | 19876:19046.5 | -1,962.2 |
#save_as_docx(path = here("DCA_TxBConv.docx"))
# Reference Conversation
BNC_DCA <- read.csv(file = here("BNC_DCA_decimals.csv"), sep = "\t", stringsAsFactors = TRUE)
head(BNC_DCA)
## lemma BNC_prog_count BNC_non_prog
## 1 accept 2 10.6977757
## 2 ache 1 0.6432416
## 3 act 0 9.8967957
## 5 add 2 27.8771192
## 6 advise 0 2.7262871
## 7 agree 1 35.4500855
DCA_BNC <- collex.dist(BNC_DCA) # Remember that the default AM is log-likelihood
head(DCA_BNC); tail(DCA_BNC)
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF
## 1 talk 134 13.5 95.41893 215.9 BNC_prog_count 464.51462 *****
## 2 try 127 17.6 171.95792 281.4 BNC_prog_count 337.13045 *****
## 3 do 333 182.0 2766.41500 2917.4 BNC_prog_count 116.12667 *****
## 4 say 214 109.7 1655.00054 1759.3 BNC_prog_count 87.24665 *****
## 5 go 325 192.9 2960.32538 3092.4 BNC_prog_count 86.57210 *****
## 6 joke 15 0.9 0.00000 14.1 BNC_prog_count 85.11647 *****
## SHARED
## 1 Y
## 2 Y
## 3 Y
## 4 Y
## 5 Y
## 6 N
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF
## 482 get 121 189.9 3113.3475 3044.5 BNC_non_prog 31.86039 *****
## 483 mean 2 26.5 449.3312 424.8 BNC_non_prog 40.23969 *****
## 484 see 11 56.0 943.0511 898.0 BNC_non_prog 57.09053 *****
## 485 have 94 187.7 3102.2916 3008.6 BNC_non_prog 63.15981 *****
## 486 want 3 51.4 872.9598 824.5 BNC_non_prog 83.35300 *****
## 487 be 56 1237.4 21020.1042 19838.7 BNC_non_prog 2691.13330 *****
## SHARED
## 482 Y
## 483 Y
## 484 Y
## 485 Y
## 486 Y
## 487 Y
DCA_BNC %>%
rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>%
mutate(G2 = ifelse(ASSOC=="BNC_non_prog", -G2, G2)) %>%
mutate(G2 = round(G2, 1)) %>%
mutate(O.CXN2 = round(O.CXN2, 0)) %>%
mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>%
mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>%
select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>%
filter(abs(G2) > 25) -> resultsBNC
resultsBNC
## lemma O:Enon-prog O:Eprog G2
## 1 talk 134:13.5 95:215.9 464.5
## 2 try 127:17.6 172:281.4 337.1
## 3 do 333:182 2766:2917.4 116.1
## 4 say 214:109.7 1655:1759.3 87.2
## 5 go 325:192.9 2960:3092.4 86.6
## 6 joke 15:0.9 0:14.1 85.1
## 7 look 101:38.6 556:618.7 77.3
## 8 work 56:17.6 243:281.5 59.0
## 9 come 102:46.8 695:750.6 53.6
## 10 sit 36:8.7 113:139.9 53.4
## 11 record 15:1.6 11:24.9 50.4
## 12 kid 8:0.5 0:7.5 45.4
## 13 plan 12:1.2 9:19.5 41.0
## 14 rain 8:0.6 2:9.6 35.1
## 15 play 30:8.7 119:140.3 35.0
## 16 struggle 9:0.8 4:12.6 34.6
## 17 drink 18:3.6 43:57.5 33.3
## 18 wait 22:5.5 71:87.5 31.7
## 19 wear 18:3.9 49:63.2 30.0
## 20 concentrate 5:0.3 0:4.7 28.4
## 21 sweat 5:0.3 0:4.7 28.4
## 22 pay 30:10.2 144:163.4 27.8
## 23 run 22:6.1 81:97.1 27.7
## 24 hold 13:2.4 28:38.1 26.2
## 25 remember 1:18.3 311:293.7 -29.9
## 26 get 121:189.9 3113:3044.5 -31.9
## 27 mean 2:26.5 449:424.8 -40.2
## 28 see 11:56 943:898 -57.1
## 29 have 94:187.7 3102:3008.6 -63.2
## 30 want 3:51.4 873:824.5 -83.4
## 31 be 56:1237.4 21020:19838.7 -2691.1
flextable(resultsBNC) %>%
compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>%
compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>%
compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
theme_booktabs() #%>%
lemma | O:Enon-prog | O:Eprog | G2 |
talk | 134:13.5 | 95:215.9 | 464.5 |
try | 127:17.6 | 172:281.4 | 337.1 |
do | 333:182 | 2766:2917.4 | 116.1 |
say | 214:109.7 | 1655:1759.3 | 87.2 |
go | 325:192.9 | 2960:3092.4 | 86.6 |
joke | 15:0.9 | 0:14.1 | 85.1 |
look | 101:38.6 | 556:618.7 | 77.3 |
work | 56:17.6 | 243:281.5 | 59.0 |
come | 102:46.8 | 695:750.6 | 53.6 |
sit | 36:8.7 | 113:139.9 | 53.4 |
record | 15:1.6 | 11:24.9 | 50.4 |
kid | 8:0.5 | 0:7.5 | 45.4 |
plan | 12:1.2 | 9:19.5 | 41.0 |
rain | 8:0.6 | 2:9.6 | 35.1 |
play | 30:8.7 | 119:140.3 | 35.0 |
struggle | 9:0.8 | 4:12.6 | 34.6 |
drink | 18:3.6 | 43:57.5 | 33.3 |
wait | 22:5.5 | 71:87.5 | 31.7 |
wear | 18:3.9 | 49:63.2 | 30.0 |
concentrate | 5:0.3 | 0:4.7 | 28.4 |
sweat | 5:0.3 | 0:4.7 | 28.4 |
pay | 30:10.2 | 144:163.4 | 27.8 |
run | 22:6.1 | 81:97.1 | 27.7 |
hold | 13:2.4 | 28:38.1 | 26.2 |
remember | 1:18.3 | 311:293.7 | -29.9 |
get | 121:189.9 | 3113:3044.5 | -31.9 |
mean | 2:26.5 | 449:424.8 | -40.2 |
see | 11:56 | 943:898 | -57.1 |
have | 94:187.7 | 3102:3008.6 | -63.2 |
want | 3:51.4 | 873:824.5 | -83.4 |
be | 56:1237.4 | 21020:19838.7 | -2,691.1 |
#save_as_docx(path = here("DCA_SpokenBNC.docx"))
BNC_collstrength <- read.csv(file = here("BNC_DCA_decimals_LL.csv"), header = TRUE, sep=",", stringsAsFactors = TRUE)
glimpse(BNC_collstrength)
## Rows: 492
## Columns: 9
## $ words <fct> accept, ache, act, add, advise, agree, aim, …
## $ obs.freq.1 <int> 2, 1, 0, 2, 0, 1, 2, 1, 2, 4, 1, 2, 2, 0, 0,…
## $ obs.freq.2 <dbl> 10.6977757, 0.6432416, 9.8967957, 27.8771192…
## $ exp.freq.1 <dbl> 0.75, 0.10, 0.58, 1.75, 0.16, 2.14, 0.23, 3.…
## $ exp.freq.2 <dbl> 11.95, 1.55, 9.32, 28.12, 2.57, 34.31, 3.76,…
## $ pref.occur <fct> X.BNC_prog_count., X.BNC_prog_count., X.BNC_…
## $ delta.p.constr1.cues.word <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta.p.word.cues.constr1 <dbl> 0.10, 0.55, -0.06, 0.01, -0.06, -0.03, 0.44,…
## $ coll.strength <dbl> 1.58, 3.55, 1.20, 0.04, 0.33, 0.80, 6.04, 2.…
names(BNC_collstrength)[1] <- "lemma"
names(BNC_collstrength)[9] <- "coll.strength.BNC"
BNC_collstrength <- na.omit(BNC_collstrength) # Exclude lemmas with no data
BNC_collstrength$coll.strength.BNC <- ifelse(BNC_collstrength$pref.occur=="X.BNC_prog_count.", BNC_collstrength$coll.strength, -(BNC_collstrength$coll.strength))
BNC_collstrength <- BNC_collstrength[,c(1,9)]
head(BNC_collstrength); tail(BNC_collstrength)
## lemma coll.strength.BNC
## 1 accept 1.58
## 2 ache 3.55
## 3 act -1.20
## 4 add 0.04
## 5 advise -0.33
## 6 agree -0.80
## lemma coll.strength.BNC
## 487 wipe -0.69
## 488 wish -3.19
## 489 wonder 2.33
## 490 work 58.99
## 491 worry 0.22
## 492 write 0.50
TxBSpoken_collstrength <- read.csv(file = here("TxB_Spoken_DCA_collstrength_LL.csv"), header = TRUE, sep = ",", stringsAsFactors = TRUE)
glimpse(TxBSpoken_collstrength)
## Rows: 454
## Columns: 9
## $ words <fct> accept, ache, act, add, advise, agree, aim, …
## $ obs.freq.1 <int> 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 1, 2, 1,…
## $ obs.freq.2 <int> 11, 1, 18, 26, 6, 122, 5, 80, 2, 6, 48, 10, …
## $ exp.freq.1 <dbl> 0.47, 0.04, 0.85, 1.15, 0.30, 5.19, 0.21, 3.…
## $ exp.freq.2 <dbl> 10.53, 0.96, 19.15, 25.85, 6.70, 116.81, 4.7…
## $ pref.occur <fct> X.TxBSpoken_non_prog., X.TxBSpoken_non_prog.…
## $ delta.p.constr1.cues.word <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta.p.word.cues.constr1 <dbl> -0.04, -0.04, 0.06, -0.01, 0.10, -0.04, -0.0…
## $ coll.strength <dbl> 0.96, 0.09, 1.19, 0.02, 1.10, 10.61, 0.43, 6…
names(TxBSpoken_collstrength)[1] <- "lemma"
names(TxBSpoken_collstrength)[9] <- "coll.strength.TxBSpoken"
subset(TxBSpoken_collstrength, lemma=="mean")
## lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2 pref.occur
## 248 mean 1 253 10.8 243.2 X.TxBSpoken_non_prog.
## delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.TxBSpoken
## 248 0 -0.04 15.26
TxBSpoken_collstrength <- na.omit(TxBSpoken_collstrength) # Exclude lemmas with no data
TxBSpoken_collstrength$coll.strength.TxBSpoken <- ifelse(TxBSpoken_collstrength$pref.occur=="X.TxB_prog_count.", TxBSpoken_collstrength$coll.strength, -(TxBSpoken_collstrength$coll.strength))
TxBSpoken_collstrength <- TxBSpoken_collstrength[,c(1,9)]
head(TxBSpoken_collstrength); tail(TxBSpoken_collstrength)
## lemma coll.strength.TxBSpoken
## 1 accept -0.96
## 2 ache -0.09
## 3 act 1.19
## 4 add -0.02
## 5 advise 1.10
## 6 agree -10.61
## lemma coll.strength.TxBSpoken
## 449 wipe 2.67
## 450 wish -0.68
## 451 wonder 10.61
## 452 work 60.23
## 453 worry -12.35
## 454 write 10.54
coll_strengths <- merge(BNC_collstrength, TxBSpoken_collstrength, by = 'lemma', all = FALSE) # Merge tables, to only look at differences in coll.strength values
#saveRDS(coll_strengths, file = here("coll_strengths_conversation.rds"))
coll_strengths %>%
ggplot(aes(x = coll.strength.TxBSpoken, y = coll.strength.BNC)) +
labs(x = "Textbook Conversation (TCC)", y = "Spoken BNC2014") +
geom_point(shape = "circle filled", fill = "grey") -> fig
fig
# We have one very extreme case: for BE.
coll_strengths %>%
filter(coll.strength.TxBSpoken < -1000, coll.strength.BNC < -1000)
## lemma coll.strength.BNC coll.strength.TxBSpoken
## 1 be -2691.13 -1962.22
# Let's exclude BE for visualisation purposes then.
coll_strengths%>%
filter(lemma != "be") ->
d_not_to_be
fig %+% d_not_to_be
# There is a strong linear association between the tendency to progressive construction in textbook conversations and in natural conversations. Let's quantify it.
d_not_to_be %>%
summarise(corr_TxB_BNC = cor(coll.strength.TxBSpoken, coll.strength.BNC))
## corr_TxB_BNC
## 1 0.6896516
cor.test(coll_strengths$coll.strength.TxBSpoken, coll_strengths$coll.strength.BNC, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: coll_strengths$coll.strength.TxBSpoken and coll_strengths$coll.strength.BNC
## t = 115.29, df = 447, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9802844 0.9863567
## sample estimates:
## cor
## 0.9835969
# According to Susanne Flach, the measures of association are not comparable across corpora even after controlling for the total number of FVPs, so let's standardise the two association variables. The absolute difference between the standardized measures then gives us a measure of the extent to which a verb is associated with the progressive construction in one corpus more than it is in the other corpus.
d_not_to_be %>%
mutate(
TxBSpoken_scaled = as.vector(scale(coll.strength.TxBSpoken)),
BNC_scaled = as.vector(scale(coll.strength.BNC)),
dev = abs(BNC_scaled - TxBSpoken_scaled)
) ->
d_not_to_be
# We can now identify verbs that are extreme by this measure of deviance.
d_not_to_be %>%
filter(dev > 1) %>%
select(lemma, coll.strength.TxBSpoken, coll.strength.BNC, dev) %>%
arrange(-dev)
## lemma coll.strength.TxBSpoken coll.strength.BNC dev
## 1 wear 227.80 30.02 9.467230
## 2 try 81.10 337.13 7.392262
## 3 talk 218.01 464.51 5.290619
## 4 do 177.81 116.13 4.332710
## 5 wait 88.14 31.69 2.989542
## 6 say -1.06 87.25 2.942169
## 7 love -47.49 -5.27 2.030740
## 8 let -47.33 -7.03 1.965426
## 9 record 0.39 50.38 1.661368
## 10 look 90.29 77.31 1.586163
## 11 kid 67.00 45.38 1.566546
## 12 think -33.15 -0.69 1.522089
## 13 play 58.38 34.97 1.512927
## 14 stand 38.45 8.30 1.474620
## 15 see -70.78 -57.09 1.395385
## 16 smile 30.16 1.32 1.323228
## 17 study 36.55 10.67 1.309199
## 18 come 10.95 53.64 1.283085
## 19 prepare 27.83 -0.60 1.279301
## 20 want -86.29 -83.35 1.243927
## 21 feel 27.26 0.09 1.230366
## 22 struggle 0.73 34.61 1.126431
## 23 walk 32.30 11.72 1.079174
## 24 rain 47.95 35.13 1.028003
## 25 plan 52.02 40.98 1.022536
## 26 sleep 22.24 -0.08 1.005104
d_not_to_be %>%
filter(dev > 1.28) %>%
arrange(-dev) ->
d_extremes
d_extremes
## lemma coll.strength.BNC coll.strength.TxBSpoken TxBSpoken_scaled
## 1 wear 30.02 227.80 10.2667320
## 2 try 337.13 81.10 3.5202800
## 3 talk 464.51 218.01 9.8165086
## 4 do 116.13 177.81 7.9677876
## 5 wait 31.69 88.14 3.8440362
## 6 say 87.25 -1.06 -0.2581009
## 7 love -5.27 -47.49 -2.3933277
## 8 let -7.03 -47.33 -2.3859696
## 9 record 50.38 0.39 -0.1914182
## 10 look 77.31 90.29 3.9429105
## 11 kid 45.38 67.00 2.8718481
## 12 think -0.69 -33.15 -1.7338585
## 13 play 34.97 58.38 2.4754308
## 14 stand 8.30 38.45 1.5588883
## 15 see -57.09 -70.78 -3.4643901
## 16 smile 1.32 30.16 1.1776470
## 17 study 10.67 36.55 1.4715109
## 18 come 53.64 10.95 0.2942160
## BNC_scaled dev
## 1 0.79950154 9.467230
## 2 10.91254204 7.392262
## 3 15.10712737 5.290619
## 4 3.63507810 4.332710
## 5 0.85449414 2.989542
## 6 2.68406833 2.942169
## 7 -0.36258762 2.030740
## 8 -0.42054389 1.965426
## 9 1.46995025 1.661368
## 10 2.35674710 1.586163
## 11 1.30530174 1.566546
## 12 -0.21176958 1.522089
## 13 0.96250356 1.512927
## 14 0.08426843 1.474620
## 15 -2.06900473 1.395385
## 16 -0.14558089 1.323228
## 17 0.16231182 1.309199
## 18 1.57730108 1.283085
d_not_to_be %>%
ggplot(aes(x = TxBSpoken_scaled, y = BNC_scaled, label = lemma)) +
coord_fixed() +
labs(x = "standardised CL (Textbook Conversation)", y = "standardised CL (Spoken BNC2014 sample)") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
geom_point(shape = "circle filled", colour = "darkred", fill = "darkred", alpha = 0.5) +
geom_text_repel(data = d_extremes, min.segment.length = 1, segment.alpha = 0.6, max.overlaps = Inf, box.padding = 0.2, direction="both", arrow = arrow(length = unit(0.015, "npc"))) +
#geom_label_repel(data = d_extremes, hjust = "inward") +
theme_light()
#ggsave(here("DCA_scaled_comparison_Conversation.svg"), dpi = 300, width = 22, height = 16, units = "cm")
glimpse(FictionProg)
lemmas_with_phrasal_verbs <- sort(unique(FictionProg$Lemma))
lemmas_with_phrasal_verbs
# For now, I will be ignoring phrasal verbs and only looking at the verb lemmas
lemmas_short <- sort(unique(stringr::word(lemmas_with_phrasal_verbs,1)))
lemmas_short
#saveRDS(lemmas_short, file = here("Fiction_lemmas_short.rds")) # 28 August 2019
## Prepare prog verb counts ##
glimpse(FictionProg)
prog_lemmas <- FictionProg[,c("Lemma", "Corpus")]
prog_lemmas$Lemma <- stringr::word(prog_lemmas$Lemma,1)
head(prog_lemmas)
tail(prog_lemmas)
levels(prog_lemmas$Corpus) <- c("SpokenBNC_Prog", "TxBSpoken_Prog") # Rename factor levels
prog_lemmas <- prog_lemmas[,c(2,1)]
names(prog_lemmas)[1] <- "corpus"
names(prog_lemmas)[2] <- "lemma"
## Prepare non-prog verb counts ##
verb_counts <-readRDS(file = here("FVP_Nar_counts.rds"))
head(verb_counts)
verb_counts <- verb_counts[,c(1,2,4)] # Get rid of total YF verb count
verb_counts$YF_count_sample <- round(verb_counts$YF_count_sample, 0)
verb_counts <- verb_counts[verb_counts$TxBNar_count!=0 & verb_counts$YF_count_sample!=0,] # Drop lemmas with zero in both corpora
head(verb_counts)
## Substract progressives from total FVP_counts
FictionProg$Lemma_short <- stringr::word(FictionProg$Lemma,1)
prog_TxB <- as.data.frame(table(FictionProg[FictionProg$Corpus=="Textbook Fiction", "Lemma_short"]))
names(prog_TxB)[1] <- "lemma"
names(prog_TxB)[2] <- "TxB_Nar_prog_count"
head(prog_TxB)
prog_YF <- as.data.frame(table(FictionProg[FictionProg$Corpus=="Youth Fiction Sampled", "Lemma_short"]))
names(prog_YF)[1] <- "lemma"
names(prog_YF)[2] <- "YF_prog_count"
head(prog_YF)
counttest <- merge(verb_counts, prog_TxB, by = 'lemma', all = TRUE)
counts <- merge(counttest, prog_YF, by = 'lemma', all = TRUE)
counts[is.na(counts)] <- 0
head(counts); tail(counts)
counts$YF_non_prog = counts$YF_count_sample - counts$YF_prog_count
counts$TxB_Nar_non_prog = counts$TxBNar_count - counts$TxB_Nar_prog_count
# Sampling (of the Spoken BNC 2014) and tagging errors sometimes lead to negative values, therefore here we change all negative values to zero
summary(counts$YF_non_prog)
counts[counts$YF_non_prog<0,c(1,5,6)]
sum(counts$YF_non_prog<0) # 116 lemma types "lost"
lost <- counts[counts$YF_non_prog<0,]
sum(lost$YF_non_prog) # 126 "lost FVPs" tokes
counts$YF_non_prog[counts$YF_non_prog<0] <- 0
summary(counts$TxB_Nar_non_prog)
counts[counts$TxB_Nar_non_prog<0,c(1,4,7)]
sum(counts$TxB_Nar_non_pro<0) # 16 types
lost2 <- counts[counts$TxB_Nar_non_prog<0,]
sum(lost2$TxB_Nar_non_prog) # 20 tokens
counts$TxB_Nar_non_prog[counts$TxB_Nar_non_prog<0] <- 0
counts <- counts[,c("lemma","TxB_Nar_prog_count", "TxB_Nar_non_prog", "YF_prog_count", "YF_non_prog")]
head(counts); tail(counts)
sum(counts$TxB_Nar_prog_count) + sum(counts$TxB_Nar_non_prog) # 31563 total FVPs
sum(counts$YF_prog_count) + sum(counts$YF_non_prog) # 31470 total FVPs
(sum(counts$TxB_Nar_prog_count) + sum(counts$TxB_Nar_non_prog))- (sum(counts$YF_prog_count) + sum(counts$YF_non_prog)) # 93 difference in total FVPs
# This file can be split into two to be used for two separate DCA analyses à la 2b.csv
#saveRDS(counts, here("counts4DCA_2b_Nar.rds") # Last saved on 28 August 2019
counts <- readRDS(file = here("counts4DCA_2b_Nar.rds")
### Files for DCA analysis à la 2b.csv
# Textbook Narrative
TxBNarDCA <- counts[,c("lemma", "TxB_Nar_prog_count", "TxB_Nar_non_prog")]
head(TxBNarDCA); tail(TxBNarDCA)
nrow(TxBNarDCA)
TxBNarDCA <- subset(TxBNarDCA, TxB_Nar_prog_count > 0 | TxB_Nar_non_prog > 0) # Drop lemmas with zero occurrences in Textbook Conversation
sum(TxBNarDCA$TxB_Nar_prog_count) + sum(TxBNarDCA$TxB_Nar_non_prog)
write.table(TxBNarDCA, file = here("TxB_Nar_DCA.csv", sep = "\t") # Last saved on 28 August 2019 III
# Youth Fiction sample
YF_DCA <- counts[,c("lemma", "YF_prog_count", "YF_non_prog")]
nrow(YF_DCA)
YF_DCA <- subset(YF_DCA, YF_non_prog > 0 | YF_prog_count > 0) # Drop lemmas with zero occurrences in Spoken BNC 2014
head(YF_DCA); tail(YF_DCA)
(sum(YF_DCA$YF_prog_count) + sum(YF_DCA$YF_non_prog)) - (sum(TxBNarDCA$TxB_Nar_prog_count) + sum(TxBNarDCA$TxB_Nar_non_prog)) # Difference in total number of FVPs = 93 more in TxB Narrative than in Youth Fiction
write.table(YF_DCA, file = here("YF_DCA.csv", sep = "\t") # Last saved on 28 August 2019 II
#source("/Users/Elen/Documents/PhD/Statistics_R/Gries_Collexemes/coll.analysis_mpfr.r")
coll.analysis()
# Textbook Fiction
counts <- read.csv(file = here("TxB_Nar_DCA.csv"), sep = "\t", stringsAsFactors = TRUE)
head(counts)
## lemma TxB_Nar_prog_count TxB_Nar_non_prog
## 1 abandon 0 1
## 2 accord 0 2
## 4 act 2 13
## 5 add 0 30
## 6 address 1 8
## 7 advance 0 1
DCA_TxB <- collex.dist(counts) # default association measure with this package is log-likelihood
head(DCA_TxB); tail(DCA_TxB)
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF
## 1 sit 62 11.6 180 230.4 TxB_Nar_prog_count 120.47601 *****
## 2 wear 37 4.1 49 81.9 TxB_Nar_prog_count 112.64951 *****
## 3 talk 50 8.2 120 161.8 TxB_Nar_prog_count 110.60440 *****
## 4 do 82 24.6 429 486.4 TxB_Nar_prog_count 92.25624 *****
## 5 wait 41 8.2 130 162.8 TxB_Nar_prog_count 74.07346 *****
## 6 stand 35 7.8 128 155.2 TxB_Nar_prog_count 56.02529 *****
## SHARED
## 1 Y
## 2 Y
## 3 Y
## 4 Y
## 5 Y
## 6 Y
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF
## 419 ask 3 24.3 503 481.7 TxB_Nar_non_prog 31.33177 *****
## 420 want 1 28.3 588 560.7 TxB_Nar_non_prog 49.76364 *****
## 421 see 4 38.4 794 759.6 TxB_Nar_non_prog 52.98648 *****
## 422 know 0 28.9 601 572.1 TxB_Nar_non_prog 59.79114 *****
## 423 say 24 92.8 1907 1838.2 TxB_Nar_non_prog 78.68266 *****
## 424 be 3 347.5 7227 6882.5 TxB_Nar_non_prog 775.34497 *****
## SHARED
## 419 Y
## 420 Y
## 421 Y
## 422 N
## 423 Y
## 424 Y
library(flextable)
DCA_TxB %>%
rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>%
mutate(G2 = ifelse(ASSOC=="TxB_Nar_non_prog", -G2, G2)) %>%
mutate(G2 = round(G2, 1)) %>%
#mutate(across(.cols = c(E.CXN1, O.CXN2, E.CXN2), round, 0)) %>%
mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>%
mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>%
select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>%
filter(abs(G2) > 30) -> resultsTxB
nrow(resultsTxB)
## [1] 15
resultsTxB
## lemma O:Enon-prog O:Eprog G2
## 1 sit 62:11.6 180:230.4 120.5
## 2 wear 37:4.1 49:81.9 112.6
## 3 talk 50:8.2 120:161.8 110.6
## 4 do 82:24.6 429:486.4 92.3
## 5 wait 41:8.2 130:162.8 74.1
## 6 stand 35:7.8 128:155.2 56.0
## 7 walk 43:13.7 243:272.3 43.4
## 8 shine 9:0.7 5:13.3 36.9
## 9 try 34:10.9 193:216.1 34.0
## 10 ask 3:24.3 503:481.7 -31.3
## 11 want 1:28.3 588:560.7 -49.8
## 12 see 4:38.4 794:759.6 -53.0
## 13 know 0:28.9 601:572.1 -59.8
## 14 say 24:92.8 1907:1838.2 -78.7
## 15 be 3:347.5 7227:6882.5 -775.3
flextable(resultsTxB) %>%
compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>%
compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>%
compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
theme_booktabs() #%>%
lemma | O:Enon-prog | O:Eprog | G2 |
sit | 62:11.6 | 180:230.4 | 120.5 |
wear | 37:4.1 | 49:81.9 | 112.6 |
talk | 50:8.2 | 120:161.8 | 110.6 |
do | 82:24.6 | 429:486.4 | 92.3 |
wait | 41:8.2 | 130:162.8 | 74.1 |
stand | 35:7.8 | 128:155.2 | 56.0 |
walk | 43:13.7 | 243:272.3 | 43.4 |
shine | 9:0.7 | 5:13.3 | 36.9 |
try | 34:10.9 | 193:216.1 | 34.0 |
ask | 3:24.3 | 503:481.7 | -31.3 |
want | 1:28.3 | 588:560.7 | -49.8 |
see | 4:38.4 | 794:759.6 | -53.0 |
know | 0:28.9 | 601:572.1 | -59.8 |
say | 24:92.8 | 1907:1838.2 | -78.7 |
be | 3:347.5 | 7227:6882.5 | -775.3 |
#save_as_docx(path = here("DCA_TxB_Nar.docx"))
# Reference Fiction
YF_DCA <- read.csv(file = here("YF_DCA.csv"), sep = "\t", stringsAsFactors = TRUE)
YF_DCA <- collex.dist(YF_DCA) # Remember that the default AM is log-likelihood
head(YF_DCA); tail(YF_DCA)
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF SHARED
## 1 talk 35 5.8 85 114.2 YF_prog_count 76.38402 ***** Y
## 2 work 23 4.6 73 91.4 YF_prog_count 41.21932 ***** Y
## 3 joke 6 0.3 0 5.7 YF_prog_count 36.41022 ***** N
## 4 try 36 11.6 204 228.4 YF_prog_count 36.00128 ***** Y
## 5 stand 31 10.0 176 197.0 YF_prog_count 30.87469 ***** Y
## 6 plan 8 0.7 6 13.3 YF_prog_count 30.02547 ***** Y
## COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2 ASSOC COLL.STR.LOGL SIGNIF SHARED
## 516 want 1 18.0 372 355.0 YF_non_prog 29.18298 ***** Y
## 517 see 7 33.1 679 652.9 YF_non_prog 31.90511 ***** Y
## 518 have 16 51.7 1056 1020.3 YF_non_prog 35.97288 ***** Y
## 519 know 1 32.7 678 646.3 YF_non_prog 58.72625 ***** Y
## 520 say 32 122.3 2506 2415.7 YF_non_prog 104.22061 ***** Y
## 521 be 15 334.3 6919 6599.7 YF_non_prog 644.29336 ***** Y
YF_DCA %>%
rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>%
mutate(G2 = ifelse(ASSOC=="YF_non_prog", -G2, G2)) %>%
mutate(G2 = round(G2, 1)) %>%
mutate(O.CXN2 = round(O.CXN2, 0)) %>%
mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>%
mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>%
select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>%
filter(abs(G2) > 23.5) -> resultsYF
nrow(resultsYF)
## [1] 15
resultsYF
## lemma O:Enon-prog O:Eprog G2
## 1 talk 35:5.8 85:114.2 76.4
## 2 work 23:4.6 73:91.4 41.2
## 3 joke 6:0.3 0:5.7 36.4
## 4 try 36:11.6 204:228.4 36.0
## 5 stand 31:10 176:197 30.9
## 6 plan 8:0.7 6:13.3 30.0
## 7 lie 19:4.6 77:91.4 27.5
## 8 sit 27:9.4 167:184.6 23.9
## 9 find 0:12.5 259:246.5 -25.7
## 10 want 1:18 372:355 -29.2
## 11 see 7:33.1 679:652.9 -31.9
## 12 have 16:51.7 1056:1020.3 -36.0
## 13 know 1:32.7 678:646.3 -58.7
## 14 say 32:122.3 2506:2415.7 -104.2
## 15 be 15:334.3 6919:6599.7 -644.3
flextable(resultsYF) %>%
compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>%
compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>%
compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
theme_booktabs() #%>%
lemma | O:Enon-prog | O:Eprog | G2 |
talk | 35:5.8 | 85:114.2 | 76.4 |
work | 23:4.6 | 73:91.4 | 41.2 |
joke | 6:0.3 | 0:5.7 | 36.4 |
try | 36:11.6 | 204:228.4 | 36.0 |
stand | 31:10 | 176:197 | 30.9 |
plan | 8:0.7 | 6:13.3 | 30.0 |
lie | 19:4.6 | 77:91.4 | 27.5 |
sit | 27:9.4 | 167:184.6 | 23.9 |
find | 0:12.5 | 259:246.5 | -25.7 |
want | 1:18 | 372:355 | -29.2 |
see | 7:33.1 | 679:652.9 | -31.9 |
have | 16:51.7 | 1056:1020.3 | -36.0 |
know | 1:32.7 | 678:646.3 | -58.7 |
say | 32:122.3 | 2506:2415.7 | -104.2 |
be | 15:334.3 | 6919:6599.7 | -644.3 |
#save_as_docx(path = here("DCA_FY.docx"))
YF_collstrength <- read.csv(file = here("YF_DCA_collstrength.csv"), header = TRUE, sep=",")
glimpse(YF_collstrength)
## Rows: 521
## Columns: 9
## $ words <chr> "be", "say", "know", "have", "see", "want", …
## $ obs.freq.1 <int> 15, 32, 1, 16, 7, 1, 0, 0, 0, 3, 0, 0, 0, 4,…
## $ obs.freq.2 <int> 6919, 2506, 678, 1056, 679, 372, 259, 203, 1…
## $ exp.freq.1 <dbl> 334.25, 122.34, 32.73, 51.68, 33.07, 17.98, …
## $ exp.freq.2 <dbl> 6599.75, 2415.66, 646.27, 1020.32, 652.93, 3…
## $ pref.occur <chr> "X.YF_non_prog.", "X.YF_non_prog.", "X.YF_no…
## $ delta.p.constr1.cues.word <dbl> -0.22, -0.06, -0.02, -0.02, -0.02, -0.01, -0…
## $ delta.p.word.cues.constr1 <dbl> -0.06, -0.04, -0.05, -0.03, -0.04, -0.05, -0…
## $ coll.strength <dbl> -644.29, -104.22, -58.73, -35.97, -31.91, -2…
names(YF_collstrength)[1] <- "lemma"
names(YF_collstrength)[9] <- "coll.strength.YF"
subset(YF_collstrength, lemma=="know")
## lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2 pref.occur
## 3 know 1 678 32.73 646.27 X.YF_non_prog.
## delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.YF
## 3 -0.02 -0.05 -58.73
YF_collstrength <- na.omit(YF_collstrength) # Exclude lemmas with no data
YF_collstrength <- YF_collstrength[,c(1,9)]
head(YF_collstrength); tail(YF_collstrength)
## lemma coll.strength.YF
## 1 be -644.29
## 2 say -104.22
## 3 know -58.73
## 4 have -35.97
## 5 see -31.91
## 6 want -29.18
## lemma coll.strength.YF
## 516 rub 0.01
## 517 spread 0.01
## 518 breathe 0.00
## 519 consider 0.00
## 520 rest 0.00
## 521 stretch 0.00
TxBNar_collstrength <- read.csv(file = here("TxB_Nar_DCA_collstrength.csv"), header = TRUE, sep = ",", stringsAsFactors = TRUE)
glimpse(TxBNar_collstrength)
## Rows: 424
## Columns: 9
## $ words <fct> be, say, know, see, want, ask, like, find, h…
## $ obs.freq.1 <int> 3, 24, 0, 4, 1, 3, 0, 2, 2, 1, 0, 1, 2, 28, …
## $ obs.freq.2 <int> 7227, 1907, 601, 794, 588, 503, 209, 294, 29…
## $ exp.freq.1 <dbl> 347.49, 92.81, 28.89, 38.35, 28.31, 24.32, 1…
## $ exp.freq.2 <dbl> 6882.51, 1838.19, 572.11, 759.65, 560.69, 48…
## $ pref.occur <fct> X.TxB_Nar_non_prog., X.TxB_Nar_non_prog., X.…
## $ delta.p.constr1.cues.word <dbl> -0.24, -0.05, -0.02, -0.02, -0.02, -0.01, -0…
## $ delta.p.word.cues.constr1 <dbl> -0.06, -0.04, -0.05, -0.04, -0.05, -0.04, -0…
## $ coll.strength <dbl> -775.34, -78.68, -59.79, -52.99, -49.76, -31…
names(TxBNar_collstrength)[1] <- "lemma"
names(TxBNar_collstrength)[9] <- "coll.strength.TxBNar"
subset(TxBNar_collstrength, lemma=="know")
## lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2 pref.occur
## 3 know 0 601 28.89 572.11 X.TxB_Nar_non_prog.
## delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.TxBNar
## 3 -0.02 -0.05 -59.79
TxBNar_collstrength <- na.omit(TxBNar_collstrength) # Exclude lemmas with no data
TxBNar_collstrength <- TxBNar_collstrength[,c(1,9)]
head(TxBNar_collstrength); tail(TxBNar_collstrength)
## lemma coll.strength.TxBNar
## 1 be -775.34
## 2 say -78.68
## 3 know -59.79
## 4 see -52.99
## 5 want -49.76
## 6 ask -31.33
## lemma coll.strength.TxBNar
## 419 stay 0.05
## 420 send 0.03
## 421 head 0.02
## 422 sell 0.02
## 423 realise 0.00
## 424 tell 0.00
coll_strengths <- merge(YF_collstrength, TxBNar_collstrength, by = 'lemma', all = FALSE) # Merge tables, to only look at differences in coll.strength values
#saveRDS(coll_strengths, file = here("coll_strengths_Fiction.rds"))
coll_strengths %>%
ggplot(aes(x = coll.strength.TxBNar, y = coll.strength.YF)) +
labs(x = "Textbook Fiction", y = "Youth Fiction") +
geom_point(shape = "circle filled", fill = "grey") -> fig
fig
# We have one very extreme case: for BE.
coll_strengths %>%
filter(coll.strength.YF < -500)
## lemma coll.strength.YF coll.strength.TxBNar
## 1 be -644.29 -775.34
# Let's exclude BE for visualisation purposes then.
coll_strengths %>%
filter(lemma != "be") ->
d_not_to_be
fig %+% d_not_to_be
d_not_to_be %>%
summarise(corr_TxB_YF = cor(coll.strength.TxBNar, coll.strength.YF))
## corr_TxB_YF
## 1 0.6807511
cor.test(coll_strengths$coll.strength.TxBNar, coll_strengths$coll.strength.YF, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: coll_strengths$coll.strength.TxBNar and coll_strengths$coll.strength.YF
## t = 74.304, df = 410, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9574529 0.9709268
## sample estimates:
## cor
## 0.9648178
d_not_to_be %>%
mutate(
TxBNar_scaled = as.vector(scale(coll.strength.TxBNar)),
YF_scaled = as.vector(scale(coll.strength.YF)),
dev = abs(YF_scaled - TxBNar_scaled)
) ->
d_not_to_be
# We can now identify verbs that are extreme by this measure of deviance.
d_not_to_be %>%
filter(dev > 1) %>%
select(lemma, coll.strength.TxBNar, coll.strength.YF, dev) %>%
arrange(-dev)
## lemma coll.strength.TxBNar coll.strength.YF dev
## 1 wear 112.65 18.31 5.944411
## 2 sit 120.48 23.92 5.920159
## 3 say -78.68 -104.22 5.068992
## 4 do 92.26 13.61 5.008025
## 5 walk 43.44 -0.03 3.006782
## 6 have -11.98 -35.97 2.798440
## 7 wait 74.07 23.18 2.784373
## 8 joke 15.03 36.41 2.634290
## 9 plan 7.48 30.03 2.512997
## 10 lie 5.48 27.46 2.392106
## 11 work 25.82 41.22 2.373084
## 12 hunt -0.20 22.16 2.250397
## 13 shine 36.93 6.67 1.880603
## 14 panic -0.59 18.20 1.877920
## 15 know -59.79 -58.73 1.787022
## 16 hold 1.50 18.23 1.736370
## 17 kid 3.40 18.20 1.601910
## 18 chase 5.71 18.37 1.459263
## 19 find -17.23 -25.70 1.399307
## 20 run 6.00 17.73 1.374644
## 21 look 21.71 1.73 1.326064
## 22 starve 3.40 15.00 1.279118
## 23 try 34.05 36.00 1.277215
## 24 mean -2.96 -14.66 1.272807
## 25 suffer 19.39 0.73 1.266449
## 26 stream 1.87 13.80 1.263909
## 27 rain 25.17 4.89 1.246653
## 28 accord -0.20 12.13 1.238645
## 29 travel 2.32 12.98 1.150065
## 30 visit 14.50 -1.19 1.121857
## 31 let -13.49 -20.13 1.096163
## 32 guard -0.10 10.18 1.035026
## 33 enjoy 15.46 0.61 1.006694
d_not_to_be %>%
filter(dev > 2.5) %>%
arrange(-dev) ->
d_extremes
d_extremes
## lemma coll.strength.YF coll.strength.TxBNar TxBNar_scaled YF_scaled
## 1 wear 18.31 112.65 7.6392496 1.6948387
## 2 sit 23.92 120.48 8.1808933 2.2607340
## 3 say -104.22 -78.68 -5.5960880 -10.6650801
## 4 do 13.61 92.26 6.2287623 1.2207375
## 5 walk -0.03 43.44 2.8516171 -0.1551646
## 6 have -35.97 -11.98 -0.9820860 -3.7805256
## 7 wait 23.18 74.07 4.9704610 2.1860883
## 8 joke 36.41 15.03 0.8863428 3.5206327
## 9 plan 30.03 7.48 0.3640682 2.8770655
## dev
## 1 5.944411
## 2 5.920159
## 3 5.068992
## 4 5.008025
## 5 3.006782
## 6 2.798440
## 7 2.784373
## 8 2.634290
## 9 2.512997
d_not_to_be %>%
ggplot(aes(x = TxBNar_scaled, y = YF_scaled, label = lemma)) +
coord_fixed() +
labs(x = "standardised CL (Textbook Fiction)", y = "standardised CL (Youth Fiction sample)") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
geom_point(shape = "circle filled", colour = "darkred", fill = "darkred", alpha = 0.5) +
geom_text_repel(data = d_extremes, min.segment.length = 1, segment.alpha = 0.6, max.overlaps = Inf, box.padding = 0.2, direction="both", arrow = arrow(length = unit(0.015, "npc"))) +
#geom_label_repel(data = d_extremes, hjust = "inward") +
theme_light()
#ggsave(here("DCA_scaled_comparison_Fiction.svg"), dpi = 300, width = 22, height = 16, units = "cm")
#source(here(coll.analysis_mpfr.r"))
#coll.analysis()
# From https://rcompanion.org/rcompanion/b_04.html #
pchisq(3.84, df=1,lower.tail=FALSE) # Threshold of significance at 0.05 for LLR > 3.84 but this is NOT corrected for multiple testing!
qchisq(0.05, df=1, lower.tail=FALSE) # Work out LLR-based CL value based on p-value https://stats.stackexchange.com/questions/250819/r-calculate-p-value-given-chi-squared-and-degrees-of-freedom
qchisq(p.adjust(0.05, method = "holm", n = 20), df=1, lower.tail=FALSE)
#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))
knitr::write_bib(c(.packages(), "knitr"), "packages.bib")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] flextable_0.6.4 collostructions_0.2.0 vcd_1.4-8
## [4] tidyr_1.1.4 RColorBrewer_1.1-2 lsr_0.5
## [7] lattice_0.20-41 gridExtra_2.3 ggrepel_0.9.1
## [10] ggplot2_3.3.5 here_1.0.1 FactoMineR_2.4
## [13] dplyr_1.0.7
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.7 zoo_1.8-9 assertthat_0.2.1
## [4] rprojroot_2.0.2 digest_0.6.29 lmtest_0.9-38
## [7] utf8_1.2.2 R6_2.5.1 evaluate_0.14
## [10] highr_0.9 pillar_1.6.4 gdtools_0.2.3
## [13] rlang_0.4.12 uuid_0.1-4 rstudioapi_0.13
## [16] data.table_1.14.2 jquerylib_0.1.4 DT_0.20
## [19] rmarkdown_2.11 labeling_0.4.2 stringr_1.4.0
## [22] htmlwidgets_1.5.4 munsell_0.5.0 compiler_4.0.3
## [25] xfun_0.29 systemfonts_1.0.1 pkgconfig_2.0.3
## [28] base64enc_0.1-3 htmltools_0.5.2 flashClust_1.01-2
## [31] tidyselect_1.1.1 tibble_3.1.6 fansi_0.5.0
## [34] crayon_1.4.2 withr_2.4.3 MASS_7.3-53.1
## [37] leaps_3.1 jsonlite_1.7.2 gtable_0.3.0
## [40] lifecycle_1.0.1 DBI_1.1.1 magrittr_2.0.1
## [43] scales_1.1.1 zip_2.1.1 cli_3.1.0
## [46] stringi_1.7.6 farver_2.1.0 scatterplot3d_0.3-41
## [49] xml2_1.3.3 bslib_0.3.1 ellipsis_0.3.2
## [52] generics_0.1.1 vctrs_0.3.8 tools_4.0.3
## [55] glue_1.6.0 officer_0.3.17 purrr_0.3.4
## [58] fastmap_1.1.0 yaml_2.2.1 colorspace_2.0-2
## [61] cluster_2.1.1 knitr_1.37 sass_0.4.0