This script is part of the Online Appendix to my PhD thesis.

Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.

For more information, see: https://elenlefoll.github.io/TextbookEnglish/

Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.

Set-up

Built with R 4.0.3

knitr::opts_chunk$set(echo = TRUE)

library(dplyr)
library(FactoMineR)
library(here)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(lattice)
library(lsr)
library(RColorBrewer)
library(tidyr)
library(vcd)

Data Preparation

annotation_file <- read.csv(here("BNCspoken_prog_conc_anno_collanalysis.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file)
## Rows: 4,641
## Columns: 18
## $ SE_Filename    <fct> filename#0, filename#0, filename#0, filename#2, filenam…
## $ BNC_Filename   <fct> SAAB, SAAB, SAAB, SAA3, SAA3, SAA3, SAA3, SABT, SAB7, S…
## $ Rand.          <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2  <fct> be inviting, were having, 'm not going, 's not going, i…
## $ Concordance3   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma          <fct> invite, have, NA, NA, NA, NA, play, come, say, NA, NA, …
## $ X.             <fct> NA, NA, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, NA, c, N…
## $ NOT            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense          <fct> modal, past, GOING TO, GOING TO, GOING TO, GOING TO, pr…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1           <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Time.reference <fct> future, future, present, present, present, present, pre…
## $ Continuous     <fct> no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y…
## $ Repeated       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, other interesting function, NA, NA, NA, NA, NA, NA,…
## $ Comment        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator      <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated <- annotation_file[1:4641,]

# Extract only progressives
BNCprog <- annotated[annotated$Tense != "NOT progressive" & annotated$Tense != "GOING TO" & annotated$Tense != "catenative", ]

BNCprog$Tense <- droplevels(BNCprog)$Tense # Drop unused levels

# Number of progressives extracted
nrow(BNCprog)
## [1] 3444
# Ratio of genuine progressives in progressive query
nrow(BNCprog)/4641
## [1] 0.7420814

Covarying Collexeme Analysis (CovCA) Verb + Variety

# File to be used for Covayring Collexeme Analysis #
prog_lemmas <- read.table(here("prog_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)

say <- prog_lemmas[prog_lemmas$Lemma=="say",]
summary(say$Corpus)
## Spoken BNC 2014 sample  Textbook Conversation 
##                    153                     29
#source(here("coll.analysis_mpfr.r"))
#coll.analysis()

Annotation process)

## Preparing list of long lemmas ##

SpokenProg_lemmas <- read.table(file = here("prog_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)
FictionProg_lemmas <- read.table(file = here("FictionProg_lemmas.csv"), sep = "\t", stringsAsFactors = TRUE)

prog_lemmas <- rbind(SpokenProg_lemmas, FictionProg_lemmas)
#write.table(prog_lemmas, file = here("CovCA.csv"), sep = "\t")
prog_lemmas <- read.table(file = here("CovCA.csv"), sep = "\t", stringsAsFactors = TRUE)

## Coding for semantic domains ##
prog_semantic_domains <- as.data.frame(sort(unique(prog_lemmas$Lemma)))
names(prog_semantic_domains)[1] <- "lemma"
head(prog_semantic_domains)
##     lemma
## 1 abandon
## 2  accept
## 3  accord
## 4    ache
## 5     act
## 6 act out
semantic_Biber <- read.csv(file = here("semantic_domains_Biber2006_p246.csv"), sep = "\t", stringsAsFactors = TRUE) # This is the table I made using the verbs listed in Biber (2006: 246-247)
semantic_Edwards <- read.csv(here("semantic_domains_Ewards.csv"), sep = "\t", stringsAsFactors = TRUE)
names(semantic_Biber)[1] <- "lemma"
names(semantic_Biber)[2] <- "semantic_domain_Biber"
head(semantic_Biber)
##         lemma semantic_domain_Biber
## 1   accompany              activity
## 2      accuse         communication
## 3 acknowledge         communication
## 4     acquire              activity
## 5         add              activity
## 6     address         communication
names(semantic_Edwards)[1] <- "lemma"
names(semantic_Edwards)[2] <- "semantic_domain_Edwards"
head(semantic_Edwards)
##       lemma semantic_domain_Edwards Edwards
## 1    accept                activity Edwards
## 2 accompany                activity Edwards
## 3   acquire                activity Edwards
## 4       add                activity Edwards
## 5   advance                activity Edwards
## 6     apply                activity Edwards
semantic_domains <- merge(semantic_Biber, semantic_Edwards, by = "lemma", all = TRUE)
head(semantic_domains)
##         lemma semantic_domain_Biber semantic_domain_Edwards Edwards
## 1   accompany              activity                activity Edwards
## 2      accuse         communication           communication Edwards
## 3 acknowledge         communication           communication Edwards
## 4     acquire              activity                activity Edwards
## 5         add              activity                activity Edwards
## 6     address         communication                  mental Edwards
semantic_domains$agreement <- TRUE
semantic_domains$agreement <- ifelse (
  (semantic_domains$semantic_domain_Biber == semantic_domains$semantic_domain_Edwards
), TRUE, FALSE)

#write.table(semantic_domains, file = here("semantic_domains1.csv"), sep = "\t")

# This table has been manually edited so that all disagreements between the two lists have been removed
semantic_domains <- read.csv(here("4_SemanticDomains_VerbLemmas1.csv"), sep = "\t", header = T, stringsAsFactors = TRUE) 
                             
names(semantic_domains)[1] <- "Lemma"
names(semantic_domains)[2] <- "Semantic domain"

prog_semantics <- merge(prog_lemmas, semantic_domains, by = "Lemma")
head(prog_semantics, 30); tail(prog_semantics, 30)
##      Lemma                 Corpus        Semantic domain
## 1  abandon  Youth Fiction Sampled               activity
## 2   accept Spoken BNC 2014 sample                 mental
## 3   accept Spoken BNC 2014 sample                 mental
## 4   accord  Youth Fiction Sampled                 mental
## 5   accord  Youth Fiction Sampled                 mental
## 6     ache  Youth Fiction Sampled               activity
## 7     ache Spoken BNC 2014 sample               activity
## 8      act       Textbook Fiction               activity
## 9      act  Textbook Conversation               activity
## 10     act  Youth Fiction Sampled               activity
## 11     act  Textbook Conversation               activity
## 12     act  Youth Fiction Sampled               activity
## 13     act       Textbook Fiction               activity
## 14 act out  Youth Fiction Sampled               activity
## 15  action Spoken BNC 2014 sample             occurrence
## 16     add  Youth Fiction Sampled               activity
## 17     add Spoken BNC 2014 sample               activity
## 18     add  Textbook Conversation               activity
## 19     add Spoken BNC 2014 sample               activity
## 20 address       Textbook Fiction          unclear_other
## 21 address  Youth Fiction Sampled          unclear_other
## 22 advance  Youth Fiction Sampled               activity
## 23 advance  Youth Fiction Sampled               activity
## 24  advise  Textbook Conversation          communication
## 25   agree Spoken BNC 2014 sample                 mental
## 26     aim Spoken BNC 2014 sample                 mental
## 27     aim Spoken BNC 2014 sample                 mental
## 28 aim for  Youth Fiction Sampled                 mental
## 29   allow Spoken BNC 2014 sample facilitation_causation
## 30   amuse  Youth Fiction Sampled               activity
##           Lemma                 Corpus Semantic domain
## 7809      write       Textbook Fiction   communication
## 7810      write Spoken BNC 2014 sample   communication
## 7811      write  Textbook Conversation   communication
## 7812      write  Textbook Conversation   communication
## 7813      write  Textbook Conversation   communication
## 7814      write       Textbook Fiction   communication
## 7815      write       Textbook Fiction   communication
## 7816      write  Textbook Conversation   communication
## 7817      write Spoken BNC 2014 sample   communication
## 7818      write  Youth Fiction Sampled   communication
## 7819      write  Textbook Conversation   communication
## 7820      write  Textbook Conversation   communication
## 7821      write       Textbook Fiction   communication
## 7822      write       Textbook Fiction   communication
## 7823      write  Textbook Conversation   communication
## 7824      write  Textbook Conversation   communication
## 7825      write  Textbook Conversation   communication
## 7826      write  Textbook Conversation   communication
## 7827      write  Textbook Conversation   communication
## 7828      write       Textbook Fiction   communication
## 7829      write       Textbook Fiction   communication
## 7830      write  Textbook Conversation   communication
## 7831      write  Textbook Conversation   communication
## 7832      write  Youth Fiction Sampled   communication
## 7833      write Spoken BNC 2014 sample   communication
## 7834      write  Textbook Conversation   communication
## 7835 write down Spoken BNC 2014 sample   communication
## 7836  write out  Youth Fiction Sampled   communication
## 7837       yawn Spoken BNC 2014 sample        activity
## 7838       yell  Youth Fiction Sampled   communication
#write.table(prog_semantics, file = here("4_prog_semantics_full.csv")) 

Semantic domain analysis

prog_semantics <- read.table(file = here("4_prog_semantics_full.csv"), stringsAsFactors = TRUE)

## Remove unclear cases which include several light verbs (TAKE, HAVE) and other highly polysemous verbs - WARNING: This means I no longer have the same amount of progressives in from each corpora! ##
prog_semantics <- prog_semantics[prog_semantics$Semantic.domain != "unclear_other", ]
prog_semantics$Semantic.domain <- droplevels(prog_semantics)$Semantic.domain # Drop unsused level
summary(prog_semantics$Semantic.domain)
##               activity              aspectual          communication 
##                   3146                    105                    986 
## existence_relationship facilitation_causation                 mental 
##                    237                     50                    783 
##             occurrence 
##                    439
## Plotting the semantic categories ##
semantics_corpus <- as.data.frame(prop.table(table(prog_semantics[,2:3]), 1)*100); semantics_corpus # As percentages for each corpora under study
##                    Corpus        Semantic.domain       Freq
## 1  Spoken BNC 2014 sample               activity 51.8115942
## 2   Textbook Conversation               activity 58.2800697
## 3        Textbook Fiction               activity 57.5250836
## 4   Youth Fiction Sampled               activity 50.8951407
## 5  Spoken BNC 2014 sample              aspectual  1.4492754
## 6   Textbook Conversation              aspectual  1.4526438
## 7        Textbook Fiction              aspectual  2.0903010
## 8   Youth Fiction Sampled              aspectual  2.6427962
## 9  Spoken BNC 2014 sample          communication 22.4033816
## 10  Textbook Conversation          communication 15.8628704
## 11       Textbook Fiction          communication 14.6321070
## 12  Youth Fiction Sampled          communication 14.2369991
## 13 Spoken BNC 2014 sample existence_relationship  2.7777778
## 14  Textbook Conversation existence_relationship  4.2998257
## 15       Textbook Fiction existence_relationship  5.4347826
## 16  Youth Fiction Sampled existence_relationship  4.4330776
## 17 Spoken BNC 2014 sample facilitation_causation  0.7850242
## 18  Textbook Conversation facilitation_causation  1.1040093
## 19       Textbook Fiction facilitation_causation  0.5852843
## 20  Youth Fiction Sampled facilitation_causation  0.9377664
## 21 Spoken BNC 2014 sample                 mental 14.6739130
## 22  Textbook Conversation                 mental 12.3184195
## 23       Textbook Fiction                 mental 12.3745819
## 24  Youth Fiction Sampled                 mental 15.3452685
## 25 Spoken BNC 2014 sample             occurrence  6.0990338
## 26  Textbook Conversation             occurrence  6.6821615
## 27       Textbook Fiction             occurrence  7.3578595
## 28  Youth Fiction Sampled             occurrence 11.5089514
colors <- RColorBrewer::brewer.pal(9, "OrRd")

levels(semantics_corpus$Corpus)[1] <- "Spoken BNC2014"
levels(semantics_corpus$Corpus)[4] <- "Youth Fiction"

#tiff(here("semantic-domains.tiff"), height = 15, width = 23, units="cm", compression = "lzw", res = 300)
ggplot(semantics_corpus, aes(x = Semantic.domain, y = Freq, fill = Corpus)) + geom_bar(stat = 'identity', position = 'dodge') + scale_fill_manual(values = c("#BD241E", "#EA7E1E", "#A18A33", "#267226")) + ylab("% of progressives in each corpus") + xlab("Semantic domains") + labs(fill = "Corpora") + scale_x_discrete(labels= c("Activity", "Aspectual", "Communication", "Existence", "Causation", "Mental/state", "Occurrence")) + 
  theme(legend.position = c(0.8, 0.8), legend.background = element_rect(fill="gray90", size=.5, linetype="solid"), 
        text = element_text(size = 16), 
        panel.background = element_rect(fill = 'white', colour = 'darkgrey'), panel.grid.major = element_line(color = 'grey'), panel.grid.minor = element_line(color = 'white'))

dev.off()
## null device 
##           1

Covarying Collexeme Analysis (CovCA) Semantic domain + Variety

## Preparing the tables for coll.analyses ##
Spoken_prog_semantics <- prog_semantics[prog_semantics$Corpus %in% c("Spoken BNC 2014", "Textbook Conversation"),c(3:2)]
summary(Spoken_prog_semantics)
##                Semantic.domain                    Corpus    
##  activity              :1003   Spoken BNC 2014 sample:   0  
##  aspectual             :  25   Textbook Conversation :1721  
##  communication         : 273   Textbook Fiction      :   0  
##  existence_relationship:  74   Youth Fiction Sampled :   0  
##  facilitation_causation:  19                                
##  mental                : 212                                
##  occurrence            : 115
#write.table(Spoken_prog_semantics, file = here("Spoken_prog_semantics.csv"), sep = "\t", row.names = F) # File for CoVCA 

Fiction_prog_semantics <- prog_semantics[prog_semantics$Corpus %in% c("Youth Fiction Sampled", "Textbook Fiction"),c(2:3)]
summary(Fiction_prog_semantics)
##                     Corpus                   Semantic.domain
##  Spoken BNC 2014 sample:   0   activity              :1285  
##  Textbook Conversation :   0   aspectual             :  56  
##  Textbook Fiction      :1196   communication         : 342  
##  Youth Fiction Sampled :1173   existence_relationship: 117  
##                                facilitation_causation:  18  
##                                mental                : 328  
##                                occurrence            : 223
#write.table(Fiction_prog_semantics, file = here("Fiction_prog_semantics.csv"), sep = "\t", row.names = F) # File for CoVCA 

# Coll.analysis #
#source(here("coll.analysis_mpfr.r"))
#coll.analysis()

Communication verbs analysis

## Communication verbs

tibble::as_tibble(BNCprog[BNCprog$Lemma=="say" & BNCprog$Time.reference=="past",c(2,4:6)])
## # A tibble: 124 × 4
##    BNC_Filename Concordance1 Concor.dance2   Concordance3
##    <fct>        <lgl>        <fct>           <lgl>       
##  1 SAR5         NA           was saying      NA          
##  2 SAR5         NA           were saying     NA          
##  3 SAUJ         NA           was saying      NA          
##  4 SAVW         NA           was saying      NA          
##  5 SBKN         NA           were saying     NA          
##  6 SBM6         NA           was just saying NA          
##  7 SBTC         NA           was saying      NA          
##  8 SBYQ         NA           was saying      NA          
##  9 SBYQ         NA           was saying um   NA          
## 10 SBZ7         NA           was saying      NA          
## # … with 114 more rows
prog <- readRDS(file = here("prog.rds"))
say_tell <- prog[prog$Lemma=="say" | prog$Lemma=="tell",]

#tiff(here("tell_say_tense.tiff"), height = 14, width = 16, units="cm", compression = "lzw", res = 300)
par(cex = 1.7, cex.main = 1)
vcd::mosaic(Tense ~ Corpus, say_tell, rot_labels = c(0, 45, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE), main = "Tense distribution of SAY and TELL progressives") # Best so far

dev.off()
## null device 
##           1

Correspondence Analysis

Preparing data for CA

prog_semantics <- read.table(file = here("4_prog_semantics_full.csv"), stringsAsFactors = TRUE)
glimpse(prog_semantics)
## Rows: 7,838
## Columns: 3
## $ Lemma           <fct> abandon, accept, accept, accord, accord, ache, ache, a…
## $ Corpus          <fct> Youth Fiction Sampled, Spoken BNC 2014 sample, Spoken …
## $ Semantic.domain <fct> activity, mental, mental, mental, mental, activity, ac…
head(prog_semantics)
##     Lemma                 Corpus Semantic.domain
## 1 abandon  Youth Fiction Sampled        activity
## 2  accept Spoken BNC 2014 sample          mental
## 3  accept Spoken BNC 2014 sample          mental
## 4  accord  Youth Fiction Sampled          mental
## 5  accord  Youth Fiction Sampled          mental
## 6    ache  Youth Fiction Sampled        activity
## Remove unclear cases which include several light verbs (TAKE, HAVE) and other highly polysemous verbs - WARNING: This means I no longer have the same amount of progressives in from each corpora! ##
prog_semantics <- prog_semantics[prog_semantics$Semantic.domain != "unclear_other", ]
prog_semantics$Semantic.domain <- droplevels(prog_semantics)$Semantic.domain # Drop unused level
summary(prog_semantics$Semantic.domain)
##               activity              aspectual          communication 
##                   3146                    105                    986 
## existence_relationship facilitation_causation                 mental 
##                    237                     50                    783 
##             occurrence 
##                    439
## Remove lemmas ##
CA_data <- prog_semantics[,2:3]
levels(CA_data$Semantic.domain)
## [1] "activity"               "aspectual"              "communication"         
## [4] "existence_relationship" "facilitation_causation" "mental"                
## [7] "occurrence"
levels(CA_data$Semantic.domain) <- c("Activity", "Aspectual", "Communication", "Existence", "Causation", "Mental/state", "Occurrence")
CA_data <- table(CA_data)

Performing CA

# Following Desagulier's (2017) method
ca.object <- CA(CA_data)

#library("factoextra")
#fviz_ca_biplot(ca.object) # Do not see the immediate advantage of this additional package... The output appears to be exactly the same as above!

V <- sqrt(as.vector(chisq.test(CA_data)$statistic)/(sum(CA_data)*((min(ncol(CA_data), nrow(CA_data)))-1))); V # Cramer's V
## [1] 0.07989554
chisq <- chisq.test(CA_data)
inertia <- as.vector(chisq$statistic)/sum(chisq$observed); inertia
## [1] 0.01914989
## CA with supplementary variable ##

CA_data_sup <- t(CA_data)
x <- cbind(CA_data_sup, apply(CA_data_sup[,1:2], 1, sum))
colnames(x)[5] <- "Conversation"

x <- cbind(x, apply(CA_data_sup[,3:4], 1, sum))
colnames(x)[6] <- "Fiction"

x <- cbind(x, apply(CA_data_sup[,c(2:3)], 1, sum))
colnames(x)[7] <- "Textbook English"

x <- cbind(x, apply(CA_data_sup[,c(1,4)], 1, sum))
colnames(x)[8] <- "ENL Reference"

CA_data_sup = x ; CA_data_sup
##               Spoken BNC 2014 sample Textbook Conversation Textbook Fiction
## Activity                         858                  1003              688
## Aspectual                         24                    25               25
## Communication                    371                   273              175
## Existence                         46                    74               65
## Causation                         13                    19                7
## Mental/state                     243                   212              148
## Occurrence                       101                   115               88
##               Youth Fiction Sampled Conversation Fiction Textbook English
## Activity                        597         1861    1285             1691
## Aspectual                        31           49      56               50
## Communication                   167          644     342              448
## Existence                        52          120     117              139
## Causation                        11           32      18               26
## Mental/state                    180          455     328              360
## Occurrence                      135          216     223              203
##               ENL Reference
## Activity               1455
## Aspectual                55
## Communication           538
## Existence                98
## Causation                24
## Mental/state            423
## Occurrence              236
ca.object.sup <- CA(CA_data_sup, col.sup = 5:8); ca.object.sup

## **Results of the Correspondence Analysis (CA)**
## The row variable has  7  categories; the column variable has 4 categories
## The chi square of independence between the two variables is equal to 110.0353 (p-value =  3.100161e-15 ).
## *The results are available in the following objects:
## 
##    name              description                       
## 1  "$eig"            "eigenvalues"                     
## 2  "$col"            "results for the columns"         
## 3  "$col$coord"      "coord. for the columns"          
## 4  "$col$cos2"       "cos2 for the columns"            
## 5  "$col$contrib"    "contributions of the columns"    
## 6  "$row"            "results for the rows"            
## 7  "$row$coord"      "coord. for the rows"             
## 8  "$row$cos2"       "cos2 for the rows"               
## 9  "$row$contrib"    "contributions of the rows"       
## 10 "$col.sup$coord"  "coord. for supplementary columns"
## 11 "$col.sup$cos2"   "cos2 for supplementary columns"  
## 12 "$call"           "summary called parameters"       
## 13 "$call$marge.col" "weights of the columns"          
## 14 "$call$marge.row" "weights of the rows"

Distinctive Collexeme Analysis (DCA)

Conversation DCAs

prog <- readRDS(file = here("prog_collanalysis.rds")) # Full, annotated prog concordance lines from Textbook Conversation + BNC Spoken sample

glimpse(prog)
## Rows: 5,867
## Columns: 22
## $ Corpus         <fct> Spoken BNC 2014 sample, Spoken BNC 2014 sample, Spoken …
## $ SE_Filename    <fct> filename#0, filename#0, filename#2, filename#3, filenam…
## $ BNC_Filename   <fct> SAAB, SAAB, SAA3, SABT, SAB7, SAFZ, SAG4, SAG4, SAG4, S…
## $ Rand.          <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1   <chr> "going to make a comment on the sleeping arrangements y…
## $ Concor.dance2  <chr> "be inviting", "were having", "'re playing", "are comin…
## $ Concordance3   <chr> "him to come and stay at their house stay at their hous…
## $ Lemma          <fct> invite, have, play, come, say, say, earn, earn, laugh, …
## $ Contraction    <fct> full, full, contract., full, full, contract., contract.…
## $ Negation       <fct> positive, positive, positive, positive, positive, posit…
## $ Tense          <fct> modal, past, present, present, present, present, presen…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Question       <fct> statement, statement, statement, statement, statement, …
## $ Time.reference <chr> "future", "future", "present", "past/present", "present…
## $ Continuous     <fct> no, yes, yes, yes, yes, yes, yes, yes, no, yes, yes, ye…
## $ Repeated       <fct> no, no, no, no, no, no, yes, yes, no, no, no, no, no, y…
## $ Extra.function <chr> NA, "other interesting function", NA, NA, NA, "emphasis…
## $ Comment        <chr> NA, NA, NA, NA, NA, NA, NA, NA, "like ", NA, NA, NA, NA…
## $ Annotator      <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", "E", …
## $ Register       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level          <fct> Spoken BNC 2014 sample, Spoken BNC 2014 sample, Spoken …
## $ Textbook       <chr> "Spoken BNC 2014 sample", "Spoken BNC 2014 sample", "Sp…
summary(prog$Corpus)
## Spoken BNC 2014 sample  Textbook Conversation 
##                   3444                   2423
## Prepare prog verb counts ##
prog_lemmas <- prog[,c("Lemma", "Corpus")]
prog_lemmas$Lemma <- stringr::word(prog_lemmas$Lemma,1) # Add new variable for lemma without any particles. For now, I will be ignoring phrasal verbs and only looking at the verb lemmas (it's too hard to extract phrasal verbs in FVPs!)
head(prog_lemmas);tail(prog_lemmas) # Check
##    Lemma                 Corpus
## 1 invite Spoken BNC 2014 sample
## 2   have Spoken BNC 2014 sample
## 3   play Spoken BNC 2014 sample
## 4   come Spoken BNC 2014 sample
## 5    say Spoken BNC 2014 sample
## 6    say Spoken BNC 2014 sample
##      Lemma                Corpus
## 5862 think Textbook Conversation
## 5863  pull Textbook Conversation
## 5864  lose Textbook Conversation
## 5865  talk Textbook Conversation
## 5866  wait Textbook Conversation
## 5867  look Textbook Conversation
levels(prog_lemmas$Corpus) <- c("SpokenBNC_Prog", "TxBSpoken_Prog") # Rename factor levels 
prog_lemmas <- prog_lemmas[,c(2,1)]
names(prog_lemmas)[1] <- "corpus"
names(prog_lemmas)[2] <- "lemma"
head(prog_lemmas);tail(prog_lemmas)
##           corpus  lemma
## 1 SpokenBNC_Prog invite
## 2 SpokenBNC_Prog   have
## 3 SpokenBNC_Prog   play
## 4 SpokenBNC_Prog   come
## 5 SpokenBNC_Prog    say
## 6 SpokenBNC_Prog    say
##              corpus lemma
## 5862 TxBSpoken_Prog think
## 5863 TxBSpoken_Prog  pull
## 5864 TxBSpoken_Prog  lose
## 5865 TxBSpoken_Prog  talk
## 5866 TxBSpoken_Prog  wait
## 5867 TxBSpoken_Prog  look
summary(factor(prog_lemmas$lemma))
##          do          go         say        talk        look         try 
##         539         530         243         222         218         182 
##         get       think        come        have        work        wear 
##         170         167         164         153         106          89 
##        make        play        take          be        wait        tell 
##          87          86          78          72          72          61 
##         sit       watch        feel        live      happen         use 
##          58          57          51          51          49          49 
##         run        walk      listen        stay         eat       start 
##          44          44          40          39          38          37 
##        read         pay         put        move         buy         ask 
##          36          35          35          32          31          29 
##        plan       stand       leave       enjoy        joke       write 
##          29          29          28          26          26          26 
##        give       drink       study       learn       drive        hold 
##          24          23          23          22          21          21 
##        hope         kid         lie        rain        call         fly 
##          21          21          21          20          19          19 
##         see        help       speak      travel        meet      record 
##          19          17          17          17          16          16 
##       sleep        cook       teach         die        pick      wonder 
##          16          15          15          14          14          14 
##      change        fall      follow        keep       laugh        show 
##          12          12          12          12          12          12 
##       smile        push        sell       build        chat         cry 
##          12          11          11          10          10          10 
##        hang        miss        sing    struggle        swim       visit 
##          10          10          10          10          10          10 
##      become        grow        lose        pull        send       bring 
##           9           9           9           9           9           8 
##       dance        earn       fight        hear         let     prepare 
##           8           8           8           8           8           8 
##       spend        turn concentrate     (Other) 
##           8           8           7         944
## Creating lemma list used to filter the FVPs for the comparative DCAs ##
lemmas_with_phrasal_verbs <- sort(unique(prog$Lemma)); lemmas_with_phrasal_verbs # For now, I will be ignoring phrasal verbs and only looking at the verb lemmas
##   [1] accept       ache         act          action       add         
##   [6] advise       agree        aim          allow        announce    
##  [11] annoy        answer       apologise    apply        appoint     
##  [16] approach     argue        arrange      arrive       ask         
##  [21] asphalt      assume       attack       attract      babysit     
##  [26] bake         balloon      bang         bar          bark        
##  [31] base         bask in      battle       be           beat        
##  [36] become       beef up      begin        behave       believe     
##  [41] bet          blame        blink        block        bloom       
##  [46] blow         board        boil         bomb         book        
##  [51] bore         borrow       bother       bottle       bounce up   
##  [56] bow          brag         break        break down   break up    
##  [61] breathe      bring        browse       build        burgeon     
##  [66] burn         burst        buy          call         call for    
##  [71] camp         carry        cat-sit      catch        cause       
##  [76] celebrate    change       charge       chase        chat        
##  [81] cheat        check        cheer        chew         chip in     
##  [86] choke        choose       chop         clamber      clap        
##  [91] clarify      clean        clear        clear out    clear up    
##  [96] climb        close        collaborate  collect      come        
## [101] come across  come back    come down    come in      come off    
## [106] come on      come out     come over    come up      comment     
## [111] commit       communicate  compete      complain     compose     
## [116] concentrate  confuse      consider     consume      contact     
## [121] contemplate  continue     contradict   control      cook        
## [126] cope         copy         cost         cosy up      cough       
## [131] count        crack up     crash        create       cringe      
## [136] criticise    cross        crowdfund    crumble      cry         
## [141] cut          cut down     cycle        dance        date        
## [146] deal         decide       decline      decorate     delete      
## [151] deliver      demand       deny         describe     desert      
## [156] design       develop      die          die out      dig         
## [161] dig in       digress      disappear    discover     discuss     
## [166] disobey      dissect      disturb      diversify    do          
## [171] doodle       download     drag along   draw         dread       
## [176] dream        dress        dress up     drink        drip        
## [181] drive        drive along  drown        dry          dust        
## [186] earn         eat          elate        email        emerge      
## [191] encourage    end          enjoy        evolve       exaggerate  
## [196] exclude      expand       expect       experiment   explain     
## [201] explode      exploit      express      extend       face        
## [206] facilitate   fail         fall         fall apart   fall asleep 
## [211] fall down    feed         feel         fight        fight for   
## [216] fill         fill up      film         find         finger      
## [221] finish       finish off   fire         flap         flee        
## [226] flick        flirt        flit         fly          fly through 
## [231] focus        fold         follow       fool around  forget      
## [236] fray         freak out    freewheel    freeze       frown       
## [241] fry          fuck         fuel         fume         function    
## [246] fundraise    gain         garden       gasp         gear up     
## [251] get          get along    get away     get back     get into    
## [256] get off      get on       get through  get up       get used    
## [261] give         give away    give off     give up      gnaw        
## [266] go           go along     go back      go down      go for      
## [271] go off       go on        go out       go over      go round    
## [276] go through   go up        grab         greet        grill       
## [281] grin         grope        grow         grow up      guarantee   
## [286] guess        hack up      hammer       hamper       hand        
## [291] hand out     handcuff     hang         hang  out    hang about  
## [296] hang out     hang over    happen       harass       harm        
## [301] hassle       have         have on      head         hear        
## [306] heat up      help         help out     hibernate    hide        
## [311] hike         hit          hog          hold         hold up     
## [316] hop          hope         host         huff         hug         
## [321] hunt         hurt         impoverish   include      indicate    
## [326] inspect      inspire      insure       interact     interrogate 
## [331] interview    invite       itch         jog          join        
## [336] joke         judge        juggle       jump         keep        
## [341] kick         kid          kill         knacker      kneel       
## [346] knock        knock around knock out    land         laugh       
## [351] lay          laze around  lead         lean in      learn       
## [356] leave        let          level        lie          lift        
## [361] light up     link         listen       live         live up     
## [366] look         look after   look around  look for     look forwad 
## [371] look forward look into    look over    look up      loop up     
## [376] lose         love         make         make up      manage      
## [381] march        mark         maximise     mean         meet        
## [386] memorize     mention      mess         mess up      message     
## [391] milk         miss         mix          moan         mock        
## [396] monitor      mouth        move         move in      munch       
## [401] name         nest         nick         notice       nourish     
## [406] observe      occur        offer        operate      organise    
## [411] overchange   overflow     overlap      overreact    overthink   
## [416] pack         paint        panic        park         participate 
## [421] party        pass         pay          pedal        perform     
## [426] persevere    phone        pick         pick up      pilot       
## [431] pinch        piss         pitch        place        plan        
## [436] plan on      plant        play         play up      point       
## [441] point in     pollute      pop out      pop over     pop up      
## [446] pose         post         pour         practice     practise    
## [451] pray         prepare      present      press        pretend     
## [456] prey         print        proclaim     produce      promise     
## [461] promote      protest      pull         pull apart   pull over   
## [466] pump         puncture     purse        push         push for    
## [471] push out     push through put          put in       put into    
## [476] put off      put on       put out      put together put up      
## [481] question     queue        queue up     race         rain        
## [486] rake in      rap          react        read         realise     
## [491] receive      record       recover      recycle      refer       
## [496] reflect      refuse       rehearse     relax        release     
## [501] rely         rely on      remember     renovate     rent        
## [506] reopen       repair       repeat       reply        report      
## [511] represent    research     resign       retire       return      
## [516] rev          reverse      revise       rewrite      ride        
## [521] ring         rip off      rip up       rise         risk        
## [526] roast        rob          roll         romanticise  ruin        
## [531] run          run out      rush         sabotage     save        
## [536] savour       say          scam         scan         scrabble    
## [541] search       see          seek         sell         send        
## [546] serve        set          set up       settle       shake       
## [551] share        shine        shoot        shop         shout       
## [556] show         shower       shrug        shut         sign        
## [561] sing         sit          sit up       skinny-dip   slam        
## [566] sleep        sleet        slow         smash        smell       
## [571] smile        smirk        smoke        sniff        snooze      
## [576] snow         soak         socialise    sort out     sound       
## [581] speak        spell        spend        spit         split       
## [586] splutter     sponsor      spot         spray        spread      
## [591] spring up    sprint       squander     squeeze      stalk       
## [596] stand        stand up     stare        start        start up    
## [601] starve       stay         steal        stick        stop        
## [606] stress       stretch      strip        strip off    stroll      
## [611] struggle     study        suffer       suggest      sunbathe    
## [616] support      surf         sweat        swim         swish       
## [621] take         take back    take out     take over    take part   
## [626] take up      talk         taste        tax          teach       
## [631] tear         tease        tell         test         text        
## [636] think        threaten     throw        throw up     tick        
## [641] tick along   tidy         tilt         tip          top         
## [646] touch        trail behind train        transcribe   travel      
## [651] tread        treat        trend        try          turn        
## [656] turn into    turn off     turn up      tweet        twist       
## [661] twitch       type         understand   unpack       update      
## [666] upload       upset        urge         use          vibrate     
## [671] visit        visualise    voice        volunteer    vomit       
## [676] vote         wait         wait for     wake up      walk        
## [681] walk up      want         wash         wash up      waste       
## [686] watch        water        wear         wear through wheel       
## [691] wheeze       whisper      whiz by      win          wind up     
## [696] wipe         wish         wonder       work         work on     
## [701] work out     worry        write        write down   yawn        
## 705 Levels: accept ache act action add advise agree aim allow announce ... yawn
lemmas_short <- sort(unique(stringr::word(lemmas_with_phrasal_verbs,1))); lemmas_short # This is the lemma list which will be used to extract the FVPs from the full corpora processed with Spacy.
##   [1] "accept"      "ache"        "act"         "action"      "add"        
##   [6] "advise"      "agree"       "aim"         "allow"       "announce"   
##  [11] "annoy"       "answer"      "apologise"   "apply"       "appoint"    
##  [16] "approach"    "argue"       "arrange"     "arrive"      "ask"        
##  [21] "asphalt"     "assume"      "attack"      "attract"     "babysit"    
##  [26] "bake"        "balloon"     "bang"        "bar"         "bark"       
##  [31] "base"        "bask"        "battle"      "be"          "beat"       
##  [36] "become"      "beef"        "begin"       "behave"      "believe"    
##  [41] "bet"         "blame"       "blink"       "block"       "bloom"      
##  [46] "blow"        "board"       "boil"        "bomb"        "book"       
##  [51] "bore"        "borrow"      "bother"      "bottle"      "bounce"     
##  [56] "bow"         "brag"        "break"       "breathe"     "bring"      
##  [61] "browse"      "build"       "burgeon"     "burn"        "burst"      
##  [66] "buy"         "call"        "camp"        "carry"       "cat-sit"    
##  [71] "catch"       "cause"       "celebrate"   "change"      "charge"     
##  [76] "chase"       "chat"        "cheat"       "check"       "cheer"      
##  [81] "chew"        "chip"        "choke"       "choose"      "chop"       
##  [86] "clamber"     "clap"        "clarify"     "clean"       "clear"      
##  [91] "climb"       "close"       "collaborate" "collect"     "come"       
##  [96] "comment"     "commit"      "communicate" "compete"     "complain"   
## [101] "compose"     "concentrate" "confuse"     "consider"    "consume"    
## [106] "contact"     "contemplate" "continue"    "contradict"  "control"    
## [111] "cook"        "cope"        "copy"        "cost"        "cosy"       
## [116] "cough"       "count"       "crack"       "crash"       "create"     
## [121] "cringe"      "criticise"   "cross"       "crowdfund"   "crumble"    
## [126] "cry"         "cut"         "cycle"       "dance"       "date"       
## [131] "deal"        "decide"      "decline"     "decorate"    "delete"     
## [136] "deliver"     "demand"      "deny"        "describe"    "desert"     
## [141] "design"      "develop"     "die"         "dig"         "digress"    
## [146] "disappear"   "discover"    "discuss"     "disobey"     "dissect"    
## [151] "disturb"     "diversify"   "do"          "doodle"      "download"   
## [156] "drag"        "draw"        "dread"       "dream"       "dress"      
## [161] "drink"       "drip"        "drive"       "drown"       "dry"        
## [166] "dust"        "earn"        "eat"         "elate"       "email"      
## [171] "emerge"      "encourage"   "end"         "enjoy"       "evolve"     
## [176] "exaggerate"  "exclude"     "expand"      "expect"      "experiment" 
## [181] "explain"     "explode"     "exploit"     "express"     "extend"     
## [186] "face"        "facilitate"  "fail"        "fall"        "feed"       
## [191] "feel"        "fight"       "fill"        "film"        "find"       
## [196] "finger"      "finish"      "fire"        "flap"        "flee"       
## [201] "flick"       "flirt"       "flit"        "fly"         "focus"      
## [206] "fold"        "follow"      "fool"        "forget"      "fray"       
## [211] "freak"       "freewheel"   "freeze"      "frown"       "fry"        
## [216] "fuck"        "fuel"        "fume"        "function"    "fundraise"  
## [221] "gain"        "garden"      "gasp"        "gear"        "get"        
## [226] "give"        "gnaw"        "go"          "grab"        "greet"      
## [231] "grill"       "grin"        "grope"       "grow"        "guarantee"  
## [236] "guess"       "hack"        "hammer"      "hamper"      "hand"       
## [241] "handcuff"    "hang"        "happen"      "harass"      "harm"       
## [246] "hassle"      "have"        "head"        "hear"        "heat"       
## [251] "help"        "hibernate"   "hide"        "hike"        "hit"        
## [256] "hog"         "hold"        "hop"         "hope"        "host"       
## [261] "huff"        "hug"         "hunt"        "hurt"        "impoverish" 
## [266] "include"     "indicate"    "inspect"     "inspire"     "insure"     
## [271] "interact"    "interrogate" "interview"   "invite"      "itch"       
## [276] "jog"         "join"        "joke"        "judge"       "juggle"     
## [281] "jump"        "keep"        "kick"        "kid"         "kill"       
## [286] "knacker"     "kneel"       "knock"       "land"        "laugh"      
## [291] "lay"         "laze"        "lead"        "lean"        "learn"      
## [296] "leave"       "let"         "level"       "lie"         "lift"       
## [301] "light"       "link"        "listen"      "live"        "look"       
## [306] "loop"        "lose"        "love"        "make"        "manage"     
## [311] "march"       "mark"        "maximise"    "mean"        "meet"       
## [316] "memorize"    "mention"     "mess"        "message"     "milk"       
## [321] "miss"        "mix"         "moan"        "mock"        "monitor"    
## [326] "mouth"       "move"        "munch"       "name"        "nest"       
## [331] "nick"        "notice"      "nourish"     "observe"     "occur"      
## [336] "offer"       "operate"     "organise"    "overchange"  "overflow"   
## [341] "overlap"     "overreact"   "overthink"   "pack"        "paint"      
## [346] "panic"       "park"        "participate" "party"       "pass"       
## [351] "pay"         "pedal"       "perform"     "persevere"   "phone"      
## [356] "pick"        "pilot"       "pinch"       "piss"        "pitch"      
## [361] "place"       "plan"        "plant"       "play"        "point"      
## [366] "pollute"     "pop"         "pose"        "post"        "pour"       
## [371] "practice"    "practise"    "pray"        "prepare"     "present"    
## [376] "press"       "pretend"     "prey"        "print"       "proclaim"   
## [381] "produce"     "promise"     "promote"     "protest"     "pull"       
## [386] "pump"        "puncture"    "purse"       "push"        "put"        
## [391] "question"    "queue"       "race"        "rain"        "rake"       
## [396] "rap"         "react"       "read"        "realise"     "receive"    
## [401] "record"      "recover"     "recycle"     "refer"       "reflect"    
## [406] "refuse"      "rehearse"    "relax"       "release"     "rely"       
## [411] "remember"    "renovate"    "rent"        "reopen"      "repair"     
## [416] "repeat"      "reply"       "report"      "represent"   "research"   
## [421] "resign"      "retire"      "return"      "rev"         "reverse"    
## [426] "revise"      "rewrite"     "ride"        "ring"        "rip"        
## [431] "rise"        "risk"        "roast"       "rob"         "roll"       
## [436] "romanticise" "ruin"        "run"         "rush"        "sabotage"   
## [441] "save"        "savour"      "say"         "scam"        "scan"       
## [446] "scrabble"    "search"      "see"         "seek"        "sell"       
## [451] "send"        "serve"       "set"         "settle"      "shake"      
## [456] "share"       "shine"       "shoot"       "shop"        "shout"      
## [461] "show"        "shower"      "shrug"       "shut"        "sign"       
## [466] "sing"        "sit"         "skinny-dip"  "slam"        "sleep"      
## [471] "sleet"       "slow"        "smash"       "smell"       "smile"      
## [476] "smirk"       "smoke"       "sniff"       "snooze"      "snow"       
## [481] "soak"        "socialise"   "sort"        "sound"       "speak"      
## [486] "spell"       "spend"       "spit"        "split"       "splutter"   
## [491] "sponsor"     "spot"        "spray"       "spread"      "spring"     
## [496] "sprint"      "squander"    "squeeze"     "stalk"       "stand"      
## [501] "stare"       "start"       "starve"      "stay"        "steal"      
## [506] "stick"       "stop"        "stress"      "stretch"     "strip"      
## [511] "stroll"      "struggle"    "study"       "suffer"      "suggest"    
## [516] "sunbathe"    "support"     "surf"        "sweat"       "swim"       
## [521] "swish"       "take"        "talk"        "taste"       "tax"        
## [526] "teach"       "tear"        "tease"       "tell"        "test"       
## [531] "text"        "think"       "threaten"    "throw"       "tick"       
## [536] "tidy"        "tilt"        "tip"         "top"         "touch"      
## [541] "trail"       "train"       "transcribe"  "travel"      "tread"      
## [546] "treat"       "trend"       "try"         "turn"        "tweet"      
## [551] "twist"       "twitch"      "type"        "understand"  "unpack"     
## [556] "update"      "upload"      "upset"       "urge"        "use"        
## [561] "vibrate"     "visit"       "visualise"   "voice"       "volunteer"  
## [566] "vomit"       "vote"        "wait"        "wake"        "walk"       
## [571] "want"        "wash"        "waste"       "watch"       "water"      
## [576] "wear"        "wheel"       "wheeze"      "whisper"     "whiz"       
## [581] "win"         "wind"        "wipe"        "wish"        "wonder"     
## [586] "work"        "worry"       "write"       "yawn"
#saveRDS(lemmas_short, file = here("lemmas_short.rds")) 

## Prepare FVP counts in "Extract_verb_lemmas_spacy.R" script with the list created above. This script leads to the creation of the verb_counts.rds file"

## Prepare non-prog verb counts ##
verb_counts <- readRDS(file = here("FVP_counts_new.rds"))
head(verb_counts)
##    lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 1 accept              11             340            12.69688471
## 2   ache               1              44             1.64312626
## 3    act              20             265             9.89610132
## 4 action               0               1             0.03734378
## 5    add              27             800            29.87502285
## 6 advise               7              73             2.72609583
#verb_counts <- verb_counts[,c(1,2,4)] # Get rid of total BNC verb count because I'm now only interested in the sample corresponding to the number of FVPs found in Textbook Conversation.

## Adjusting sample ratio with adjusted FVP count for Textbook Conversation (+4 FVPs) ##
verb_counts$BNCSpoken_count_sample <- verb_counts$BNCSpoken_count*(57007/1526439)
head(verb_counts, 20)
##       lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 1    accept              11             340             12.6977757
## 2      ache               1              44              1.6432416
## 3       act              20             265              9.8967957
## 4    action               0               1              0.0373464
## 5       add              27             800             29.8771192
## 6    advise               7              73              2.7262871
## 7     agree             122             976             36.4500855
## 8       aim               5             107              3.9960647
## 9     allow              80            1584             59.1566961
## 10 announce               2              71              2.6515943
## 11    annoy               6             469             17.5154611
## 12   answer              51             366             13.6687820
## 13    apply              10             548             20.4658267
## 14  appoint               1              15              0.5601960
## 15 approach               6              98              3.6599471
## 16    argue              18             249              9.2992534
## 17  arrange               7             164              6.1248094
## 18   arrive             130             474             17.7021931
## 19      ask             342            4338            162.0086790
## 20  asphalt               0               7              0.2614248
## Subtract progressives from total FVP_counts 
prog$Lemma_short <- stringr::word(prog$Lemma,1)
prog_TxB <- as.data.frame(table(prog[prog$Corpus=="Textbook Conversation", "Lemma_short"]))
names(prog_TxB)[1] <- "lemma"
names(prog_TxB)[2] <- "TxB_prog_count"
head(prog_TxB)
##      lemma TxB_prog_count
## 1      act              2
## 2      add              1
## 3   advise              1
## 4   answer              3
## 5  appoint              1
## 6 approach              2
prog_BNC <- as.data.frame(table(prog[prog$Corpus=="Spoken BNC 2014 sample", "Lemma_short"]))
names(prog_BNC)[1] <- "lemma"
names(prog_BNC)[2] <- "BNC_prog_count"
head(prog_BNC)
##    lemma BNC_prog_count
## 1 accept              2
## 2   ache              1
## 3 action              1
## 4    add              2
## 5  agree              1
## 6    aim              2
counttest <- merge(verb_counts, prog_TxB, by = 'lemma', all = TRUE)
counts <- merge(counttest, prog_BNC, by = 'lemma', all = TRUE)
counts[is.na(counts)] <- 0 # Replace all NAs with 0

counts$TxBSpoken_non_prog = counts$TxBSpoken_count - counts$TxB_prog_count
counts$BNC_non_prog = counts$BNCSpoken_count_sample - counts$BNC_prog_count

head(counts); tail(counts) # Problem: lots of negative non-prog values!!
##    lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 1 accept              11             340             12.6977757              0
## 2   ache               1              44              1.6432416              0
## 3    act              20             265              9.8967957              2
## 4 action               0               1              0.0373464              0
## 5    add              27             800             29.8771192              1
## 6 advise               7              73              2.7262871              1
##   BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 1              2                 11   10.6977757
## 2              1                  1    0.6432416
## 3              0                 18    9.8967957
## 4              1                  0   -0.9626536
## 5              2                 26   27.8771192
## 6              0                  6    2.7262871
##        lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 584 scrabble               0               0                      0
## 585  squeeze               0               0                      0
## 586    stalk               0               0                      0
## 587    tease               0               0                      0
## 588    trail               0               0                      0
## 589     vote               0               0                      0
##     TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 584              0              1                  0           -1
## 585              0              1                  0           -1
## 586              0              1                  0           -1
## 587              0              1                  0           -1
## 588              0              1                  0           -1
## 589              0              1                  0           -1
## What shall I do with all these decimal values from the BNC sample??
# a) Round up and down? Probably less than ideal!
#verb_counts$BNCSpoken_count_sample <- round(verb_counts$BNCSpoken_count_sample, 0)
# b) Peter's suggestion was to sample which very low-frequency verbs to remove but this is no longer necessary.
# c) It turns out that decimal points are not a problem for the coll.analyis script. Solution: just keep them in even if, conceptually, it doesn't make a lot of sense.

verb_counts[verb_counts$TxBSpoken_count==0 & verb_counts$BNCSpoken_count_sample==0,] # These are a few problematic lemmas for which there are progressive counts (because otherwise these lemmas would not have been in the list) but no FVP counts in either Textbook Conversation or the BNC sample. See below more...
##          lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 68     cat-sit               0               0                      0
## 324 overchange               0               0                      0
## 425       scam               0               0                      0
## 448 skinny-dip               0               0                      0
## 451      sleet               0               0                      0
#verb_counts <- verb_counts[verb_counts$TxBSpoken_count!=0 & verb_counts$BNCSpoken_count_sample!=0,] # Drop lemmas with zero in both corpora?

#verb_counts <- verb_counts[verb_counts$TxBSpoken_count!=0,] 
#verb_counts <- verb_counts[verb_counts$BNCSpoken_count_sample!=0,] # Drop lemmas with a zero frequency in either corpus?

counts[counts$lemma=="say",]
##     lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 424   say             817           50045               1869.001             29
##     BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 424            214                788     1655.001
## TxB Conversation problem ##

summary(counts$TxBSpoken_non_prog) # We have vegative values!!
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    -1.00     0.00     4.00    92.67    25.00 19876.00
counts[counts$TxBSpoken_non_prog<0,c("lemma", "TxBSpoken_count", "TxB_prog_count", "TxBSpoken_non_prog")] # Four verbs which were, unsurprisingly, not identified as such by the Spacy model
##          lemma TxBSpoken_count TxB_prog_count TxBSpoken_non_prog
## 293       loop               0              1                 -1
## 318    nourish               0              1                 -1
## 324 overchange               0              1                 -1
## 534      upset               0              1                 -1
counts$TxBSpoken_non_prog[counts$TxBSpoken_non_prog<0] <- 0 # We will therefore set the non-prog count to 0 for these four lemmas. This adds four FVPs to the total count for the contingency tables.

## Alternative to the above ##
#counts <- subset(counts, !(TxBSpoken_non_prog<0)) # A more logical alternative, given the solution used for the Spoken BNC 2014? But it actually slightly increases the gap in total list-FVPs.

summary(counts$TxBSpoken_non_prog) # Check operation
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     4.00    92.67    25.00 19876.00
## Bigger problem with reference corpus! ##
# Sampling (of the Spoken BNC 2014) and tagging errors sometimes lead to negative values, therefore I will have to do something about this. Maybe change all negative values to zero?
summary(counts$BNC_non_prog)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    -2.656     0.120     3.921    90.939    16.374 21020.104
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57007
negatives <- counts[counts$BNC_non_prog<0,c("lemma", "BNCSpoken_count", "BNC_non_prog", "BNC_prog_count", "TxBSpoken_non_prog")] #
head(negatives)
##      lemma BNCSpoken_count BNC_non_prog BNC_prog_count TxBSpoken_non_prog
## 4   action               1   -0.9626536              1                  0
## 20 asphalt               7   -0.7385752              1                  0
## 24 babysit              21   -0.2157256              1                  2
## 26 balloon               6   -0.7759216              1                  0
## 28     bar              17   -0.3651112              1                  0
## 31    bask               1   -0.9626536              1                  0
nrow(negatives) # 131 lemmas with negative BNC_non_prog counts
## [1] 131
sum(negatives$BNC_non_prog) # Equivalent to 96.64 FVPs
## [1] -96.63055
## Here is a way to think about this: no good! ##
lostBNC <- if_else(counts$BNC_non_prog<0, counts$BNC_prog_count, 0) 
sum(lostBNC) # 177 # This is not making things any better, on the contrary!
## [1] 177
## Let's think about removing low-frequency lemmas which are only sampling artefacts and only keep the ones which are due to annotation errors:
counts[counts$lemma=="party",] # This is likely to be a tagging error. KEEP.
##     lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 334 party               1              15               0.560196              1
##     BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 334              1                  0    -0.439804
counts[counts$lemma=="action",] # This is most likely a sampling artefact and I am not so interested in it because the CL value will not be insignificant either way.
##    lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 4 action               0               1              0.0373464              0
##   BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 4              1                  0   -0.9626536
counts[counts$lemma=="shine",]
##     lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 437 shine              10              19              0.7095816              2
##     BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 437              1                  8   -0.2904184
counts[counts$lemma=="sweat",]
##     lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 497 sweat               0              92               3.435869              0
##     BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 497              5                  0    -1.564131
keep <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count>1,]; keep # These are potentially important ones I'd like to keep like JOKE and KID.
##           lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 99  concentrate               9             101              3.7719863
## 166  exaggerate               3              36              1.3444704
## 191       flick               0              59              2.2034375
## 266        joke              16             388             14.4904028
## 272         kid              16             151              5.6393063
## 368     protest               4              19              0.7095816
## 376       queue               0              44              1.6432416
## 377        race               0              45              1.6805880
## 398      reopen               1              15              0.5601960
## 421    sabotage               0              15              0.5601960
## 468        spit               0              55              2.0540519
## 488       strip               1              50              1.8673200
## 497       sweat               0              92              3.4358687
## 552       wheel               1              29              1.0830456
## 565   apologise               0               0              0.0000000
## 570       crash               0               0              0.0000000
##     TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 99               2              5                  7   -1.2280137
## 166              3              4                  0   -2.6555296
## 191              0              3                  0   -0.7965625
## 266             11             15                  5   -0.5095972
## 272             13              8                  3   -2.3606937
## 368              1              2                  3   -1.2904184
## 376              0              2                  0   -0.3567584
## 377              0              2                  0   -0.3194120
## 398              0              2                  1   -1.4398040
## 421              0              2                  0   -1.4398040
## 468              0              4                  0   -1.9459481
## 488              0              2                  1   -0.1326800
## 497              0              5                  0   -1.5641313
## 552              0              2                  1   -0.9169544
## 565              0              2                  0   -2.0000000
## 570              0              2                  0   -2.0000000
sum(keep$BNC_prog_count) # This adds 62 FVPs to the total.
## [1] 62
factor(keep$lemma) # For 16 unique lemmas
##  [1] concentrate exaggerate  flick       joke        kid         protest    
##  [7] queue       race        reopen      sabotage    spit        strip      
## [13] sweat       wheel       apologise   crash      
## 16 Levels: concentrate exaggerate flick joke kid protest queue race ... crash
dispose <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count<=1,] # These are the low-frequency ones we can get rid of.
sum(dispose$BNC_prog_count) # This removes 115 FVPs from the total.
## [1] 115
factor(dispose$lemma) # For, as we would expect, 115 unique (low-frequency) lemmas.
##   [1] action      asphalt     babysit     balloon     bar         bask       
##   [7] battle      beef        bottle      brag        browse      burgeon    
##  [13] cat-sit     clarify     collaborate compose     contradict  cosy       
##  [19] cringe      crowdfund   decline     digress     dissect     doodle     
##  [25] exploit     facilitate  finger      flap        flee        flit       
##  [31] fray        freewheel   fume        function    gasp        gear       
##  [37] gnaw        grope       hamper      harm        hassle      hibernate  
##  [43] hike        hog         huff        inspect     interrogate itch       
##  [49] laze        maximise    memorize    monitor     mouth       munch      
##  [55] nest        overflow    overlap     overthink   party       persevere  
##  [61] pilot       pollute     prey        proclaim    puncture    rake       
##  [67] rev         rewrite     romanticise savour      scam        shine      
##  [73] skinny-dip  sleet       smirk       snooze      splutter    sprint     
##  [79] stare       sunbathe    swish       tilt        tread       trend      
##  [85] twitch      unpack      visualise   voice       wheeze      whisper    
##  [91] whiz        yawn        bounce      clamber     contact     contemplate
##  [97] demand      deny        drag        elate       fuel        hug        
## [103] knacker     name        nick        promise     represent   resign     
## [109] rush        scrabble    squeeze     stalk       tease       trail      
## [115] vote       
## 115 Levels: action asphalt babysit balloon bar bask battle beef bottle ... vote
# Question: Should I be getting rid of these if they happen to be more frequent in Textbook Conversation??
head(dispose[order(dispose$TxBSpoken_count, decreasing = T),], 12) # These lemmas should probably not be deleted because they are potentially relevant for Textbook Conversation!
##           lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 437       shine              10              19              0.7095816
## 243        hike               7              22              0.8216208
## 479       stare               4              26              0.9710064
## 127     decline               3              22              0.8216208
## 190        flee               3              12              0.4481568
## 351     pollute               3              10              0.3734640
## 24      babysit               2              21              0.7842744
## 59       browse               2              11              0.4108104
## 98      compose               2              14              0.5228496
## 260 interrogate               2               4              0.1493856
## 540       voice               2              20              0.7469280
## 554     whisper               2              23              0.8589672
##     TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 437              2              1                  8  -0.29041842
## 243              2              1                  5  -0.17837922
## 479              3              1                  1  -0.02899363
## 127              0              1                  3  -0.17837922
## 190              0              1                  3  -0.55184321
## 351              1              1                  2  -0.62653601
## 24               0              1                  2  -0.21572562
## 59               0              1                  2  -0.58918961
## 98               0              1                  2  -0.47715041
## 260              0              1                  2  -0.85061440
## 540              0              1                  2  -0.25307202
## 554              1              1                  1  -0.14103282
dispose2 <- counts[counts$BNC_non_prog<0 & counts$BNC_prog_count<=1 & counts$TxBSpoken_count<1,]
factor(dispose2$lemma) # That would mean that I only delete 97 verb lemmas & therefore 97 FVPs from the BNC total count
##  [1] action      asphalt     balloon     bar         bask        battle     
##  [7] beef        bottle      brag        burgeon     cat-sit     clarify    
## [13] collaborate contradict  cosy        cringe      crowdfund   digress    
## [19] dissect     doodle      facilitate  finger      flap        flit       
## [25] fray        freewheel   function    gear        gnaw        grope      
## [31] hamper      harm        hassle      hibernate   hog         huff       
## [37] itch        laze        maximise    memorize    monitor     mouth      
## [43] nest        overflow    overlap     overthink   persevere   pilot      
## [49] prey        proclaim    puncture    rake        rev         rewrite    
## [55] romanticise savour      scam        skinny-dip  sleet       smirk      
## [61] snooze      splutter    sprint      sunbathe    swish       tilt       
## [67] tread       trend       twitch      unpack      visualise   wheeze     
## [73] whiz        yawn        bounce      clamber     contact     contemplate
## [79] demand      deny        drag        elate       fuel        hug        
## [85] knacker     name        nick        promise     represent   resign     
## [91] rush        scrabble    squeeze     stalk       tease       trail      
## [97] vote       
## 97 Levels: action asphalt balloon bar bask battle beef bottle brag ... vote
nrow(counts)
## [1] 589
counts <- subset(counts, !(BNC_non_prog<0 & BNC_prog_count==1 & TxBSpoken_count<1)) # This excludes the 97 VFPs/hapax legomena lemmas from the count data frame
nrow(counts) # 492
## [1] 492
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 56979.33 FVPs in Spoken BNC
## [1] 56979.33
sum(counts$BNC_non_prog)
## [1] 53632.33
summary(counts$BNC_non_prog)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    -2.656     1.367     6.129   109.009    24.839 21020.104
counts$BNC_non_prog[counts$BNC_non_prog<0] <- 0 # It doesn't make any sense to have negative non-progressive counts, therefore all negative counts will be levelled to 0.
sum(counts$BNC_non_prog)
## [1] 53659.63
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57006.63
## Summary

#counts <- counts[,c("lemma","TxB_prog_count", "TxBSpoken_non_prog", "BNC_prog_count", "BNC_non_prog")]
head(counts); tail(counts)
##    lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample TxB_prog_count
## 1 accept              11             340              12.697776              0
## 2   ache               1              44               1.643242              0
## 3    act              20             265               9.896796              2
## 5    add              27             800              29.877119              1
## 6 advise               7              73               2.726287              1
## 7  agree             122             976              36.450085              0
##   BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 1              2                 11   10.6977757
## 2              1                  1    0.6432416
## 3              0                 18    9.8967957
## 5              2                 26   27.8771192
## 6              0                  6    2.7262871
## 7              1                122   35.4500855
##         lemma TxBSpoken_count BNCSpoken_count BNCSpoken_count_sample
## 560    wonder              38            1733               64.72131
## 561      work             329            8008              299.06996
## 562     worry             142            1043               38.95229
## 563     write             182            2840              106.06377
## 565 apologise               0               0                0.00000
## 570     crash               0               0                0.00000
##     TxB_prog_count BNC_prog_count TxBSpoken_non_prog BNC_non_prog
## 560              7              7                 31     57.72131
## 561             50             56                279    243.06996
## 562              0              3                142     35.95229
## 563             18              8                164     98.06377
## 565              0              2                  0      0.00000
## 570              0              2                  0      0.00000
## Final check ##

sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog) # that's 57007 FVPs in TxB Spoken
## [1] 57007
sum(counts$BNC_prog_count) + sum(counts$BNC_non_prog) # and 57006.63 FVPs in Spoken BNC
## [1] 57006.63
(dif <- sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog) - sum(counts$BNC_prog_count) - sum(counts$BNC_non_prog)) # Difference of 0.36
## [1] 0.3694507
dif / (sum(counts$TxB_prog_count) + sum(counts$TxBSpoken_non_prog)) * 100 # Difference in percentage (being honest and taking the largest of the two percentages ;-)
## [1] 0.0006480796
## Contingency table function ##

TxB_CTable <- function(verb) {
  x = subset(counts, lemma==verb)
  vprog = x$TxB_prog_count
  tverb = x$TxBSpoken_count
  vnonprog = tverb - vprog
  tprog = sum(counts$TxB_prog_count)
  tnonprog = sum(counts$TxBSpoken_non_prog)
  xvprog = tprog - vprog
  xvnonprog = tnonprog - vnonprog
  
  verb = c(vprog, vnonprog, tverb)
  other = c(xvprog, xvnonprog, (xvprog+xvnonprog))
  totals = c(tprog, tnonprog, (tprog + tnonprog))
  
  df=data.frame(verb=verb, other=other, row_totals=totals)
  return(df)
}

TxB_CTable("say")
##   verb other row_totals
## 1   29  2394       2423
## 2  788 53796      54584
## 3  817 56190      57007
##

BNC_CTable <- function(verb) {
  x = subset(counts, lemma==verb)
  vprog = x$BNC_prog_count
  tverb = x$BNCSpoken_count_sample
  vnonprog = x$BNC_non_prog
  tprog = sum(counts$BNC_prog_count)
  tnonprog = sum(counts$BNC_non_prog)
  xvprog = tprog - vprog
  xvnonprog = tnonprog - vnonprog
  
  verb = c(vprog, vnonprog, tverb)
  other = c(xvprog, xvnonprog, (xvprog+xvnonprog))
  totals = c(tprog, tnonprog, (tprog + tnonprog))
  
  df=data.frame(verb=verb, other=other, row_totals=totals)
  return(df)
}

BNC_CTable("party")
##       verb    other row_totals
## 1 1.000000  3346.00    3347.00
## 2 0.000000 53659.63   53659.63
## 3 0.560196 57005.63   57006.63
BNC_CTable("say")
##       verb    other row_totals
## 1  214.000  3133.00    3347.00
## 2 1655.001 52004.63   53659.63
## 3 1869.001 55137.63   57006.63
# This file can be split into two to be used for two separate DCA analyses à la 2b.csv
#saveRDS(counts, here("counts4DCA_2b.rds")) # Last saved on 26 January 2020
counts <- readRDS(file =here("counts4DCA_2b.rds"))

### Files for DCA analysis à la 2b.csv
# Textbook conversation
TxB_Spoken_DCA <- counts[,c("lemma", "TxB_prog_count", "TxBSpoken_non_prog")]
head(TxB_Spoken_DCA); tail(TxB_Spoken_DCA)
##    lemma TxB_prog_count TxBSpoken_non_prog
## 1 accept              0                 11
## 2   ache              0                  1
## 3    act              2                 18
## 5    add              1                 26
## 6 advise              1                  6
## 7  agree              0                122
##         lemma TxB_prog_count TxBSpoken_non_prog
## 560    wonder              7                 31
## 561      work             50                279
## 562     worry              0                142
## 563     write             18                164
## 565 apologise              0                  0
## 570     crash              0                  0
nrow(TxB_Spoken_DCA)
## [1] 492
TxB_Spoken_DCA <- subset(TxB_Spoken_DCA, TxB_prog_count>0 | TxBSpoken_non_prog>0) # # Drop lemmas with zero occurrences in Textbook Conversation
nrow(TxB_Spoken_DCA)
## [1] 454
sum(TxB_Spoken_DCA$TxB_prog_count) + sum(TxB_Spoken_DCA$TxBSpoken_non_prog)
## [1] 57007
#write.table(TxB_Spoken_DCA, file = here("TxB_Spoken_DCA.csv"), sep = "\t") 


# Spoken BNC2014 sample
BNC_DCA <- counts[,c("lemma", "BNC_prog_count", "BNC_non_prog")]
nrow(BNC_DCA)
## [1] 492
BNC_DCA <- subset(BNC_DCA, BNC_non_prog > 0 | BNC_prog_count > 0) # Drop lemmas with zero occurrences in Spoken BNC 2014
head(BNC_DCA); tail(BNC_DCA)
##    lemma BNC_prog_count BNC_non_prog
## 1 accept              2   10.6977757
## 2   ache              1    0.6432416
## 3    act              0    9.8967957
## 5    add              2   27.8771192
## 6 advise              0    2.7262871
## 7  agree              1   35.4500855
##         lemma BNC_prog_count BNC_non_prog
## 560    wonder              7     57.72131
## 561      work             56    243.06996
## 562     worry              3     35.95229
## 563     write              8     98.06377
## 565 apologise              2      0.00000
## 570     crash              2      0.00000
(sum(BNC_DCA$BNC_prog_count) + sum(BNC_DCA$BNC_non_prog)) - (sum(TxB_Spoken_DCA$TxB_prog_count) + sum(TxB_Spoken_DCA$TxBSpoken_non_prog)) # Difference in total number of FVPs = 71
## [1] -0.3694507
#write.table(BNC_DCA, file = here("BNC_DCA_decimals.csv"), sep = "\t") 
library(collostructions)

counts <- read.csv(file = here("TxB_Spoken_DCA.csv"), sep = "\t")
head(counts)
##    lemma TxB_prog_count TxBSpoken_non_prog
## 1 accept              0                 11
## 2   ache              0                  1
## 3    act              2                 18
## 5    add              1                 26
## 6 advise              1                  6
## 7  agree              0                122
DCA_TxB <- collex.dist(counts) # default association measure with this package is log-likelihood
head(DCA_TxB); tail(DCA_TxB)
##   COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2          ASSOC COLL.STR.LOGL SIGNIF SHARED
## 1   wear     71    7.2     99  162.8 TxB_prog_count     227.79570  *****      Y
## 2   talk     88   12.5    205  280.5 TxB_prog_count     218.00717  *****      Y
## 3     do    206   73.7   1527 1659.3 TxB_prog_count     177.80929  *****      Y
## 4   look    117   44.1    921  993.9 TxB_prog_count      90.29046  *****      Y
## 5   wait     50   10.1    188  227.9 TxB_prog_count      88.14350  *****      Y
## 6     go    205  104.9   2263 2363.1 TxB_prog_count      83.38281  *****      Y
##     COLLEX O.CXN1 E.CXN1 O.CXN2  E.CXN2              ASSOC COLL.STR.LOGL SIGNIF
## 449   have     59  116.2   2675  2617.8 TxBSpoken_non_prog      37.13818  *****
## 450    let      3   33.1    775   744.9 TxBSpoken_non_prog      47.32697  *****
## 451   love      0   23.1    544   520.9 TxBSpoken_non_prog      47.48734  *****
## 452    see      8   57.7   1350  1300.3 TxBSpoken_non_prog      70.78280  *****
## 453   want      2   50.1   1176  1127.9 TxBSpoken_non_prog      86.28789  *****
## 454     be     16  845.5  19876 19046.5 TxBSpoken_non_prog    1962.21780  *****
##     SHARED
## 449      Y
## 450      Y
## 451      N
## 452      Y
## 453      Y
## 454      Y
library(flextable)

DCA_TxB %>% 
  rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>% 
  mutate(G2 = ifelse(ASSOC=="TxBSpoken_non_prog", -G2, G2)) %>% 
  mutate(G2 = round(G2, 1)) %>% 
  #mutate(across(.cols = c(E.CXN1, O.CXN2, E.CXN2), round, 0)) %>% 
  mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>% 
  mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>% 
  select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>% 
  filter(abs(G2) > 30) -> resultsTxB

resultsTxB
##    lemma O:Enon-prog       O:Eprog      G2
## 1   wear      71:7.2      99:162.8   227.8
## 2   talk     88:12.5     205:280.5   218.0
## 3     do    206:73.7   1527:1659.3   177.8
## 4   look    117:44.1     921:993.9    90.3
## 5   wait     50:10.1     188:227.9    88.1
## 6     go   205:104.9   2263:2363.1    83.4
## 7    try     55:13.1     253:294.9    81.1
## 8    kid      13:0.7        3:15.3    67.0
## 9   work       50:14       279:315    60.2
## 10  play     56:17.4     353:391.6    58.4
## 11  plan      17:1.8       26:41.2    52.0
## 12  joke      11:0.7        5:15.3    50.1
## 13  rain      12:0.9        9:20.1    47.9
## 14 stand      20:3.7       67:83.3    38.5
## 15 study      18:3.1       56:70.9    36.5
## 16   lie      12:1.4       21:31.6    34.4
## 17  walk      27:7.6     151:170.4    32.3
## 18 smile      11:1.4       21:30.6    30.2
## 19 think     27:67.4   1558:1517.6   -33.1
## 20  have    59:116.2   2675:2617.8   -37.1
## 21   let      3:33.1     775:744.9   -47.3
## 22  love      0:23.1     544:520.9   -47.5
## 23   see      8:57.7   1350:1300.3   -70.8
## 24  want      2:50.1   1176:1127.9   -86.3
## 25    be    16:845.5 19876:19046.5 -1962.2
flextable(resultsTxB) %>% 
  compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>% 
  compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>% 
  compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>% 
  theme_booktabs() #%>% 
  #save_as_docx(path = here("DCA_TxBConv.docx"))

# Reference Conversation

BNC_DCA <- read.csv(file = here("BNC_DCA_decimals.csv"), sep = "\t", stringsAsFactors = TRUE)
head(BNC_DCA)
##    lemma BNC_prog_count BNC_non_prog
## 1 accept              2   10.6977757
## 2   ache              1    0.6432416
## 3    act              0    9.8967957
## 5    add              2   27.8771192
## 6 advise              0    2.7262871
## 7  agree              1   35.4500855
DCA_BNC <- collex.dist(BNC_DCA) # Remember that the default AM is log-likelihood
head(DCA_BNC); tail(DCA_BNC)
##   COLLEX O.CXN1 E.CXN1     O.CXN2 E.CXN2          ASSOC COLL.STR.LOGL SIGNIF
## 1   talk    134   13.5   95.41893  215.9 BNC_prog_count     464.51462  *****
## 2    try    127   17.6  171.95792  281.4 BNC_prog_count     337.13045  *****
## 3     do    333  182.0 2766.41500 2917.4 BNC_prog_count     116.12667  *****
## 4    say    214  109.7 1655.00054 1759.3 BNC_prog_count      87.24665  *****
## 5     go    325  192.9 2960.32538 3092.4 BNC_prog_count      86.57210  *****
## 6   joke     15    0.9    0.00000   14.1 BNC_prog_count      85.11647  *****
##   SHARED
## 1      Y
## 2      Y
## 3      Y
## 4      Y
## 5      Y
## 6      N
##     COLLEX O.CXN1 E.CXN1     O.CXN2  E.CXN2        ASSOC COLL.STR.LOGL SIGNIF
## 482    get    121  189.9  3113.3475  3044.5 BNC_non_prog      31.86039  *****
## 483   mean      2   26.5   449.3312   424.8 BNC_non_prog      40.23969  *****
## 484    see     11   56.0   943.0511   898.0 BNC_non_prog      57.09053  *****
## 485   have     94  187.7  3102.2916  3008.6 BNC_non_prog      63.15981  *****
## 486   want      3   51.4   872.9598   824.5 BNC_non_prog      83.35300  *****
## 487     be     56 1237.4 21020.1042 19838.7 BNC_non_prog    2691.13330  *****
##     SHARED
## 482      Y
## 483      Y
## 484      Y
## 485      Y
## 486      Y
## 487      Y
DCA_BNC %>% 
  rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>% 
  mutate(G2 = ifelse(ASSOC=="BNC_non_prog", -G2, G2)) %>% 
  mutate(G2 = round(G2, 1)) %>% 
  mutate(O.CXN2 = round(O.CXN2, 0)) %>% 
  mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>% 
  mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>% 
  select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>% 
  filter(abs(G2) > 25) -> resultsBNC

resultsBNC
##          lemma O:Enon-prog       O:Eprog      G2
## 1         talk    134:13.5      95:215.9   464.5
## 2          try    127:17.6     172:281.4   337.1
## 3           do     333:182   2766:2917.4   116.1
## 4          say   214:109.7   1655:1759.3    87.2
## 5           go   325:192.9   2960:3092.4    86.6
## 6         joke      15:0.9        0:14.1    85.1
## 7         look    101:38.6     556:618.7    77.3
## 8         work     56:17.6     243:281.5    59.0
## 9         come    102:46.8     695:750.6    53.6
## 10         sit      36:8.7     113:139.9    53.4
## 11      record      15:1.6       11:24.9    50.4
## 12         kid       8:0.5         0:7.5    45.4
## 13        plan      12:1.2        9:19.5    41.0
## 14        rain       8:0.6         2:9.6    35.1
## 15        play      30:8.7     119:140.3    35.0
## 16    struggle       9:0.8        4:12.6    34.6
## 17       drink      18:3.6       43:57.5    33.3
## 18        wait      22:5.5       71:87.5    31.7
## 19        wear      18:3.9       49:63.2    30.0
## 20 concentrate       5:0.3         0:4.7    28.4
## 21       sweat       5:0.3         0:4.7    28.4
## 22         pay     30:10.2     144:163.4    27.8
## 23         run      22:6.1       81:97.1    27.7
## 24        hold      13:2.4       28:38.1    26.2
## 25    remember      1:18.3     311:293.7   -29.9
## 26         get   121:189.9   3113:3044.5   -31.9
## 27        mean      2:26.5     449:424.8   -40.2
## 28         see       11:56       943:898   -57.1
## 29        have    94:187.7   3102:3008.6   -63.2
## 30        want      3:51.4     873:824.5   -83.4
## 31          be   56:1237.4 21020:19838.7 -2691.1
flextable(resultsBNC) %>% 
  compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>% 
  compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>% 
  compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
  theme_booktabs() #%>% 
  #save_as_docx(path = here("DCA_SpokenBNC.docx"))
BNC_collstrength <- read.csv(file = here("BNC_DCA_decimals_LL.csv"), header = TRUE, sep=",", stringsAsFactors = TRUE)
glimpse(BNC_collstrength)
## Rows: 492
## Columns: 9
## $ words                     <fct> accept, ache, act, add, advise, agree, aim, …
## $ obs.freq.1                <int> 2, 1, 0, 2, 0, 1, 2, 1, 2, 4, 1, 2, 2, 0, 0,…
## $ obs.freq.2                <dbl> 10.6977757, 0.6432416, 9.8967957, 27.8771192…
## $ exp.freq.1                <dbl> 0.75, 0.10, 0.58, 1.75, 0.16, 2.14, 0.23, 3.…
## $ exp.freq.2                <dbl> 11.95, 1.55, 9.32, 28.12, 2.57, 34.31, 3.76,…
## $ pref.occur                <fct> X.BNC_prog_count., X.BNC_prog_count., X.BNC_…
## $ delta.p.constr1.cues.word <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta.p.word.cues.constr1 <dbl> 0.10, 0.55, -0.06, 0.01, -0.06, -0.03, 0.44,…
## $ coll.strength             <dbl> 1.58, 3.55, 1.20, 0.04, 0.33, 0.80, 6.04, 2.…
names(BNC_collstrength)[1] <- "lemma"
names(BNC_collstrength)[9] <- "coll.strength.BNC"
BNC_collstrength <- na.omit(BNC_collstrength) # Exclude lemmas with no data
BNC_collstrength$coll.strength.BNC <- ifelse(BNC_collstrength$pref.occur=="X.BNC_prog_count.", BNC_collstrength$coll.strength, -(BNC_collstrength$coll.strength))
BNC_collstrength <- BNC_collstrength[,c(1,9)]
head(BNC_collstrength); tail(BNC_collstrength)
##    lemma coll.strength.BNC
## 1 accept              1.58
## 2   ache              3.55
## 3    act             -1.20
## 4    add              0.04
## 5 advise             -0.33
## 6  agree             -0.80
##      lemma coll.strength.BNC
## 487   wipe             -0.69
## 488   wish             -3.19
## 489 wonder              2.33
## 490   work             58.99
## 491  worry              0.22
## 492  write              0.50
TxBSpoken_collstrength <- read.csv(file = here("TxB_Spoken_DCA_collstrength_LL.csv"), header = TRUE, sep = ",", stringsAsFactors = TRUE)
glimpse(TxBSpoken_collstrength)
## Rows: 454
## Columns: 9
## $ words                     <fct> accept, ache, act, add, advise, agree, aim, …
## $ obs.freq.1                <int> 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 3, 0, 1, 2, 1,…
## $ obs.freq.2                <int> 11, 1, 18, 26, 6, 122, 5, 80, 2, 6, 48, 10, …
## $ exp.freq.1                <dbl> 0.47, 0.04, 0.85, 1.15, 0.30, 5.19, 0.21, 3.…
## $ exp.freq.2                <dbl> 10.53, 0.96, 19.15, 25.85, 6.70, 116.81, 4.7…
## $ pref.occur                <fct> X.TxBSpoken_non_prog., X.TxBSpoken_non_prog.…
## $ delta.p.constr1.cues.word <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta.p.word.cues.constr1 <dbl> -0.04, -0.04, 0.06, -0.01, 0.10, -0.04, -0.0…
## $ coll.strength             <dbl> 0.96, 0.09, 1.19, 0.02, 1.10, 10.61, 0.43, 6…
names(TxBSpoken_collstrength)[1] <- "lemma"
names(TxBSpoken_collstrength)[9] <- "coll.strength.TxBSpoken"
subset(TxBSpoken_collstrength, lemma=="mean")
##     lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2            pref.occur
## 248  mean          1        253       10.8      243.2 X.TxBSpoken_non_prog.
##     delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.TxBSpoken
## 248                         0                     -0.04                   15.26
TxBSpoken_collstrength <- na.omit(TxBSpoken_collstrength) # Exclude lemmas with no data
TxBSpoken_collstrength$coll.strength.TxBSpoken <- ifelse(TxBSpoken_collstrength$pref.occur=="X.TxB_prog_count.", TxBSpoken_collstrength$coll.strength, -(TxBSpoken_collstrength$coll.strength))
TxBSpoken_collstrength <- TxBSpoken_collstrength[,c(1,9)]
head(TxBSpoken_collstrength); tail(TxBSpoken_collstrength)
##    lemma coll.strength.TxBSpoken
## 1 accept                   -0.96
## 2   ache                   -0.09
## 3    act                    1.19
## 4    add                   -0.02
## 5 advise                    1.10
## 6  agree                  -10.61
##      lemma coll.strength.TxBSpoken
## 449   wipe                    2.67
## 450   wish                   -0.68
## 451 wonder                   10.61
## 452   work                   60.23
## 453  worry                  -12.35
## 454  write                   10.54
coll_strengths <- merge(BNC_collstrength, TxBSpoken_collstrength, by = 'lemma', all = FALSE) # Merge tables, to only look at differences in coll.strength values

#saveRDS(coll_strengths, file = here("coll_strengths_conversation.rds"))

coll_strengths %>%
  ggplot(aes(x = coll.strength.TxBSpoken, y = coll.strength.BNC)) +
  labs(x = "Textbook Conversation (TCC)", y = "Spoken BNC2014") +
  geom_point(shape = "circle filled", fill = "grey") -> fig

fig

# We have one very extreme case: for BE.
coll_strengths %>%
  filter(coll.strength.TxBSpoken < -1000, coll.strength.BNC < -1000)
##   lemma coll.strength.BNC coll.strength.TxBSpoken
## 1    be          -2691.13                -1962.22
# Let's exclude BE for visualisation purposes then.
coll_strengths%>%
  filter(lemma != "be") ->
  d_not_to_be

fig %+% d_not_to_be

# There is a strong linear association between the tendency to progressive construction in textbook conversations and in natural conversations. Let's quantify it.

d_not_to_be %>%
  summarise(corr_TxB_BNC = cor(coll.strength.TxBSpoken, coll.strength.BNC))
##   corr_TxB_BNC
## 1    0.6896516
cor.test(coll_strengths$coll.strength.TxBSpoken, coll_strengths$coll.strength.BNC, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  coll_strengths$coll.strength.TxBSpoken and coll_strengths$coll.strength.BNC
## t = 115.29, df = 447, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9802844 0.9863567
## sample estimates:
##       cor 
## 0.9835969
# According to Susanne Flach, the measures of association are not comparable across corpora even after controlling for the total number of FVPs, so let's standardise the two association variables. The absolute difference between the standardized measures then gives us a measure of the extent to which a verb is associated with the progressive construction in one corpus more than it is in the other corpus.

d_not_to_be %>%
  mutate(
    TxBSpoken_scaled = as.vector(scale(coll.strength.TxBSpoken)),
    BNC_scaled = as.vector(scale(coll.strength.BNC)),
    dev = abs(BNC_scaled - TxBSpoken_scaled)
  ) ->
  d_not_to_be

# We can now identify verbs that are extreme by this measure of deviance.

d_not_to_be %>%
  filter(dev > 1) %>% 
  select(lemma, coll.strength.TxBSpoken, coll.strength.BNC, dev) %>% 
  arrange(-dev)
##       lemma coll.strength.TxBSpoken coll.strength.BNC      dev
## 1      wear                  227.80             30.02 9.467230
## 2       try                   81.10            337.13 7.392262
## 3      talk                  218.01            464.51 5.290619
## 4        do                  177.81            116.13 4.332710
## 5      wait                   88.14             31.69 2.989542
## 6       say                   -1.06             87.25 2.942169
## 7      love                  -47.49             -5.27 2.030740
## 8       let                  -47.33             -7.03 1.965426
## 9    record                    0.39             50.38 1.661368
## 10     look                   90.29             77.31 1.586163
## 11      kid                   67.00             45.38 1.566546
## 12    think                  -33.15             -0.69 1.522089
## 13     play                   58.38             34.97 1.512927
## 14    stand                   38.45              8.30 1.474620
## 15      see                  -70.78            -57.09 1.395385
## 16    smile                   30.16              1.32 1.323228
## 17    study                   36.55             10.67 1.309199
## 18     come                   10.95             53.64 1.283085
## 19  prepare                   27.83             -0.60 1.279301
## 20     want                  -86.29            -83.35 1.243927
## 21     feel                   27.26              0.09 1.230366
## 22 struggle                    0.73             34.61 1.126431
## 23     walk                   32.30             11.72 1.079174
## 24     rain                   47.95             35.13 1.028003
## 25     plan                   52.02             40.98 1.022536
## 26    sleep                   22.24             -0.08 1.005104
d_not_to_be %>%
  filter(dev > 1.28) %>% 
  arrange(-dev) ->
  d_extremes

d_extremes
##     lemma coll.strength.BNC coll.strength.TxBSpoken TxBSpoken_scaled
## 1    wear             30.02                  227.80       10.2667320
## 2     try            337.13                   81.10        3.5202800
## 3    talk            464.51                  218.01        9.8165086
## 4      do            116.13                  177.81        7.9677876
## 5    wait             31.69                   88.14        3.8440362
## 6     say             87.25                   -1.06       -0.2581009
## 7    love             -5.27                  -47.49       -2.3933277
## 8     let             -7.03                  -47.33       -2.3859696
## 9  record             50.38                    0.39       -0.1914182
## 10   look             77.31                   90.29        3.9429105
## 11    kid             45.38                   67.00        2.8718481
## 12  think             -0.69                  -33.15       -1.7338585
## 13   play             34.97                   58.38        2.4754308
## 14  stand              8.30                   38.45        1.5588883
## 15    see            -57.09                  -70.78       -3.4643901
## 16  smile              1.32                   30.16        1.1776470
## 17  study             10.67                   36.55        1.4715109
## 18   come             53.64                   10.95        0.2942160
##     BNC_scaled      dev
## 1   0.79950154 9.467230
## 2  10.91254204 7.392262
## 3  15.10712737 5.290619
## 4   3.63507810 4.332710
## 5   0.85449414 2.989542
## 6   2.68406833 2.942169
## 7  -0.36258762 2.030740
## 8  -0.42054389 1.965426
## 9   1.46995025 1.661368
## 10  2.35674710 1.586163
## 11  1.30530174 1.566546
## 12 -0.21176958 1.522089
## 13  0.96250356 1.512927
## 14  0.08426843 1.474620
## 15 -2.06900473 1.395385
## 16 -0.14558089 1.323228
## 17  0.16231182 1.309199
## 18  1.57730108 1.283085
d_not_to_be %>%
  ggplot(aes(x = TxBSpoken_scaled, y = BNC_scaled, label = lemma)) +
  coord_fixed() +
  labs(x = "standardised CL (Textbook Conversation)", y = "standardised CL (Spoken BNC2014 sample)") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  geom_point(shape = "circle filled", colour = "darkred", fill = "darkred", alpha = 0.5) +
  geom_text_repel(data = d_extremes, min.segment.length = 1, segment.alpha = 0.6, max.overlaps = Inf, box.padding = 0.2, direction="both", arrow = arrow(length = unit(0.015, "npc"))) +
  #geom_label_repel(data = d_extremes, hjust = "inward") + 
  theme_light()

#ggsave(here("DCA_scaled_comparison_Conversation.svg"), dpi = 300, width = 22, height = 16, units = "cm")

Fiction DCAs

glimpse(FictionProg)
lemmas_with_phrasal_verbs <- sort(unique(FictionProg$Lemma))
lemmas_with_phrasal_verbs
# For now, I will be ignoring phrasal verbs and only looking at the verb lemmas
lemmas_short <- sort(unique(stringr::word(lemmas_with_phrasal_verbs,1)))
lemmas_short
#saveRDS(lemmas_short, file = here("Fiction_lemmas_short.rds")) # 28 August 2019

## Prepare prog verb counts ##
glimpse(FictionProg)
prog_lemmas <- FictionProg[,c("Lemma", "Corpus")]
prog_lemmas$Lemma <- stringr::word(prog_lemmas$Lemma,1)
head(prog_lemmas)
tail(prog_lemmas)

levels(prog_lemmas$Corpus) <- c("SpokenBNC_Prog", "TxBSpoken_Prog") # Rename factor levels 
prog_lemmas <- prog_lemmas[,c(2,1)]
names(prog_lemmas)[1] <- "corpus"
names(prog_lemmas)[2] <- "lemma"

## Prepare non-prog verb counts ##

verb_counts <-readRDS(file = here("FVP_Nar_counts.rds"))
head(verb_counts)
verb_counts <- verb_counts[,c(1,2,4)] # Get rid of total YF verb count

verb_counts$YF_count_sample <- round(verb_counts$YF_count_sample, 0)
verb_counts <- verb_counts[verb_counts$TxBNar_count!=0 & verb_counts$YF_count_sample!=0,] # Drop lemmas with zero in both corpora
head(verb_counts)

## Substract progressives from total FVP_counts

FictionProg$Lemma_short <- stringr::word(FictionProg$Lemma,1)
prog_TxB <- as.data.frame(table(FictionProg[FictionProg$Corpus=="Textbook Fiction", "Lemma_short"]))
names(prog_TxB)[1] <- "lemma"
names(prog_TxB)[2] <- "TxB_Nar_prog_count"
head(prog_TxB)
prog_YF <- as.data.frame(table(FictionProg[FictionProg$Corpus=="Youth Fiction Sampled", "Lemma_short"]))
names(prog_YF)[1] <- "lemma"
names(prog_YF)[2] <- "YF_prog_count"
head(prog_YF)

counttest <- merge(verb_counts, prog_TxB, by = 'lemma', all = TRUE)
counts <- merge(counttest, prog_YF, by = 'lemma', all = TRUE)
counts[is.na(counts)] <- 0
head(counts); tail(counts)
counts$YF_non_prog = counts$YF_count_sample - counts$YF_prog_count
counts$TxB_Nar_non_prog = counts$TxBNar_count - counts$TxB_Nar_prog_count
# Sampling (of the Spoken BNC 2014) and tagging errors sometimes lead to negative values, therefore here we change all negative values to zero
summary(counts$YF_non_prog)
counts[counts$YF_non_prog<0,c(1,5,6)]
sum(counts$YF_non_prog<0) # 116 lemma types "lost"

lost <- counts[counts$YF_non_prog<0,]
sum(lost$YF_non_prog) # 126 "lost FVPs" tokes

counts$YF_non_prog[counts$YF_non_prog<0] <- 0

summary(counts$TxB_Nar_non_prog)
counts[counts$TxB_Nar_non_prog<0,c(1,4,7)]
sum(counts$TxB_Nar_non_pro<0) # 16 types

lost2 <- counts[counts$TxB_Nar_non_prog<0,]
sum(lost2$TxB_Nar_non_prog) # 20 tokens

counts$TxB_Nar_non_prog[counts$TxB_Nar_non_prog<0] <- 0

counts <- counts[,c("lemma","TxB_Nar_prog_count", "TxB_Nar_non_prog", "YF_prog_count", "YF_non_prog")]
head(counts); tail(counts)

sum(counts$TxB_Nar_prog_count) + sum(counts$TxB_Nar_non_prog) # 31563 total FVPs
sum(counts$YF_prog_count) + sum(counts$YF_non_prog) # 31470 total FVPs
(sum(counts$TxB_Nar_prog_count) + sum(counts$TxB_Nar_non_prog))- (sum(counts$YF_prog_count) + sum(counts$YF_non_prog)) # 93 difference in total FVPs

# This file can be split into two to be used for two separate DCA analyses à la 2b.csv
#saveRDS(counts, here("counts4DCA_2b_Nar.rds") # Last saved on 28 August 2019
counts <- readRDS(file = here("counts4DCA_2b_Nar.rds")


### Files for DCA analysis à la 2b.csv
# Textbook Narrative
TxBNarDCA <- counts[,c("lemma", "TxB_Nar_prog_count", "TxB_Nar_non_prog")]
head(TxBNarDCA); tail(TxBNarDCA)
nrow(TxBNarDCA)
TxBNarDCA <- subset(TxBNarDCA, TxB_Nar_prog_count > 0 | TxB_Nar_non_prog > 0) # Drop lemmas with zero occurrences in Textbook Conversation
sum(TxBNarDCA$TxB_Nar_prog_count) + sum(TxBNarDCA$TxB_Nar_non_prog)
write.table(TxBNarDCA, file = here("TxB_Nar_DCA.csv", sep = "\t") # Last saved on 28 August 2019 III

# Youth Fiction sample
YF_DCA <- counts[,c("lemma", "YF_prog_count", "YF_non_prog")]
nrow(YF_DCA)
YF_DCA <- subset(YF_DCA, YF_non_prog > 0 | YF_prog_count > 0) # Drop lemmas with zero occurrences in Spoken BNC 2014
head(YF_DCA); tail(YF_DCA)
(sum(YF_DCA$YF_prog_count) + sum(YF_DCA$YF_non_prog)) - (sum(TxBNarDCA$TxB_Nar_prog_count) + sum(TxBNarDCA$TxB_Nar_non_prog)) # Difference in total number of FVPs = 93 more in TxB Narrative than in Youth Fiction
write.table(YF_DCA, file = here("YF_DCA.csv", sep = "\t") # Last saved on 28 August 2019 II

#source("/Users/Elen/Documents/PhD/Statistics_R/Gries_Collexemes/coll.analysis_mpfr.r")
coll.analysis()
# Textbook Fiction
counts <- read.csv(file = here("TxB_Nar_DCA.csv"), sep = "\t", stringsAsFactors = TRUE)
head(counts)
##     lemma TxB_Nar_prog_count TxB_Nar_non_prog
## 1 abandon                  0                1
## 2  accord                  0                2
## 4     act                  2               13
## 5     add                  0               30
## 6 address                  1                8
## 7 advance                  0                1
DCA_TxB <- collex.dist(counts) # default association measure with this package is log-likelihood
head(DCA_TxB); tail(DCA_TxB)
##   COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2              ASSOC COLL.STR.LOGL SIGNIF
## 1    sit     62   11.6    180  230.4 TxB_Nar_prog_count     120.47601  *****
## 2   wear     37    4.1     49   81.9 TxB_Nar_prog_count     112.64951  *****
## 3   talk     50    8.2    120  161.8 TxB_Nar_prog_count     110.60440  *****
## 4     do     82   24.6    429  486.4 TxB_Nar_prog_count      92.25624  *****
## 5   wait     41    8.2    130  162.8 TxB_Nar_prog_count      74.07346  *****
## 6  stand     35    7.8    128  155.2 TxB_Nar_prog_count      56.02529  *****
##   SHARED
## 1      Y
## 2      Y
## 3      Y
## 4      Y
## 5      Y
## 6      Y
##     COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2            ASSOC COLL.STR.LOGL SIGNIF
## 419    ask      3   24.3    503  481.7 TxB_Nar_non_prog      31.33177  *****
## 420   want      1   28.3    588  560.7 TxB_Nar_non_prog      49.76364  *****
## 421    see      4   38.4    794  759.6 TxB_Nar_non_prog      52.98648  *****
## 422   know      0   28.9    601  572.1 TxB_Nar_non_prog      59.79114  *****
## 423    say     24   92.8   1907 1838.2 TxB_Nar_non_prog      78.68266  *****
## 424     be      3  347.5   7227 6882.5 TxB_Nar_non_prog     775.34497  *****
##     SHARED
## 419      Y
## 420      Y
## 421      Y
## 422      N
## 423      Y
## 424      Y
library(flextable)

DCA_TxB %>% 
  rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>% 
  mutate(G2 = ifelse(ASSOC=="TxB_Nar_non_prog", -G2, G2)) %>% 
  mutate(G2 = round(G2, 1)) %>% 
  #mutate(across(.cols = c(E.CXN1, O.CXN2, E.CXN2), round, 0)) %>% 
  mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>% 
  mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>% 
  select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>% 
  filter(abs(G2) > 30) -> resultsTxB

nrow(resultsTxB)
## [1] 15
resultsTxB
##    lemma O:Enon-prog     O:Eprog     G2
## 1    sit     62:11.6   180:230.4  120.5
## 2   wear      37:4.1     49:81.9  112.6
## 3   talk      50:8.2   120:161.8  110.6
## 4     do     82:24.6   429:486.4   92.3
## 5   wait      41:8.2   130:162.8   74.1
## 6  stand      35:7.8   128:155.2   56.0
## 7   walk     43:13.7   243:272.3   43.4
## 8  shine       9:0.7      5:13.3   36.9
## 9    try     34:10.9   193:216.1   34.0
## 10   ask      3:24.3   503:481.7  -31.3
## 11  want      1:28.3   588:560.7  -49.8
## 12   see      4:38.4   794:759.6  -53.0
## 13  know      0:28.9   601:572.1  -59.8
## 14   say     24:92.8 1907:1838.2  -78.7
## 15    be     3:347.5 7227:6882.5 -775.3
flextable(resultsTxB) %>% 
  compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>% 
  compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>% 
  compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>% 
  theme_booktabs() #%>% 
  #save_as_docx(path = here("DCA_TxB_Nar.docx"))

# Reference Fiction

YF_DCA <- read.csv(file = here("YF_DCA.csv"), sep = "\t", stringsAsFactors = TRUE)
YF_DCA <- collex.dist(YF_DCA) # Remember that the default AM is log-likelihood
head(YF_DCA); tail(YF_DCA)
##   COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2         ASSOC COLL.STR.LOGL SIGNIF SHARED
## 1   talk     35    5.8     85  114.2 YF_prog_count      76.38402  *****      Y
## 2   work     23    4.6     73   91.4 YF_prog_count      41.21932  *****      Y
## 3   joke      6    0.3      0    5.7 YF_prog_count      36.41022  *****      N
## 4    try     36   11.6    204  228.4 YF_prog_count      36.00128  *****      Y
## 5  stand     31   10.0    176  197.0 YF_prog_count      30.87469  *****      Y
## 6   plan      8    0.7      6   13.3 YF_prog_count      30.02547  *****      Y
##     COLLEX O.CXN1 E.CXN1 O.CXN2 E.CXN2       ASSOC COLL.STR.LOGL SIGNIF SHARED
## 516   want      1   18.0    372  355.0 YF_non_prog      29.18298  *****      Y
## 517    see      7   33.1    679  652.9 YF_non_prog      31.90511  *****      Y
## 518   have     16   51.7   1056 1020.3 YF_non_prog      35.97288  *****      Y
## 519   know      1   32.7    678  646.3 YF_non_prog      58.72625  *****      Y
## 520    say     32  122.3   2506 2415.7 YF_non_prog     104.22061  *****      Y
## 521     be     15  334.3   6919 6599.7 YF_non_prog     644.29336  *****      Y
YF_DCA %>% 
  rename(lemma = COLLEX, G2 = COLL.STR.LOGL) %>% 
  mutate(G2 = ifelse(ASSOC=="YF_non_prog", -G2, G2)) %>% 
  mutate(G2 = round(G2, 1)) %>% 
  mutate(O.CXN2 = round(O.CXN2, 0)) %>% 
  mutate(`O:Enon-prog` = as.character(paste0(O.CXN1,":",E.CXN1))) %>% 
  mutate(`O:Eprog` = as.character(paste0(O.CXN2,":",E.CXN2))) %>% 
  select(lemma, `O:Enon-prog`, `O:Eprog`, G2) %>% 
  filter(abs(G2) > 23.5) -> resultsYF

nrow(resultsYF)
## [1] 15
resultsYF
##    lemma O:Enon-prog     O:Eprog     G2
## 1   talk      35:5.8    85:114.2   76.4
## 2   work      23:4.6     73:91.4   41.2
## 3   joke       6:0.3       0:5.7   36.4
## 4    try     36:11.6   204:228.4   36.0
## 5  stand       31:10     176:197   30.9
## 6   plan       8:0.7      6:13.3   30.0
## 7    lie      19:4.6     77:91.4   27.5
## 8    sit      27:9.4   167:184.6   23.9
## 9   find      0:12.5   259:246.5  -25.7
## 10  want        1:18     372:355  -29.2
## 11   see      7:33.1   679:652.9  -31.9
## 12  have     16:51.7 1056:1020.3  -36.0
## 13  know      1:32.7   678:646.3  -58.7
## 14   say    32:122.3 2506:2415.7 -104.2
## 15    be    15:334.3 6919:6599.7 -644.3
flextable(resultsYF) %>% 
  compose(part = "header", j = "O:Enon-prog", value = as_paragraph("O:E", as_sub("non-prog"))) %>% 
  compose(part = "header", j = "O:Eprog", value = as_paragraph("O:E", as_sub("prog"))) %>% 
  compose(part = "header", j = "G2", value = as_paragraph("G", as_sup("2"))) %>%
  theme_booktabs() #%>% 
  #save_as_docx(path = here("DCA_FY.docx"))
YF_collstrength <- read.csv(file = here("YF_DCA_collstrength.csv"), header = TRUE, sep=",")
glimpse(YF_collstrength)
## Rows: 521
## Columns: 9
## $ words                     <chr> "be", "say", "know", "have", "see", "want", …
## $ obs.freq.1                <int> 15, 32, 1, 16, 7, 1, 0, 0, 0, 3, 0, 0, 0, 4,…
## $ obs.freq.2                <int> 6919, 2506, 678, 1056, 679, 372, 259, 203, 1…
## $ exp.freq.1                <dbl> 334.25, 122.34, 32.73, 51.68, 33.07, 17.98, …
## $ exp.freq.2                <dbl> 6599.75, 2415.66, 646.27, 1020.32, 652.93, 3…
## $ pref.occur                <chr> "X.YF_non_prog.", "X.YF_non_prog.", "X.YF_no…
## $ delta.p.constr1.cues.word <dbl> -0.22, -0.06, -0.02, -0.02, -0.02, -0.01, -0…
## $ delta.p.word.cues.constr1 <dbl> -0.06, -0.04, -0.05, -0.03, -0.04, -0.05, -0…
## $ coll.strength             <dbl> -644.29, -104.22, -58.73, -35.97, -31.91, -2…
names(YF_collstrength)[1] <- "lemma"
names(YF_collstrength)[9] <- "coll.strength.YF"
subset(YF_collstrength, lemma=="know")
##   lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2     pref.occur
## 3  know          1        678      32.73     646.27 X.YF_non_prog.
##   delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.YF
## 3                     -0.02                     -0.05           -58.73
YF_collstrength <- na.omit(YF_collstrength) # Exclude lemmas with no data
YF_collstrength <- YF_collstrength[,c(1,9)]
head(YF_collstrength); tail(YF_collstrength)
##   lemma coll.strength.YF
## 1    be          -644.29
## 2   say          -104.22
## 3  know           -58.73
## 4  have           -35.97
## 5   see           -31.91
## 6  want           -29.18
##        lemma coll.strength.YF
## 516      rub             0.01
## 517   spread             0.01
## 518  breathe             0.00
## 519 consider             0.00
## 520     rest             0.00
## 521  stretch             0.00
TxBNar_collstrength <- read.csv(file = here("TxB_Nar_DCA_collstrength.csv"), header = TRUE, sep = ",", stringsAsFactors = TRUE)
glimpse(TxBNar_collstrength)
## Rows: 424
## Columns: 9
## $ words                     <fct> be, say, know, see, want, ask, like, find, h…
## $ obs.freq.1                <int> 3, 24, 0, 4, 1, 3, 0, 2, 2, 1, 0, 1, 2, 28, …
## $ obs.freq.2                <int> 7227, 1907, 601, 794, 588, 503, 209, 294, 29…
## $ exp.freq.1                <dbl> 347.49, 92.81, 28.89, 38.35, 28.31, 24.32, 1…
## $ exp.freq.2                <dbl> 6882.51, 1838.19, 572.11, 759.65, 560.69, 48…
## $ pref.occur                <fct> X.TxB_Nar_non_prog., X.TxB_Nar_non_prog., X.…
## $ delta.p.constr1.cues.word <dbl> -0.24, -0.05, -0.02, -0.02, -0.02, -0.01, -0…
## $ delta.p.word.cues.constr1 <dbl> -0.06, -0.04, -0.05, -0.04, -0.05, -0.04, -0…
## $ coll.strength             <dbl> -775.34, -78.68, -59.79, -52.99, -49.76, -31…
names(TxBNar_collstrength)[1] <- "lemma"
names(TxBNar_collstrength)[9] <- "coll.strength.TxBNar"
subset(TxBNar_collstrength, lemma=="know")
##   lemma obs.freq.1 obs.freq.2 exp.freq.1 exp.freq.2          pref.occur
## 3  know          0        601      28.89     572.11 X.TxB_Nar_non_prog.
##   delta.p.constr1.cues.word delta.p.word.cues.constr1 coll.strength.TxBNar
## 3                     -0.02                     -0.05               -59.79
TxBNar_collstrength <- na.omit(TxBNar_collstrength) # Exclude lemmas with no data
TxBNar_collstrength <- TxBNar_collstrength[,c(1,9)]
head(TxBNar_collstrength); tail(TxBNar_collstrength)
##   lemma coll.strength.TxBNar
## 1    be              -775.34
## 2   say               -78.68
## 3  know               -59.79
## 4   see               -52.99
## 5  want               -49.76
## 6   ask               -31.33
##       lemma coll.strength.TxBNar
## 419    stay                 0.05
## 420    send                 0.03
## 421    head                 0.02
## 422    sell                 0.02
## 423 realise                 0.00
## 424    tell                 0.00
coll_strengths <- merge(YF_collstrength, TxBNar_collstrength, by = 'lemma', all = FALSE) # Merge tables, to only look at differences in coll.strength values

#saveRDS(coll_strengths, file = here("coll_strengths_Fiction.rds")) 

coll_strengths %>%
  ggplot(aes(x = coll.strength.TxBNar, y = coll.strength.YF)) +
  labs(x = "Textbook Fiction", y = "Youth Fiction") +
  geom_point(shape = "circle filled", fill = "grey") -> fig

fig

# We have one very extreme case: for BE.
coll_strengths %>%
  filter(coll.strength.YF < -500)
##   lemma coll.strength.YF coll.strength.TxBNar
## 1    be          -644.29              -775.34
# Let's exclude BE for visualisation purposes then.
coll_strengths %>%
  filter(lemma != "be") ->
  d_not_to_be

fig %+% d_not_to_be

d_not_to_be %>%
  summarise(corr_TxB_YF = cor(coll.strength.TxBNar, coll.strength.YF))
##   corr_TxB_YF
## 1   0.6807511
cor.test(coll_strengths$coll.strength.TxBNar, coll_strengths$coll.strength.YF, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  coll_strengths$coll.strength.TxBNar and coll_strengths$coll.strength.YF
## t = 74.304, df = 410, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9574529 0.9709268
## sample estimates:
##       cor 
## 0.9648178
d_not_to_be %>%
  mutate(
    TxBNar_scaled = as.vector(scale(coll.strength.TxBNar)),
    YF_scaled = as.vector(scale(coll.strength.YF)),
    dev = abs(YF_scaled - TxBNar_scaled)
  ) ->
  d_not_to_be

# We can now identify verbs that are extreme by this measure of deviance.

d_not_to_be %>%
  filter(dev > 1) %>% 
  select(lemma, coll.strength.TxBNar, coll.strength.YF, dev) %>% 
  arrange(-dev)
##     lemma coll.strength.TxBNar coll.strength.YF      dev
## 1    wear               112.65            18.31 5.944411
## 2     sit               120.48            23.92 5.920159
## 3     say               -78.68          -104.22 5.068992
## 4      do                92.26            13.61 5.008025
## 5    walk                43.44            -0.03 3.006782
## 6    have               -11.98           -35.97 2.798440
## 7    wait                74.07            23.18 2.784373
## 8    joke                15.03            36.41 2.634290
## 9    plan                 7.48            30.03 2.512997
## 10    lie                 5.48            27.46 2.392106
## 11   work                25.82            41.22 2.373084
## 12   hunt                -0.20            22.16 2.250397
## 13  shine                36.93             6.67 1.880603
## 14  panic                -0.59            18.20 1.877920
## 15   know               -59.79           -58.73 1.787022
## 16   hold                 1.50            18.23 1.736370
## 17    kid                 3.40            18.20 1.601910
## 18  chase                 5.71            18.37 1.459263
## 19   find               -17.23           -25.70 1.399307
## 20    run                 6.00            17.73 1.374644
## 21   look                21.71             1.73 1.326064
## 22 starve                 3.40            15.00 1.279118
## 23    try                34.05            36.00 1.277215
## 24   mean                -2.96           -14.66 1.272807
## 25 suffer                19.39             0.73 1.266449
## 26 stream                 1.87            13.80 1.263909
## 27   rain                25.17             4.89 1.246653
## 28 accord                -0.20            12.13 1.238645
## 29 travel                 2.32            12.98 1.150065
## 30  visit                14.50            -1.19 1.121857
## 31    let               -13.49           -20.13 1.096163
## 32  guard                -0.10            10.18 1.035026
## 33  enjoy                15.46             0.61 1.006694
d_not_to_be %>%
  filter(dev > 2.5) %>% 
  arrange(-dev) ->
  d_extremes

d_extremes
##   lemma coll.strength.YF coll.strength.TxBNar TxBNar_scaled   YF_scaled
## 1  wear            18.31               112.65     7.6392496   1.6948387
## 2   sit            23.92               120.48     8.1808933   2.2607340
## 3   say          -104.22               -78.68    -5.5960880 -10.6650801
## 4    do            13.61                92.26     6.2287623   1.2207375
## 5  walk            -0.03                43.44     2.8516171  -0.1551646
## 6  have           -35.97               -11.98    -0.9820860  -3.7805256
## 7  wait            23.18                74.07     4.9704610   2.1860883
## 8  joke            36.41                15.03     0.8863428   3.5206327
## 9  plan            30.03                 7.48     0.3640682   2.8770655
##        dev
## 1 5.944411
## 2 5.920159
## 3 5.068992
## 4 5.008025
## 5 3.006782
## 6 2.798440
## 7 2.784373
## 8 2.634290
## 9 2.512997
d_not_to_be %>%
  ggplot(aes(x = TxBNar_scaled, y = YF_scaled, label = lemma)) +
  coord_fixed() +
  labs(x = "standardised CL (Textbook Fiction)", y = "standardised CL (Youth Fiction sample)") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  geom_point(shape = "circle filled", colour = "darkred", fill = "darkred", alpha = 0.5) +
  geom_text_repel(data = d_extremes, min.segment.length = 1, segment.alpha = 0.6, max.overlaps = Inf, box.padding = 0.2, direction="both", arrow = arrow(length = unit(0.015, "npc"))) +
  #geom_label_repel(data = d_extremes, hjust = "inward") + 
  theme_light()

#ggsave(here("DCA_scaled_comparison_Fiction.svg"), dpi = 300, width = 22, height = 16, units = "cm")

Multiple testing correction

#source(here(coll.analysis_mpfr.r"))
#coll.analysis()

# From https://rcompanion.org/rcompanion/b_04.html #
pchisq(3.84, df=1,lower.tail=FALSE) # Threshold of significance at 0.05 for LLR > 3.84 but this is NOT corrected for multiple testing!

qchisq(0.05, df=1, lower.tail=FALSE) # Work out LLR-based CL value based on p-value https://stats.stackexchange.com/questions/250819/r-calculate-p-value-given-chi-squared-and-degrees-of-freedom

qchisq(p.adjust(0.05, method = "holm", n = 20), df=1, lower.tail=FALSE)

Package used in this script

#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))

knitr::write_bib(c(.packages(), "knitr"), "packages.bib")

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] flextable_0.6.4       collostructions_0.2.0 vcd_1.4-8            
##  [4] tidyr_1.1.4           RColorBrewer_1.1-2    lsr_0.5              
##  [7] lattice_0.20-41       gridExtra_2.3         ggrepel_0.9.1        
## [10] ggplot2_3.3.5         here_1.0.1            FactoMineR_2.4       
## [13] dplyr_1.0.7          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.7           zoo_1.8-9            assertthat_0.2.1    
##  [4] rprojroot_2.0.2      digest_0.6.29        lmtest_0.9-38       
##  [7] utf8_1.2.2           R6_2.5.1             evaluate_0.14       
## [10] highr_0.9            pillar_1.6.4         gdtools_0.2.3       
## [13] rlang_0.4.12         uuid_0.1-4           rstudioapi_0.13     
## [16] data.table_1.14.2    jquerylib_0.1.4      DT_0.20             
## [19] rmarkdown_2.11       labeling_0.4.2       stringr_1.4.0       
## [22] htmlwidgets_1.5.4    munsell_0.5.0        compiler_4.0.3      
## [25] xfun_0.29            systemfonts_1.0.1    pkgconfig_2.0.3     
## [28] base64enc_0.1-3      htmltools_0.5.2      flashClust_1.01-2   
## [31] tidyselect_1.1.1     tibble_3.1.6         fansi_0.5.0         
## [34] crayon_1.4.2         withr_2.4.3          MASS_7.3-53.1       
## [37] leaps_3.1            jsonlite_1.7.2       gtable_0.3.0        
## [40] lifecycle_1.0.1      DBI_1.1.1            magrittr_2.0.1      
## [43] scales_1.1.1         zip_2.1.1            cli_3.1.0           
## [46] stringi_1.7.6        farver_2.1.0         scatterplot3d_0.3-41
## [49] xml2_1.3.3           bslib_0.3.1          ellipsis_0.3.2      
## [52] generics_0.1.1       vctrs_0.3.8          tools_4.0.3         
## [55] glue_1.6.0           officer_0.3.17       purrr_0.3.4         
## [58] fastmap_1.1.0        yaml_2.2.1           colorspace_2.0-2    
## [61] cluster_2.1.1        knitr_1.37           sass_0.4.0
Auguie, Baptiste. 2017. gridExtra: Miscellaneous Functions for "Grid" Graphics. https://CRAN.R-project.org/package=gridExtra.
Flach, Susanne. 2021. Collostructions: An r Implementation for the Family of Collostructional Methods. www.sfla.ch.
Gohel, David. 2021. Flextable: Functions for Tabular Reporting. https://CRAN.R-project.org/package=flextable.
Husson, Francois, Julie Josse, Sebastien Le, and Jeremy Mazet. 2020. FactoMineR: Multivariate Exploratory Data Analysis and Data Mining. http://factominer.free.fr.
Lê, Sébastien, Julie Josse, and François Husson. 2008. FactoMineR: A Package for Multivariate Analysis.” Journal of Statistical Software 25 (1): 1–18. https://doi.org/10.18637/jss.v025.i01.
Meyer, David, Achim Zeileis, and Kurt Hornik. 2006. “The Strucplot Framework: Visualizing Multi-Way Contingency Tables with Vcd.” Journal of Statistical Software 17 (3): 1–48. https://www.jstatsoft.org/v17/i03/.
———. 2020. Vcd: Visualizing Categorical Data. https://CRAN.R-project.org/package=vcd.
Müller, Kirill. 2020. Here: A Simpler Way to Find Your Files. https://CRAN.R-project.org/package=here.
Navarro, Daniel. 2015. Lsr: Companion to "Learning Statistics with r". http://health.adelaide.edu.au/psychology/ccs/teaching/lsr/.
Neuwirth, Erich. 2014. RColorBrewer: ColorBrewer Palettes. https://CRAN.R-project.org/package=RColorBrewer.
R Core Team. 2020. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.
Sarkar, Deepayan. 2008. Lattice: Multivariate Data Visualization with r. New York: Springer. http://lmdvr.r-forge.r-project.org.
———. 2020. Lattice: Trellis Graphics for r. http://lattice.r-forge.r-project.org/.
Slowikowski, Kamil. 2021. Ggrepel: Automatically Position Non-Overlapping Text Labels with Ggplot2. https://github.com/slowkow/ggrepel.
Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. https://ggplot2.tidyverse.org.
———. 2021. Tidyr: Tidy Messy Data. https://CRAN.R-project.org/package=tidyr.
Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, Kohske Takahashi, Claus Wilke, Kara Woo, Hiroaki Yutani, and Dewey Dunnington. 2021. Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics. https://CRAN.R-project.org/package=ggplot2.
Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2021. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.
Xie, Yihui. 2014. “Knitr: A Comprehensive Tool for Reproducible Research in R.” In Implementing Reproducible Computational Research, edited by Victoria Stodden, Friedrich Leisch, and Roger D. Peng. Chapman; Hall/CRC. http://www.crcpress.com/product/isbn/9781466561595.
———. 2015. Dynamic Documents with R and Knitr. 2nd ed. Boca Raton, Florida: Chapman; Hall/CRC. https://yihui.org/knitr/.
———. 2021. Knitr: A General-Purpose Package for Dynamic Report Generation in r. https://yihui.org/knitr/.
Zeileis, Achim, David Meyer, and Kurt Hornik. 2007. “Residual-Based Shadings for Visualizing (Conditional) Independence.” Journal of Computational and Graphical Statistics 16 (3): 507–25.