This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(here)
library(ggplot2)
library(gridExtra)
library(lattice)
library(lsr)
library(RColorBrewer)
library(tidyr)
library(vcd)
annotation_file <- read.csv(here("BNCspoken_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file)
## Rows: 4,099
## Columns: 18
## $ SE_Filename <fct> filename#0, filename#0, filename#0, filename#2, filenam…
## $ BNC_Filename <fct> SAAB, SAAB, SAAB, SAA3, SAA3, SAA3, SAA3, SABT, SAB7, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> be inviting, were having, 'm not going, 's not going, i…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> invite, have, NA, NA, NA, NA, play, come, say, NA, NA, …
## $ X. <fct> NA, NA, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, NA, c, N…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense <fct> modal, past, GOING TO, GOING TO, GOING TO, GOING TO, pr…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Time.reference <fct> future, future, present, present, present, present, pre…
## $ Continuous <fct> no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y…
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, other interesting function, NA, NA, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated <- annotation_file[71:3380,] # So far, I've only annotated to line 4006. This current selection ensures that we have the same number of progressives as in the Textbook Conversation subcorpus (namely 2423 progressives)
# Extract only progressives
BNCprog <- annotated[annotated$Tense != "NOT progressive" & annotated$Tense != "GOING TO" & annotated$Tense != "catenative", ]
BNCprog$Tense <- droplevels(BNCprog)$Tense # Drop unused levels
BNCprog$Time.reference <- droplevels(BNCprog)$Time.reference # Drop unused levels
BNCprog$Tense <- factor(BNCprog$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
BNCprog$Time.reference <- factor(BNCprog$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))
# Number of progressives extracted
nrow(BNCprog)
## [1] 2423
glimpse(BNCprog)
## Rows: 2,423
## Columns: 18
## $ SE_Filename <fct> filename#19, filename#19, filename#19, filename#19, fil…
## $ BNC_Filename <fct> SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> was mostly travelling, be lying, 're getting, is piloti…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> travel, lie, attack, pilot, have, lose, be, sabotage, p…
## $ X. <fct> NA, NA, c, c, c, c, NA, NA, c, c, NA, c, c, NA, NA, NA,…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, n, n, NA, NA, N…
## $ Tense <fct> past, modal, present, present, present, present, presen…
## $ Voice <fct> A, A, P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, q, q, NA, NA, NA, NA, NA, q, NA…
## $ Time.reference <fct> past, present, present, present, present, past/present,…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, unclear, no, no, no, no, no, no, yes, no, no, no, u…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, emphasis/shock, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, get passive, NA, NA, gradual change, NA, NA, NA…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Before saving annotation file as .csv, make sure to have deleted all trailing inverted commas as they serve as string deliminators. This can easily be done with a regex search and replace in calc $" --> nothing.
annotation_file_TxB <- read.csv(here("TxB_spoken_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file_TxB)
## Rows: 3,380
## Columns: 18
## $ Register <fct> spoken, spoken, spoken, spoken, spoken, spoken, spoken,…
## $ Level <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Textbook <fct> Hi there, Hi there, Hi there, Hi there, Hi there, Hi th…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "is swimming", "'m eating", "'m eating", "'m cooking", …
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> NA, eat, eat, cook, eat, dance, joke, teach, come, leav…
## $ X. <fct> NA, c, c, c, NA, NA, c, c, NA, c, c, c, c, NA, NA, c, c…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense <fct> NOT progressive, present, present, present, present, pr…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, q, NA, q, NA, NA, NA, N…
## $ Time.reference <fct> present, present, present, present, present, present, p…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, emphasis/shock, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated_TxB <- annotation_file_TxB
# Extract only progressives
TxBprog <- annotated_TxB[annotated_TxB$Tense != "NOT progressive" & annotated_TxB$Tense != "GOING TO" & annotated_TxB$Tense != "catenative" , ]
TxBprog$Tense <- droplevels(TxBprog)$Tense # Drop unused levels
TxBprog$Time.reference <- droplevels(TxBprog)$Time.reference # Drop unused levels
TxBprog$Tense <- factor(TxBprog$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
TxBprog$Time.reference <- factor(TxBprog$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))
# Number of progressives extracted
nrow(TxBprog)
## [1] 2423
# Ratio of false positives in progressive query
1 - nrow(TxBprog)/nrow(annotation_file_TxB)
## [1] 0.2831361
prog <- bind_rows("Spoken BNC2014 sample" = BNCprog, "Textbook Conversation" = TxBprog, .id = "Corpus")
glimpse(prog)
## Rows: 4,846
## Columns: 22
## $ Corpus <chr> "Spoken BNC2014 sample", "Spoken BNC2014 sample", "Spok…
## $ SE_Filename <fct> filename#19, filename#19, filename#19, filename#19, fil…
## $ BNC_Filename <fct> SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "was mostly travelling", "be lying", "'re getting", "is…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> travel, lie, attack, pilot, have, lose, be, sabotage, p…
## $ X. <fct> NA, NA, c, c, c, c, NA, NA, c, c, NA, c, c, NA, NA, NA,…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, n, n, NA, NA, N…
## $ Tense <fct> past, modal, present, present, present, present, presen…
## $ Voice <fct> A, A, P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, q, q, NA, NA, NA, NA, NA, q, NA…
## $ Time.reference <fct> past, present, present, present, present, past/present,…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, unclear, no, no, no, no, no, no, yes, no, no, no, u…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, emphasis/shock, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, "get passive", NA, NA, "gradual change", NA, NA…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
## $ Register <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
prog <- prog %>% dplyr::rename(Contraction = X., Question = X..1, Negation = NOT) # Rename variables
glimpse(prog)
## Rows: 4,846
## Columns: 22
## $ Corpus <chr> "Spoken BNC2014 sample", "Spoken BNC2014 sample", "Spok…
## $ SE_Filename <fct> filename#19, filename#19, filename#19, filename#19, fil…
## $ BNC_Filename <fct> SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, SAUR, S…
## $ Rand. <fct> done, done, done, done, done, done, done, done, done, d…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "was mostly travelling", "be lying", "'re getting", "is…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> travel, lie, attack, pilot, have, lose, be, sabotage, p…
## $ Contraction <fct> NA, NA, c, c, c, c, NA, NA, c, c, NA, c, c, NA, NA, NA,…
## $ Negation <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, n, n, NA, NA, N…
## $ Tense <fct> past, modal, present, present, present, present, presen…
## $ Voice <fct> A, A, P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Question <fct> NA, NA, NA, NA, NA, NA, q, q, NA, NA, NA, NA, NA, q, NA…
## $ Time.reference <fct> past, present, present, present, present, past/present,…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, unclear, no, no, no, no, no, no, yes, no, no, no, u…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, emphasis/shock, NA, NA, NA, NA,…
## $ Comment <fct> NA, NA, "get passive", NA, NA, "gradual change", NA, NA…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
## $ Register <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# Replace NA with meaningful levels
summary(as.factor(prog$Negation))
## n NA's
## 280 4566
prog$Negation <- tidyr::replace_na(as.character(prog$Negation), "positive") # positive
summary(as.factor(prog$Contraction))
## c NA's
## 2410 2436
prog$Contraction <- tidyr::replace_na(as.character(prog$Contraction), "full") # full
summary(as.factor(prog$Question))
## q unclear NA's
## 710 4 4132
prog$Question <- tidyr::replace_na(as.character(prog$Question), "statement") # statement
summary(as.factor(prog$Level))
## A B C D E NA's
## 316 565 622 577 343 2423
prog$Level <- tidyr::replace_na(as.character(prog$Level), "Spoken BNC2014 sample")
prog$Textbook <- tidyr::replace_na(as.character(prog$Textbook), "Spoken BNC2014 sample")
# Make the following variables into factors
factor_cols <- c("Corpus", "Negation", "Contraction", "Question", "Repeated", "Lemma", "Voice", "Level")
prog[factor_cols] <- lapply(prog[factor_cols], as.factor)
# Drop unused levels
prog$Corpus <- droplevels(prog)$Corpus
levels(prog$Negation)[levels(prog$Negation)=="n"] <- "negated"
levels(prog$Contraction)[levels(prog$Contraction)=="c"] <- "contract."
levels(prog$Tense)[levels(prog$Tense)=="modal/infinitive"] <- "modal/inf."
summary(prog$Corpus) # Number of annotated progressive concordances in each subcorpus
## Spoken BNC2014 sample Textbook Conversation
## 2423 2423
#saveRDS(prog, file = here("prog.rds"))
#saveRDS(prog, file = here("prog_collanalysis.rds"))
prog <- readRDS(file = here("prog.rds"))
# Textbook conversation #
TS_words <- 420130 # Much better than tokens due to lack of punctuation in Spoken BNC
TS_verbs <- 80106 # These are all verb forms (e.g. have been drinking = 3 verb forms). Calculated using SE query
TS_verb_phrases <- 64292 # As calculated with Extract_verb_lemmas_spacy
TS_prog <- nrow(TxBprog) #2423 # After manual annotation, thus excluding GOING TO + inf.
TS_ProgVerbPhrase_ratio <- TS_prog/(TS_verb_phrases - TS_prog - (nrow(filter(annotated_TxB, Tense=="GOING TO"))))*10000; TS_ProgVerbPhrase_ratio # 395.4756 per 10,000 finite verb phrases
## [1] 395.4756
# Spoken BNC
BNC_words <- 10581951
BNC_verbs <- 2487515
BNC_progCQL <- 126395
# Ratio of genuine progressives in progressive query for BNC
BNCratio_prog_CQL <- nrow(BNCprog)/nrow(annotated) # (using full BNC dataset as for coll.analysis)
BNCtotal_prog <- BNCratio_prog_CQL*BNC_progCQL # Approximate total number of progressives across BNC
BNC_verb_phrases <- 1751040
BNC_ProgVerbPhrase_ratio <- BNCtotal_prog/(BNC_verb_phrases - BNCtotal_prog - (nrow(filter(annotation_file, Tense=="GOING TO"))))*10000; BNC_ProgVerbPhrase_ratio # 566.0897 per 10,000 finite verb phrases (using full BNC dataset as for coll.analysis)
## [1] 557.9771
# Comparing relative frequencies #
TS <- as.vector(c(TS_words, TS_verbs, TS_verb_phrases, TS_prog))
BNC <- as.vector(c(BNC_words, BNC_verbs, BNC_verb_phrases, BNCtotal_prog))
BNC_sample <- round(BNC*(TS_words/BNC_words),0) # This has to be based on the total number of words because there are no sentence delimiters in the Spokem BNC 2014.
prog_ratio <- rbind(TS, BNC_sample)
rownames(prog_ratio) <- c("Textbook Conversation", "Spoken BNC2014 sample")
colnames(prog_ratio) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives")
prog_ratio <- as.data.frame(prog_ratio); prog_ratio
## Words Verb forms Finite verb phrases Progressives
## Textbook Conversation 420130 80106 64292 2423
## Spoken BNC2014 sample 420130 98761 69521 3673
prog_ratio$Non_prog <- prog_ratio$`Finite verb phrases` - prog_ratio$Progressives
prog_ratio$FProgRatio <- round(c(TS_ProgVerbPhrase_ratio, BNC_ProgVerbPhrase_ratio), 2)
colnames(prog_ratio) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives", "Non-progressives", "F-coefficients of progressives")
prog_ratio
## Words Verb forms Finite verb phrases Progressives
## Textbook Conversation 420130 80106 64292 2423
## Spoken BNC2014 sample 420130 98761 69521 3673
## Non-progressives F-coefficients of progressives
## Textbook Conversation 61869 395.48
## Spoken BNC2014 sample 65848 557.98
#clipr::write_clip(prog_ratio) # Copy into manuscript
#saveRDS(prog_ratio, file = here("prog_ratio_spoken.rds")
# Significance testing and approximation of effect sizes #
# http://rcompanion.org/handbook/H_03.html
prog_ratio1 <- prog_ratio[,5:4]; prog_ratio1 # This is based on the word-based sample size of the Spoken BNC2014
## Non-progressives Progressives
## Textbook Conversation 61869 2423
## Spoken BNC2014 sample 65848 3673
chisq.test(prog_ratio1) # Highly significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: prog_ratio1
## X-squared = 175.87, df = 1, p-value < 2.2e-16
vcd::assocstats(t(prog_ratio1)) # Small effect size
## X^2 df P(> X^2)
## Likelihood Ratio 177.74 1 0
## Pearson 176.22 1 0
##
## Phi-Coefficient : 0.036
## Contingency Coeff.: 0.036
## Cramer's V : 0.036
fisher.test(prog_ratio1) # Is this helpful?
##
## Fisher's Exact Test for Count Data
##
## data: prog_ratio1
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.351109 1.501624
## sample estimates:
## odds ratio
## 1.424279
verb_ratio <- prog_ratio[,1:2]; verb_ratio
## Words Verb forms
## Textbook Conversation 420130 80106
## Spoken BNC2014 sample 420130 98761
chisq.test(verb_ratio) # Highly significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: verb_ratio
## X-squared = 1604.5, df = 1, p-value < 2.2e-16
vcd::assocstats(t(verb_ratio)) # Very small effect size
## X^2 df P(> X^2)
## Likelihood Ratio 1607.7 1 0
## Pearson 1604.7 1 0
##
## Phi-Coefficient : 0.04
## Contingency Coeff.: 0.04
## Cramer's V : 0.04
# https://www.tutorialgateway.org/mosaic-plot-in-r/
#tiff(here("mosaic_prog_prop2.tiff"), height = 20, width = 30, units="cm", compression = "lzw", res = 300)
mosaicplot(prog_ratio1, shade = TRUE, main = "Progressives in Textbook Conversation and the Spoken BNC 2014", cex.axis = 1.5) # Unlike vcd::mosaic, Base R mosaicplot works with dataframes
#dev.off()
# Table summaries
library(gtsummary)
# Great slides on using this package: http://www.emilyzabor.com/cleveland-r-gtsummary
# Also for adding custom functions: http://www.danieldsjoberg.com/gtsummary/reference/tests.html
prog %>%
select(Corpus, Tense) %>%
tbl_summary(by = Corpus) %>%
#add_p() %>%
modify_header(label = "**Form**") #%>%
Form | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Tense | ||
past | 705 (29%) | 381 (16%) |
perfect | 109 (4.5%) | 157 (6.5%) |
present | 1,518 (63%) | 1,833 (76%) |
modal | 91 (3.8%) | 52 (2.1%) |
1
n (%)
|
#as_flex_table() %>%
#flextable::save_as_docx(path = here("SpokenTensesTable.docx")
prog %>%
select(Tense, Level) %>%
tbl_summary(by = Level) %>%
#add_p() %>%
modify_header(label = "**Form**") #%>%
Form | A, N = 3161 | B, N = 5651 | C, N = 6221 | D, N = 5771 | E, N = 3431 | Spoken BNC2014 sample, N = 2,4231 |
---|---|---|---|---|---|---|
Tense | ||||||
past | 1 (0.3%) | 84 (15%) | 130 (21%) | 114 (20%) | 52 (15%) | 705 (29%) |
perfect | 0 (0%) | 3 (0.5%) | 63 (10%) | 61 (11%) | 30 (8.7%) | 109 (4.5%) |
present | 314 (99%) | 476 (84%) | 421 (68%) | 385 (67%) | 237 (69%) | 1,518 (63%) |
modal | 1 (0.3%) | 2 (0.4%) | 8 (1.3%) | 17 (2.9%) | 24 (7.0%) | 91 (3.8%) |
1
n (%)
|
#as_flex_table() %>%
#flextable::save_as_docx(path = here("SpokenTensesTable_byLevel.docx")
# Graphs with absolute values.
par(mfrow=c(1,2))
plot(BNCprog$Tense, main = "Progressives in the Spoken BNC2014 sample", ylab = "Number of occurrences", ylim = c(0,2000))
plot(TxBprog$Tense, main = "Progressives in the Spoken BNC2014 sample", ylab = "Number of occurrences", ylim = c(0,2000))
par(mfrow=c(1,1))
#png(filename = here("prog_tenses.png", width = 1800, height = 1400, res = 300)
# Distributions across corpora
round(prop.table(table(prog$Tense[prog$Corpus=="Textbook Conversation"])), 4)*100
##
## past perfect present modal
## 15.72 6.48 75.65 2.15
round(prop.table(table(prog$Tense[prog$Corpus=="Spoken BNC2014 sample"])), 4)*100
##
## past perfect present modal
## 29.10 4.50 62.65 3.76
# Mosaic plot of tense distribution across corpora
# Select the colours that will be used
library(RColorBrewer)
# All palette available from RColorBrewer
display.brewer.all(colorblindFriendly = T)
# We will select the first 4 colors in the Set1 palette
cols<-brewer.pal(n=4,name="OrRd")
#tiff(here("mosaic_tenseforms.tiff"), height = 30, width = 35, units="cm", compression = "lzw", res = 300)
par(cex = 2, cex.main = 0.85, font = 2)
#svg(here("mosaic_tenseforms.svg"), height = 10, width = 12, pointsize = 12)
par(cex = 2, cex.main = 0.85, font = 2)
plot(prog$Tense ~ prog$Corpus, col = cols, xlab = "", ylab = "Progressive forms", main = "Distribution of progressive forms in Textbook Conversation and the Spoken BNC2014")
#dev.off()
# Differentiated look at textbook levels #
#tiff(here("mosaic_tenseforms_levels.tiff"), height = 32, width = 40, units="cm", compression = "lzw", res = 300)
par(cex = 2)
plot(prog$Tense ~ relevel(prog$Level, "Spoken BNC2014 sample"), col = cols, xlab = " Textbook Levels", ylab = "")
#coord2 <- locator(4)
coord2 <- readRDS(file = here("coord2.rds"))
text(coord2, c("past", "perfect", "present", "modal"), font = 1)
#dev.off()
tenses <- table(prog$Tense, prog$Corpus)
tenses
##
## Spoken BNC2014 sample Textbook Conversation
## past 705 381
## perfect 109 157
## present 1518 1833
## modal 91 52
tensesp <- t((prop.table(tenses, margin = 2)*100)) # Work out percentages and transpose table
tensesp <-round(tensesp, 2)
barplot(tensesp, beside = TRUE, ylab = "% of occurrences", xlab = "Forms of the progressive", ylim = c(0,100), col = c("darkblue", "darkred"))
legend("topright", fill = c("darkblue", "darkred"), c("BNC Spoken 2014", "Textbook Conversation"))
# Legend: "Forms of the progressives in the Spoken BNC 2014 and Textbook Conversation"
vcd::assocstats(tenses) # Measuring the effect size. Cramer's V is what we're interested in (Levshina p. 217)
## X^2 df P(> X^2)
## Likelihood Ratio 147.29 3 0
## Pearson 145.57 3 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.171
## Cramer's V : 0.173
#http://groups.chass.utoronto.ca/pol242/Labs/LM-3A/LM-3A_content.htm
chisq.test(tenses)
##
## Pearson's Chi-squared test
##
## data: tenses
## X-squared = 145.57, df = 3, p-value < 2.2e-16
lsr::associationTest( ~ Tense + Corpus, data = prog) # Easier to interpret results
##
## Chi-square test of categorical association
##
## Variables: Tense, Corpus
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Corpus
## Tense Spoken BNC2014 sample Textbook Conversation
## past 705 381
## perfect 109 157
## present 1518 1833
## modal 91 52
##
## Expected contingency table under the null hypothesis:
## Corpus
## Tense Spoken BNC2014 sample Textbook Conversation
## past 543.0 543.0
## perfect 133.0 133.0
## present 1675.5 1675.5
## modal 71.5 71.5
##
## Test results:
## X-squared statistic: 145.572
## degrees of freedom: 3
## p-value: <.001
##
## Other information:
## estimated effect size (Cramer's v): 0.173
# Association plot of tense distribution in corpora
#tiff(here("tenseforms_assoc.tiff"), height = 12, width = 17, units="cm", compression = "lzw", res = 300)
vcd::assoc(tenses, shade = TRUE, varnames = FALSE) # This works
#dev.off()
chisq.test(tenses)$stdres
##
## Spoken BNC2014 sample Textbook Conversation
## past 11.161637 -11.161637
## perfect -3.027327 3.027327
## present -9.797030 9.797030
## modal 3.310555 -3.310555
# Association plot of tense distributions across levels
levels(prog$Level) <- c("A", "B", "C", "D", "E", "Spoken BNC2014")
tenselevel <- table(prog$Tense, prog$Level)
#tiff(here("tenseforms_levels_assoc.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
vcd::assoc(tenselevel, shade = TRUE, varnames = FALSE) # This works
#dev.off()
lsr::associationTest( ~ Tense + Level, data = prog) # Easier to interpret results
##
## Chi-square test of categorical association
##
## Variables: Tense, Level
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Level
## Tense A B C D E Spoken BNC2014
## past 1 84 130 114 52 705
## perfect 0 3 63 61 30 109
## present 314 476 421 385 237 1518
## modal 1 2 8 17 24 91
##
## Expected contingency table under the null hypothesis:
## Level
## Tense A B C D E Spoken BNC2014
## past 70.82 126.6 139.4 129.3 76.9 543.0
## perfect 17.35 31.0 34.1 31.7 18.8 133.0
## present 218.51 390.7 430.1 399.0 237.2 1675.5
## modal 9.32 16.7 18.4 17.0 10.1 71.5
##
## Test results:
## X-squared statistic: 373.529
## degrees of freedom: 15
## p-value: <.001
##
## Other information:
## estimated effect size (Cramer's v): 0.16
# Distributions across corpora
round(prop.table(table(prog$Time.reference[prog$Corpus=="Textbook Conversation"])), 4)*100
##
## past past/present present future general hypothetical
## 16.34 6.97 53.69 12.17 8.34 1.36
## unclear
## 1.11
round(prop.table(table(prog$Time.reference[prog$Corpus=="Spoken BNC2014 sample"])), 4)*100
##
## past past/present present future general hypothetical
## 28.35 4.58 35.29 9.78 11.23 2.97
## unclear
## 7.80
timeref <- table(prog$Corpus, prog$Time.reference); timeref # Raw figures
##
## past past/present present future general hypothetical
## Spoken BNC2014 sample 687 111 855 237 272 72
## Textbook Conversation 396 169 1301 295 202 33
##
## unclear
## Spoken BNC2014 sample 189
## Textbook Conversation 27
timerefprop <- round(t((prop.table(timeref, margin = 2)*100)),2); timerefprop # As %
##
## Spoken BNC2014 sample Textbook Conversation
## past 63.43 36.57
## past/present 39.64 60.36
## present 39.66 60.34
## future 44.55 55.45
## general 57.38 42.62
## hypothetical 68.57 31.43
## unclear 87.50 12.50
chisq.test(timeref)
##
## Pearson's Chi-squared test
##
## data: timeref
## X-squared = 335.11, df = 6, p-value < 2.2e-16
vcd::assocstats(timeref) # Cramer's V = 0.263
## X^2 df P(> X^2)
## Likelihood Ratio 352.42 6 0
## Pearson 335.11 6 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.254
## Cramer's V : 0.263
# "Fairer" distribution, excluding "unclear's"
prog1 <- prog[prog$Time.reference!="unclear",]
prog1$Time.reference <- droplevels(prog1$Time.reference)
lsr::associationTest( ~ Corpus + Time.reference, data = prog1) # Cramer's V = 0.091
##
## Chi-square test of categorical association
##
## Variables: Corpus, Time.reference
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Time.reference
## Corpus past past/present present future general hypothetical
## Spoken BNC2014 sample 687 111 855 237 272 72
## Textbook Conversation 396 169 1301 295 202 33
##
## Expected contingency table under the null hypothesis:
## Time.reference
## Corpus past past/present present future general hypothetical
## Spoken BNC2014 sample 523 135 1040 257 229 50.7
## Textbook Conversation 560 145 1116 275 245 54.3
##
## Test results:
## X-squared statistic: 208.2
## degrees of freedom: 5
## p-value: <.001
##
## Other information:
## estimated effect size (Cramer's v): 0.212
timeref <- table(prog1$Corpus, prog1$Time.reference); timeref # Raw figures
##
## past past/present present future general hypothetical
## Spoken BNC2014 sample 687 111 855 237 272 72
## Textbook Conversation 396 169 1301 295 202 33
# Significance test for only textbook levels C-E against Textbook Conversation
prog2 <- prog[prog$Level!="A" & prog$Level!="B",]
prog2$Level <- droplevels(prog2)$Level # Drop levels A and B
prog2 <- prog2[prog2$Time.reference!="unclear",] # Get rid of unclear's
prog2$Time.reference <- droplevels(prog2)$Time.reference # Drop unclear level
timeref2 <- table(prog2$Corpus, prog2$Time.reference); timeref2 # Raw figures
##
## past past/present present future general hypothetical
## Spoken BNC2014 sample 687 111 855 237 272 72
## Textbook Conversation 311 161 672 174 174 32
chisq.test(timeref2) # p < 0.001
##
## Pearson's Chi-squared test
##
## data: timeref2
## X-squared = 88.371, df = 5, p-value < 2.2e-16
vcd::assocstats(timeref2) # Cramer's V = 0.153
## X^2 df P(> X^2)
## Likelihood Ratio 88.688 5 0
## Pearson 88.371 5 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.152
## Cramer's V : 0.153
#tiff(here("time-ref_assoc_levelsC-E.tiff"), height = 18, width = 36, units="cm", compression = "lzw", res = 300)
vcd::assoc(timeref2, shade = T, varnames = F, shading_max(), gp_labels=(grid::gpar(fontsize=14))) # Association plot only for textbook levels C-E, no unclear's
#dev.off()
# Association plot of time ref distribution in corpora
#tiff(here("time-ref_assoc.tiff"), height = 18, width = 36, units="cm", compression = "lzw", res = 300)
vcd::assoc(timeref, shade = TRUE, varnames = FALSE, gp_labels=(grid::gpar(fontsize=18)))
#dev.off()
# Barplot with number of occurrences per corpus
timeref <- table(prog$Corpus, prog$Time.reference); timeref # Raw figures
##
## past past/present present future general hypothetical
## Spoken BNC2014 sample 687 111 855 237 272 72
## Textbook Conversation 396 169 1301 295 202 33
##
## unclear
## Spoken BNC2014 sample 189
## Textbook Conversation 27
timeref <- as.data.frame(timeref)
names(timeref)[1] <- "Corpus"
names(timeref)[2] <- "Timeref"
names(timeref)[3] <- "Freq"
timeref
## Corpus Timeref Freq
## 1 Spoken BNC2014 sample past 687
## 2 Textbook Conversation past 396
## 3 Spoken BNC2014 sample past/present 111
## 4 Textbook Conversation past/present 169
## 5 Spoken BNC2014 sample present 855
## 6 Textbook Conversation present 1301
## 7 Spoken BNC2014 sample future 237
## 8 Textbook Conversation future 295
## 9 Spoken BNC2014 sample general 272
## 10 Textbook Conversation general 202
## 11 Spoken BNC2014 sample hypothetical 72
## 12 Textbook Conversation hypothetical 33
## 13 Spoken BNC2014 sample unclear 189
## 14 Textbook Conversation unclear 27
# Version 2 in a more publishable style
#tiff(here("timeref_barplot.tiff"), height = 15, width = 25, units="cm", compression = "lzw", res = 300)
cols<-RColorBrewer::brewer.pal(n=7,name="OrRd")
ggplot(timeref, aes(Timeref, Freq)) +
geom_bar(aes(fill = Corpus), stat="identity", position="dodge", width=.5) +
theme_bw(base_size = 18) +
scale_fill_manual(values = cols[c(3,7)]) +
ylab("Number of progressives") +
xlab("Time reference") +
scale_y_continuous(breaks = c(0, 250, 500, 750, 1000, 1250)) +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(face = "plain", color = "grey20", size = 15), axis.text.y = element_text(face = "plain", color = "grey20", size = 15), legend.title=element_blank(), legend.position=c(.8, .8), legend.background = element_rect(colour="grey", linetype="solid"))
#ggtitle("")
dev.off()
## null device
## 1
# Mosaic plot of time reference distribution across corpora
# We will select the first 4 colors in the Set1 palette
cols<-brewer.pal(n=7,name="OrRd")
#tiff(here("mosaic_timeref.tiff"), height = 40, width = 40, units="cm", compression = "lzw", res = 300)
par(cex = 2, cex.main = 0.85, font = 2)
#svg(here("mosaic_timeref.svg"), height = 10, width = 12, pointsize = 12)
par(cex = 2, cex.main = 0.85, font = 2)
plot(prog$Time.reference ~ prog$Corpus, col = cols, xlab = "", ylab = "Time reference of progressives", main = "Time reference of progressives in Textbook Conversation and the Spoken BNC2014 sample") # Good but bar plot is more informative
#dev.off()
# Differentiated look at textbook levels #
#tiff(here("mosaic_timeref_levels.tiff"), height = 30, width = 25, units="cm", compression = "lzw", res = 300)
par(cex = 1.5)
plot(prog$Time.reference ~ prog$Level, col = cols, xlab= "", ylab = "Time reference of the progressives")
mtext(" Textbook levels", side = 1, adj = 0, line = 2.2, cex = 1.5) # To write text in the margin of plots
#dev.off()
# Differentiated look at textbook levels 2 #
prog1 <- prog[prog$Time.reference!="unclear",] # Get rid of unclear's
prog1$Time.reference <- droplevels(prog1)$Time.reference # Drop unclear level
cols<-brewer.pal(n=6,name="OrRd") # Pick just 6 colours now
#tiff(here("mosaic_timeref_levels.tiff"), height = 32, width = 30, units="cm", compression = "lzw", res = 300)
par(cex = 1.5)
plot(prog1$Time.reference ~ relevel(prog1$Level, "Spoken BNC2014"), col = cols, xlab = " Textbook Levels", ylab = "", main = "Time reference of the progressive")
#coord <- locator(6)
coord <- readRDS(file = here("coord.rds"))
text(coord, c("past", "past/present", "present", "future", "general", "hypothetical"), font = 1)
#dev.off() # !! Crop graph to the left in Word afterwards
# Plot relationship between tense and time reference
## Mosaic plot for comparison between two corpora ##
BNCprog$Tense <- relevel(BNCprog$Tense, "modal") # Makes it easier to read the mosaic plot if this tense form is first because it covers all time reference
TxBprog$Tense <- relevel(TxBprog$Tense, "modal")
par(mfrow=c(1,2))
plot(BNCprog$Time.reference ~ BNCprog$Tense)
plot(TxBprog$Time.reference ~ TxBprog$Tense)
par(mfrow=c(1,1))
prog1 <- prog[prog$Time.reference != "unclear", ]
prog1$Time.reference <- droplevels(prog1)$Time.reference # Drop unused level "unclear"
prog1$Form <- prog1$Tense
cols<-brewer.pal(n=4,name="OrRd")
#tiff(here("mosaic_tenses_timeref.tiff"), height = 15, width = 28, units="cm", compression = "lzw", res = 300)
vcd::mosaic(Form ~ Corpus + Time.reference, prog1, gp = grid::gpar(fill = cols), rot_labels = c(0, 45, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE), xlab = "Time reference") # Best so far
dev.off()
## null device
## 1
vcd::assoc(Time.reference ~ Corpus + Tense, prog1, shade = TRUE) # Super hard to interpret!
prog %>%
select(Corpus, Tense, Contraction, Question, Voice) %>%
tbl_summary(by = Corpus) %>%
add_p() %>%
modify_header(label = "**Form**") #%>%
Form | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 | p-value2 |
---|---|---|---|
Tense | <0.001 | ||
past | 705 (29%) | 381 (16%) | |
perfect | 109 (4.5%) | 157 (6.5%) | |
present | 1,518 (63%) | 1,833 (76%) | |
modal | 91 (3.8%) | 52 (2.1%) | |
Contraction | <0.001 | ||
contract. | 1,285 (53%) | 1,125 (46%) | |
full | 1,138 (47%) | 1,298 (54%) | |
Question | <0.001 | ||
q | 276 (11%) | 434 (18%) | |
statement | 2,143 (88%) | 1,989 (82%) | |
unclear | 4 (0.2%) | 0 (0%) | |
Voice | 0.14 | ||
A | 2,399 (99%) | 2,409 (99%) | |
P | 23 (0.9%) | 14 (0.6%) | |
unclear | 1 (<0.1%) | 0 (0%) | |
1
n (%)
2
Pearson's Chi-squared test; Fisher's exact test
|
# as_flex_table() %>%
# flextable::save_as_docx(path = here("SpokenTensesTable.docx"))
prog %>%
select(Textbook, Contraction) %>%
tbl_summary(by = Textbook) %>%
modify_header(label = "**Form**") %>%
modify_footnote(update = everything() ~ NA)
Form | Access, N = 418 | Achievers, N = 101 | English in Mind, N = 242 | Green Line, N = 139 | Hi there, N = 213 | Join the Team, N = 119 | New Green Line, N = 409 | Piece of cake, N = 163 | Solutions, N = 619 | Spoken BNC2014 sample, N = 2,423 |
---|---|---|---|---|---|---|---|---|---|---|
Contraction | ||||||||||
contract. | 226 (54%) | 43 (43%) | 138 (57%) | 60 (43%) | 94 (44%) | 35 (29%) | 182 (44%) | 64 (39%) | 283 (46%) | 1,285 (53%) |
full | 192 (46%) | 58 (57%) | 104 (43%) | 79 (57%) | 119 (56%) | 84 (71%) | 227 (56%) | 99 (61%) | 336 (54%) | 1,138 (47%) |
prog %>%
select(Textbook, Negation) %>%
tbl_summary(by = Textbook) %>%
modify_header(label = "**Form**") %>%
modify_footnote(update = everything() ~ NA)
Form | Access, N = 418 | Achievers, N = 101 | English in Mind, N = 242 | Green Line, N = 139 | Hi there, N = 213 | Join the Team, N = 119 | New Green Line, N = 409 | Piece of cake, N = 163 | Solutions, N = 619 | Spoken BNC2014 sample, N = 2,423 |
---|---|---|---|---|---|---|---|---|---|---|
Negation | ||||||||||
negated | 15 (3.6%) | 4 (4.0%) | 11 (4.5%) | 10 (7.2%) | 8 (3.8%) | 7 (5.9%) | 15 (3.7%) | 9 (5.5%) | 44 (7.1%) | 157 (6.5%) |
positive | 403 (96%) | 97 (96%) | 231 (95%) | 129 (93%) | 205 (96%) | 112 (94%) | 394 (96%) | 154 (94%) | 575 (93%) | 2,266 (94%) |
prog %>%
select(Corpus, Negation) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Form**")
Form | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Negation | ||
negated | 157 (6.5%) | 123 (5.1%) |
positive | 2,266 (94%) | 2,300 (95%) |
1
n (%)
|
prog %>%
select(Corpus, Question) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Form**")
Form | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Question | ||
q | 276 (11%) | 434 (18%) |
statement | 2,143 (88%) | 1,989 (82%) |
unclear | 4 (0.2%) | 0 (0%) |
1
n (%)
|
prog %>%
select(Corpus, Voice) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Form**")
Form | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Voice | ||
A | 2,399 (99%) | 2,409 (99%) |
P | 23 (0.9%) | 14 (0.6%) |
unclear | 1 (<0.1%) | 0 (0%) |
1
n (%)
|
prog %>%
select(Corpus, Time.reference) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Time reference**")
Time reference | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Time.reference | ||
past | 687 (28%) | 396 (16%) |
past/present | 111 (4.6%) | 169 (7.0%) |
present | 855 (35%) | 1,301 (54%) |
future | 237 (9.8%) | 295 (12%) |
general | 272 (11%) | 202 (8.3%) |
hypothetical | 72 (3.0%) | 33 (1.4%) |
unclear | 189 (7.8%) | 27 (1.1%) |
1
n (%)
|
prog %>%
select(Corpus, Repeated) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Function**")
Function | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Repeated | ||
no | 1,679 (69%) | 2,001 (83%) |
unclear | 243 (10%) | 28 (1.2%) |
yes | 501 (21%) | 394 (16%) |
1
n (%)
|
prog1 <- prog
prog1$Extra.function <- factor(prog1$Extra.function, exclude = NULL)
levels(prog1$Extra.function)[is.na(levels(prog1$Extra.function))] <- "none"
prog1 %>%
select(Corpus, Extra.function) %>%
tbl_summary(by = Corpus, missing = "ifany") %>%
modify_header(label = "**Function**")
Function | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Extra.function | ||
emphasis/shock | 40 (1.7%) | 37 (1.5%) |
framing | 45 (1.9%) | 98 (4.0%) |
gradual change | 66 (2.7%) | 47 (1.9%) |
other interesting function | 3 (0.1%) | 3 (0.1%) |
politeness/softening | 8 (0.3%) | 4 (0.2%) |
politeness or softening | 0 (0%) | 1 (<0.1%) |
none | 2,261 (93%) | 2,233 (92%) |
1
n (%)
|
prog %>%
select(Corpus, Continuous) %>%
tbl_summary(by = Corpus) %>%
modify_header(label = "**Function**")
Function | Spoken BNC2014 sample, N = 2,4231 | Textbook Conversation, N = 2,4231 |
---|---|---|
Continuous | ||
no | 518 (21%) | 275 (11%) |
unclear | 74 (3.1%) | 6 (0.2%) |
yes | 1,831 (76%) | 2,142 (88%) |
1
n (%)
|
H0: The frequencies of the levels of the dependent variable QUESTION do not vary as a function of the levels of the independent variable CORPUS; X2 = 0
H1: The frequencies of the levels of the dependent variable QUESTION vary as a function of the levels of the independent variable CORPUS; X2 > 0
A chi-square test assumes independent observations which we don’t really have here (lots from same textbook, possibly several from individual speaker in the BNC).
round(prop.table(table(prog$Corpus, prog$Question), margin = 1), 4)*100
##
## q statement unclear
## Spoken BNC2014 sample 11.39 88.44 0.17
## Textbook Conversation 17.91 82.09 0.00
questions <- table(prog$Corpus, prog$Question)
questions <- questions[,1:2] # Get rid of 5 unclear's in the BNC data
test.questions <- chisq.test(questions); test.questions # Significant correlation at p < 0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: questions
## X-squared = 40.379, df = 1, p-value = 2.092e-10
assocplot(t(questions))
vcd::assoc(questions, shade = TRUE, col_vars = c("question", "statement"))
vcd::assocstats(questions) # Odd ratio of 32.571. Phi-coefficient is only 0.092 = very weak effect!
## X^2 df P(> X^2)
## Likelihood Ratio 41.194 1 1.3782e-10
## Pearson 40.897 1 1.6048e-10
##
## Phi-Coefficient : 0.092
## Contingency Coeff.: 0.092
## Cramer's V : 0.092
prog1 <- prog[prog$Question!="unclear",]
prog1$Question <- droplevels(prog1$Question)
lsr::associationTest( ~ Question + Corpus, data = prog1) # Cramer's V = 0.091
##
## Chi-square test of categorical association
##
## Variables: Question, Corpus
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Corpus
## Question Spoken BNC2014 sample Textbook Conversation
## q 276 434
## statement 2143 1989
##
## Expected contingency table under the null hypothesis:
## Corpus
## Question Spoken BNC2014 sample Textbook Conversation
## q 355 355
## statement 2064 2068
##
## Test results:
## X-squared statistic: 40.379
## degrees of freedom: 1
## p-value: <.001
##
## Other information:
## estimated effect size (Cramer's v): 0.091
## Yates' continuity correction has been applied
H0: The frequencies of the levels of the dependent variable NEGATION do not vary as a function of the levels of the independent variable CORPUS; X2 = 0
H1: The frequencies of the levels of the dependent variable NEGATION vary as a function of the levels of the independent variable CORPUS; X2 > 0
A chi-square test assumes independent observations which we don’t really have here (lots from same textbook, possibly several from indidivudal speaker in the BNC).
round(prop.table(table(prog$Corpus, prog$Negation), margin = 1), 4)*100
##
## negated positive
## Spoken BNC2014 sample 6.48 93.52
## Textbook Conversation 5.08 94.92
prog$Negation <- relevel(prog$Negation, "positive")
#tiff(here("prog_negation.tiff"), height = 17, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(prog$Negation ~ prog$Corpus, ylab = "", xlab = "", main = "", col = cols[3:4])
dev.off()
## null device
## 1
# No interesting correlation to be seen
#tiff(here("prog_negation_question.tiff", height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Negation ~ Corpus + Question, prog, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
#dev.off()
negation <- table(prog$Corpus, prog$Negation)
test.negation <- chisq.test(negation); test.negation # Significant correlation at p < 0.05
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: negation
## X-squared = 4.1278, df = 1, p-value = 0.04218
assocplot(t(negation))
vcd::assocstats(negation) # Phi-coefficient is only 0.03 = very weak!
## X^2 df P(> X^2)
## Likelihood Ratio 4.3920 1 0.036109
## Pearson 4.3817 1 0.036326
##
## Phi-Coefficient : 0.03
## Contingency Coeff.: 0.03
## Cramer's V : 0.03
lsr::associationTest( ~ Negation + Corpus, data = prog) # Cramer's V = 0.03
##
## Chi-square test of categorical association
##
## Variables: Negation, Corpus
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Corpus
## Negation Spoken BNC2014 sample Textbook Conversation
## positive 2266 2300
## negated 157 123
##
## Expected contingency table under the null hypothesis:
## Corpus
## Negation Spoken BNC2014 sample Textbook Conversation
## positive 2283 2283
## negated 140 140
##
## Test results:
## X-squared statistic: 4.128
## degrees of freedom: 1
## p-value: 0.042
##
## Other information:
## estimated effect size (Cramer's v): 0.029
## Yates' continuity correction has been applied
round(prop.table(table(prog$Corpus, prog$Contraction), margin = 1), 4)*100
##
## contract. full
## Spoken BNC2014 sample 53.03 46.97
## Textbook Conversation 46.43 53.57
round(prop.table(table(prog$Textbook, prog$Contraction), 1),4)*100
##
## contract. full
## Access 54.07 45.93
## Achievers 42.57 57.43
## English in Mind 57.02 42.98
## Green Line 43.17 56.83
## Hi there 44.13 55.87
## Join the Team 29.41 70.59
## New Green Line 44.50 55.50
## Piece of cake 39.26 60.74
## Solutions 45.72 54.28
## Spoken BNC2014 sample 53.03 46.97
table(prog$Textbook, prog$Contraction)
##
## contract. full
## Access 226 192
## Achievers 43 58
## English in Mind 138 104
## Green Line 60 79
## Hi there 94 119
## Join the Team 35 84
## New Green Line 182 227
## Piece of cake 64 99
## Solutions 283 336
## Spoken BNC2014 sample 1285 1138
cols<-RColorBrewer::brewer.pal(n=4,name="OrRd")
#tiff(here("prog_contractions.tiff"), height = 25, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(prog$Contraction ~ prog$Corpus, ylab = "", xlab = "", main = "", col = cols[4:3])
plot(prog$Contraction ~ prog$Corpus, ylab = "", xlab = "", main = "", col = cols[4:3])
#dev.off()
#tiff(here("prog_contractions_tenses.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Corpus + Tense, prog, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
#dev.off()
round(prop.table(table(prog$Contraction, prog$Textbook), 2),4)*100
##
## Access Achievers English in Mind Green Line Hi there Join the Team
## contract. 54.07 42.57 57.02 43.17 44.13 29.41
## full 45.93 57.43 42.98 56.83 55.87 70.59
##
## New Green Line Piece of cake Solutions Spoken BNC2014 sample
## contract. 44.50 39.26 45.72 53.03
## full 55.50 60.74 54.28 46.97
#tiff(here("prog_contractions_series.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Textbook, prog, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
#dev.off()
contractions <- table(prog$Corpus, prog$Contraction)
test.contractions <- chisq.test(contractions)
test.contractions # Significant correlation
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contractions
## X-squared = 20.868, df = 1, p-value = 4.92e-06
assocplot(t(contractions))
vcd::assocstats(contractions) # Phi-coefficient is only 0.066 = very weak!
## X^2 df P(> X^2)
## Likelihood Ratio 21.147 1 4.2541e-06
## Pearson 21.131 1 4.2883e-06
##
## Phi-Coefficient : 0.066
## Contingency Coeff.: 0.066
## Cramer's V : 0.066
round(prop.table(table(prog$Corpus, prog$Voice), margin = 1), 4)*100
##
## A P unclear
## Spoken BNC2014 sample 99.01 0.95 0.04
## Textbook Conversation 99.42 0.58 0.00
voice <- table(prog$Corpus, prog$Voice); voice
##
## A P unclear
## Spoken BNC2014 sample 2399 23 1
## Textbook Conversation 2409 14 0
voice1 <- voice[,c(1:2)]; voice1 # Get rid of unclear as otherwise chi-squared approximation may be incorrect
##
## A P
## Spoken BNC2014 sample 2399 23
## Textbook Conversation 2409 14
chisq.test(voice1) # p-value = 0.1863 = Not a significant correlation
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: voice1
## X-squared = 1.7464, df = 1, p-value = 0.1863
vcd::assocstats(voice) # Phi-coefficient is only 0.026 = no effect!
## X^2 df P(> X^2)
## Likelihood Ratio 3.6184 2 0.16379
## Pearson 3.2100 2 0.20089
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.026
## Cramer's V : 0.026
prog[prog$Voice=="P",c(1,6,11)] # For qualitative analysis
## Corpus Concor.dance2 Tense
## 3 Spoken BNC2014 sample 're getting present
## 72 Spoken BNC2014 sample am I getting present
## 79 Spoken BNC2014 sample 're being present
## 113 Spoken BNC2014 sample was being past
## 333 Spoken BNC2014 sample was getting past
## 457 Spoken BNC2014 sample 's just being present
## 510 Spoken BNC2014 sample 're having present
## 643 Spoken BNC2014 sample is being present
## 751 Spoken BNC2014 sample 's being present
## 833 Spoken BNC2014 sample was being past
## 1001 Spoken BNC2014 sample are being present
## 1008 Spoken BNC2014 sample was being past
## 1027 Spoken BNC2014 sample were being past
## 1065 Spoken BNC2014 sample are being present
## 1239 Spoken BNC2014 sample is that it's being present
## 1313 Spoken BNC2014 sample 're being present
## 1361 Spoken BNC2014 sample are they getting present
## 1398 Spoken BNC2014 sample be like he's being modal
## 1504 Spoken BNC2014 sample 're getting present
## 1543 Spoken BNC2014 sample is being present
## 1557 Spoken BNC2014 sample were being past
## 1665 Spoken BNC2014 sample were being past
## 1958 Spoken BNC2014 sample were getting past
## 4315 Textbook Conversation was being past
## 4319 Textbook Conversation was just being past
## 4344 Textbook Conversation are being present
## 4408 Textbook Conversation are being present
## 4492 Textbook Conversation are being present
## 4537 Textbook Conversation 's not being present
## 4554 Textbook Conversation is being present
## 4586 Textbook Conversation are being present
## 4691 Textbook Conversation was being past
## 4694 Textbook Conversation 'm being present
## 4695 Textbook Conversation 's being present
## 4706 Textbook Conversation 'm being present
## 4794 Textbook Conversation are being present
## 4824 Textbook Conversation 's being present
table(prog[prog$Voice=="P",c(1,11)])
## Tense
## Corpus past perfect present modal
## Spoken BNC2014 sample 8 0 14 1
## Textbook Conversation 3 0 11 0
prog1 <- prog[prog$Voice!="unclear",]
prog1$Voice <- droplevels(prog1$Voice)
lsr::associationTest( ~ Voice + Corpus, data = prog1) # Cramer's V = 0.02
##
## Chi-square test of categorical association
##
## Variables: Voice, Corpus
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## Corpus
## Voice Spoken BNC2014 sample Textbook Conversation
## A 2399 2409
## P 23 14
##
## Expected contingency table under the null hypothesis:
## Corpus
## Voice Spoken BNC2014 sample Textbook Conversation
## A 2403.5 2404.5
## P 18.5 18.5
##
## Test results:
## X-squared statistic: 1.746
## degrees of freedom: 1
## p-value: 0.186
##
## Other information:
## estimated effect size (Cramer's v): 0.019
## Yates' continuity correction has been applied
repeatedness <- table(prog$Corpus, prog$Repeated); repeatedness
##
## no unclear yes
## Spoken BNC2014 sample 1679 243 501
## Textbook Conversation 2001 28 394
round(prop.table(repeatedness, 1),4)*100
##
## no unclear yes
## Spoken BNC2014 sample 69.29 10.03 20.68
## Textbook Conversation 82.58 1.16 16.26
chisq.test(repeatedness) # Significant correlation at p<0.001
##
## Pearson's Chi-squared test
##
## data: repeatedness
## X-squared = 211.54, df = 2, p-value < 2.2e-16
vcd::assocstats(repeatedness)
## X^2 df P(> X^2)
## Likelihood Ratio 236.60 2 0
## Pearson 211.54 2 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.205
## Cramer's V : 0.209
#tiff(here("Prog_Repeatedness.tiff"), height = 18, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.5, cex.main = 1)
par(cex = 1, cex.lab = 1.5, cex.axis = 1, cex.main = 1)
plot(repeatedness, shade = TRUE, main = "Progressive function: Repeatedness")
#dev.off()
repeatedness1 <- repeatedness[,c(1,3)] # Get rid of unclear's - if so desired... (I'm not sure this is a good idea because they make up some 9% of BNC data)
chisq.test(repeatedness1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: repeatedness1
## X-squared = 30.518, df = 1, p-value = 3.308e-08
vcd::assocstats(repeatedness1) # Phi-coefficient is only 0.082 = very weak!
## X^2 df P(> X^2)
## Likelihood Ratio 30.926 1 2.6801e-08
## Pearson 30.932 1 2.6728e-08
##
## Phi-Coefficient : 0.082
## Contingency Coeff.: 0.082
## Cramer's V : 0.082
## Which lemmas occur most frequently in either corpora with the function repeatedness?
repeated_lemmas <- prog[prog$Repeated=="yes", c(1,8)]
repeated_lemmas_TxB <- prog[prog$Repeated=="yes" & prog$Corpus=="Textbook Conversation", 8]
table_repeated_lemmas_TxB <- sort(table(repeated_lemmas_TxB), decreasing = T)
head(table_repeated_lemmas_TxB, 10)
## repeated_lemmas_TxB
## do work try get make look talk take use be
## 28 18 14 13 10 9 9 8 8 7
repeated_lemmas_BNC <- prog[prog$Repeated=="yes" & prog$Corpus=="Spoken BNC2014 sample", 8]
table_repeated_lemmas_BNC <- sort(table(repeated_lemmas_BNC), decreasing = T)
head(table_repeated_lemmas_BNC, 10)
## repeated_lemmas_BNC
## do go say work pay have try use be get
## 63 25 22 21 16 13 13 12 10 10
## Which lemmas occur proportionally most frequently with function repeatedness?
BNCrepeated <- BNCprog[, c(7, 15)]
BNCrepeated <- table(Case = BNCrepeated$Lemma, BNCrepeated$Repeated)
BNCrepeated <- addmargins(BNCrepeated) # adds rows total to table!
BNCrepeated <- BNCrepeated[order(BNCrepeated[,4], decreasing = T),]; head(BNCrepeated, 11)
##
## Case no unclear yes Sum
## Sum 1679 243 501 2423
## do 119 47 63 229
## go 135 13 25 173
## say 118 13 22 153
## think 92 4 5 101
## try 79 6 13 98
## talk 77 10 6 93
## get 55 10 10 75
## have 46 5 13 64
## come 46 5 4 55
## make 29 6 7 42
BNCrepeated <- BNCrepeated[1:30,]
BNCrepeatedp <- round(prop.table(BNCrepeated, 1), 4)*200; BNCrepeatedp
##
## Case no unclear yes Sum
## Sum 69.30 10.02 20.68 100.00
## do 51.96 20.52 27.52 100.00
## go 78.04 7.52 14.46 100.00
## say 77.12 8.50 14.38 100.00
## think 91.08 3.96 4.96 100.00
## try 80.62 6.12 13.26 100.00
## talk 82.80 10.76 6.46 100.00
## get 73.34 13.34 13.34 100.00
## have 71.88 7.82 20.32 100.00
## come 83.64 9.10 7.28 100.00
## make 69.04 14.28 16.66 100.00
## work 46.34 2.44 51.22 100.00
## be 58.82 11.76 29.42 100.00
## look 83.34 6.66 10.00 100.00
## take 64.28 14.28 21.42 100.00
## tell 59.26 18.52 22.22 100.00
## play 66.66 4.16 29.16 100.00
## sit 86.96 4.34 8.70 100.00
## look for 81.82 0.00 18.18 100.00
## use 20.00 20.00 60.00 100.00
## watch 50.00 10.00 40.00 100.00
## pay 10.52 5.26 84.22 100.00
## go on 88.88 5.56 5.56 100.00
## run 64.70 5.88 29.42 100.00
## start 100.00 0.00 0.00 100.00
## wait 94.12 5.88 0.00 100.00
## live 100.00 0.00 0.00 100.00
## put 46.66 13.34 40.00 100.00
## record 86.66 6.66 6.66 100.00
## buy 57.14 7.14 35.72 100.00
BNCrepeatedp <- BNCrepeatedp[order(BNCrepeatedp[,3], decreasing = T),]
BNCrepeatedp <- BNCrepeatedp[1:12,1:3]; BNCrepeatedp
##
## Case no unclear yes
## pay 10.52 5.26 84.22
## use 20.00 20.00 60.00
## work 46.34 2.44 51.22
## watch 50.00 10.00 40.00
## put 46.66 13.34 40.00
## buy 57.14 7.14 35.72
## be 58.82 11.76 29.42
## run 64.70 5.88 29.42
## play 66.66 4.16 29.16
## do 51.96 20.52 27.52
## tell 59.26 18.52 22.22
## take 64.28 14.28 21.42
fig_BNC_repeated = ggplot(as.data.frame(BNCrepeatedp), aes(x=Var2, y=Freq)) +
geom_col() +
facet_wrap(~Case) +
labs(x='Repeatedness function', y='Proportion of use') +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(fig_BNC_repeated) # Takes a few seconds
TCrepeated <- TxBprog[, c(7, 15)]
TCrepeated <- addmargins(table(Case = TCrepeated$Lemma, TCrepeated$Repeated)); TCrepeated
##
## Case no unclear yes Sum
## act 2 0 0 2
## add 1 0 0 1
## advise 1 0 0 1
## affect 0 0 0 0
## allow 0 0 0 0
## amaze 0 0 0 0
## amaziiing 0 0 0 0
## annoy 0 0 0 0
## answer 2 0 1 3
## appeal 0 0 0 0
## appoint 1 0 0 1
## appreciate 0 0 0 0
## approach 2 0 0 2
## argue 1 0 0 1
## arrive 3 0 0 3
## ask 9 0 6 15
## assist 0 0 0 0
## attack 0 0 0 0
## attract 0 0 1 1
## bake 1 0 0 1
## bark 2 0 0 2
## be 9 0 7 16
## become 3 0 1 4
## begin 4 0 0 4
## behave 1 0 2 3
## block 1 0 0 1
## blow 0 0 1 1
## board 2 0 0 2
## bootleg 0 0 0 0
## bore 0 0 0 0
## borrow 0 0 1 1
## bother 1 0 1 2
## bow 1 0 0 1
## break down 0 0 1 1
## break up 1 0 0 1
## bring 4 0 0 4
## build 4 0 0 4
## burn 1 0 0 1
## burst 1 0 0 1
## buy 8 0 4 12
## call 12 0 0 12
## call for 0 0 1 1
## camp 1 0 0 1
## care 0 0 0 0
## carry 3 0 0 3
## catch 1 0 0 1
## cause 1 0 2 3
## celebrate 5 0 0 5
## change 4 0 4 8
## chase 3 0 0 3
## chat 2 0 2 4
## cheat 0 0 0 0
## check 2 0 0 2
## cheer 3 0 1 4
## chew 0 0 1 1
## choose 2 0 0 2
## clap 3 0 0 3
## clean 2 0 0 2
## clear 2 0 0 2
## climb 1 0 0 1
## close 2 0 0 2
## collect 1 0 3 4
## come 49 0 5 54
## come back 3 0 0 3
## come out 3 0 1 4
## come up 0 0 1 1
## commit 0 0 1 1
## communicate 0 0 1 1
## complain 1 0 1 2
## concentrate 1 0 1 2
## connect 0 0 0 0
## consume 0 0 1 1
## continue 0 0 2 2
## convert 0 0 0 0
## cook 5 0 0 5
## copy 1 0 0 1
## create 0 0 1 1
## criticise 0 0 2 2
## cross 4 0 0 4
## cry 2 0 0 2
## cut 0 0 0 0
## cycle 2 0 0 2
## dance 7 0 1 8
## daunt 0 0 0 0
## decorate 1 0 0 1
## delete 1 0 0 1
## demand 0 0 0 0
## depress 0 0 0 0
## desert 1 0 0 1
## devastate 0 0 0 0
## develop 2 0 0 2
## die 5 0 0 5
## die out 1 0 0 1
## disappear 2 0 1 3
## discover 1 0 1 2
## discuss 4 0 0 4
## disobey 1 0 0 1
## diversify 0 0 1 1
## do 177 1 28 206
## download 1 0 0 1
## draw 1 0 1 2
## dread 1 0 0 1
## dream 3 0 0 3
## dress 0 0 1 1
## dress up 1 0 0 1
## drink 4 0 1 5
## drive 4 0 1 5
## drown 2 0 0 2
## drum 0 0 0 0
## dust 1 0 0 1
## dying 0 0 0 0
## eat 15 0 4 19
## emerge 2 0 0 2
## employ 0 0 0 0
## encourage 0 0 1 1
## end 0 0 0 0
## enjoy 15 0 2 17
## everything 0 0 0 0
## evolve 1 0 0 1
## exaggerate 3 0 0 3
## examine 0 0 0 0
## exasperate 0 0 0 0
## exclude 0 0 1 1
## exhaust 0 0 0 0
## expect 3 0 0 3
## experiment 1 0 0 1
## explain 0 0 0 0
## explode 1 0 0 1
## express 0 0 1 1
## face 0 0 2 2
## fall 5 0 0 5
## feed 2 0 0 2
## feel 29 0 5 34
## fight 5 0 1 6
## fill 0 0 1 1
## film 1 1 1 3
## find 2 0 0 2
## flirt 0 0 1 1
## flow 0 0 0 0
## fly 7 0 1 8
## follow 4 0 1 5
## fool around 1 0 0 1
## forget 0 0 2 2
## frown 1 0 0 1
## frustrate 0 0 0 0
## fundraise 0 0 1 1
## gain 1 0 0 1
## garden 1 0 0 1
## Geocaching 0 0 0 0
## get 30 1 13 44
## get off 1 0 0 1
## get on 2 0 2 4
## give 2 1 1 4
## give away 0 0 1 1
## give up 1 0 0 1
## go 158 2 6 166
## go back 1 0 0 1
## go on 20 1 5 26
## go out 11 0 1 12
## greet 1 0 0 1
## grill 2 0 0 2
## grin 1 0 0 1
## grow 0 0 1 1
## grow up 0 0 1 1
## guess 1 0 0 1
## hand 2 0 0 2
## handcuff 1 0 0 1
## hang 2 0 1 3
## hang out 0 0 3 3
## happen 18 0 5 23
## harass 0 0 1 1
## hardworking 0 0 0 0
## have 53 0 5 58
## have on 0 0 1 1
## head 1 0 1 2
## hear 1 1 1 3
## help 6 1 3 10
## hide 1 1 1 3
## hike 1 0 1 2
## hit 1 0 0 1
## hold 8 0 0 8
## hop 0 0 1 1
## hope 13 0 0 13
## hunt 0 0 1 1
## hurt 0 0 5 5
## impoverish 0 0 1 1
## include 0 0 0 0
## inspire 0 0 2 2
## interest 0 0 0 0
## interview 2 0 0 2
## invest 0 0 0 0
## invite 1 0 0 1
## jog 2 0 0 2
## join 3 0 0 3
## joke 11 0 0 11
## judge 0 0 0 0
## juggle 2 0 0 2
## jump 1 0 1 2
## keep 2 0 0 2
## kid 13 0 0 13
## kill 0 0 1 1
## kneel 1 0 0 1
## know 0 0 0 0
## land 1 0 0 1
## laugh 4 0 0 4
## lead 1 0 0 1
## lean 0 0 0 0
## learn 9 0 5 14
## leave 15 0 0 15
## let 1 0 2 3
## letterboxing 0 0 0 0
## level 0 0 1 1
## lie 9 0 3 12
## lift 2 0 1 3
## light up 1 0 0 1
## listen 27 0 1 28
## live 22 0 0 22
## look 83 0 9 92
## look after 0 0 1 1
## look around 1 0 0 1
## look for 7 0 4 11
## look forward 12 0 0 12
## loop up 1 0 0 1
## lose 2 0 0 2
## make 18 1 10 29
## march 1 0 0 1
## mean 1 0 0 1
## meet 13 0 0 13
## ming 0 0 0 0
## miss 6 0 1 7
## miss out 0 0 0 0
## mix 0 0 1 1
## motivate 0 0 0 0
## move 13 0 2 15
## NA 0 0 0 0
## nourish 0 0 1 1
## offer 1 0 2 3
## operate 0 0 1 1
## organise 1 0 0 1
## overchange 1 0 0 1
## overdo 0 0 0 0
## overreact 1 0 0 1
## pack 1 0 1 2
## paint 1 0 1 2
## panic 0 0 0 0
## participate 1 0 1 2
## party 1 0 0 1
## pass 5 0 0 5
## paste 0 0 0 0
## pay 4 0 1 5
## pedal 1 0 0 1
## pend 0 0 0 0
## perform 1 1 1 3
## persuade 0 0 0 0
## phone 2 0 0 2
## pick up 4 0 0 4
## pinch 0 1 1 2
## place 0 0 1 1
## plan 17 0 0 17
## play 49 1 6 56
## pollute 0 0 1 1
## pop up 0 0 2 2
## pose 1 0 0 1
## post 2 0 0 2
## pour 1 0 0 1
## practice 1 1 1 3
## practise 1 0 2 3
## prepare 6 0 2 8
## pretend 1 0 0 1
## print 1 0 0 1
## promote 0 0 0 0
## protect 0 0 0 0
## protest 0 0 1 1
## pull 1 0 2 3
## pump 1 0 0 1
## purse 1 0 0 1
## push 3 0 0 3
## put 5 0 1 6
## put off 0 0 1 1
## put on 1 0 0 1
## raft 0 0 0 0
## rain 12 0 0 12
## raise 0 0 0 0
## react 0 0 1 1
## read 16 0 2 18
## realise 0 0 1 1
## receive 0 0 2 2
## record 1 0 0 1
## recover 2 0 0 2
## recycle 0 0 0 0
## redefine 0 0 0 0
## refer 1 0 0 1
## refuse 1 0 0 1
## rehearse 1 0 0 1
## relax 1 0 0 1
## remember 1 0 0 1
## renovate 0 0 1 1
## repair 1 0 0 1
## repeat 0 0 1 1
## report 4 0 0 4
## research 0 0 0 0
## return 2 0 0 2
## revise 3 0 1 4
## reward 0 0 0 0
## ride 1 0 0 1
## ring 3 0 0 3
## rob 1 0 0 1
## roll 1 0 0 1
## ruin 3 0 0 3
## run 19 0 3 22
## save 1 0 0 1
## say 23 2 4 29
## scan 1 0 0 1
## scowl 0 0 0 0
## scream 0 0 0 0
## search 2 0 0 2
## see 5 0 3 8
## seek 0 0 1 1
## sell 2 0 1 3
## send 0 0 4 4
## set 0 0 1 1
## shake 1 0 0 1
## share 1 0 0 1
## shine 2 0 0 2
## shoot 0 0 0 0
## shop 2 0 0 2
## shout 2 0 0 2
## show 4 0 0 4
## shrug 0 0 1 1
## sing 3 0 1 4
## sit 21 0 1 22
## ski 0 0 0 0
## slam 0 0 2 2
## sleep 13 1 0 14
## slow 1 0 0 1
## smile 10 0 1 11
## smoke 1 0 0 1
## snow 5 0 0 5
## solve 0 0 0 0
## something 0 0 0 0
## sort 0 0 0 0
## sound 1 0 0 1
## speak 5 0 2 7
## spend 3 0 0 3
## split 2 0 0 2
## sponsor 1 0 0 1
## spread 0 0 1 1
## spring up 0 0 1 1
## squander 0 1 0 1
## stand 19 0 0 19
## stand up 1 0 0 1
## stare 2 0 1 3
## start 13 0 1 14
## starve 2 0 1 3
## stay 25 0 0 25
## steal 0 1 1 2
## stop 2 0 1 3
## stress 1 0 0 1
## stretch 0 0 0 0
## stroll 1 0 0 1
## struggle 1 0 0 1
## study 15 0 3 18
## suffer 1 0 3 4
## suggest 1 0 0 1
## support 1 0 0 1
## surf 4 0 0 4
## surprise 0 0 0 0
## survive 0 0 0 0
## swim 9 0 0 9
## take 26 0 8 34
## take back 1 0 0 1
## take part 0 0 1 1
## take up 0 0 1 1
## talk 76 3 9 88
## tame 0 0 0 0
## tanding 0 0 0 0
## taste 0 0 0 0
## tattoo 0 0 0 0
## teach 3 0 3 6
## tear 1 0 0 1
## tell 11 1 7 19
## test 2 0 0 2
## text 1 0 0 1
## thing 0 0 0 0
## think 26 0 1 27
## throw 2 0 0 2
## tidy 1 0 0 1
## tingle 0 0 0 0
## tip 0 0 1 1
## tire 0 0 0 0
## top 1 0 0 1
## train 1 0 4 5
## travel 8 0 3 11
## treat 1 0 0 1
## try 41 0 14 55
## turn 1 0 1 2
## turn off 1 0 0 1
## tutting 0 0 0 0
## understand 0 0 0 0
## update 1 0 1 2
## upset 0 0 1 1
## urge 0 0 1 1
## use 13 0 8 21
## vibrate 1 0 0 1
## visit 10 0 0 10
## volunteer 0 1 2 3
## wait 49 0 1 50
## wake up 0 0 0 0
## walk 27 0 0 27
## want 1 0 1 2
## wash 2 0 0 2
## waste 2 1 0 3
## watch 28 1 2 31
## wear 70 0 1 71
## welcome 0 0 0 0
## whisper 1 0 0 1
## win 1 0 0 1
## wipe 0 0 1 1
## wish 1 0 0 1
## wonder 7 0 0 7
## woo 0 0 0 0
## work 31 1 18 50
## worry 0 0 0 0
## write 16 0 2 18
## Sum 2001 28 394 2423
TCrepeated <- TCrepeated[order(TCrepeated[,4], decreasing = T),]; head(TCrepeated, 11)
##
## Case no unclear yes Sum
## Sum 2001 28 394 2423
## do 177 1 28 206
## go 158 2 6 166
## look 83 0 9 92
## talk 76 3 9 88
## wear 70 0 1 71
## have 53 0 5 58
## play 49 1 6 56
## try 41 0 14 55
## come 49 0 5 54
## wait 49 0 1 50
TCrepeated <- TCrepeated[1:30,]
TCrepeatedp <- round(prop.table(TCrepeated, 1), 4)*200; TCrepeatedp
##
## Case no unclear yes Sum
## Sum 82.58 1.16 16.26 100.00
## do 85.92 0.48 13.60 100.00
## go 95.18 1.20 3.62 100.00
## look 90.22 0.00 9.78 100.00
## talk 86.36 3.40 10.22 100.00
## wear 98.60 0.00 1.40 100.00
## have 91.38 0.00 8.62 100.00
## play 87.50 1.78 10.72 100.00
## try 74.54 0.00 25.46 100.00
## come 90.74 0.00 9.26 100.00
## wait 98.00 0.00 2.00 100.00
## work 62.00 2.00 36.00 100.00
## get 68.18 2.28 29.54 100.00
## feel 85.30 0.00 14.70 100.00
## take 76.48 0.00 23.52 100.00
## watch 90.32 3.22 6.46 100.00
## make 62.06 3.44 34.48 100.00
## say 79.32 6.90 13.80 100.00
## listen 96.42 0.00 3.58 100.00
## think 96.30 0.00 3.70 100.00
## walk 100.00 0.00 0.00 100.00
## go on 76.92 3.84 19.24 100.00
## stay 100.00 0.00 0.00 100.00
## happen 78.26 0.00 21.74 100.00
## live 100.00 0.00 0.00 100.00
## run 86.36 0.00 13.64 100.00
## sit 95.46 0.00 4.54 100.00
## use 61.90 0.00 38.10 100.00
## eat 78.94 0.00 21.06 100.00
## stand 100.00 0.00 0.00 100.00
TCrepeatedp <- TCrepeatedp[order(TCrepeatedp[,3], decreasing = T),]
TCrepeatedp <- TCrepeatedp[1:9,1:3]; TCrepeatedp
##
## Case no unclear yes
## use 61.90 0.00 38.10
## work 62.00 2.00 36.00
## make 62.06 3.44 34.48
## get 68.18 2.28 29.54
## try 74.54 0.00 25.46
## take 76.48 0.00 23.52
## happen 78.26 0.00 21.74
## eat 78.94 0.00 21.06
## go on 76.92 3.84 19.24
fig_TC_repeated = ggplot(as.data.frame(TCrepeatedp), aes(x=Var2, y=Freq)) +
geom_col() +
facet_wrap(~Case) +
labs(x='Repeatedness function', y='Proportion of use') +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(fig_TC_repeated) # Takes a few seconds
##
rep <- prog[, c(1, 8, 16)]; head(rep)
## Corpus Lemma Repeated
## 1 Spoken BNC2014 sample travel no
## 2 Spoken BNC2014 sample lie unclear
## 3 Spoken BNC2014 sample attack no
## 4 Spoken BNC2014 sample pilot no
## 5 Spoken BNC2014 sample have no
## 6 Spoken BNC2014 sample lose no
rep <- table(Case = rep$Lemma, rep$Repeated)
BNCrepeated <- addmargins(BNCrepeated) # adds rows total to table!
BNCrepeated <- BNCrepeated[order(BNCrepeated[,4], decreasing = T),]; head(BNCrepeated, 11)
##
## Case no unclear yes Sum Sum
## Sum 2747 394 782 3923 7846
## Sum 1679 243 501 2423 4846
## do 119 47 63 229 458
## go 135 13 25 173 346
## say 118 13 22 153 306
## think 92 4 5 101 202
## try 79 6 13 98 196
## talk 77 10 6 93 186
## get 55 10 10 75 150
## have 46 5 13 64 128
## come 46 5 4 55 110
BNCrepeated <- BNCrepeated[1:30,]
BNCrepeatedp <- round(prop.table(BNCrepeated, 1), 4)*200; BNCrepeatedp
##
## Case no unclear yes Sum Sum
## Sum 35.02 5.02 9.96 50.00 100.00
## Sum 34.64 5.02 10.34 50.00 100.00
## do 25.98 10.26 13.76 50.00 100.00
## go 39.02 3.76 7.22 50.00 100.00
## say 38.56 4.24 7.18 50.00 100.00
## think 45.54 1.98 2.48 50.00 100.00
## try 40.30 3.06 6.64 50.00 100.00
## talk 41.40 5.38 3.22 50.00 100.00
## get 36.66 6.66 6.66 50.00 100.00
## have 35.94 3.90 10.16 50.00 100.00
## come 41.82 4.54 3.64 50.00 100.00
## make 34.52 7.14 8.34 50.00 100.00
## work 23.18 1.22 25.60 50.00 100.00
## be 29.42 5.88 14.70 50.00 100.00
## look 41.66 3.34 5.00 50.00 100.00
## take 32.14 7.14 10.72 50.00 100.00
## tell 29.62 9.26 11.12 50.00 100.00
## play 33.34 2.08 14.58 50.00 100.00
## sit 43.48 2.18 4.34 50.00 100.00
## look for 40.90 0.00 9.10 50.00 100.00
## use 10.00 10.00 30.00 50.00 100.00
## watch 25.00 5.00 20.00 50.00 100.00
## pay 5.26 2.64 42.10 50.00 100.00
## go on 44.44 2.78 2.78 50.00 100.00
## run 32.36 2.94 14.70 50.00 100.00
## start 50.00 0.00 0.00 50.00 100.00
## wait 47.06 2.94 0.00 50.00 100.00
## live 50.00 0.00 0.00 50.00 100.00
## put 23.34 6.66 20.00 50.00 100.00
## record 43.34 3.34 3.34 50.00 100.00
BNCrepeatedp <- BNCrepeatedp[order(BNCrepeatedp[,3], decreasing = T),]
BNCrepeatedp <- BNCrepeatedp[1:12,1:3]; BNCrepeatedp
##
## Case no unclear yes
## pay 5.26 2.64 42.10
## use 10.00 10.00 30.00
## work 23.18 1.22 25.60
## watch 25.00 5.00 20.00
## put 23.34 6.66 20.00
## be 29.42 5.88 14.70
## run 32.36 2.94 14.70
## play 33.34 2.08 14.58
## do 25.98 10.26 13.76
## tell 29.62 9.26 11.12
## take 32.14 7.14 10.72
## Sum 34.64 5.02 10.34
continuous <- table(prog$Corpus, prog$Continuous); continuous
##
## no unclear yes
## Spoken BNC2014 sample 518 74 1831
## Textbook Conversation 275 6 2142
prop.table(continuous, 1)*100
##
## no unclear yes
## Spoken BNC2014 sample 21.3784565 3.0540652 75.5674783
## Textbook Conversation 11.3495667 0.2476269 88.4028064
#tiff(here("Prog_Continuous.tiff"), height = 18, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.6, cex.main = 0.6, cex.lab = 1, cex.axis = 1.5, cex.sub = 0.5)
plot(continuous, shade = TRUE, main = "Progressive function: Continuousness")
dev.off()
## null device
## 1
continuous1 <- continuous[,c(1,3)] # Get rid of unclear's
chisq.test(continuous1) # Significant correlation at p<0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: continuous1
## X-squared = 97.089, df = 1, p-value < 2.2e-16
vcd::assocstats(continuous1) # Cramer's V is 0.147
## X^2 df P(> X^2)
## Likelihood Ratio 99.074 1 0
## Pearson 97.857 1 0
##
## Phi-Coefficient : 0.143
## Contingency Coeff.: 0.142
## Cramer's V : 0.143
## Which lemmas occur most frequently in either corpora with the function non-continuousness? ##
noncont_lemmas <- prog[prog$Continuous=="no", c(1,8)]
noncont_lemmas_TxB <- prog[prog$Continuous=="no" & prog$Corpus=="Textbook Conversation", 8]
table_noncont_lemmas_TxB <- sort(table(noncont_lemmas_TxB), decreasing = T)
head(table_noncont_lemmas_TxB, 20)
## noncont_lemmas_TxB
## go come ask leave say start buy take get joke
## 52 16 9 9 9 9 8 7 6 6
## meet tell call do have kid pick up send fly let
## 6 6 4 4 4 4 4 4 3 3
noncont_lemmas_BNC <- prog[prog$Continuous=="no" & prog$Corpus=="Spoken BNC2014 sample", 8]
table_noncont_lemmas_BNC <- sort(table(noncont_lemmas_BNC), decreasing = T)
head(table_noncont_lemmas_BNC, 20)
## noncont_lemmas_BNC
## say go get come have take tell buy
## 77 76 36 31 21 17 14 12
## pay put leave be do start ask make
## 12 12 7 6 6 6 5 5
## think call come out give
## 5 4 4 4
## Percentage of specific lemmas used with continuousness function
BNCPC <- BNCprog[BNCprog$Continuous!="unclear",]
prop.table(table(BNCPC$Lemma=="tell", BNCPC$Continuous), 1)*100
##
## no unclear yes
## FALSE 21.70543 0.00000 78.29457
## TRUE 51.85185 0.00000 48.14815
TCPC <- TxBprog[TxBprog$Continuous!="unclear",]
prop.table(table(TCPC$Lemma=="tell", TCPC$Continuous), 1)*100
##
## no unclear yes
## FALSE 11.21768 0.00000 88.78232
## TRUE 31.57895 0.00000 68.42105
functions1 <- table(prog$Corpus, prog$Extra.function); functions1
##
## emphasis/shock framing gradual change
## Spoken BNC2014 sample 40 45 66
## Textbook Conversation 37 98 47
##
## other interesting function politeness/softening
## Spoken BNC2014 sample 3 8
## Textbook Conversation 3 4
##
## politeness or softening
## Spoken BNC2014 sample 0
## Textbook Conversation 1
round((functions1/2423*100),2) # Percentage of concordances in each corpus
##
## emphasis/shock framing gradual change
## Spoken BNC2014 sample 1.65 1.86 2.72
## Textbook Conversation 1.53 4.04 1.94
##
## other interesting function politeness/softening
## Spoken BNC2014 sample 0.12 0.33
## Textbook Conversation 0.12 0.17
##
## politeness or softening
## Spoken BNC2014 sample 0.00
## Textbook Conversation 0.04
framing <- table(prog$Corpus, prog$Extra.function=="framing"); framing
##
## FALSE TRUE
## Spoken BNC2014 sample 117 45
## Textbook Conversation 92 98
chisq.test(framing) # Significant at p>0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: framing
## X-squared = 19.562, df = 1, p-value = 9.741e-06
vcd::assocstats(framing) # phi-coefficient = 0.242
## X^2 df P(> X^2)
## Likelihood Ratio 20.888 1 4.8700e-06
## Pearson 20.536 1 5.8506e-06
##
## Phi-Coefficient : 0.242
## Contingency Coeff.: 0.235
## Cramer's V : 0.242
emphasis <- table(prog$Corpus, prog$Extra.function=="emphasis/shock"); emphasis
##
## FALSE TRUE
## Spoken BNC2014 sample 122 40
## Textbook Conversation 153 37
chisq.test(emphasis) # Not significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: emphasis
## X-squared = 1.1044, df = 1, p-value = 0.2933
vcd::assocstats(emphasis)
## X^2 df P(> X^2)
## Likelihood Ratio 1.3893 1 0.23853
## Pearson 1.3930 1 0.23790
##
## Phi-Coefficient : 0.063
## Contingency Coeff.: 0.063
## Cramer's V : 0.063
change <- table(prog$Corpus,prog$Extra.function=="gradual change"); change
##
## FALSE TRUE
## Spoken BNC2014 sample 96 66
## Textbook Conversation 143 47
chisq.test(change) # Significant at p < 0.005
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: change
## X-squared = 9.554, df = 1, p-value = 0.001995
vcd::assocstats(change) # Phi-Coefficient = 0.171
## X^2 df P(> X^2)
## Likelihood Ratio 10.283 1 0.0013423
## Pearson 10.275 1 0.0013484
##
## Phi-Coefficient : 0.171
## Contingency Coeff.: 0.168
## Cramer's V : 0.171
politeness <- table(prog$Corpus, prog$Extra.function=="politeness/softening"); politeness
##
## FALSE TRUE
## Spoken BNC2014 sample 154 8
## Textbook Conversation 186 4
chisq.test(politeness) #
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: politeness
## X-squared = 1.3578, df = 1, p-value = 0.2439
vcd::assocstats(politeness) #
## X^2 df P(> X^2)
## Likelihood Ratio 2.1458 1 0.14296
## Pearson 2.1313 1 0.14432
##
## Phi-Coefficient : 0.078
## Contingency Coeff.: 0.078
## Cramer's V : 0.078
#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))
knitr::write_bib(c(.packages(), "knitr"), "packages.bib")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] gtsummary_1.5.0 vcd_1.4-8 tidyr_1.1.4 RColorBrewer_1.1-2
## [5] lsr_0.5 lattice_0.20-41 gridExtra_2.3 ggplot2_3.3.5
## [9] here_1.0.1 dplyr_1.0.7
##
## loaded via a namespace (and not attached):
## [1] zoo_1.8-9 tidyselect_1.1.1 xfun_0.29
## [4] bslib_0.3.1 purrr_0.3.4 colorspace_2.0-2
## [7] vctrs_0.3.8 generics_0.1.1 htmltools_0.5.2
## [10] yaml_2.2.1 utf8_1.2.2 rlang_0.4.12
## [13] jquerylib_0.1.4 pillar_1.6.4 glue_1.6.0
## [16] withr_2.4.3 DBI_1.1.1 lifecycle_1.0.1
## [19] stringr_1.4.0 commonmark_1.7 munsell_0.5.0
## [22] gtable_0.3.0 evaluate_0.14 labeling_0.4.2
## [25] knitr_1.37 forcats_0.5.1 fastmap_1.1.0
## [28] lmtest_0.9-38 fansi_0.5.0 highr_0.9
## [31] broom_0.7.9 backports_1.4.1 checkmate_2.0.0
## [34] scales_1.1.1 jsonlite_1.7.2 farver_2.1.0
## [37] digest_0.6.29 stringi_1.7.6 rprojroot_2.0.2
## [40] cli_3.1.0 tools_4.0.3 magrittr_2.0.1
## [43] sass_0.4.0 tibble_3.1.6 crayon_1.4.2
## [46] pkgconfig_2.0.3 ellipsis_0.3.2 MASS_7.3-53.1
## [49] broom.helpers_1.4.0 assertthat_0.2.1 gt_0.3.1
## [52] rmarkdown_2.11 rstudioapi_0.13 R6_2.5.1
## [55] compiler_4.0.3