This script is part of the Online Appendix to my PhD thesis.

Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.

For more information, see: https://elenlefoll.github.io/TextbookEnglish/

Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.

Set-up

Built with R 4.0.3

knitr::opts_chunk$set(echo = TRUE)

library(dplyr)
library(here)
library(ggplot2)
library(gridExtra)
library(lattice)
library(lsr)
library(RColorBrewer)
library(tidyr)
library(vcd)

Data Preparation

annotation_file <- read.csv(here("YouthFiction_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file)
## Rows: 2,364
## Columns: 16
## $ Book           <fct> book185, book183, book109, book277, book165, book73, bo…
## $ Concordance1   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2  <fct> "was sitting", "was going", "was trying", "be swimming …
## $ Concordance3   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma          <fct> sit, go, try, swim, cry out, talk about, think, scorch,…
## $ X.             <fct> NA, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, c, NA, NA, N…
## $ NOT            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense          <fct> past, GOING TO, past, NOT progressive, past, present, p…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1           <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, q, NA, …
## $ Time.reference <fct> past, past, past, modal/infinitive, past, present, past…
## $ Continuous     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, NA, framing, NA, NA, NA, NA, NA, NA, NA, NA, NA, em…
## $ Comment        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator      <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
# Select only annotated concordance lines
#annotated <- annotation_file
annotated <- annotation_file[1:2050,] # Narrows it down to 1517 concordances with prog. Note that as of 11.08.2019 only the first 2053 have been manually annotated anyway.
# Extract only progressives
YF_prog <- annotated[annotated$Tense != "NOT progressive" & annotated$Tense != "GOING TO" & annotated$Tense !="catenative", ]

#YF_prog$Annotator <- as.character(YF_prog$Annotator) # Only necessary if Tatjana did all the annotating in a particular file because otherwise T is recognised as TRUE and the vector becomes a logical vector

YF_prog$Tense <- droplevels(YF_prog)$Tense # Drop unused levels
YF_prog$Time.reference <- droplevels(YF_prog)$Time.reference # Drop unused levels
YF_prog$Lemma <- droplevels(YF_prog)$Lemma # Drop unused levels

YF_prog$Tense <- factor(YF_prog$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
YF_prog$Time.reference <- factor(YF_prog$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))

nrow(YF_prog) # Number of progressives in Youth Fiction sampled
## [1] 1517
nrow(YF_prog)/nrow(annotated) # Ratio of genuine progressives in progressive CQL query
## [1] 0.74
# Before saving annotation file as .csv, make sure to have deleted all trailing inverted commas as they serve as string deliminators. This can easily be done with a regex search and replace in calc $" --> nothing.
annotation_file_TxB <- read.csv(here("TxB_narrative_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file_TxB)
## Rows: 1,855
## Columns: 18
## $ Register       <fct> narrative, narrative, narrative, narrative, narrative, …
## $ Level          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Textbook       <fct> Hi there, Hi there, Hi there, Join the Team, Join the T…
## $ Concordance1   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2  <fct> "'re going", "'re going", "'re going", "are discussing"…
## $ Concordance3   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma          <fct> go, go, go, discuss, go, sit, read, run, go, wear, go, …
## $ X.             <fct> c, c, c, NA, NA, NA, NA, NA, c, c, NA, NA, c, c, c, c, …
## $ NOT            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense          <fct> GOING TO, GOING TO, GOING TO, present, GOING TO, presen…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1           <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Time.reference <fct> present, present, present, present, present, present, p…
## $ Continuous     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, gradual…
## $ Comment        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator      <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated_TxB <- annotation_file_TxB
# Extract only progressives
TxBNarProg <- filter(annotated_TxB, Tense != "NOT progressive" & Tense != "GOING TO" & Tense !="catenative")

TxBNarProg$Tense <- droplevels(TxBNarProg)$Tense # Drop unused levels
TxBNarProg$Time.reference <- droplevels(TxBNarProg)$Time.reference # Drop unused levels
TxBNarProg$Lemma <- droplevels(TxBNarProg)$Lemma

TxBNarProg$Tense <- factor(TxBNarProg$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
TxBNarProg$Time.reference <- factor(TxBNarProg$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))

nrow(TxBNarProg) # Number of progressives in Textbook Fiction
## [1] 1517
nrow(TxBNarProg)/nrow(annotation_file_TxB) # Ratio of genuine progressives in progressive query
## [1] 0.8177898
FictionProg <- bind_rows("Youth Fiction Sampled" = YF_prog, "Textbook Fiction" = TxBNarProg, .id = "Corpus")
glimpse(FictionProg)
## Rows: 3,034
## Columns: 20
## $ Corpus         <chr> "Youth Fiction Sampled", "Youth Fiction Sampled", "Yout…
## $ Book           <fct> book185, book109, book165, book73, book109, book55, boo…
## $ Concordance1   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2  <fct> "was sitting", "was trying", "were crying", "'re talkin…
## $ Concordance3   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma          <fct> sit, try, cry out, talk about, think, say, go, wait, be…
## $ X.             <fct> NA, NA, NA, c, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, N…
## $ NOT            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, n, NA, NA, NA, NA, …
## $ Tense          <fct> past, past, past, present, past, present, present, perf…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1           <fct> NA, NA, NA, NA, NA, q, NA, NA, q, NA, NA, NA, q, NA, NA…
## $ Time.reference <fct> past, past, past, present, past, present, future, hypot…
## $ Continuous     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, no, yes, y…
## $ Repeated       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, framing, NA, NA, NA, emphasis/shock, NA, NA, NA, NA…
## $ Comment        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "progr. seems t…
## $ Annotator      <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
## $ Register       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level          <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
FictionProg <- FictionProg %>% dplyr::rename(Contraction = X., Question = X..1, Negation = NOT) # Rename variables
glimpse(FictionProg)
## Rows: 3,034
## Columns: 20
## $ Corpus         <chr> "Youth Fiction Sampled", "Youth Fiction Sampled", "Yout…
## $ Book           <fct> book185, book109, book165, book73, book109, book55, boo…
## $ Concordance1   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2  <fct> "was sitting", "was trying", "were crying", "'re talkin…
## $ Concordance3   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma          <fct> sit, try, cry out, talk about, think, say, go, wait, be…
## $ Contraction    <fct> NA, NA, NA, c, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, N…
## $ Negation       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, n, NA, NA, NA, NA, …
## $ Tense          <fct> past, past, past, present, past, present, present, perf…
## $ Voice          <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Question       <fct> NA, NA, NA, NA, NA, q, NA, NA, q, NA, NA, NA, q, NA, NA…
## $ Time.reference <fct> past, past, past, present, past, present, future, hypot…
## $ Continuous     <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, no, yes, y…
## $ Repeated       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, framing, NA, NA, NA, emphasis/shock, NA, NA, NA, NA…
## $ Comment        <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "progr. seems t…
## $ Annotator      <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
## $ Register       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level          <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# Replace NA with meaningful levels
summary(as.factor(FictionProg$Negation))
##    n NA's 
##  128 2906
FictionProg$Negation <- tidyr::replace_na(as.character(FictionProg$Negation), "positive") # positive
summary(as.factor(FictionProg$Contraction))
##    c NA's 
##  636 2398
FictionProg$Contraction <- tidyr::replace_na(as.character(FictionProg$Contraction), "full") # full
summary(as.factor(FictionProg$Question))
##       q unclear    NA's 
##     238       1    2795
FictionProg$Question <- tidyr::replace_na(as.character(FictionProg$Question), "statement") # statement

summary(as.factor(FictionProg$Level))
##    A    B    C    D    E NA's 
##  167  353  313  349  335 1517
FictionProg$Level <- tidyr::replace_na(as.character(FictionProg$Level), "Youth Fiction") # positive

summary(as.factor(FictionProg$Book))
##  book16 book109  book75 book192  book98 book163 book168  book17  book19  book95 
##      20      16      16      14      13      12      12      12      12      12 
## book172 book182 book226   book5   book9  book10 book125 book204 book208 book230 
##      11      11      11      11      11      10      10      10      10      10 
## book251  book41 book126 book132 book137 book175 book179  book18 book183  book24 
##      10      10       9       9       9       9       9       9       9       9 
## book271 book284 book295 book298  book73  book91  book92 book100 book113 book127 
##       9       9       9       9       9       9       9       8       8       8 
## book136 book142 book151 book152 book166 book195 book209 book223 book244 book269 
##       8       8       8       8       8       8       8       8       8       8 
##  book46  book49  book53  book55  book58  book65  book74  book76  book79  book84 
##       8       8       8       8       8       8       8       8       8       8 
##  book96  book12 book122 book123 book169 book185 book188 book207  book22 book228 
##       8       7       7       7       7       7       7       7       7       7 
## book231 book239 book259 book263 book264  book27  book28  book37  book60  book68 
##       7       7       7       7       7       7       7       7       7       7 
##  book81  book90   book1 book107 book114 book119 book120 book129 book133 book134 
##       7       7       6       6       6       6       6       6       6       6 
## book146 book180   book2 book206 book210 book215 book221 book227 (Other)    NA's 
##       6       6       6       6       6       6       6       6     683    1517
FictionProg$Book <- tidyr::replace_na(as.character(FictionProg$Book), "Textbook Narrative")
plot(as.factor(FictionProg$Book[2:620])) # Number of concordances per book from YF corpus

# Make the following variables into factors
factor_cols <- c("Corpus", "Negation", "Contraction", "Question", "Repeated", "Lemma", "Voice", "Level")
FictionProg[factor_cols] <- lapply(FictionProg[factor_cols], as.factor)

levels(FictionProg$Negation)[levels(FictionProg$Negation)=="n"] <- "negated"
levels(FictionProg$Contraction)[levels(FictionProg$Contraction)=="c"] <- "contract."

FictionProg$Corpus <- relevel(FictionProg$Corpus, "Youth Fiction Sampled") # This is so that the plots are in the same order as the Textbook Conversation vs. Spoken BNC ones. E.g. reference corpus on the left.

summary(FictionProg$Corpus) # Number of annotated progressive concordances in each subcorpus
## Youth Fiction Sampled      Textbook Fiction 
##                  1517                  1517
#saveRDS(FictionProg, file = here("FictionProg.rds")) # Last saved on 25.08.2019

#FictionProg <- readRDS(here("FictionProg.rds"))

Proportion of progressives

# Textbook Fiction #
TF_words <- 219040 # Much better than tokens due to lack of punctuation in Spoken BNC
TF_verbs <- 42302 # These are all verb forms (e.g. have been drinking = 3 verb forms). Calculated using SE query
TF_verb_phrases <- 35148 # As calculated with Extract_verb_lemmas_spacy
TF_prog <- nrow(TxBNarProg) # 1517 # After manual annotation, thus excluding GOING TO + inf.
TF_ProgVerbPhrase_ratio <- TF_prog/(TF_verb_phrases - TF_prog - (nrow(filter(annotated_TxB, Tense=="GOING TO"))))*10000; TF_ProgVerbPhrase_ratio # 453.36 per 10,000 finite verb phrases
## [1] 453.3636
# Youth Fiction sampled
YFs_words <- 8328976
YFs_verbs <- 1792111 # From SE query
YFs_progCQL <- 71140 # From SE query
# Ratio of genuine progressives in progressive query for Youth Fiction sampled
YFsratio_prog_CQL <- nrow(YF_prog)/nrow(annotated)
YFstotal_prog <- YFsratio_prog_CQL*YFs_progCQL # Approximate total number of progressives across Youth Fiction sampled
YFs_verb_phrases <- 1328349 # From spacy script
YFs_ProgVerbPhrase_ratio <- YFstotal_prog/(YFs_verb_phrases - YFstotal_prog - (nrow(filter(annotation_file, Tense=="GOING TO"))))*10000; YFs_ProgVerbPhrase_ratio # 412.74 per 10,000 finite verb phrases
## [1] 412.7364
# Comparing proportions of verbs and progressives #

TF <- as.vector(c(TF_words, TF_verbs, TF_verb_phrases, TF_prog))
YFs <- as.vector(c(YFs_words, YFs_verbs, YFs_verb_phrases, YFstotal_prog))
#YFs_sample <- round(YFs*(TF_words/YFs_words),0) # based on total number of words (from SE)
YFs_sample <- round(YFs*(21530/727487),0) # based on total number of sentences (from SE)
prog_ratio_fiction <- rbind(TF, YFs_sample)
rownames(prog_ratio_fiction) <- c("Textbook Fiction", "Youth Fiction sample")
colnames(prog_ratio_fiction) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives")
prog_ratio_fiction <- as.data.frame(prog_ratio_fiction); prog_ratio_fiction
##                       Words Verb forms Finite verb phrases Progressives
## Textbook Fiction     219040      42302               35148         1517
## Youth Fiction sample 246496      53038               39313         1558
prog_ratio_fiction$Non_prog <- prog_ratio_fiction$`Finite verb phrases` - prog_ratio_fiction$Progressives
prog_ratio_fiction$FProgRatio <- round(c(TF_ProgVerbPhrase_ratio, YFs_ProgVerbPhrase_ratio), 2)
colnames(prog_ratio_fiction) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives", "Non-progressives","F-coefficients of progressives")
prog_ratio_fiction
##                       Words Verb forms Finite verb phrases Progressives
## Textbook Fiction     219040      42302               35148         1517
## Youth Fiction sample 246496      53038               39313         1558
##                      Non-progressives F-coefficients of progressives
## Textbook Fiction                33631                         453.36
## Youth Fiction sample            37755                         412.74
#clipr::write_clip(prog_ratio_fiction)
#saveRDS(prog_ratio_fiction, file = here("prog_ratio_fiction.rds"))


# Significance testing and approximation of effect sizes #
# http://rcompanion.org/handbook/H_03.html 

prog_ratio_fiction1 <- prog_ratio_fiction[,5:4]; prog_ratio_fiction1 
##                      Non-progressives Progressives
## Textbook Fiction                33631         1517
## Youth Fiction sample            37755         1558
chisq.test(prog_ratio_fiction1) 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  prog_ratio_fiction1
## X-squared = 5.7508, df = 1, p-value = 0.01648
vcd::assocstats(t(prog_ratio_fiction1)) # Phi-Coefficient : 0.009
##                     X^2 df P(> X^2)
## Likelihood Ratio 5.8324  1 0.015734
## Pearson          5.8396  1 0.015670
## 
## Phi-Coefficient   : 0.009 
## Contingency Coeff.: 0.009 
## Cramer's V        : 0.009
fisher.test(prog_ratio_fiction1) # Is this helpful?
## 
##  Fisher's Exact Test for Count Data
## 
## data:  prog_ratio_fiction1
## p-value = 0.01647
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8505462 0.9840351
## sample estimates:
## odds ratio 
##  0.9148458
verb_ratio_fiction <- prog_ratio_fiction[,1:2]; verb_ratio_fiction
##                       Words Verb forms
## Textbook Fiction     219040      42302
## Youth Fiction sample 246496      53038
chisq.test(verb_ratio_fiction) # Highly significant
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  verb_ratio_fiction
## X-squared = 228.56, df = 1, p-value < 2.2e-16
vcd::assocstats(t(verb_ratio_fiction)) 
##                     X^2 df P(> X^2)
## Likelihood Ratio 229.10  1        0
## Pearson          228.67  1        0
## 
## Phi-Coefficient   : 0.02 
## Contingency Coeff.: 0.02 
## Cramer's V        : 0.02
prog_ratio_spoken <- readRDS(file = here("prog_ratio_spoken.rds"))
prog_ratio_fiction <- readRDS(file = here("prog_ratio_fiction.rds"))

prog_ratios <- rbind(prog_ratio_spoken, prog_ratio_fiction); prog_ratios
##                         Words Verb forms Finite verb phrases Progressives
## Textbook Conversation  420130      80106               64292         2423
## Spoken BNC 2014 Sample 420130      98761               69521         3724
## Textbook Fiction       219040      42302               35148         1517
## Youth Fiction sample   246496      53038               39313         1555
##                        Non-progressives F-coefficients of progressives
## Textbook Conversation             61869                         395.48
## Spoken BNC 2014 Sample            65797                         566.09
## Textbook Fiction                  33631                         453.36
## Youth Fiction sample              37758                         411.98
prog_ratios <- prog_ratios[c(1,3), c(4:5)]; prog_ratios
##                       Progressives Non-progressives
## Textbook Conversation         2423            61869
## Textbook Fiction              1517            33631
chisq.test(prog_ratios)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  prog_ratios
## X-squared = 17.744, df = 1, p-value = 2.527e-05
dev.off()
## null device 
##           1
vcd::assoc(prog_ratios)
vcd::assocstats(t(prog_ratios))
##                     X^2 df   P(> X^2)
## Likelihood Ratio 17.673  1 2.6233e-05
## Pearson          17.888  1 2.3431e-05
## 
## Phi-Coefficient   : 0.013 
## Contingency Coeff.: 0.013 
## Cramer's V        : 0.013
prop.test(t(prog_ratios))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  t(prog_ratios)
## X-squared = 17.744, df = 1, p-value = 2.527e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.04849354 -0.01724308
## sample estimates:
##    prop 1    prop 2 
## 0.6149746 0.6478429

Tenses

# Mosaic plot of tense distribution across corpora
# Select the colors that will be used
#RColorBrewer::display.brewer.all(colorblindFriendly = T)
# We will select the first 4 colors in the Set1 palette
cols<-RColorBrewer::brewer.pal(n=4,name="OrRd")

#tiff(here("mosaic_tenseforms_nar.tiff"), height = 30, width = 35, units="cm", compression = "lzw", res = 300)
par(cex = 2, cex.main = 0.85, font = 2)
plot(FictionProg$Tense ~ FictionProg$Corpus, col = cols, xlab = "", ylab = "Progressive tense forms", main = "Distribution of progressive tense forms in Textbook Conversation and the Spoken BNC 2014")

dev.off()
## null device 
##           1
# Differentiated look at textbook levels #
#tiff(here("mosaic_tenseforms_levels_nar.tiff"), height = 15, width = 25, units="cm", compression = "lzw", res = 300)
plot(FictionProg$Tense ~ FictionProg$Level, col = cols, xlab = "                                                                                             Textbook Levels", ylab = "Progressive tense forms")
dev.off()
## null device 
##           1
# Barplot with % of occurrences

tenses <- table(FictionProg$Tense, FictionProg$Corpus)
class(tenses) # Produces a lovely simple table
## [1] "table"
tenses
##          
##           Youth Fiction Sampled Textbook Fiction
##   past                      851              792
##   perfect                   102               67
##   present                   492              635
##   modal                      72               23
tensesp <- t((prop.table(tenses, margin = 2)*100)) # Work out percentages and transpose table
tensesp <-round(tensesp, 2)

barplot(tensesp, beside = TRUE, ylab = "% of occurrences", xlab = "Forms of the progressive", ylim = c(0,100), col = c("darkblue", "darkred"))
legend("topright", fill = c("darkblue", "darkred"), c("Youth Reference", "Textbook Fiction"))

vcd::assocstats(tenses) # Measuring the effect size. Cramer's V is what we're interested in (Levshina p. 217)
##                     X^2 df   P(> X^2)
## Likelihood Ratio 54.148  3 1.0434e-11
## Pearson          52.786  3 2.0369e-11
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.131 
## Cramer's V        : 0.132
# Association plot of tense distribution in corpora
#tiff(here("tenseforms_assoc_nar.tiff"), height = 12, width = 17, units="cm", compression = "lzw", res = 300)
vcd::assoc(tenses, shade = TRUE, varnames = FALSE) # This works
dev.off()
## null device 
##           1
chisq.test(tenses)$stdres
##          
##           Youth Fiction Sampled Textbook Fiction
##   past                 2.149698        -2.149698
##   perfect              2.770577        -2.770577
##   present             -5.372872         5.372872
##   modal                5.107899        -5.107899

Relationship between tense and time reference

# Significance testing for time ref only #

timeref <- table(FictionProg$Corpus, FictionProg$Time.reference); timeref # Raw figures
##                        
##                         past past/present present future general hypothetical
##   Youth Fiction Sampled  855           39     431     84      51           38
##   Textbook Fiction       785           34     579     58      30           19
##                        
##                         unclear
##   Youth Fiction Sampled      14
##   Textbook Fiction           12
timerefprop <- round(t((prop.table(timeref, margin = 2)*100)),2); timerefprop # As %
##               
##                Youth Fiction Sampled Textbook Fiction
##   past                         52.13            47.87
##   past/present                 53.42            46.58
##   present                      42.67            57.33
##   future                       59.15            40.85
##   general                      62.96            37.04
##   hypothetical                 66.67            33.33
##   unclear                      53.85            46.15
chisq.test(timeref)
## 
##  Pearson's Chi-squared test
## 
## data:  timeref
## X-squared = 41.701, df = 6, p-value = 2.106e-07
vcd::assocstats(timeref) # Cramer's V = 0.117
##                     X^2 df   P(> X^2)
## Likelihood Ratio 41.993  6 1.8443e-07
## Pearson          41.701  6 2.1060e-07
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.117 
## Cramer's V        : 0.117
vcd::assoc(timeref, shade = TRUE)

# Significance test for only textbook levels C-E against Textbook Conversation

prog2 <- FictionProg[FictionProg$Level!="A" & FictionProg$Level!="B",]
prog2$Level <- droplevels(prog2)$Level # Drop levels A and B
prog2 <- prog2[prog2$Time.reference!="unclear",] # Get rid of unclear's
prog2$Time.reference <- droplevels(prog2)$Time.reference # Drop unclear level
timeref2 <- table(prog2$Corpus, prog2$Time.reference); timeref2 # Raw figures
##                        
##                         past past/present present future general hypothetical
##   Youth Fiction Sampled  855           39     431     84      51           38
##   Textbook Fiction       554           33     323     39      23           17
chisq.test(timeref2) # p < 0.039
## 
##  Pearson's Chi-squared test
## 
## data:  timeref2
## X-squared = 11.662, df = 5, p-value = 0.03973
vcd::assoc(timeref2, shade = TRUE)

# Plot relationship between tense and time reference

FictionProg1 <- FictionProg[FictionProg$Time.reference != "unclear", ]
FictionProg1 <- FictionProg1[!is.na(FictionProg1$Time.reference), ] # Remove NA's
FictionProg1$Time.reference <- droplevels(FictionProg1)$Time.reference # Drop unused level "unclear"
#prog1$Time.reference = forcats::fct_collapse(prog1$Time.reference, future = c("present/future","future")) # function to collapse levels

#RColorBrewer::display.brewer.all(colorblindFriendly = T)
# We will select the first 4 colors in the Set1 palette
cols<-RColorBrewer::brewer.pal(n=4,name="OrRd")

#tiff(here("mosaic_tenses_timeref_nar.tiff"), height = 15, width = 30, units="cm", compression = "lzw", res = 300)
vcd::mosaic(Tense ~ Corpus + Time.reference, FictionProg1, gp = grid::gpar(fill = cols), rot_labels = c(45, 45, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE), xlab = "Time reference") # Best so far
grid::grid.text("                                    past                                                     past/present                        present                                          future general hypo.", x=0.1, y=.1, vjust=1.5, hjust=0, gp=grid::gpar(fontface=1)) # Doesn't really work

dev.off()
## null device 
##           1

Questions

round(prop.table(table(FictionProg$Corpus, FictionProg$Question), margin = 1), 4)*100
##                        
##                             q statement unclear
##   Youth Fiction Sampled  8.44     91.56    0.00
##   Textbook Fiction       7.25     92.68    0.07
#tiff(here("prog_questions.tiff"), height = 17, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(plyr::revalue(FictionProg$Question, c("q"="question")) ~ FictionProg$Corpus, ylab = "", xlab = "", main = "", col = cols[4:2])

dev.off()
## null device 
##           1
questions <- table(FictionProg$Corpus, FictionProg$Question) ; questions
##                        
##                            q statement unclear
##   Youth Fiction Sampled  128      1389       0
##   Textbook Fiction       110      1406       1
questions <- questions[,1:2] ; questions # Get rid of unclear's
##                        
##                            q statement
##   Youth Fiction Sampled  128      1389
##   Textbook Fiction       110      1406
test.questions <- chisq.test(questions)
test.questions # Not significant
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  questions
## X-squared = 1.3055, df = 1, p-value = 0.2532
assocplot(t(questions))
vcd::assoc(questions, shade = TRUE, col_vars = c("question", "statement"))
vcd::assocstats(questions) # No effect!
##                     X^2 df P(> X^2)
## Likelihood Ratio 1.4657  1  0.22602
## Pearson          1.4644  1  0.22623
## 
## Phi-Coefficient   : 0.022 
## Contingency Coeff.: 0.022 
## Cramer's V        : 0.022

Negation

FictionProg$Corpus <- relevel(FictionProg$Corpus, "Textbook Fiction")
levels(FictionProg$Corpus) <- c("Textbook Narrative", "Youth Fiction sample")

negation1 <- round(prop.table(table(FictionProg$Corpus, FictionProg$Negation), margin = 1), 4)*100; negation1
##                       
##                        negated positive
##   Textbook Narrative      3.10    96.90
##   Youth Fiction sample    5.34    94.66
#clipr::write_clip(negation1)

negation <- table(FictionProg$Corpus, FictionProg$Negation)
test.negation <- chisq.test(negation); test.negation # Significant correlation at p < 0.01
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  negation
## X-squared = 8.8826, df = 1, p-value = 0.002879
assocplot(t(negation))

vcd::assocstats(negation) # Phi-coefficient is only 0.056 = very weak!
##                     X^2 df  P(> X^2)
## Likelihood Ratio 9.5384  1 0.0020122
## Pearson          9.4290  1 0.0021358
## 
## Phi-Coefficient   : 0.056 
## Contingency Coeff.: 0.056 
## Cramer's V        : 0.056
#tiff(here("prog_negation.tiff"), height = 17, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(plyr::revalue(FictionProg$Negation, c("positive"="not negated")) ~ FictionProg$Corpus, ylab = "", xlab = "", main = "", col = cols[3:4])

dev.off()
## null device 
##           1
# No interesting correlation to be seen
#tiff(here("prog_negation_question.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Negation ~ Corpus + Question, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
dev.off()
## null device 
##           1

Contraction

round(prop.table(table(FictionProg$Corpus, FictionProg$Contraction), margin = 1), 4)*100
##                       
##                        contract.  full
##   Textbook Narrative       20.70 79.30
##   Youth Fiction sample     21.23 78.77
contractions <- table(FictionProg$Corpus, FictionProg$Contraction)
test.contractions <- chisq.test(contractions)
test.contractions # Not significant
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contractions
## X-squared = 0.097478, df = 1, p-value = 0.7549
#tiff(here("prog_contractions_tenses_nar.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Corpus + Tense, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far

dev.off()
## null device 
##           1
FictionProg$Textbook <- tidyr::replace_na(as.character(FictionProg$Textbook), "Youth Fiction")

t(round(prop.table(table(FictionProg$Contraction, FictionProg$Textbook), 2),4)*100)
##                  
##                   contract.  full
##   Access              26.76 73.24
##   Achievers           15.26 84.74
##   English in Mind     18.37 81.63
##   Green Line          18.42 81.58
##   Hi there            32.73 67.27
##   Join the Team       13.33 86.67
##   New Green Line      23.54 76.46
##   Piece of cake       40.00 60.00
##   Solutions            3.85 96.15
##   Youth Fiction       21.23 78.77
#tiff(here("prog_contractions_series_nar.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Textbook, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
dev.off()
## null device 
##           1

Voice

round(prop.table(table(FictionProg$Corpus, FictionProg$Voice), margin = 1), 4)*100
##                       
##                            A     P
##   Textbook Narrative   99.41  0.59
##   Youth Fiction sample 98.22  1.78
voice <- table(FictionProg$Corpus, FictionProg$Voice)
test.voice <- chisq.test(voice)
test.voice # Not a significant correlation
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  voice
## X-squared = 8.1242, df = 1, p-value = 0.004368
vcd::assocstats(voice)
##                     X^2 df  P(> X^2)
## Likelihood Ratio 9.5265  1 0.0020252
## Pearson          9.1081  1 0.0025448
## 
## Phi-Coefficient   : 0.055 
## Contingency Coeff.: 0.055 
## Cramer's V        : 0.055

Repeatedness

repeatedness <- table(FictionProg$Corpus, FictionProg$Repeated); repeatedness
##                       
##                          no unclear  yes
##   Textbook Narrative   1359      25  133
##   Youth Fiction sample 1312      54  151
repeatedness <- repeatedness[,c(1,3)]; repeatedness
##                       
##                          no  yes
##   Textbook Narrative   1359  133
##   Youth Fiction sample 1312  151
round(prop.table(repeatedness, 1), 2)
##                       
##                          no  yes
##   Textbook Narrative   0.91 0.09
##   Youth Fiction sample 0.90 0.10
test.repeat <- chisq.test(repeatedness)
test.repeat # No significant difference
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  repeatedness
## X-squared = 1.5254, df = 1, p-value = 0.2168
vcd::assocstats(repeatedness)
##                     X^2 df P(> X^2)
## Likelihood Ratio 1.6841  1  0.19438
## Pearson          1.6834  1  0.19447
## 
## Phi-Coefficient   : 0.024 
## Contingency Coeff.: 0.024 
## Cramer's V        : 0.024

Continuousness

continuous <- table(FictionProg$Corpus, FictionProg$Continuous); continuous
##                       
##                          no unclear  yes
##   Textbook Narrative    139      16 1362
##   Youth Fiction sample  234      12 1271
continuous <- continuous[,c(1,3)]; continuous
##                       
##                          no  yes
##   Textbook Narrative    139 1362
##   Youth Fiction sample  234 1271
round(prop.table(continuous, 1), 4)*100
##                       
##                           no   yes
##   Textbook Narrative    9.26 90.74
##   Youth Fiction sample 15.55 84.45
test.cont <- chisq.test(continuous); test.cont # Significant at p>0.001
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  continuous
## X-squared = 26.76, df = 1, p-value = 2.303e-07
vcd::assocstats(continuous) # Cramer's V is 0.095 = very weak!
##                     X^2 df   P(> X^2)
## Likelihood Ratio 27.605  1 1.4882e-07
## Pearson          27.336  1 1.7104e-07
## 
## Phi-Coefficient   : 0.095 
## Contingency Coeff.: 0.095 
## Cramer's V        : 0.095
#tiff(here("Prog_Continuous_Fiction.tiff"), height = 18, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.5, cex.main = 1)
plot(continuous, shade = TRUE, main = "Progressive function: Continuousness")

dev.off()
## null device 
##           1
## Which lemmas occur most frequently in either corpora with the function non-continuousness? ##
noncont_lemmas_TxB <- FictionProg[FictionProg$Continuous=="no" & FictionProg$Corpus=="Textbook Fiction", 6] 
table_noncont_lemmas_TxB <- sort(table(noncont_lemmas_TxB), decreasing = T)
head(table_noncont_lemmas_TxB, 10)
## noncont_lemmas_TxB
## abandon  accord    ache     act act out     add address advance aim for   amuse 
##       0       0       0       0       0       0       0       0       0       0
noncont_lemmas_YF <- FictionProg[FictionProg$Continuous=="no" & FictionProg$Corpus=="Youth Fiction Sampled", 6] 
table_noncont_lemmas_YF <- sort(table(noncont_lemmas_YF), decreasing = T)
head(table_noncont_lemmas_YF, 10)
## noncont_lemmas_YF
## abandon  accord    ache     act act out     add address advance aim for   amuse 
##       0       0       0       0       0       0       0       0       0       0

Additional functions

functions <- table(FictionProg$Corpus, FictionProg$Extra.function); functions
##                       
##                        emphasis/shock framing gradual change
##   Textbook Narrative               12     144             51
##   Youth Fiction sample             20      79             82
##                       
##                        politeness/softening reported speech/thought
##   Textbook Narrative                      2                      25
##   Youth Fiction sample                    2                      31
round((functions/1517*100),2) # Percentage of concordances in each corpus
##                       
##                        emphasis/shock framing gradual change
##   Textbook Narrative             0.79    9.49           3.36
##   Youth Fiction sample           1.32    5.21           5.41
##                       
##                        politeness/softening reported speech/thought
##   Textbook Narrative                   0.13                    1.65
##   Youth Fiction sample                 0.13                    2.04
framing <- table(FictionProg$Corpus, FictionProg$Extra.function=="framing"); framing
##                       
##                        FALSE TRUE
##   Textbook Narrative      90  144
##   Youth Fiction sample   135   79
chisq.test(framing) # Significant at p>0.001
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  framing
## X-squared = 26.131, df = 1, p-value = 3.19e-07
vcd::assocstats(framing) # phi-coefficient = 0.246
##                     X^2 df   P(> X^2)
## Likelihood Ratio 27.392  1 1.6613e-07
## Pearson          27.107  1 1.9246e-07
## 
## Phi-Coefficient   : 0.246 
## Contingency Coeff.: 0.239 
## Cramer's V        : 0.246
emphasis <- table(FictionProg$Corpus, FictionProg$Extra.function=="emphasis/shock"); emphasis
##                       
##                        FALSE TRUE
##   Textbook Narrative     222   12
##   Youth Fiction sample   194   20
chisq.test(emphasis) # Not significant, p = 0.12
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  emphasis
## X-squared = 2.3956, df = 1, p-value = 0.1217
change <- table(FictionProg$Corpus,FictionProg$Extra.function=="gradual change"); change
##                       
##                        FALSE TRUE
##   Textbook Narrative     183   51
##   Youth Fiction sample   132   82
chisq.test(change) # Significant at p < 0.001
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  change
## X-squared = 13.838, df = 1, p-value = 0.0001992
vcd::assocstats(change) # Phi-Coefficient = 0.18
##                     X^2 df   P(> X^2)
## Likelihood Ratio 14.693  1 0.00012652
## Pearson          14.619  1 0.00013158
## 
## Phi-Coefficient   : 0.181 
## Contingency Coeff.: 0.178 
## Cramer's V        : 0.181

Package used in this script

#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))

knitr::write_bib(c(.packages(), "knitr"), "packages.bib")

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] vcd_1.4-8          tidyr_1.1.4        RColorBrewer_1.1-2 lsr_0.5           
## [5] lattice_0.20-41    gridExtra_2.3      ggplot2_3.3.5      here_1.0.1        
## [9] dplyr_1.0.7       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.7       plyr_1.8.6       highr_0.9        pillar_1.6.4    
##  [5] bslib_0.3.1      compiler_4.0.3   jquerylib_0.1.4  tools_4.0.3     
##  [9] digest_0.6.29    jsonlite_1.7.2   evaluate_0.14    lifecycle_1.0.1 
## [13] tibble_3.1.6     gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.12    
## [17] rstudioapi_0.13  cli_3.1.0        DBI_1.1.1        yaml_2.2.1      
## [21] xfun_0.29        fastmap_1.1.0    withr_2.4.3      stringr_1.4.0   
## [25] knitr_1.37       generics_0.1.1   vctrs_0.3.8      sass_0.4.0      
## [29] lmtest_0.9-38    rprojroot_2.0.2  tidyselect_1.1.1 glue_1.6.0      
## [33] R6_2.5.1         fansi_0.5.0      rmarkdown_2.11   purrr_0.3.4     
## [37] magrittr_2.0.1   MASS_7.3-53.1    scales_1.1.1     ellipsis_0.3.2  
## [41] htmltools_0.5.2  assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2      
## [45] stringi_1.7.6    munsell_0.5.0    crayon_1.4.2     zoo_1.8-9
Auguie, Baptiste. 2017. gridExtra: Miscellaneous Functions for "Grid" Graphics. https://CRAN.R-project.org/package=gridExtra.
Meyer, David, Achim Zeileis, and Kurt Hornik. 2006. “The Strucplot Framework: Visualizing Multi-Way Contingency Tables with Vcd.” Journal of Statistical Software 17 (3): 1–48. https://www.jstatsoft.org/v17/i03/.
———. 2020. Vcd: Visualizing Categorical Data. https://CRAN.R-project.org/package=vcd.
Müller, Kirill. 2020. Here: A Simpler Way to Find Your Files. https://CRAN.R-project.org/package=here.
Navarro, Daniel. 2015. Lsr: Companion to "Learning Statistics with r". http://health.adelaide.edu.au/psychology/ccs/teaching/lsr/.
Neuwirth, Erich. 2014. RColorBrewer: ColorBrewer Palettes. https://CRAN.R-project.org/package=RColorBrewer.
R Core Team. 2020. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.
Sarkar, Deepayan. 2008. Lattice: Multivariate Data Visualization with r. New York: Springer. http://lmdvr.r-forge.r-project.org.
———. 2020. Lattice: Trellis Graphics for r. http://lattice.r-forge.r-project.org/.
Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. https://ggplot2.tidyverse.org.
———. 2021. Tidyr: Tidy Messy Data. https://CRAN.R-project.org/package=tidyr.
Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, Kohske Takahashi, Claus Wilke, Kara Woo, Hiroaki Yutani, and Dewey Dunnington. 2021. Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics. https://CRAN.R-project.org/package=ggplot2.
Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2021. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.
Xie, Yihui. 2014. “Knitr: A Comprehensive Tool for Reproducible Research in R.” In Implementing Reproducible Computational Research, edited by Victoria Stodden, Friedrich Leisch, and Roger D. Peng. Chapman; Hall/CRC. http://www.crcpress.com/product/isbn/9781466561595.
———. 2015. Dynamic Documents with R and Knitr. 2nd ed. Boca Raton, Florida: Chapman; Hall/CRC. https://yihui.org/knitr/.
———. 2021. Knitr: A General-Purpose Package for Dynamic Report Generation in r. https://yihui.org/knitr/.
Zeileis, Achim, David Meyer, and Kurt Hornik. 2007. “Residual-Based Shadings for Visualizing (Conditional) Independence.” Journal of Computational and Graphical Statistics 16 (3): 507–25.