This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(here)
library(ggplot2)
library(gridExtra)
library(lattice)
library(lsr)
library(RColorBrewer)
library(tidyr)
library(vcd)
annotation_file <- read.csv(here("YouthFiction_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file)
## Rows: 2,364
## Columns: 16
## $ Book <fct> book185, book183, book109, book277, book165, book73, bo…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "was sitting", "was going", "was trying", "be swimming …
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> sit, go, try, swim, cry out, talk about, think, scorch,…
## $ X. <fct> NA, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, c, NA, NA, N…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense <fct> past, GOING TO, past, NOT progressive, past, present, p…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, q, NA, …
## $ Time.reference <fct> past, past, past, modal/infinitive, past, present, past…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, NA, framing, NA, NA, NA, NA, NA, NA, NA, NA, NA, em…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
# Select only annotated concordance lines
#annotated <- annotation_file
annotated <- annotation_file[1:2050,] # Narrows it down to 1517 concordances with prog. Note that as of 11.08.2019 only the first 2053 have been manually annotated anyway.
# Extract only progressives
YF_prog <- annotated[annotated$Tense != "NOT progressive" & annotated$Tense != "GOING TO" & annotated$Tense !="catenative", ]
#YF_prog$Annotator <- as.character(YF_prog$Annotator) # Only necessary if Tatjana did all the annotating in a particular file because otherwise T is recognised as TRUE and the vector becomes a logical vector
YF_prog$Tense <- droplevels(YF_prog)$Tense # Drop unused levels
YF_prog$Time.reference <- droplevels(YF_prog)$Time.reference # Drop unused levels
YF_prog$Lemma <- droplevels(YF_prog)$Lemma # Drop unused levels
YF_prog$Tense <- factor(YF_prog$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
YF_prog$Time.reference <- factor(YF_prog$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))
nrow(YF_prog) # Number of progressives in Youth Fiction sampled
## [1] 1517
nrow(YF_prog)/nrow(annotated) # Ratio of genuine progressives in progressive CQL query
## [1] 0.74
# Before saving annotation file as .csv, make sure to have deleted all trailing inverted commas as they serve as string deliminators. This can easily be done with a regex search and replace in calc $" --> nothing.
annotation_file_TxB <- read.csv(here("TxB_narrative_prog_conc_anno.csv"), sep = "\t", header = TRUE, na.strings="", stringsAsFactors = TRUE)
glimpse(annotation_file_TxB)
## Rows: 1,855
## Columns: 18
## $ Register <fct> narrative, narrative, narrative, narrative, narrative, …
## $ Level <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Textbook <fct> Hi there, Hi there, Hi there, Join the Team, Join the T…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "'re going", "'re going", "'re going", "are discussing"…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> go, go, go, discuss, go, sit, read, run, go, wear, go, …
## $ X. <fct> c, c, c, NA, NA, NA, NA, NA, c, c, NA, NA, c, c, c, c, …
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Tense <fct> GOING TO, GOING TO, GOING TO, present, GOING TO, presen…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Time.reference <fct> present, present, present, present, present, present, p…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, …
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, gradual…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Annotator <fct> E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E, E…
# Select only annotated concordance lines
annotated_TxB <- annotation_file_TxB
# Extract only progressives
TxBNarProg <- filter(annotated_TxB, Tense != "NOT progressive" & Tense != "GOING TO" & Tense !="catenative")
TxBNarProg$Tense <- droplevels(TxBNarProg)$Tense # Drop unused levels
TxBNarProg$Time.reference <- droplevels(TxBNarProg)$Time.reference # Drop unused levels
TxBNarProg$Lemma <- droplevels(TxBNarProg)$Lemma
TxBNarProg$Tense <- factor(TxBNarProg$Tense, levels = c("past", "perfect", "present", "modal")) # Re-order levels
TxBNarProg$Time.reference <- factor(TxBNarProg$Time.reference, levels = c("past", "past/present", "present", "future", "general", "hypothetical", "unclear"))
nrow(TxBNarProg) # Number of progressives in Textbook Fiction
## [1] 1517
nrow(TxBNarProg)/nrow(annotation_file_TxB) # Ratio of genuine progressives in progressive query
## [1] 0.8177898
FictionProg <- bind_rows("Youth Fiction Sampled" = YF_prog, "Textbook Fiction" = TxBNarProg, .id = "Corpus")
glimpse(FictionProg)
## Rows: 3,034
## Columns: 20
## $ Corpus <chr> "Youth Fiction Sampled", "Youth Fiction Sampled", "Yout…
## $ Book <fct> book185, book109, book165, book73, book109, book55, boo…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "was sitting", "was trying", "were crying", "'re talkin…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> sit, try, cry out, talk about, think, say, go, wait, be…
## $ X. <fct> NA, NA, NA, c, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, N…
## $ NOT <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, n, NA, NA, NA, NA, …
## $ Tense <fct> past, past, past, present, past, present, present, perf…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ X..1 <fct> NA, NA, NA, NA, NA, q, NA, NA, q, NA, NA, NA, q, NA, NA…
## $ Time.reference <fct> past, past, past, present, past, present, future, hypot…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, no, yes, y…
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, framing, NA, NA, NA, emphasis/shock, NA, NA, NA, NA…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "progr. seems t…
## $ Annotator <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
## $ Register <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
FictionProg <- FictionProg %>% dplyr::rename(Contraction = X., Question = X..1, Negation = NOT) # Rename variables
glimpse(FictionProg)
## Rows: 3,034
## Columns: 20
## $ Corpus <chr> "Youth Fiction Sampled", "Youth Fiction Sampled", "Yout…
## $ Book <fct> book185, book109, book165, book73, book109, book55, boo…
## $ Concordance1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Concor.dance2 <fct> "was sitting", "was trying", "were crying", "'re talkin…
## $ Concordance3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Lemma <fct> sit, try, cry out, talk about, think, say, go, wait, be…
## $ Contraction <fct> NA, NA, NA, c, NA, NA, NA, NA, c, NA, NA, NA, NA, NA, N…
## $ Negation <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, n, NA, NA, NA, NA, …
## $ Tense <fct> past, past, past, present, past, present, present, perf…
## $ Voice <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A…
## $ Question <fct> NA, NA, NA, NA, NA, q, NA, NA, q, NA, NA, NA, q, NA, NA…
## $ Time.reference <fct> past, past, past, present, past, present, future, hypot…
## $ Continuous <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, no, yes, y…
## $ Repeated <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ Extra.function <fct> NA, framing, NA, NA, NA, emphasis/shock, NA, NA, NA, NA…
## $ Comment <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "progr. seems t…
## $ Annotator <fct> T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T…
## $ Register <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Level <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Textbook <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# Replace NA with meaningful levels
summary(as.factor(FictionProg$Negation))
## n NA's
## 128 2906
FictionProg$Negation <- tidyr::replace_na(as.character(FictionProg$Negation), "positive") # positive
summary(as.factor(FictionProg$Contraction))
## c NA's
## 636 2398
FictionProg$Contraction <- tidyr::replace_na(as.character(FictionProg$Contraction), "full") # full
summary(as.factor(FictionProg$Question))
## q unclear NA's
## 238 1 2795
FictionProg$Question <- tidyr::replace_na(as.character(FictionProg$Question), "statement") # statement
summary(as.factor(FictionProg$Level))
## A B C D E NA's
## 167 353 313 349 335 1517
FictionProg$Level <- tidyr::replace_na(as.character(FictionProg$Level), "Youth Fiction") # positive
summary(as.factor(FictionProg$Book))
## book16 book109 book75 book192 book98 book163 book168 book17 book19 book95
## 20 16 16 14 13 12 12 12 12 12
## book172 book182 book226 book5 book9 book10 book125 book204 book208 book230
## 11 11 11 11 11 10 10 10 10 10
## book251 book41 book126 book132 book137 book175 book179 book18 book183 book24
## 10 10 9 9 9 9 9 9 9 9
## book271 book284 book295 book298 book73 book91 book92 book100 book113 book127
## 9 9 9 9 9 9 9 8 8 8
## book136 book142 book151 book152 book166 book195 book209 book223 book244 book269
## 8 8 8 8 8 8 8 8 8 8
## book46 book49 book53 book55 book58 book65 book74 book76 book79 book84
## 8 8 8 8 8 8 8 8 8 8
## book96 book12 book122 book123 book169 book185 book188 book207 book22 book228
## 8 7 7 7 7 7 7 7 7 7
## book231 book239 book259 book263 book264 book27 book28 book37 book60 book68
## 7 7 7 7 7 7 7 7 7 7
## book81 book90 book1 book107 book114 book119 book120 book129 book133 book134
## 7 7 6 6 6 6 6 6 6 6
## book146 book180 book2 book206 book210 book215 book221 book227 (Other) NA's
## 6 6 6 6 6 6 6 6 683 1517
FictionProg$Book <- tidyr::replace_na(as.character(FictionProg$Book), "Textbook Narrative")
plot(as.factor(FictionProg$Book[2:620])) # Number of concordances per book from YF corpus
# Make the following variables into factors
factor_cols <- c("Corpus", "Negation", "Contraction", "Question", "Repeated", "Lemma", "Voice", "Level")
FictionProg[factor_cols] <- lapply(FictionProg[factor_cols], as.factor)
levels(FictionProg$Negation)[levels(FictionProg$Negation)=="n"] <- "negated"
levels(FictionProg$Contraction)[levels(FictionProg$Contraction)=="c"] <- "contract."
FictionProg$Corpus <- relevel(FictionProg$Corpus, "Youth Fiction Sampled") # This is so that the plots are in the same order as the Textbook Conversation vs. Spoken BNC ones. E.g. reference corpus on the left.
summary(FictionProg$Corpus) # Number of annotated progressive concordances in each subcorpus
## Youth Fiction Sampled Textbook Fiction
## 1517 1517
#saveRDS(FictionProg, file = here("FictionProg.rds")) # Last saved on 25.08.2019
#FictionProg <- readRDS(here("FictionProg.rds"))
# Textbook Fiction #
TF_words <- 219040 # Much better than tokens due to lack of punctuation in Spoken BNC
TF_verbs <- 42302 # These are all verb forms (e.g. have been drinking = 3 verb forms). Calculated using SE query
TF_verb_phrases <- 35148 # As calculated with Extract_verb_lemmas_spacy
TF_prog <- nrow(TxBNarProg) # 1517 # After manual annotation, thus excluding GOING TO + inf.
TF_ProgVerbPhrase_ratio <- TF_prog/(TF_verb_phrases - TF_prog - (nrow(filter(annotated_TxB, Tense=="GOING TO"))))*10000; TF_ProgVerbPhrase_ratio # 453.36 per 10,000 finite verb phrases
## [1] 453.3636
# Youth Fiction sampled
YFs_words <- 8328976
YFs_verbs <- 1792111 # From SE query
YFs_progCQL <- 71140 # From SE query
# Ratio of genuine progressives in progressive query for Youth Fiction sampled
YFsratio_prog_CQL <- nrow(YF_prog)/nrow(annotated)
YFstotal_prog <- YFsratio_prog_CQL*YFs_progCQL # Approximate total number of progressives across Youth Fiction sampled
YFs_verb_phrases <- 1328349 # From spacy script
YFs_ProgVerbPhrase_ratio <- YFstotal_prog/(YFs_verb_phrases - YFstotal_prog - (nrow(filter(annotation_file, Tense=="GOING TO"))))*10000; YFs_ProgVerbPhrase_ratio # 412.74 per 10,000 finite verb phrases
## [1] 412.7364
# Comparing proportions of verbs and progressives #
TF <- as.vector(c(TF_words, TF_verbs, TF_verb_phrases, TF_prog))
YFs <- as.vector(c(YFs_words, YFs_verbs, YFs_verb_phrases, YFstotal_prog))
#YFs_sample <- round(YFs*(TF_words/YFs_words),0) # based on total number of words (from SE)
YFs_sample <- round(YFs*(21530/727487),0) # based on total number of sentences (from SE)
prog_ratio_fiction <- rbind(TF, YFs_sample)
rownames(prog_ratio_fiction) <- c("Textbook Fiction", "Youth Fiction sample")
colnames(prog_ratio_fiction) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives")
prog_ratio_fiction <- as.data.frame(prog_ratio_fiction); prog_ratio_fiction
## Words Verb forms Finite verb phrases Progressives
## Textbook Fiction 219040 42302 35148 1517
## Youth Fiction sample 246496 53038 39313 1558
prog_ratio_fiction$Non_prog <- prog_ratio_fiction$`Finite verb phrases` - prog_ratio_fiction$Progressives
prog_ratio_fiction$FProgRatio <- round(c(TF_ProgVerbPhrase_ratio, YFs_ProgVerbPhrase_ratio), 2)
colnames(prog_ratio_fiction) <- c("Words", "Verb forms", "Finite verb phrases", "Progressives", "Non-progressives","F-coefficients of progressives")
prog_ratio_fiction
## Words Verb forms Finite verb phrases Progressives
## Textbook Fiction 219040 42302 35148 1517
## Youth Fiction sample 246496 53038 39313 1558
## Non-progressives F-coefficients of progressives
## Textbook Fiction 33631 453.36
## Youth Fiction sample 37755 412.74
#clipr::write_clip(prog_ratio_fiction)
#saveRDS(prog_ratio_fiction, file = here("prog_ratio_fiction.rds"))
# Significance testing and approximation of effect sizes #
# http://rcompanion.org/handbook/H_03.html
prog_ratio_fiction1 <- prog_ratio_fiction[,5:4]; prog_ratio_fiction1
## Non-progressives Progressives
## Textbook Fiction 33631 1517
## Youth Fiction sample 37755 1558
chisq.test(prog_ratio_fiction1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: prog_ratio_fiction1
## X-squared = 5.7508, df = 1, p-value = 0.01648
vcd::assocstats(t(prog_ratio_fiction1)) # Phi-Coefficient : 0.009
## X^2 df P(> X^2)
## Likelihood Ratio 5.8324 1 0.015734
## Pearson 5.8396 1 0.015670
##
## Phi-Coefficient : 0.009
## Contingency Coeff.: 0.009
## Cramer's V : 0.009
fisher.test(prog_ratio_fiction1) # Is this helpful?
##
## Fisher's Exact Test for Count Data
##
## data: prog_ratio_fiction1
## p-value = 0.01647
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.8505462 0.9840351
## sample estimates:
## odds ratio
## 0.9148458
verb_ratio_fiction <- prog_ratio_fiction[,1:2]; verb_ratio_fiction
## Words Verb forms
## Textbook Fiction 219040 42302
## Youth Fiction sample 246496 53038
chisq.test(verb_ratio_fiction) # Highly significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: verb_ratio_fiction
## X-squared = 228.56, df = 1, p-value < 2.2e-16
vcd::assocstats(t(verb_ratio_fiction))
## X^2 df P(> X^2)
## Likelihood Ratio 229.10 1 0
## Pearson 228.67 1 0
##
## Phi-Coefficient : 0.02
## Contingency Coeff.: 0.02
## Cramer's V : 0.02
prog_ratio_spoken <- readRDS(file = here("prog_ratio_spoken.rds"))
prog_ratio_fiction <- readRDS(file = here("prog_ratio_fiction.rds"))
prog_ratios <- rbind(prog_ratio_spoken, prog_ratio_fiction); prog_ratios
## Words Verb forms Finite verb phrases Progressives
## Textbook Conversation 420130 80106 64292 2423
## Spoken BNC 2014 Sample 420130 98761 69521 3724
## Textbook Fiction 219040 42302 35148 1517
## Youth Fiction sample 246496 53038 39313 1555
## Non-progressives F-coefficients of progressives
## Textbook Conversation 61869 395.48
## Spoken BNC 2014 Sample 65797 566.09
## Textbook Fiction 33631 453.36
## Youth Fiction sample 37758 411.98
prog_ratios <- prog_ratios[c(1,3), c(4:5)]; prog_ratios
## Progressives Non-progressives
## Textbook Conversation 2423 61869
## Textbook Fiction 1517 33631
chisq.test(prog_ratios)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: prog_ratios
## X-squared = 17.744, df = 1, p-value = 2.527e-05
dev.off()
## null device
## 1
vcd::assoc(prog_ratios)
vcd::assocstats(t(prog_ratios))
## X^2 df P(> X^2)
## Likelihood Ratio 17.673 1 2.6233e-05
## Pearson 17.888 1 2.3431e-05
##
## Phi-Coefficient : 0.013
## Contingency Coeff.: 0.013
## Cramer's V : 0.013
prop.test(t(prog_ratios))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: t(prog_ratios)
## X-squared = 17.744, df = 1, p-value = 2.527e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.04849354 -0.01724308
## sample estimates:
## prop 1 prop 2
## 0.6149746 0.6478429
# Mosaic plot of tense distribution across corpora
# Select the colors that will be used
#RColorBrewer::display.brewer.all(colorblindFriendly = T)
# We will select the first 4 colors in the Set1 palette
cols<-RColorBrewer::brewer.pal(n=4,name="OrRd")
#tiff(here("mosaic_tenseforms_nar.tiff"), height = 30, width = 35, units="cm", compression = "lzw", res = 300)
par(cex = 2, cex.main = 0.85, font = 2)
plot(FictionProg$Tense ~ FictionProg$Corpus, col = cols, xlab = "", ylab = "Progressive tense forms", main = "Distribution of progressive tense forms in Textbook Conversation and the Spoken BNC 2014")
dev.off()
## null device
## 1
# Differentiated look at textbook levels #
#tiff(here("mosaic_tenseforms_levels_nar.tiff"), height = 15, width = 25, units="cm", compression = "lzw", res = 300)
plot(FictionProg$Tense ~ FictionProg$Level, col = cols, xlab = " Textbook Levels", ylab = "Progressive tense forms")
dev.off()
## null device
## 1
# Barplot with % of occurrences
tenses <- table(FictionProg$Tense, FictionProg$Corpus)
class(tenses) # Produces a lovely simple table
## [1] "table"
tenses
##
## Youth Fiction Sampled Textbook Fiction
## past 851 792
## perfect 102 67
## present 492 635
## modal 72 23
tensesp <- t((prop.table(tenses, margin = 2)*100)) # Work out percentages and transpose table
tensesp <-round(tensesp, 2)
barplot(tensesp, beside = TRUE, ylab = "% of occurrences", xlab = "Forms of the progressive", ylim = c(0,100), col = c("darkblue", "darkred"))
legend("topright", fill = c("darkblue", "darkred"), c("Youth Reference", "Textbook Fiction"))
vcd::assocstats(tenses) # Measuring the effect size. Cramer's V is what we're interested in (Levshina p. 217)
## X^2 df P(> X^2)
## Likelihood Ratio 54.148 3 1.0434e-11
## Pearson 52.786 3 2.0369e-11
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.131
## Cramer's V : 0.132
# Association plot of tense distribution in corpora
#tiff(here("tenseforms_assoc_nar.tiff"), height = 12, width = 17, units="cm", compression = "lzw", res = 300)
vcd::assoc(tenses, shade = TRUE, varnames = FALSE) # This works
dev.off()
## null device
## 1
chisq.test(tenses)$stdres
##
## Youth Fiction Sampled Textbook Fiction
## past 2.149698 -2.149698
## perfect 2.770577 -2.770577
## present -5.372872 5.372872
## modal 5.107899 -5.107899
# Significance testing for time ref only #
timeref <- table(FictionProg$Corpus, FictionProg$Time.reference); timeref # Raw figures
##
## past past/present present future general hypothetical
## Youth Fiction Sampled 855 39 431 84 51 38
## Textbook Fiction 785 34 579 58 30 19
##
## unclear
## Youth Fiction Sampled 14
## Textbook Fiction 12
timerefprop <- round(t((prop.table(timeref, margin = 2)*100)),2); timerefprop # As %
##
## Youth Fiction Sampled Textbook Fiction
## past 52.13 47.87
## past/present 53.42 46.58
## present 42.67 57.33
## future 59.15 40.85
## general 62.96 37.04
## hypothetical 66.67 33.33
## unclear 53.85 46.15
chisq.test(timeref)
##
## Pearson's Chi-squared test
##
## data: timeref
## X-squared = 41.701, df = 6, p-value = 2.106e-07
vcd::assocstats(timeref) # Cramer's V = 0.117
## X^2 df P(> X^2)
## Likelihood Ratio 41.993 6 1.8443e-07
## Pearson 41.701 6 2.1060e-07
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.117
## Cramer's V : 0.117
vcd::assoc(timeref, shade = TRUE)
# Significance test for only textbook levels C-E against Textbook Conversation
prog2 <- FictionProg[FictionProg$Level!="A" & FictionProg$Level!="B",]
prog2$Level <- droplevels(prog2)$Level # Drop levels A and B
prog2 <- prog2[prog2$Time.reference!="unclear",] # Get rid of unclear's
prog2$Time.reference <- droplevels(prog2)$Time.reference # Drop unclear level
timeref2 <- table(prog2$Corpus, prog2$Time.reference); timeref2 # Raw figures
##
## past past/present present future general hypothetical
## Youth Fiction Sampled 855 39 431 84 51 38
## Textbook Fiction 554 33 323 39 23 17
chisq.test(timeref2) # p < 0.039
##
## Pearson's Chi-squared test
##
## data: timeref2
## X-squared = 11.662, df = 5, p-value = 0.03973
vcd::assoc(timeref2, shade = TRUE)
# Plot relationship between tense and time reference
FictionProg1 <- FictionProg[FictionProg$Time.reference != "unclear", ]
FictionProg1 <- FictionProg1[!is.na(FictionProg1$Time.reference), ] # Remove NA's
FictionProg1$Time.reference <- droplevels(FictionProg1)$Time.reference # Drop unused level "unclear"
#prog1$Time.reference = forcats::fct_collapse(prog1$Time.reference, future = c("present/future","future")) # function to collapse levels
#RColorBrewer::display.brewer.all(colorblindFriendly = T)
# We will select the first 4 colors in the Set1 palette
cols<-RColorBrewer::brewer.pal(n=4,name="OrRd")
#tiff(here("mosaic_tenses_timeref_nar.tiff"), height = 15, width = 30, units="cm", compression = "lzw", res = 300)
vcd::mosaic(Tense ~ Corpus + Time.reference, FictionProg1, gp = grid::gpar(fill = cols), rot_labels = c(45, 45, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE), xlab = "Time reference") # Best so far
grid::grid.text(" past past/present present future general hypo.", x=0.1, y=.1, vjust=1.5, hjust=0, gp=grid::gpar(fontface=1)) # Doesn't really work
dev.off()
## null device
## 1
round(prop.table(table(FictionProg$Corpus, FictionProg$Question), margin = 1), 4)*100
##
## q statement unclear
## Youth Fiction Sampled 8.44 91.56 0.00
## Textbook Fiction 7.25 92.68 0.07
#tiff(here("prog_questions.tiff"), height = 17, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(plyr::revalue(FictionProg$Question, c("q"="question")) ~ FictionProg$Corpus, ylab = "", xlab = "", main = "", col = cols[4:2])
dev.off()
## null device
## 1
questions <- table(FictionProg$Corpus, FictionProg$Question) ; questions
##
## q statement unclear
## Youth Fiction Sampled 128 1389 0
## Textbook Fiction 110 1406 1
questions <- questions[,1:2] ; questions # Get rid of unclear's
##
## q statement
## Youth Fiction Sampled 128 1389
## Textbook Fiction 110 1406
test.questions <- chisq.test(questions)
test.questions # Not significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: questions
## X-squared = 1.3055, df = 1, p-value = 0.2532
assocplot(t(questions))
vcd::assoc(questions, shade = TRUE, col_vars = c("question", "statement"))
vcd::assocstats(questions) # No effect!
## X^2 df P(> X^2)
## Likelihood Ratio 1.4657 1 0.22602
## Pearson 1.4644 1 0.22623
##
## Phi-Coefficient : 0.022
## Contingency Coeff.: 0.022
## Cramer's V : 0.022
FictionProg$Corpus <- relevel(FictionProg$Corpus, "Textbook Fiction")
levels(FictionProg$Corpus) <- c("Textbook Narrative", "Youth Fiction sample")
negation1 <- round(prop.table(table(FictionProg$Corpus, FictionProg$Negation), margin = 1), 4)*100; negation1
##
## negated positive
## Textbook Narrative 3.10 96.90
## Youth Fiction sample 5.34 94.66
#clipr::write_clip(negation1)
negation <- table(FictionProg$Corpus, FictionProg$Negation)
test.negation <- chisq.test(negation); test.negation # Significant correlation at p < 0.01
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: negation
## X-squared = 8.8826, df = 1, p-value = 0.002879
assocplot(t(negation))
vcd::assocstats(negation) # Phi-coefficient is only 0.056 = very weak!
## X^2 df P(> X^2)
## Likelihood Ratio 9.5384 1 0.0020122
## Pearson 9.4290 1 0.0021358
##
## Phi-Coefficient : 0.056
## Contingency Coeff.: 0.056
## Cramer's V : 0.056
#tiff(here("prog_negation.tiff"), height = 17, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
plot(plyr::revalue(FictionProg$Negation, c("positive"="not negated")) ~ FictionProg$Corpus, ylab = "", xlab = "", main = "", col = cols[3:4])
dev.off()
## null device
## 1
# No interesting correlation to be seen
#tiff(here("prog_negation_question.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Negation ~ Corpus + Question, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
dev.off()
## null device
## 1
round(prop.table(table(FictionProg$Corpus, FictionProg$Contraction), margin = 1), 4)*100
##
## contract. full
## Textbook Narrative 20.70 79.30
## Youth Fiction sample 21.23 78.77
contractions <- table(FictionProg$Corpus, FictionProg$Contraction)
test.contractions <- chisq.test(contractions)
test.contractions # Not significant
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contractions
## X-squared = 0.097478, df = 1, p-value = 0.7549
#tiff(here("prog_contractions_tenses_nar.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Corpus + Tense, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
dev.off()
## null device
## 1
FictionProg$Textbook <- tidyr::replace_na(as.character(FictionProg$Textbook), "Youth Fiction")
t(round(prop.table(table(FictionProg$Contraction, FictionProg$Textbook), 2),4)*100)
##
## contract. full
## Access 26.76 73.24
## Achievers 15.26 84.74
## English in Mind 18.37 81.63
## Green Line 18.42 81.58
## Hi there 32.73 67.27
## Join the Team 13.33 86.67
## New Green Line 23.54 76.46
## Piece of cake 40.00 60.00
## Solutions 3.85 96.15
## Youth Fiction 21.23 78.77
#tiff(here("prog_contractions_series_nar.tiff"), height = 13, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.4, cex.main = 0.85)
vcd::mosaic(Contraction ~ Textbook, FictionProg, gp = grid::gpar(fill = cols[4:3]), rot_labels = c(0, 90, 0, 90), cex=2.5, zero_size = 0, labeling_args = list(rep = TRUE)) # Best so far
dev.off()
## null device
## 1
round(prop.table(table(FictionProg$Corpus, FictionProg$Voice), margin = 1), 4)*100
##
## A P
## Textbook Narrative 99.41 0.59
## Youth Fiction sample 98.22 1.78
voice <- table(FictionProg$Corpus, FictionProg$Voice)
test.voice <- chisq.test(voice)
test.voice # Not a significant correlation
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: voice
## X-squared = 8.1242, df = 1, p-value = 0.004368
vcd::assocstats(voice)
## X^2 df P(> X^2)
## Likelihood Ratio 9.5265 1 0.0020252
## Pearson 9.1081 1 0.0025448
##
## Phi-Coefficient : 0.055
## Contingency Coeff.: 0.055
## Cramer's V : 0.055
repeatedness <- table(FictionProg$Corpus, FictionProg$Repeated); repeatedness
##
## no unclear yes
## Textbook Narrative 1359 25 133
## Youth Fiction sample 1312 54 151
repeatedness <- repeatedness[,c(1,3)]; repeatedness
##
## no yes
## Textbook Narrative 1359 133
## Youth Fiction sample 1312 151
round(prop.table(repeatedness, 1), 2)
##
## no yes
## Textbook Narrative 0.91 0.09
## Youth Fiction sample 0.90 0.10
test.repeat <- chisq.test(repeatedness)
test.repeat # No significant difference
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: repeatedness
## X-squared = 1.5254, df = 1, p-value = 0.2168
vcd::assocstats(repeatedness)
## X^2 df P(> X^2)
## Likelihood Ratio 1.6841 1 0.19438
## Pearson 1.6834 1 0.19447
##
## Phi-Coefficient : 0.024
## Contingency Coeff.: 0.024
## Cramer's V : 0.024
continuous <- table(FictionProg$Corpus, FictionProg$Continuous); continuous
##
## no unclear yes
## Textbook Narrative 139 16 1362
## Youth Fiction sample 234 12 1271
continuous <- continuous[,c(1,3)]; continuous
##
## no yes
## Textbook Narrative 139 1362
## Youth Fiction sample 234 1271
round(prop.table(continuous, 1), 4)*100
##
## no yes
## Textbook Narrative 9.26 90.74
## Youth Fiction sample 15.55 84.45
test.cont <- chisq.test(continuous); test.cont # Significant at p>0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: continuous
## X-squared = 26.76, df = 1, p-value = 2.303e-07
vcd::assocstats(continuous) # Cramer's V is 0.095 = very weak!
## X^2 df P(> X^2)
## Likelihood Ratio 27.605 1 1.4882e-07
## Pearson 27.336 1 1.7104e-07
##
## Phi-Coefficient : 0.095
## Contingency Coeff.: 0.095
## Cramer's V : 0.095
#tiff(here("Prog_Continuous_Fiction.tiff"), height = 18, width = 20, units="cm", compression = "lzw", res = 300)
par(cex = 1.5, cex.main = 1)
plot(continuous, shade = TRUE, main = "Progressive function: Continuousness")
dev.off()
## null device
## 1
## Which lemmas occur most frequently in either corpora with the function non-continuousness? ##
noncont_lemmas_TxB <- FictionProg[FictionProg$Continuous=="no" & FictionProg$Corpus=="Textbook Fiction", 6]
table_noncont_lemmas_TxB <- sort(table(noncont_lemmas_TxB), decreasing = T)
head(table_noncont_lemmas_TxB, 10)
## noncont_lemmas_TxB
## abandon accord ache act act out add address advance aim for amuse
## 0 0 0 0 0 0 0 0 0 0
noncont_lemmas_YF <- FictionProg[FictionProg$Continuous=="no" & FictionProg$Corpus=="Youth Fiction Sampled", 6]
table_noncont_lemmas_YF <- sort(table(noncont_lemmas_YF), decreasing = T)
head(table_noncont_lemmas_YF, 10)
## noncont_lemmas_YF
## abandon accord ache act act out add address advance aim for amuse
## 0 0 0 0 0 0 0 0 0 0
functions <- table(FictionProg$Corpus, FictionProg$Extra.function); functions
##
## emphasis/shock framing gradual change
## Textbook Narrative 12 144 51
## Youth Fiction sample 20 79 82
##
## politeness/softening reported speech/thought
## Textbook Narrative 2 25
## Youth Fiction sample 2 31
round((functions/1517*100),2) # Percentage of concordances in each corpus
##
## emphasis/shock framing gradual change
## Textbook Narrative 0.79 9.49 3.36
## Youth Fiction sample 1.32 5.21 5.41
##
## politeness/softening reported speech/thought
## Textbook Narrative 0.13 1.65
## Youth Fiction sample 0.13 2.04
framing <- table(FictionProg$Corpus, FictionProg$Extra.function=="framing"); framing
##
## FALSE TRUE
## Textbook Narrative 90 144
## Youth Fiction sample 135 79
chisq.test(framing) # Significant at p>0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: framing
## X-squared = 26.131, df = 1, p-value = 3.19e-07
vcd::assocstats(framing) # phi-coefficient = 0.246
## X^2 df P(> X^2)
## Likelihood Ratio 27.392 1 1.6613e-07
## Pearson 27.107 1 1.9246e-07
##
## Phi-Coefficient : 0.246
## Contingency Coeff.: 0.239
## Cramer's V : 0.246
emphasis <- table(FictionProg$Corpus, FictionProg$Extra.function=="emphasis/shock"); emphasis
##
## FALSE TRUE
## Textbook Narrative 222 12
## Youth Fiction sample 194 20
chisq.test(emphasis) # Not significant, p = 0.12
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: emphasis
## X-squared = 2.3956, df = 1, p-value = 0.1217
change <- table(FictionProg$Corpus,FictionProg$Extra.function=="gradual change"); change
##
## FALSE TRUE
## Textbook Narrative 183 51
## Youth Fiction sample 132 82
chisq.test(change) # Significant at p < 0.001
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: change
## X-squared = 13.838, df = 1, p-value = 0.0001992
vcd::assocstats(change) # Phi-Coefficient = 0.18
## X^2 df P(> X^2)
## Likelihood Ratio 14.693 1 0.00012652
## Pearson 14.619 1 0.00013158
##
## Phi-Coefficient : 0.181
## Contingency Coeff.: 0.178
## Cramer's V : 0.181
#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))
knitr::write_bib(c(.packages(), "knitr"), "packages.bib")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] vcd_1.4-8 tidyr_1.1.4 RColorBrewer_1.1-2 lsr_0.5
## [5] lattice_0.20-41 gridExtra_2.3 ggplot2_3.3.5 here_1.0.1
## [9] dplyr_1.0.7
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.7 plyr_1.8.6 highr_0.9 pillar_1.6.4
## [5] bslib_0.3.1 compiler_4.0.3 jquerylib_0.1.4 tools_4.0.3
## [9] digest_0.6.29 jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.1
## [13] tibble_3.1.6 gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.12
## [17] rstudioapi_0.13 cli_3.1.0 DBI_1.1.1 yaml_2.2.1
## [21] xfun_0.29 fastmap_1.1.0 withr_2.4.3 stringr_1.4.0
## [25] knitr_1.37 generics_0.1.1 vctrs_0.3.8 sass_0.4.0
## [29] lmtest_0.9-38 rprojroot_2.0.2 tidyselect_1.1.1 glue_1.6.0
## [33] R6_2.5.1 fansi_0.5.0 rmarkdown_2.11 purrr_0.3.4
## [37] magrittr_2.0.1 MASS_7.3-53.1 scales_1.1.1 ellipsis_0.3.2
## [41] htmltools_0.5.2 assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2
## [45] stringi_1.7.6 munsell_0.5.0 crayon_1.4.2 zoo_1.8-9