This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
knitr::opts_chunk$set(echo = TRUE, paged.print=TRUE, fig.width = 10, warning=FALSE)
#renv::restore() # Restore the project's dependencies from the lockfile to ensure that same package versions are used as in the original thesis.
library(car) # For data wrangling
library(caret) # For its confusion matrix function
#library(clipr) # For quick exports to other programme
library(cowplot)
library(DescTools) # For 95% CI
library(dplyr)
library(emmeans)
#library(FactoMineR) # For Shiny app
library(factoextra) # For circular graphs of variables
library(forcats) # For the fct_relevel function
library(here) # For dynamic file paths
library(ggplot2)
library(ggthemes) # For theme of factoextra plots
library(lme4) # For linear regression modelling
library(patchwork) # To create figures with more than one plot
#library(pca3d) # For 3-D plots (not rendered in html knit)
library(PCAtools) # For nice biplots of PCA results
library(purrr) # For data wrangling
library(psych) # For various useful stats function
library(sjPlot) # For model plots and tables
library(suffrager) # For pretty feminist colour palettes :)
library(visreg) # For plots of interaction effects
source(here("R_rainclouds.R")) # For geom_flat_violin rainplots
Import full data (see 7_Ref_data_prep.Rmd for data preparation steps)
data <- readRDS(here("FullMDA", "dataforPCA.rds"))
data %>%
filter(Series=="NGL") %>%
group_by(Series, Level) %>%
summarise(wordcount = sum(Words))
## `summarise()` has grouped output by 'Series'. You can override using the
## `.groups` argument.
## # A tibble: 5 × 3
## # Groups: Series [1]
## Series Level wordcount
## <fct> <fct> <int>
## 1 NGL A 18388
## 2 NGL B 37471
## 3 NGL C 30574
## 4 NGL D 36395
## 5 NGL E 61486
This chunk can be used to perform the MDA on various subsets of the data.
# Full dataset
data <- readRDS(here("FullMDA", "dataforPCA.rds"))
# Subset of the data that excludes levels A and B textbooks
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
filter(Level !="A" & Level != "B") %>%
droplevels()
summary(data$Level)
# Subset of the data to only include one Country subcorpus of the TEC
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
#filter(Country != "France" & Country != "Germany") %>% # Spain only
#filter(Country != "France" & Country != "Spain") %>% # Germany only
filter(Country != "Spain" & Country != "Germany") %>% # France only
droplevels()
summary(data$Country)
# Perform PCA on random subset of the data to test the stability of the solution. Re-running this line will generate a new subset of 2/3 of the texts randomly sampled.
set.seed(13) # Seed used for the biplot printed in Chapter 8
data <- readRDS(here("FullMDA", "dataforPCA.rds")) %>%
slice_sample(n = 4980*0.6, replace = FALSE)
nrow(data)
data$Filename[1:4]
This chunk is used to create the 3-D plots. These cannot be rendered in the html knit.
colnames(data)
pca <- prcomp(data[,9:ncol(data)], scale.=FALSE) # All quantitative variables
register <- factor(data[,"Register"])
corpus <- factor(data[,"Corpus"])
subcorpus <- factor(data[,"Subcorpus"])
summary(register)
summary(corpus)
summary(subcorpus)
summary(pca)
# 3-D plot
colours <- suf_palette(name = "london", n = 6, type = "continuous")
colours2 <- suf_palette(name = "classic", n = 5, type = "continuous")
colours <- c(colours, colours2[c(2:4)]) # Nine colours range
col6 <- colours[c(6,5,4,7,9,2)] # Good order for PCA
scales::show_col(col6)
col6 <- c("#F9B921", "#A18A33", "#722672", "#BD241E", "#267226", "#15274D")
names(col6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")
shapes6 <- c(rep("cube", 3),rep("sphere", 3))
names(shapes6) <- c("Textbook Conversation", "Textbook Fiction", "Textbook Informative", "Spoken BNC2014 Ref.", "Youth Fiction Ref.", "Info Teens Ref.")
library(pca3d)
pca3d(pca, group = subcorpus,
components = 1:3,
#components = 4:6,
show.plane=FALSE,
col = col6,
shape = shapes6,
radius = 0.7,
legend = "right")
snapshotPCA3d(here("plots", "PCA_TxB_3Ref_3Dsnapshot.png"))
## Looking at all three Textbook English registers in one colour
col4 <- colours[c(1,3,7,9)]
col4 <- c("#EA7E1E", "#15274D", "#BD241E", "#267226")
names(col4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")
shapes4 <- c("cube", rep("sphere", 3))
names(shapes4) <- c("Textbook.English", "Informative.Teens", "Spoken.BNC2014", "Youth.Fiction")
pca3d(pca, group = corpus,
show.plane=FALSE,
components = 1:3,
col = col4,
shape = shapes4,
radius = 0.7,
legend = "right")
data2 <- data %>%
mutate(Source = recode_factor(Corpus, Textbook.English = "Textbook English (TEC)", Informative.Teens = "Reference corpora", Spoken.BNC2014 = "Reference corpora", Youth.Fiction = "Reference corpora")) %>%
mutate(Corpus = fct_relevel(Subcorpus, "Info Teens Ref.", after = 9)) %>%
relocate(Source, .after = "Corpus") %>%
droplevels(.)
colnames(data2)
## [1] "Filename" "Register" "Level" "Series" "Country" "Corpus"
## [7] "Source" "Subcorpus" "Words" "ACT" "AMP" "ASPECT"
## [13] "AWL" "BEMA" "CAUSE" "CC" "COMM" "CONC"
## [19] "COND" "CONT" "CUZ" "DEMO" "DMA" "DOAUX"
## [25] "DT" "DWNT" "ELAB" "EMPH" "EX" "EXIST"
## [31] "FPP1P" "FPP1S" "FPUH" "GTO" "HDG" "HGOT"
## [37] "IN" "JJAT" "JJPR" "LD" "MDCA" "MDCO"
## [43] "MDMM" "MDNE" "MDWO" "MDWS" "MENTAL" "NCOMP"
## [49] "NN" "OCCUR" "PASS" "PEAS" "PIT" "PLACE"
## [55] "POLITE" "POS" "PROG" "QUAN" "QUPR" "QUTAG"
## [61] "RB" "RP" "SPLIT" "SPP2" "STPR" "THATD"
## [67] "THRC" "THSC" "TTR" "VBD" "VBG" "VBN"
## [73] "VIMP" "WHQU" "WHSC" "XX0" "YNQU" "TPP3"
## [79] "FQTI"
data2meta <- data2[,1:9]
rownames(data2meta) <- data2meta$Filename
data2meta <- data2meta %>% select(-Filename)
head(data2meta)
## Register Level Series Country
## Achievers_B1_plus_Informative_0007.txt Informative D Achievers Spain
## POC_5e_Spoken_0003.txt Conversation B POC France
## Access_4_Narrative_0013.txt Fiction D Access Germany
## NGL_1_Spoken_0002.txt Conversation A NGL Germany
## Access_1_Narrative_0005.txt Fiction A Access Germany
## NGL_2_Narrative_0007.txt Fiction B NGL Germany
## Corpus
## Achievers_B1_plus_Informative_0007.txt Textbook Informative
## POC_5e_Spoken_0003.txt Textbook Conversation
## Access_4_Narrative_0013.txt Textbook Fiction
## NGL_1_Spoken_0002.txt Textbook Conversation
## Access_1_Narrative_0005.txt Textbook Fiction
## NGL_2_Narrative_0007.txt Textbook Fiction
## Source
## Achievers_B1_plus_Informative_0007.txt Textbook English (TEC)
## POC_5e_Spoken_0003.txt Textbook English (TEC)
## Access_4_Narrative_0013.txt Textbook English (TEC)
## NGL_1_Spoken_0002.txt Textbook English (TEC)
## Access_1_Narrative_0005.txt Textbook English (TEC)
## NGL_2_Narrative_0007.txt Textbook English (TEC)
## Subcorpus Words
## Achievers_B1_plus_Informative_0007.txt Textbook Informative 690
## POC_5e_Spoken_0003.txt Textbook Conversation 694
## Access_4_Narrative_0013.txt Textbook Fiction 547
## NGL_1_Spoken_0002.txt Textbook Conversation 927
## Access_1_Narrative_0005.txt Textbook Fiction 840
## NGL_2_Narrative_0007.txt Textbook Fiction 1127
rownames(data2) <- data2$Filename
data2num <- as.data.frame(base::t(data2[,10:ncol(data2)]))
data2num[1:5,1:5] # Check data frame format is correct
## Achievers_B1_plus_Informative_0007.txt POC_5e_Spoken_0003.txt
## ACT 1.3305834 -0.8133966
## AMP -0.4504541 0.1376718
## ASPECT 1.0228557 -0.4515438
## AWL 0.7288132 -0.7268089
## BEMA -0.4801820 1.1272237
## Access_4_Narrative_0013.txt NGL_1_Spoken_0002.txt
## ACT -0.4707243 -1.014328
## AMP 1.4534008 -0.561931
## ASPECT 1.1039502 -0.749580
## AWL -0.5477092 -0.716272
## BEMA 0.2456594 1.700641
## Access_1_Narrative_0005.txt
## ACT 0.03320131
## AMP -0.52977306
## ASPECT -0.55344739
## AWL -0.56332789
## BEMA -0.39233698
p <- PCAtools::pca(data2num,
metadata = data2meta,
scale = FALSE)
p$variance[1:6]
## PC1 PC2 PC3 PC4 PC5 PC6
## 30.334148 7.512979 5.886828 3.415906 2.703581 2.415911
sum(p$variance[1:4])
## [1] 47.14986
# For five TEC registers
# colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672", `Textbook Instructional` = "grey", `Textbook Personal` = "black")
# For three TEC registers
summary(data2$Corpus)
## Textbook Conversation Textbook Fiction Textbook Informative
## 565 285 352
## Spoken BNC2014 Ref. Youth Fiction Ref. Info Teens Ref.
## 1250 1191 1337
colkey = c(`Spoken BNC2014 Ref.`="#BD241E", `Info Teens Ref.`="#15274D", `Youth Fiction Ref.`="#267226", `Textbook Fiction`="#A18A33", `Textbook Conversation`="#F9B921", `Textbook Informative` = "#722672")
#summary(data2$Source)
#shapekey = c(`Textbook English (TEC)`=6, `Reference corpora`=1)
summary(data2$Level)
## A B C D E Ref.
## 152 231 306 304 209 3778
shapekey = c(A=1, B=2, C=6, D=0, E=5, `Ref.`=4)
## Very slow, open in zoomed out window!
# Add legend manually? Yes (take it from the biplot code below), let's not waste too much time, here. Or use Evert's mvar.pairs plot (though that annoyingly requires manual axis annotation so not really better!)
#png(here("plots", "PCA_3Ref_pairsplot.png"), width = 25, height= 40, units = "cm", res = 300)
PCAtools::pairsplot(p,
triangle = FALSE,
components = 1:4,
ncol = 2,
nrow = 3,
pointSize = 1,
shape = "Level",
shapekey = shapekey,
lab = NULL, # Otherwise will try to label each data point!
colby = "Corpus",
legendPosition = "none",
margingaps = unit(c(0.2, 0.2, 0.8, 0.2), "cm"),
colkey = colkey)
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.