This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
TxBcounts <- readRDS(here("FullMDA", "TxBcounts3.rds"))
colnames(TxBcounts)
## [1] "Filename" "Country" "Series" "Level" "Register" "Words"
## [7] "ACT" "AMP" "ASPECT" "AWL" "BEMA" "CAUSE"
## [13] "CC" "COMM" "COND" "CONT" "CUZ" "DEMO"
## [19] "DMA" "DOAUX" "DT" "EMPH" "EX" "EXIST"
## [25] "FPP1P" "FPP1S" "FPUH" "FREQ" "HDG" "IN"
## [31] "JJAT" "JJPR" "LD" "MDCA" "MDCO" "MDNE"
## [37] "MDWO" "MDWS" "MENTAL" "NCOMP" "NN" "OCCUR"
## [43] "PASS" "PEAS" "PIT" "PLACE" "POLITE" "POS"
## [49] "PROG" "QUAN" "QUPR" "RB" "RP" "SPLIT"
## [55] "SPP2" "STPR" "THATD" "THRC" "THSC" "TIME"
## [61] "TPP3P" "TPP3S" "TTR" "VBD" "VBG" "VBN"
## [67] "VIMP" "VPRT" "WHQU" "WHSC" "XX0" "YNQU"
nrow(TxBcounts)
## [1] 1961
TxBzlogcounts <- readRDS(here("FullMDA", "TxBzlogcounts.rds"))
nrow(TxBzlogcounts)
## [1] 1961
colnames(TxBzlogcounts)
## [1] "ACT" "AMP" "ASPECT" "AWL" "BEMA" "CAUSE" "CC" "COMM"
## [9] "COND" "CONT" "CUZ" "DEMO" "DMA" "DOAUX" "DT" "EMPH"
## [17] "EX" "EXIST" "FPP1P" "FPP1S" "FPUH" "FREQ" "HDG" "IN"
## [25] "JJAT" "JJPR" "LD" "MDCA" "MDCO" "MDNE" "MDWO" "MDWS"
## [33] "MENTAL" "NCOMP" "NN" "OCCUR" "PASS" "PEAS" "PIT" "PLACE"
## [41] "POLITE" "POS" "PROG" "QUAN" "QUPR" "RB" "RP" "SPLIT"
## [49] "SPP2" "STPR" "THATD" "THRC" "THSC" "TIME" "TPP3P" "TPP3S"
## [57] "TTR" "VBD" "VBG" "VBN" "VIMP" "VPRT" "WHQU" "WHSC"
## [65] "XX0" "YNQU"
TxBdata <- cbind(TxBcounts[,1:6], as.data.frame(TxBzlogcounts))
str(TxBdata)
## 'data.frame': 1961 obs. of 72 variables:
## $ Filename: chr "Achievers_A1_Instructional_0012.txt" "Solutions_Pre-Intermediate_Instructional_0023.txt" "Achievers_A2_Personal_0003.txt" "Achievers_B1_plus_Informative_0007.txt" ...
## $ Country : Factor w/ 3 levels "France","Germany",..: 3 3 3 3 1 2 3 2 2 2 ...
## $ Series : Factor w/ 9 levels "Access","Achievers",..: 2 9 2 2 8 1 2 7 1 7 ...
## $ Level : Factor w/ 5 levels "A","B","C","D",..: 1 2 2 4 2 4 4 1 1 2 ...
## $ Register: Factor w/ 5 levels "Conversation",..: 4 4 5 3 1 2 4 1 2 2 ...
## $ Words : int 931 889 979 690 694 547 967 927 840 1127 ...
## $ ACT : num -0.231 0.933 0.597 1.262 -0.851 ...
## $ AMP : num -0.658 -0.401 -0.427 -0.313 0.174 ...
## $ ASPECT : num 0.202 1.02 -0.527 0.713 -0.473 ...
## $ AWL : num 0.733 0.575 -0.638 0.933 -0.928 ...
## $ BEMA : num -0.6836 -0.7093 -0.0337 -0.2838 0.9802 ...
## $ CAUSE : num -0.69023 -0.39356 0.2673 0.00833 -0.35629 ...
## $ CC : num -0.498 0.129 -0.125 1.536 -0.624 ...
## $ COMM : num 1.036 0.782 -0.551 0.239 -0.82 ...
## $ COND : num -0.1011 -0.6147 0.0464 -0.6147 0.9133 ...
## $ CONT : num -0.612 -0.753 0.768 -0.595 0.725 ...
## $ CUZ : num -0.5311 -0.5311 -0.0409 -0.5311 -0.5311 ...
## $ DEMO : num -0.371 -0.332 -0.669 -0.835 0.639 ...
## $ DMA : num -0.446 -0.529 -0.365 -0.529 1.106 ...
## $ DOAUX : num 0.534 0.288 1.312 -0.916 0.598 ...
## $ DT : num 0.0449 0.7521 -0.8778 -0.4304 0.6603 ...
## $ EMPH : num -0.811 -0.688 0.34 -0.456 -0.222 ...
## $ EX : num -0.467 -0.655 0.239 -0.655 0.491 ...
## $ EXIST : num -0.545 -0.139 -0.251 1.643 0.361 ...
## $ FPP1P : num -0.579 -0.579 1.197 1.125 0.971 ...
## $ FPP1S : num -0.6252 -0.616 0.9428 -0.6465 -0.0813 ...
## $ FPUH : num -0.47 -0.33 -0.344 -0.47 0.822 ...
## $ FREQ : num 0.1594 -0.0277 -0.6317 -0.3651 -0.5756 ...
## $ HDG : num 0.0544 -0.5244 -0.5244 0.2747 0.2704 ...
## $ IN : num 0.314 1.193 -0.552 0.49 -0.601 ...
## $ JJAT : num -0.786 -0.715 -0.833 1.145 -0.147 ...
## $ JJPR : num -0.651 -0.627 -0.111 -0.24 0.409 ...
## $ LD : num 0.997 -0.167 0.183 0.949 -0.732 ...
## $ MDCA : num -0.56379 -0.24224 -0.14574 0.00326 -0.41384 ...
## $ MDCO : num -0.541 -0.541 -0.541 -0.541 -0.541 ...
## $ MDNE : num -0.632 -0.632 0.991 0.983 1.099 ...
## $ MDWO : num -0.327 0.182 0.367 -0.532 -0.532 ...
## $ MDWS : num -0.1848 -0.5195 -0.0988 1.2966 -0.2656 ...
## $ MENTAL : num 0.304 -0.103 0.558 -0.148 -0.226 ...
## $ NCOMP : num 0.423 -0.647 1.553 1.194 -0.467 ...
## $ NN : num 0.384 0.553 -0.413 0.889 -0.868 ...
## $ OCCUR : num -0.488 0.319 -0.66 -0.66 -0.66 ...
## $ PASS : num -0.475 -0.257 -0.313 0.157 -0.412 ...
## $ PEAS : num -0.621 -0.621 -0.488 0.19 -0.621 ...
## $ PIT : num -0.812 -0.918 0.382 0.469 1.239 ...
## $ PLACE : num -0.588 -0.314 0.317 0.652 0.59 ...
## $ POLITE : num -0.415 -0.415 0.399 -0.415 -0.041 ...
## $ POS : num 0.17589 -0.83652 -0.83652 -0.49089 0.00188 ...
## $ PROG : num 0.442 -0.0613 0.7701 0.6251 1.3626 ...
## $ QUAN : num -0.7083 -0.0067 0.2901 -0.5901 -0.2334 ...
## $ QUPR : num -0.5186 -0.6999 -0.4765 0.6157 0.0386 ...
## $ RB : num -0.541 -0.974 0.32 -0.769 1.297 ...
## $ RP : num -0.792 0.544 -0.631 0.723 -0.792 ...
## $ SPLIT : num -0.637 -0.637 -0.389 0.71 0.179 ...
## $ SPP2 : num 0.144 0.442 0.44 0.586 0.192 ...
## $ STPR : num -0.223 0.975 0.82 -0.604 -0.604 ...
## $ THATD : num 1.018 -0.716 0.639 -0.716 -0.716 ...
## $ THRC : num -0.523 -0.523 -0.523 -0.523 -0.523 ...
## $ THSC : num -0.302 -0.113 -0.451 -0.128 -0.384 ...
## $ TIME : num -0.3472 -0.5245 0.0688 -0.5329 0.1273 ...
## $ TPP3P : num -0.259 -0.445 -0.662 0.537 -0.617 ...
## $ TPP3S : num -0.47 -0.28 -0.594 -0.631 -0.365 ...
## $ TTR : num -0.6587 -0.2769 -0.0125 1.2226 -0.3719 ...
## $ VBD : num -0.669 -0.468 -0.621 -0.176 -0.594 ...
## $ VBG : num -0.673 0.627 0.644 0.957 -0.741 ...
## $ VBN : num -0.608 0.329 -0.608 0.568 -0.608 ...
## $ VIMP : num 1.086 0.978 -0.432 -0.434 -0.409 ...
## $ VPRT : num -0.491 -0.586 0.893 0.295 0.934 ...
## $ WHQU : num 0.9144 0.604 0.4301 -0.7314 0.0563 ...
## $ WHSC : num -0.454 -0.322 -0.613 0.851 -0.381 ...
## $ XX0 : num -0.716 -0.51 0.21 -0.52 0.681 ...
## $ YNQU : num -0.0974 0.7753 0.0723 -0.7135 1.5438 ...
ncol(TxBdata)-6
## [1] 66
#saveRDS(TxBdata, here("FullMDA", "TxBdata.rds")) # Last saved 6 December 2021
TxBdata %>% summarise(total = sum(Words))
## total
## 1 1693650
metadata <- TxBdata %>%
select(Filename, Country, Series, Level, Register, Words) %>%
mutate(Volume = paste(Series, Level)) %>%
mutate(Volume = fct_rev(Volume)) %>%
mutate(Volume = fct_reorder(Volume, as.numeric(Level))) %>%
group_by(Volume) %>%
mutate(wordcount = sum(Words)) %>%
ungroup() %>%
distinct(Volume, .keep_all = TRUE)
metadata
## # A tibble: 42 × 8
## Filename Country Series Level Register Words Volume wordcount
## <chr> <fct> <fct> <fct> <fct> <int> <fct> <int>
## 1 Achievers_A1_Instruc… Spain Achiev… A Instruct… 931 Achiev… 31214
## 2 Solutions_Pre-Interm… Spain Soluti… B Instruct… 889 Soluti… 59343
## 3 Achievers_A2_Persona… Spain Achiev… B Personal 979 Achiev… 36579
## 4 Achievers_B1_plus_In… Spain Achiev… D Informat… 690 Achiev… 48994
## 5 POC_5e_Spoken_0003.t… France POC B Conversa… 694 POC B 16964
## 6 Access_4_Narrative_0… Germany Access D Fiction 547 Access… 60986
## 7 NGL_1_Spoken_0002.txt Germany NGL A Conversa… 927 NGL A 33572
## 8 Access_1_Narrative_0… Germany Access A Fiction 840 Access… 34421
## 9 NGL_2_Narrative_0007… Germany NGL B Fiction 1127 NGL B 53694
## 10 EIM_4_Instructional_… Spain EIM D Instruct… 885 EIM D 35841
## # … with 32 more rows
ggplot(metadata, aes(x = Volume, y = wordcount, fill = Country)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = palette[c(3,8,1)]) +
scale_y_continuous(limits = c(0,90000), expand = c(0,0), labels = scales::comma) +
coord_flip() +
theme_bw() +
labs(x = "", y = "Number of words") +
theme(legend.position = c(0.73,0.1))
#ggsave(here("plots", "TEC-T_wordcounts.svg"), width = 6, height = 10)
metadataInstr <- TxBdata %>%
select(Country, Series, Level, Register, Words) %>%
filter(Register=="Instructional") %>%
mutate(Volume = paste(Series, Register)) %>%
mutate(Volume = fct_rev(Volume)) %>%
mutate(Volume = fct_reorder(Volume, as.numeric(Level))) %>%
group_by(Volume, Register) %>%
mutate(InstrWordcount = sum(Words)) %>%
ungroup() %>%
distinct(Volume, .keep_all = TRUE) %>%
select(Series, InstrWordcount)
metadataInstr
## # A tibble: 9 × 2
## Series InstrWordcount
## <fct> <int>
## 1 Achievers 109886
## 2 Solutions 87829
## 3 EIM 59928
## 4 HT 51550
## 5 Access 60938
## 6 NGL 79312
## 7 JTT 48375
## 8 GreenLine 54263
## 9 POC 30548
metaWordcount <- TxBdata %>%
select(Country, Series, Level, Register, Words) %>%
group_by(Series) %>%
mutate(TECwordcount = sum(Words)) %>%
ungroup() %>%
distinct(Series, .keep_all = TRUE) %>%
select(Series, TECwordcount)
wordcount <- merge(metaWordcount, metadataInstr, by = "Series")
wordcount %>%
mutate(InstrucPercent = InstrWordcount/TECwordcount*100) %>%
arrange(InstrucPercent) %>%
mutate(InstrucPercent = round(InstrucPercent, 2))
## Series TECwordcount InstrWordcount InstrucPercent
## 1 Access 259679 60938 23.47
## 2 NGL 278316 79312 28.50
## 3 GreenLine 172267 54263 31.50
## 4 Solutions 270278 87829 32.50
## 5 JTT 137557 48375 35.17
## 6 HT 142676 51550 36.13
## 7 POC 76714 30548 39.82
## 8 EIM 147185 59928 40.72
## 9 Achievers 208978 109886 52.58
colnames(TxBdata)
## [1] "Filename" "Country" "Series" "Level" "Register" "Words"
## [7] "ACT" "AMP" "ASPECT" "AWL" "BEMA" "CAUSE"
## [13] "CC" "COMM" "COND" "CONT" "CUZ" "DEMO"
## [19] "DMA" "DOAUX" "DT" "EMPH" "EX" "EXIST"
## [25] "FPP1P" "FPP1S" "FPUH" "FREQ" "HDG" "IN"
## [31] "JJAT" "JJPR" "LD" "MDCA" "MDCO" "MDNE"
## [37] "MDWO" "MDWS" "MENTAL" "NCOMP" "NN" "OCCUR"
## [43] "PASS" "PEAS" "PIT" "PLACE" "POLITE" "POS"
## [49] "PROG" "QUAN" "QUPR" "RB" "RP" "SPLIT"
## [55] "SPP2" "STPR" "THATD" "THRC" "THSC" "TIME"
## [61] "TPP3P" "TPP3S" "TTR" "VBD" "VBG" "VBN"
## [67] "VIMP" "VPRT" "WHQU" "WHSC" "XX0" "YNQU"
ncol(TxBdata) - 6
## [1] 66
kmo <- KMO(TxBdata[,7:ncol(TxBdata)])
kmo # 0.86 (if conducted on counts all normalised per 100 words, this value drops to 0.61!)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdata[, 7:ncol(TxBdata)])
## Overall MSA = 0.86
## MSA for each item =
## ACT AMP ASPECT AWL BEMA CAUSE CC COMM COND CONT CUZ
## 0.65 0.95 0.90 0.92 0.93 0.69 0.90 0.87 0.75 0.92 0.95
## DEMO DMA DOAUX DT EMPH EX EXIST FPP1P FPP1S FPUH FREQ
## 0.89 0.96 0.84 0.80 0.96 0.90 0.86 0.92 0.87 0.95 0.65
## HDG IN JJAT JJPR LD MDCA MDCO MDNE MDWO MDWS MENTAL
## 0.96 0.88 0.89 0.94 0.68 0.53 0.77 0.52 0.34 0.46 0.84
## NCOMP NN OCCUR PASS PEAS PIT PLACE POLITE POS PROG QUAN
## 0.79 0.88 0.90 0.94 0.91 0.96 0.97 0.96 0.64 0.92 0.92
## QUPR RB RP SPLIT SPP2 STPR THATD THRC THSC TIME TPP3P
## 0.95 0.95 0.81 0.93 0.83 0.81 0.86 0.89 0.95 0.93 0.80
## TPP3S TTR VBD VBG VBN VIMP VPRT WHQU WHSC XX0 YNQU
## 0.66 0.91 0.59 0.86 0.95 0.78 0.60 0.89 0.85 0.92 0.91
kmo$MSAi[order(kmo$MSAi)]
## MDWO MDWS MDNE MDCA VBD VPRT POS ACT
## 0.3419233 0.4607994 0.5174560 0.5323388 0.5947193 0.6037051 0.6435553 0.6517842
## FREQ TPP3S LD CAUSE COND MDCO VIMP NCOMP
## 0.6518112 0.6595372 0.6831356 0.6886684 0.7472644 0.7681180 0.7782319 0.7914283
## DT TPP3P STPR RP SPP2 MENTAL DOAUX WHSC
## 0.7989497 0.8040188 0.8106413 0.8137266 0.8288755 0.8388478 0.8400305 0.8545428
## VBG EXIST THATD COMM FPP1S IN NN WHQU
## 0.8566262 0.8621924 0.8625047 0.8685300 0.8726754 0.8819697 0.8845778 0.8866317
## JJAT DEMO THRC ASPECT CC EX OCCUR PEAS
## 0.8888528 0.8899441 0.8927221 0.8950231 0.8980297 0.9028360 0.9030450 0.9069610
## TTR YNQU AWL QUAN FPP1P PROG XX0 CONT
## 0.9095693 0.9102920 0.9179605 0.9183916 0.9210382 0.9218841 0.9228772 0.9237714
## TIME BEMA SPLIT PASS JJPR AMP QUPR THSC
## 0.9280529 0.9298954 0.9340567 0.9364387 0.9376667 0.9495248 0.9502215 0.9505850
## RB FPUH CUZ VBN PIT DMA POLITE EMPH
## 0.9510642 0.9525589 0.9546196 0.9547362 0.9557132 0.9564074 0.9587834 0.9597464
## HDG PLACE
## 0.9627588 0.9655892
# Remove first feature with MSAs of < 0.5
TxBdata <- TxBdata %>%
select(-c(MDWO))
kmo <- KMO(TxBdata[,7:ncol(TxBdata)])
kmo # 0.87
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdata[, 7:ncol(TxBdata)])
## Overall MSA = 0.87
## MSA for each item =
## ACT AMP ASPECT AWL BEMA CAUSE CC COMM COND CONT CUZ
## 0.66 0.95 0.89 0.92 0.93 0.70 0.91 0.87 0.80 0.93 0.96
## DEMO DMA DOAUX DT EMPH EX EXIST FPP1P FPP1S FPUH FREQ
## 0.89 0.96 0.84 0.80 0.96 0.90 0.86 0.92 0.87 0.95 0.65
## HDG IN JJAT JJPR LD MDCA MDCO MDNE MDWS MENTAL NCOMP
## 0.96 0.88 0.89 0.94 0.69 0.61 0.77 0.58 0.55 0.85 0.81
## NN OCCUR PASS PEAS PIT PLACE POLITE POS PROG QUAN QUPR
## 0.89 0.91 0.94 0.90 0.96 0.97 0.96 0.64 0.93 0.92 0.95
## RB RP SPLIT SPP2 STPR THATD THRC THSC TIME TPP3P TPP3S
## 0.95 0.81 0.93 0.83 0.82 0.87 0.89 0.95 0.92 0.80 0.66
## TTR VBD VBG VBN VIMP VPRT WHQU WHSC XX0 YNQU
## 0.91 0.63 0.85 0.96 0.81 0.65 0.89 0.85 0.92 0.91
kmo$MSAi[order(kmo$MSAi)] # All individual MSA > 0.5
## MDWS MDNE MDCA VBD POS VPRT FREQ ACT
## 0.5465989 0.5809599 0.6146009 0.6323342 0.6414034 0.6451452 0.6501807 0.6573716
## TPP3S LD CAUSE MDCO COND DT TPP3P VIMP
## 0.6624572 0.6852487 0.6976734 0.7674031 0.7956649 0.7987725 0.8040846 0.8076928
## NCOMP RP STPR SPP2 DOAUX MENTAL VBG WHSC
## 0.8091170 0.8116498 0.8191502 0.8267635 0.8362975 0.8468678 0.8537894 0.8544238
## EXIST THATD FPP1S COMM IN NN DEMO WHQU
## 0.8622285 0.8662124 0.8722478 0.8722598 0.8816688 0.8869328 0.8886727 0.8897047
## THRC JJAT ASPECT PEAS EX OCCUR CC TTR
## 0.8915424 0.8933355 0.8935492 0.9002544 0.9018388 0.9059682 0.9105868 0.9139092
## YNQU AWL QUAN FPP1P TIME XX0 CONT PROG
## 0.9143984 0.9173308 0.9176126 0.9199363 0.9221310 0.9241552 0.9277727 0.9293376
## BEMA SPLIT PASS JJPR THSC AMP RB QUPR
## 0.9296225 0.9339730 0.9360995 0.9387083 0.9470548 0.9476720 0.9502436 0.9509730
## FPUH PIT VBN DMA CUZ POLITE EMPH HDG
## 0.9530703 0.9550031 0.9552492 0.9567070 0.9581991 0.9594241 0.9608348 0.9619206
## PLACE
## 0.9657210
https://rdrr.io/cran/FactorAssumptions/man/communalities_optimal_solution.html
#TxBdata <- readRDS(here("FullMDA", "TxBdata.rds")) %>% select(-MDWO)
#colnames(TxBdata)
# Plot screen plot
#png(here("plots", "screeplot-TEC-only.png"), width = 20, height= 12, units = "cm", res = 300)
scree(TxBdata[,7:ncol(TxBdata)], factors = FALSE, pc = TRUE) # Retain six components
dev.off()
## null device
## 1
# Perform PCA
pca1 <- psych::principal(TxBdata[,7:ncol(TxBdata)],
nfactors = 6)
pca1$loadings
##
## Loadings:
## RC1 RC2 RC4 RC6 RC3 RC5
## ACT -0.387 0.260 0.111 0.105 0.147
## AMP 0.405 0.368 0.148
## ASPECT -0.432 -0.222 0.303
## AWL -0.662 0.402 -0.458 0.147
## BEMA 0.832 0.148 0.102
## CAUSE 0.111 0.108 0.386 -0.135
## CC -0.202 0.653 -0.276 0.172 0.106
## COMM -0.665 -0.353 -0.151 -0.165
## COND 0.499 0.129
## CONT 0.849 -0.183 0.236 0.185
## CUZ 0.256 0.422 0.149
## DEMO 0.369 -0.163 0.221 0.192 0.149
## DMA 0.782 -0.324 0.138
## DOAUX 0.121 -0.566 0.175 -0.212 0.261
## DT -0.571 -0.181 0.317 -0.235 0.321
## EMPH 0.581 0.111 0.376 0.129 0.113
## EX 0.355 0.231 -0.263 0.325 0.101 0.247
## EXIST 0.499 -0.157 0.101
## FPP1P 0.598 0.200 0.123
## FPP1S 0.724 -0.142 0.274 0.185 -0.137
## FPUH 0.748 -0.348 -0.138
## FREQ 0.127 0.433
## HDG 0.276 0.177 0.219 0.117
## IN -0.677 0.412 0.187
## JJAT 0.515 0.234 0.134 0.382
## JJPR 0.593 0.276 0.125 0.204
## LD -0.319 0.117 -0.244 -0.695 0.118
## MDCA 0.197 -0.182 0.177 0.603
## MDCO 0.217 0.338 0.178 -0.321 -0.115
## MDNE 0.142 0.345 0.123
## MDWS 0.178 0.462 0.123 -0.117
## MENTAL -0.449 -0.423 0.289 -0.241 0.134 0.149
## NCOMP 0.249 -0.299 0.385 -0.112
## NN -0.530 0.284 -0.517 -0.408 0.225 -0.148
## OCCUR -0.136 0.503 -0.201
## PASS 0.715 0.121 -0.174
## PEAS 0.482 0.445 -0.190
## PIT 0.599 0.201 0.133 0.167
## PLACE 0.544 0.110 0.316 -0.104
## POLITE 0.668 -0.323 0.104 -0.131
## POS -0.468
## PROG 0.299 0.263 0.229 -0.113 -0.174
## QUAN 0.359 0.321 0.412 0.377
## QUPR 0.188 0.137 0.387 0.277
## RB 0.441 0.187 0.315 0.425 -0.269
## RP -0.178 0.207 0.214 0.281 -0.377
## SPLIT 0.243 0.560 0.369
## SPP2 -0.255 -0.590 0.239 -0.170 0.465
## STPR -0.103 -0.268
## THATD 0.131 -0.131 0.501 -0.117 -0.135 0.212
## THRC -0.128 0.348 0.192 0.162 0.123
## THSC 0.493 0.297 0.141 0.161
## TIME 0.359 0.268 0.109 0.193 -0.262
## TPP3P 0.531 -0.103 0.138 0.112
## TPP3S 0.128 0.263 0.147 -0.571 -0.352
## TTR 0.750 0.153 -0.122 -0.189
## VBD 0.118 0.452 0.266 -0.691 -0.183
## VBG -0.225 0.512 0.279 -0.195 0.164
## VBN 0.586 -0.323 -0.142
## VIMP -0.728 -0.511 -0.106 -0.252 0.174
## VPRT 0.627 0.533 0.135
## WHQU -0.221 -0.674 -0.333 0.139 0.106
## WHSC -0.511 0.288 0.219 0.239 -0.232
## XX0 0.674 0.282 0.208
## YNQU 0.246 -0.650 -0.249 0.151
##
## RC1 RC2 RC4 RC6 RC3 RC5
## SS loadings 11.240 8.383 3.370 3.099 2.871 2.108
## Proportion Var 0.173 0.129 0.052 0.048 0.044 0.032
## Cumulative Var 0.173 0.302 0.354 0.401 0.446 0.478
pca1$communality %>% sort(.) # If features with communalities of < 0.2 are removed, we remove STPR, HDG, MDNE and CAUSE
## STPR MDNE HDG CAUSE FREQ THRC POS
## 0.08680591 0.16599754 0.17031066 0.19385220 0.21559975 0.21665202 0.24146645
## PROG ACT DEMO MDWS CUZ COND QUPR
## 0.25772748 0.26338996 0.27063897 0.27686168 0.27687863 0.27739424 0.28622322
## EXIST MDCO NCOMP OCCUR TIME ASPECT TPP3P
## 0.29399921 0.30928840 0.31480148 0.31708189 0.31918395 0.33002812 0.33354566
## AMP RP THATD THSC EX FPP1P PLACE
## 0.33518074 0.34193421 0.36279318 0.38651020 0.42556202 0.42852216 0.42992831
## PIT VBG PEAS MDCA DOAUX VBN JJPR
## 0.45207315 0.45554303 0.47436604 0.47786754 0.47949968 0.48281382 0.48762746
## JJAT WHSC SPLIT EMPH QUAN MENTAL TPP3S
## 0.48782476 0.50334350 0.51457713 0.52622378 0.55407414 0.56217178 0.56414393
## PASS YNQU POLITE RB CC XX0 DT
## 0.56500214 0.57919758 0.58209654 0.58283536 0.58388767 0.58932610 0.61919662
## COMM WHQU TTR FPP1S IN LD FPUH
## 0.62403733 0.64487002 0.64917506 0.67309250 0.67535202 0.67672838 0.70390909
## VPRT SPP2 BEMA DMA VBD AWL CONT
## 0.70823740 0.71930226 0.73708708 0.74191227 0.79994657 0.84672034 0.84847556
## NN VIMP
## 0.86821643 0.90139093
TxBdataforPCA <- TxBdata %>%
select(-c(STPR, MDNE, HDG, CAUSE))
kmo <- KMO(TxBdataforPCA[,7:ncol(TxBdataforPCA)])
kmo # 0.88
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = TxBdataforPCA[, 7:ncol(TxBdataforPCA)])
## Overall MSA = 0.88
## MSA for each item =
## ACT AMP ASPECT AWL BEMA CC COMM COND CONT CUZ DEMO
## 0.67 0.95 0.89 0.92 0.93 0.91 0.88 0.78 0.93 0.96 0.89
## DMA DOAUX DT EMPH EX EXIST FPP1P FPP1S FPUH FREQ IN
## 0.96 0.84 0.79 0.96 0.90 0.86 0.92 0.87 0.95 0.65 0.89
## JJAT JJPR LD MDCA MDCO MDWS MENTAL NCOMP NN OCCUR PASS
## 0.89 0.94 0.69 0.64 0.79 0.54 0.85 0.82 0.89 0.90 0.94
## PEAS PIT PLACE POLITE POS PROG QUAN QUPR RB RP SPLIT
## 0.91 0.95 0.97 0.96 0.64 0.93 0.91 0.95 0.95 0.82 0.93
## SPP2 THATD THRC THSC TIME TPP3P TPP3S TTR VBD VBG VBN
## 0.82 0.86 0.90 0.95 0.92 0.81 0.66 0.92 0.65 0.86 0.95
## VIMP VPRT WHQU WHSC XX0 YNQU
## 0.82 0.66 0.89 0.86 0.92 0.91
kmo$MSAi[order(kmo$MSAi)] # All individual MSA > 0.5
## MDWS MDCA POS FREQ VBD TPP3S VPRT ACT
## 0.5366469 0.6420343 0.6429012 0.6462548 0.6533315 0.6642081 0.6647081 0.6651354
## LD COND DT MDCO TPP3P RP NCOMP VIMP
## 0.6862060 0.7828310 0.7921304 0.7928956 0.8110997 0.8197204 0.8210077 0.8240245
## SPP2 DOAUX MENTAL WHSC VBG THATD EXIST FPP1S
## 0.8241715 0.8366812 0.8483976 0.8550631 0.8566951 0.8635491 0.8638716 0.8709469
## COMM NN IN WHQU DEMO ASPECT JJAT THRC
## 0.8781870 0.8859333 0.8881073 0.8890239 0.8904559 0.8930911 0.8947296 0.8985048
## EX OCCUR PEAS CC YNQU QUAN AWL TIME
## 0.9002604 0.9040632 0.9060428 0.9121949 0.9133540 0.9145440 0.9189537 0.9189799
## XX0 FPP1P TTR CONT PROG BEMA SPLIT PASS
## 0.9199086 0.9209230 0.9215357 0.9277423 0.9284678 0.9296929 0.9347762 0.9369669
## JJPR THSC RB QUPR AMP FPUH PIT VBN
## 0.9383203 0.9472544 0.9492791 0.9508128 0.9518372 0.9521478 0.9542214 0.9547174
## DMA POLITE CUZ EMPH PLACE
## 0.9568827 0.9590518 0.9592047 0.9622991 0.9655714
# Final number of features
ncol(TxBdataforPCA)-6
## [1] 61
#saveRDS(TxBdataforPCA, here("FullMDA", "TxBdataforPCA.rds")) # Last saved on 3 December 2021
This chunk was used when considering whether or not to rotate the components (see methods section)
# Comparing a rotated vs. a non-rotated solution
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
colnames(TxBdata)
ncol(TxBdata)-6
# No rotation
pca2 <- psych::principal(TxBdata[,7:ncol(TxBdata)],
nfactors = 6,
rotate = "none")
pca2$loadings
biplot.psych(pca2,
vars = TRUE,
choose=c(1,2),
)
# Promax rotation
pca2.rotated <- psych::principal(TxBdata[,7:ncol(TxBdata)],
nfactors = 6,
rotate = "promax")
# This summary shows the component correlations which is particularly interesting
pca2.rotated
pca2.rotated$loadings
biplot.psych(pca2.rotated, vars = TRUE, choose=c(1,2))
# Perform PCA on full data
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
nrow(TxBdata)
## [1] 1961
ncol(TxBdata)-6
## [1] 61
Perform PCA on random subset of the data to test the stability of the solution. Re-running this line will generate a new subset of the TEC texts containing 2/3 of the texts randomly sampled.
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds")) %>%
slice_sample(n = 1961*0.6, replace = FALSE)
nrow(TxBdata)
TxBdata$Filename[1:10]
colnames(TxBdata)
nrow(TxBdata) / (ncol(TxBdata)-6) # Check that there is enough data to conduct a PCA. This should be > 5 (see Friginal & Hardy 2014: 303–304).
Perform PCA on country subset of the data to test the stability of the solution:
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds")) %>%
#filter(Country == "France")
#filter(Country == "Germany")
filter(Country == "Spain")
nrow(TxBdata)
TxBdata$Filename[1:10] # Check data
colnames(TxBdata)
nrow(TxBdata) / (ncol(TxBdata)-6) # Check that there is enough data to conduct a PCA. This should be > 5 (see Friginal & Hardy 2014: 303–304).
pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files except outliers
register <- factor(TxBdata[,"Register"]) # Register
level <- factor(TxBdata[,"Level"]) # Textbook proficiency level
summary(register)
## Conversation Fiction Informative Instructional Personal
## 583 285 363 644 86
summary(level)
## A B C D E
## 281 395 494 468 323
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1693 1.7776 1.08902 1.00207 0.84288 0.76792 0.70295
## Proportion of Variance 0.2108 0.1416 0.05313 0.04499 0.03183 0.02642 0.02214
## Cumulative Proportion 0.2108 0.3524 0.40553 0.45051 0.48234 0.50876 0.53090
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.67987 0.66896 0.62006 0.60634 0.59078 0.5882 0.58213
## Proportion of Variance 0.02071 0.02005 0.01722 0.01647 0.01564 0.0155 0.01518
## Cumulative Proportion 0.55160 0.57165 0.58888 0.60535 0.62098 0.6365 0.65167
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.56560 0.55730 0.54868 0.54180 0.5325 0.52111 0.5153
## Proportion of Variance 0.01433 0.01391 0.01349 0.01315 0.0127 0.01217 0.0119
## Cumulative Proportion 0.66600 0.67991 0.69340 0.70655 0.7192 0.73142 0.7433
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.51058 0.50478 0.49919 0.48542 0.48162 0.47183 0.46835
## Proportion of Variance 0.01168 0.01142 0.01116 0.01056 0.01039 0.00997 0.00983
## Cumulative Proportion 0.75499 0.76641 0.77757 0.78813 0.79852 0.80849 0.81832
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.46383 0.46072 0.45286 0.44463 0.44124 0.43630 0.42986
## Proportion of Variance 0.00964 0.00951 0.00919 0.00886 0.00872 0.00853 0.00828
## Cumulative Proportion 0.82796 0.83747 0.84666 0.85551 0.86423 0.87276 0.88104
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.42419 0.41741 0.41442 0.40058 0.39737 0.39455 0.38291
## Proportion of Variance 0.00806 0.00781 0.00769 0.00719 0.00707 0.00697 0.00657
## Cumulative Proportion 0.88910 0.89691 0.90460 0.91179 0.91887 0.92584 0.93241
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.37583 0.36825 0.35414 0.34269 0.33970 0.3308 0.32069
## Proportion of Variance 0.00633 0.00608 0.00562 0.00526 0.00517 0.0049 0.00461
## Cumulative Proportion 0.93874 0.94481 0.95043 0.95569 0.96086 0.9658 0.97037
## PC50 PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 0.3098 0.30595 0.28868 0.27485 0.25385 0.24257 0.2167
## Proportion of Variance 0.0043 0.00419 0.00373 0.00338 0.00289 0.00264 0.0021
## Cumulative Proportion 0.9747 0.97887 0.98260 0.98598 0.98887 0.99151 0.9936
## PC57 PC58 PC59 PC60 PC61
## Standard deviation 0.21094 0.18995 0.18519 0.1496 0.07340
## Proportion of Variance 0.00199 0.00162 0.00154 0.0010 0.00024
## Cumulative Proportion 0.99560 0.99722 0.99876 0.9998 1.00000
scree(TxBdata[,7:ncol(TxBdata)], factors = FALSE, pc = TRUE) # The scree plots pretty much always suggest that six components should be retained
## 3-D PCA plots https://cran.r-project.org/web/packages/pca3d/vignettes/pca3d.pdf ##
# 3-D plot
library(pca3d) # For 3-D plots
col <- palette[c(1:3,8,7)] # without poetry
names(col) <- c("Conversation", "Fiction", "Informative", "Instructional", "Personal")
scales::show_col(col) # Check colours
pca3d(pca,
group = register,
components = 1:3,
#components = 4:6,
show.ellipses=FALSE,
ellipse.ci=0.75,
show.plane=FALSE,
col = col,
shape = "sphere",
radius = 1,
legend = "right")
snapshotPCA3d(here("plots", "PCA_TxB_3Dsnapshot.png"))
names(col) <- c("C", "B", "E", "A", "D") # To colour the dots according to the profiency level of the textbooks
pca3d(pca,
components = 4:6,
group = level,
show.ellipses=FALSE,
ellipse.ci=0.75,
show.plane=FALSE,
col = col,
shape = "sphere",
radius = 0.8,
legend = "right")
# PCA for PCAtools
# This package requires the data to be formatted in a rather unconventional way so it needs to wrangled first.
#TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
TxBdata2meta <- TxBdata[,1:6]
rownames(TxBdata2meta) <- TxBdata2meta$Filename
TxBdata2meta <- TxBdata2meta %>% select(-Filename)
head(TxBdata2meta)
## Country Series Level
## Achievers_A1_Instructional_0012.txt Spain Achievers A
## Solutions_Pre-Intermediate_Instructional_0023.txt Spain Solutions B
## Achievers_A2_Personal_0003.txt Spain Achievers B
## Achievers_B1_plus_Informative_0007.txt Spain Achievers D
## POC_5e_Spoken_0003.txt France POC B
## Access_4_Narrative_0013.txt Germany Access D
## Register Words
## Achievers_A1_Instructional_0012.txt Instructional 931
## Solutions_Pre-Intermediate_Instructional_0023.txt Instructional 889
## Achievers_A2_Personal_0003.txt Personal 979
## Achievers_B1_plus_Informative_0007.txt Informative 690
## POC_5e_Spoken_0003.txt Conversation 694
## Access_4_Narrative_0013.txt Fiction 547
TxBdata2 = TxBdata
rownames(TxBdata2) <- TxBdata2$Filename
TxBdata2num <- as.data.frame(base::t(TxBdata2[,7:ncol(TxBdata2)]))
TxBdata2num[1:12,1:3] # Check sanity of data
## Achievers_A1_Instructional_0012.txt
## ACT -0.2310193
## AMP -0.6575617
## ASPECT 0.2019095
## AWL 0.7330794
## BEMA -0.6836245
## CC -0.4984911
## COMM 1.0357805
## COND -0.1011047
## CONT -0.6124532
## CUZ -0.5310538
## DEMO -0.3706952
## DMA -0.4459697
## Solutions_Pre-Intermediate_Instructional_0023.txt
## ACT 0.9328214
## AMP -0.4007729
## ASPECT 1.0196960
## AWL 0.5749224
## BEMA -0.7093252
## CC 0.1285756
## COMM 0.7819071
## COND -0.6146586
## CONT -0.7525215
## CUZ -0.5310538
## DEMO -0.3318521
## DMA -0.5288974
## Achievers_A2_Personal_0003.txt
## ACT 0.59748978
## AMP -0.42747830
## ASPECT -0.52672897
## AWL -0.63781078
## BEMA -0.03367772
## CC -0.12522295
## COMM -0.55058584
## COND 0.04641396
## CONT 0.76769734
## CUZ -0.04087171
## DEMO -0.66887418
## DMA -0.36474869
p <- PCAtools::pca(TxBdata2num,
metadata = TxBdata2meta,
scale = FALSE)
colkey = c(Conversation="#BD241E", Fiction="#A18A33", Informative="#15274D", Instructional="#F9B921", Personal="#722672")
shapekey = c(A=1, B=2, C=6, D=0, E=5)
# Biplots to examine components more carefully
#png(here("plots", "PCA_TxB_Biplot_PC1_PC2.png"), width = 40, height= 25, units = "cm", res = 300)
PCAtools::biplot(p,
x = "PC1",
y = "PC2",
lab = NULL, # Otherwise will try to label each data point!
#xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
#ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
colby = "Register",
pointSize = 2,
colkey = colkey,
shape = "Level",
shapekey = shapekey,
showLoadings = FALSE,
ellipse = TRUE,
axisLabSize = 22,
legendPosition = 'right',
legendTitleSize = 22,
legendLabSize = 18,
legendIconSize = 7) +
theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#dev.off()
#ggsave(here("plots", "PCA_TxB_BiplotPC1_PC2.svg"), width = 12, height = 10)
# Biplots to examine components more carefully
pRegisters <- PCAtools::biplot(p,
x = "PC3",
y = "PC4",
lab = NULL, # Otherwise will try to label each data point!
colby = "Register",
pointSize = 2,
colkey = colkey,
shape = "Level",
shapekey = shapekey,
showLoadings = FALSE,
ellipse = TRUE,
legendPosition = 'right',
legendTitleSize = 22,
legendLabSize = 18,
legendIconSize = 7) +
theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave(here("plots", "PCA_TxB_BiplotPC3_PC4.svg"), width = 12, height = 10)
# Biplot with ellipses for Level rather than Register
colkey = c(A="#F9B921", B="#A18A33", C="#BD241E", D="#722672", E="#15274D")
shapekey = c(Conversation=1, Fiction=2, Informative=6, Instructional=0, Personal=5)
pLevels <- PCAtools::biplot(p,
x = "PC3",
y = "PC4",
lab = NULL, # Otherwise will try to label each data point!
#xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
#ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
colby = "Level",
pointSize = 2,
colkey = colkey,
shape = "Register",
shapekey = shapekey,
showLoadings = FALSE,
ellipse = TRUE,
legendPosition = 'right',
legendTitleSize = 22,
legendLabSize = 18,
legendIconSize = 7) +
theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave(here("plots", "PCA_TxB_BiplotPC3_PC4_Level.svg"), width = 12, height = 10)
# Save the two different representations of data points on PC2 and PC3 using the {patchwork} package
pRegisters / pLevels
#ggsave(here("plots", "PCA_TxB_BiplotPC3_PC4_Register_vs_Level.svg"), width = 14, height = 20)
pLevels <- PCAtools::biplot(p,
x = "PC5",
y = "PC6",
lab = NULL, # Otherwise will try to label each data point!
xlim = c(min(p$rotated$PC5)-0.5, max(p$rotated$PC5)+0.5),
ylim = c(min(p$rotated$PC6)-0.5, max(p$rotated$PC6)+0.5),
colby = "Level",
pointSize = 2,
colkey = colkey,
shape = "Register",
shapekey = shapekey,
showLoadings = FALSE,
ellipse = TRUE,
#axisLabSize = 22,
legendPosition = 'right',
legendTitleSize = 22,
legendLabSize = 18,
legendIconSize = 7) +
theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#pLevels
#ggsave(here("plots", "PCA_TxB_BiplotPC5_PC6_Level.svg"), width = 12, height = 10)
# Biplots to examine components more carefully
colkey = c(Conversation="#BD241E", Fiction="#A18A33", Informative="#15274D", Instructional="#F9B921", Personal="#722672")
shapekey = c(A=1, B=2, C=6, D=0, E=5)
pRegisters <- PCAtools::biplot(p,
x = "PC5",
y = "PC6",
lab = NULL, # Otherwise will try to label each data point!
#xlim = c(min(p$rotated$PC1)-0.5, max(p$rotated$PC1)+0.5),
#ylim = c(min(p$rotated$PC2)-0.5, max(p$rotated$PC2)+0.5),
colby = "Register",
pointSize = 2,
colkey = colkey,
shape = "Level",
shapekey = shapekey,
showLoadings = FALSE,
ellipse = TRUE,
axisLabSize = 22,
legendPosition = 'right',
legendTitleSize = 22,
legendLabSize = 18,
legendIconSize = 7) +
theme(plot.margin = unit(c(0,0,0,0.2), "cm"))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
#ggsave("Plots/PCA_TxB_BiplotPC5_PC6.svg", width = 12, height = 10)
# Save the two different representations of data points on PC4 and PC5 using the {patchwork} package
pRegisters / pLevels
#ggsave(here("plots", "PCA_TxB_BiplotPC5_PC6_Register_vs_Level.svg"), width = 14, height = 20)
## Very slow, open in zoomed out window!
# Add legend manually? Yes (take it from the biplot code below), let's not waste too much time, here. Or use Evert's mvar.pairs plot (though that also requires manual axis annotation)
png(here("plots", "PCA_TxB_pairsplot.png"), width = 25, height= 45, units = "cm", res = 300)
PCAtools::pairsplot(p,
triangle = FALSE,
components = 1:6,
ncol = 3,
nrow = 5,
pointSize = 0.8,
lab = NULL, # Otherwise will try to label each data point!
colby = "Register",
colkey = colkey,
shape = "Level",
shapekey = shapekey,
margingaps = unit(c(0.2, 0.2, 0.8, 0.2), "cm"),
legendPosition = "none")
dev.off()
#ggsave(here("plots", "PCA_TxB_pairsplot.svg"), width = 15, height = 15)
TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files
# The rotated data that represents the observations / samples is stored in rotated, while the variable loadings are stored in loadings
loadings <- as.data.frame(pca$rotation[,1:4])
loadings %>% round(2)
## PC1 PC2 PC3 PC4
## ACT 0.08 -0.11 0.04 -0.10
## AMP -0.12 -0.10 -0.11 0.01
## ASPECT 0.10 -0.05 0.14 -0.01
## AWL 0.22 -0.16 -0.12 -0.13
## BEMA -0.22 0.01 -0.21 0.02
## CC 0.05 -0.21 -0.19 0.00
## COMM 0.20 0.09 0.14 -0.04
## COND -0.01 -0.02 0.11 -0.24
## CONT -0.25 0.11 -0.03 -0.06
## CUZ -0.09 -0.13 -0.06 -0.02
## DEMO -0.12 0.08 0.03 -0.09
## DMA -0.20 0.14 -0.02 0.00
## DOAUX -0.01 0.20 0.05 -0.15
## DT 0.12 0.00 0.31 -0.02
## EMPH -0.19 -0.02 0.06 -0.14
## EX -0.10 -0.05 -0.11 0.05
## EXIST -0.02 -0.15 -0.09 -0.09
## FPP1P -0.17 0.01 -0.07 0.00
## FPP1S -0.23 0.07 0.08 -0.01
## FPUH -0.16 0.15 -0.09 0.07
## FREQ -0.03 -0.05 0.01 -0.10
## IN 0.17 -0.18 0.02 -0.08
## JJAT -0.06 -0.18 0.04 -0.21
## JJPR -0.17 -0.06 -0.11 -0.11
## LD 0.16 -0.03 -0.26 -0.01
## MDCA -0.04 0.10 -0.18 -0.09
## MDCO -0.05 -0.10 0.22 0.01
## MDWS -0.07 -0.01 0.05 -0.16
## MENTAL 0.14 0.13 0.12 -0.25
## NCOMP 0.04 -0.05 -0.24 -0.15
## NN 0.20 -0.09 -0.29 0.11
## OCCUR 0.02 -0.18 0.03 0.02
## PASS -0.01 -0.22 -0.06 -0.05
## PEAS -0.06 -0.17 0.13 -0.13
## PIT -0.19 -0.04 -0.06 -0.06
## PLACE -0.16 -0.01 -0.07 0.09
## POLITE -0.14 0.13 -0.07 0.02
## POS -0.01 0.03 -0.04 0.16
## PROG -0.11 -0.02 0.11 0.00
## QUAN -0.15 -0.03 0.12 -0.19
## QUPR -0.10 -0.05 0.16 -0.11
## RB -0.19 -0.08 0.20 0.00
## RP 0.00 -0.09 0.14 0.02
## SPLIT -0.11 -0.18 0.02 -0.16
## SPP2 0.10 0.22 -0.01 -0.25
## THATD -0.05 0.04 0.16 -0.24
## THRC 0.02 -0.11 -0.02 -0.18
## THSC -0.06 -0.17 0.07 -0.14
## TIME -0.12 -0.08 -0.01 0.06
## TPP3P -0.01 -0.16 -0.09 -0.02
## TPP3S -0.06 -0.11 0.13 0.30
## TTR -0.04 -0.26 -0.05 -0.01
## VBD -0.08 -0.20 0.23 0.30
## VBG 0.04 -0.18 0.00 -0.22
## VBN 0.03 -0.18 -0.07 -0.04
## VIMP 0.25 0.15 0.04 -0.08
## VPRT -0.15 0.05 -0.32 -0.22
## WHQU 0.11 0.23 0.00 -0.09
## WHSC 0.11 -0.11 0.03 -0.15
## XX0 -0.22 0.03 0.06 -0.06
## YNQU -0.03 0.23 0.00 -0.08
smallToZero <- function(x) {if_else(x < 0.05 & x > -0.05, 0, x)}
loadings %>%
filter_all(any_vars(. > abs(0.05))) %>%
mutate_all(smallToZero) %>%
round(3)
## PC1 PC2 PC3 PC4
## ACT 0.080 -0.105 0.000 -0.101
## ASPECT 0.100 0.000 0.139 0.000
## AWL 0.216 -0.159 -0.121 -0.134
## CC 0.052 -0.208 -0.191 0.000
## COMM 0.201 0.088 0.136 0.000
## COND 0.000 0.000 0.107 -0.245
## CONT -0.254 0.110 0.000 -0.061
## DEMO -0.117 0.075 0.000 -0.092
## DMA -0.201 0.144 0.000 0.000
## DOAUX 0.000 0.200 0.055 -0.147
## DT 0.119 0.000 0.306 0.000
## EMPH -0.191 0.000 0.059 -0.136
## EX -0.104 -0.053 -0.115 0.053
## FPP1S -0.228 0.073 0.076 0.000
## FPUH -0.161 0.148 -0.094 0.067
## IN 0.172 -0.178 0.000 -0.079
## LD 0.159 0.000 -0.259 0.000
## MDCA 0.000 0.104 -0.179 -0.092
## MDCO 0.000 -0.101 0.216 0.000
## MENTAL 0.138 0.128 0.117 -0.254
## NN 0.204 -0.090 -0.293 0.112
## PEAS -0.056 -0.174 0.134 -0.129
## PLACE -0.163 0.000 -0.072 0.092
## POLITE -0.141 0.130 -0.067 0.000
## POS 0.000 0.000 0.000 0.164
## PROG -0.115 0.000 0.110 0.000
## QUAN -0.148 0.000 0.116 -0.194
## QUPR -0.101 -0.053 0.157 -0.105
## RB -0.188 -0.075 0.204 0.000
## RP 0.000 -0.089 0.139 0.000
## SPP2 0.097 0.219 0.000 -0.252
## THATD -0.054 0.000 0.160 -0.236
## THSC -0.058 -0.167 0.074 -0.137
## TIME -0.125 -0.077 0.000 0.061
## TPP3S -0.057 -0.113 0.134 0.302
## VBD -0.082 -0.201 0.226 0.297
## VIMP 0.247 0.154 0.000 -0.076
## VPRT -0.155 0.054 -0.322 -0.221
## WHQU 0.110 0.230 0.000 -0.092
## WHSC 0.114 -0.112 0.000 -0.150
## XX0 -0.217 0.000 0.056 -0.055
## YNQU 0.000 0.234 0.000 -0.083
#write_last_clip()
# Compare frequencies of the individual features across different registers and levels
TxBcounts %>%
group_by(Register, Level) %>%
summarise(median(NCOMP), MAD(NCOMP)) %>%
mutate(across(where(is.numeric), round, 2)) %>%
as.data.frame()
## `summarise()` has grouped output by 'Register'. You can override using the `.groups` argument.
## Register Level median(NCOMP) MAD(NCOMP)
## 1 Conversation A 5.69 2.79
## 2 Conversation B 5.48 2.66
## 3 Conversation C 5.32 2.58
## 4 Conversation D 6.18 2.91
## 5 Conversation E 6.21 2.62
## 6 Fiction A 4.14 2.34
## 7 Fiction B 3.96 2.17
## 8 Fiction C 4.05 1.86
## 9 Fiction D 5.05 2.34
## 10 Fiction E 5.05 2.16
## 11 Informative A 8.07 2.48
## 12 Informative B 7.62 2.40
## 13 Informative C 7.49 3.16
## 14 Informative D 7.56 2.46
## 15 Informative E 8.77 2.45
## 16 Instructional A 6.84 2.54
## 17 Instructional B 6.80 2.65
## 18 Instructional C 6.14 2.35
## 19 Instructional D 6.22 2.29
## 20 Instructional E 6.75 2.69
## 21 Personal A 6.72 1.42
## 22 Personal B 4.92 2.33
## 23 Personal C 5.75 1.45
## 24 Personal D 6.46 3.19
## 25 Personal E 8.22 3.09
# Graph of variables
factoextra::fviz_pca_var(pca,
axes = c(1,2),
select.var = list(cos2 = 0.1),
col.var = "contrib", # Colour by contributions to the PC
gradient.cols = c("#F9B921", "#DB241E", "#722672"),
title = "",
repel = TRUE, # Try to avoid too much text overlapping
ggtheme = ggthemes::theme_few())
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
#ggsave(here("plots", "fviz_pca_var_PC1_PC2.svg"), width = 11, height = 9)
factoextra::fviz_pca_var(pca,
axes = c(3,2),
select.var = list(contrib = 30),
col.var = "contrib", # Colour by contributions to the PC
gradient.cols = c("#F9B921", "#DB241E", "#722672"),
title = "",
repel = TRUE, # Try to avoid too much text overlapping
ggtheme = ggthemes::theme_few())
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#ggsave(here("plots", "fviz_pca_var_PC3_PC2.svg"), width = 9, height = 8)
factoextra::fviz_pca_var(pca,
axes = c(3,4),
select.var = list(contrib = 30),
col.var = "contrib", # Colour by contributions to the PC
gradient.cols = c("#F9B921", "#DB241E", "#722672"),
title = "",
repel = TRUE, # Try to avoid too much text overlapping
ggtheme = ggthemes::theme_few())
#ggsave(here("plots", "fviz_pca_var_PC3_PC4.svg"), width = 9, height = 8)
factoextra::fviz_pca_var(pca,
axes = c(5,6),
select.var = list(cos2 = 0.05),
col.var = "contrib", # Colour by contributions to the PC
gradient.cols = c("#F9B921", "#DB241E", "#722672"),
title = "",
repel = TRUE, # Try to avoid too much text overlapping
ggtheme = ggthemes::theme_few())
#ggsave(here("plots", "fviz_pca_var_PC5_PC6.svg"), width = 9, height = 8)
colnames(TxBdata)
res.pca <- FactoMineR::PCA(TxBdata, quali.sup = 1:5, quanti.sup = 6, scale.unit = FALSE, graph = FALSE, ncp = 6)
summary(res.pca, ncp = 6, nbelements = 6)
# For some reason I don't understand, the dimdesc() function only works if I eliminate the supplementary variables (although they obviously don't contribute to the dimensions anyway!)
res.pca2 <- FactoMineR::PCA(TxBdata[,7:ncol(TxBdata)], scale.unit = FALSE, graph = FALSE)
#
dims <- dimdesc(res.pca2, proba = 0.0001)
dims
dims$Dim.1
dims$Dim.2
dims$Dim.3
dims$Dim.4
dims$Dim.5
plot(res.pca, choix = "ind", habillage = "Register", col.hab = colours[1:5], cex = 1.1, select = "contrib 10", title = "Graphe des individus")
plot(res.pca, choix="var", select="cos2 0.6") # plot the variables with cos2 greater than 0.6
library(Factoshiny)
TxBdata2 <- TxBdata %>%
select(-Words, -Filename)
res.shiny <- PCAshiny(TxBdata2)
# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/#pca-results-for-variables
#TxBdata <- readRDS(here("FullMDA", "TxBdataforPCA.rds"))
pca <- prcomp(TxBdata[,7:ncol(TxBdata)], scale.=FALSE) # All quantitative variables for all TxB files
register <- factor(TxBdata[,"Register"]) # Register
level <- factor(TxBdata[,"Level"]) # Textbook proficiency level
summary(register)
## Conversation Fiction Informative Instructional Personal
## 583 285 363 644 86
summary(level)
## A B C D E
## 281 395 494 468 323
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1693 1.7776 1.08902 1.00207 0.84288 0.76792 0.70295
## Proportion of Variance 0.2108 0.1416 0.05313 0.04499 0.03183 0.02642 0.02214
## Cumulative Proportion 0.2108 0.3524 0.40553 0.45051 0.48234 0.50876 0.53090
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.67987 0.66896 0.62006 0.60634 0.59078 0.5882 0.58213
## Proportion of Variance 0.02071 0.02005 0.01722 0.01647 0.01564 0.0155 0.01518
## Cumulative Proportion 0.55160 0.57165 0.58888 0.60535 0.62098 0.6365 0.65167
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.56560 0.55730 0.54868 0.54180 0.5325 0.52111 0.5153
## Proportion of Variance 0.01433 0.01391 0.01349 0.01315 0.0127 0.01217 0.0119
## Cumulative Proportion 0.66600 0.67991 0.69340 0.70655 0.7192 0.73142 0.7433
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.51058 0.50478 0.49919 0.48542 0.48162 0.47183 0.46835
## Proportion of Variance 0.01168 0.01142 0.01116 0.01056 0.01039 0.00997 0.00983
## Cumulative Proportion 0.75499 0.76641 0.77757 0.78813 0.79852 0.80849 0.81832
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.46383 0.46072 0.45286 0.44463 0.44124 0.43630 0.42986
## Proportion of Variance 0.00964 0.00951 0.00919 0.00886 0.00872 0.00853 0.00828
## Cumulative Proportion 0.82796 0.83747 0.84666 0.85551 0.86423 0.87276 0.88104
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.42419 0.41741 0.41442 0.40058 0.39737 0.39455 0.38291
## Proportion of Variance 0.00806 0.00781 0.00769 0.00719 0.00707 0.00697 0.00657
## Cumulative Proportion 0.88910 0.89691 0.90460 0.91179 0.91887 0.92584 0.93241
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.37583 0.36825 0.35414 0.34269 0.33970 0.3308 0.32069
## Proportion of Variance 0.00633 0.00608 0.00562 0.00526 0.00517 0.0049 0.00461
## Cumulative Proportion 0.93874 0.94481 0.95043 0.95569 0.96086 0.9658 0.97037
## PC50 PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 0.3098 0.30595 0.28868 0.27485 0.25385 0.24257 0.2167
## Proportion of Variance 0.0043 0.00419 0.00373 0.00338 0.00289 0.00264 0.0021
## Cumulative Proportion 0.9747 0.97887 0.98260 0.98598 0.98887 0.99151 0.9936
## PC57 PC58 PC59 PC60 PC61
## Standard deviation 0.21094 0.18995 0.18519 0.1496 0.07340
## Proportion of Variance 0.00199 0.00162 0.00154 0.0010 0.00024
## Cumulative Proportion 0.99560 0.99722 0.99876 0.9998 1.00000
## Access to the PCA results
pca$rotation[,1]
## ACT AMP ASPECT AWL BEMA CC
## 0.080060346 -0.116127587 0.099543545 0.216440492 -0.217351695 0.051555936
## COMM COND CONT CUZ DEMO DMA
## 0.201198439 -0.011408054 -0.254470196 -0.085519298 -0.116742474 -0.201458765
## DOAUX DT EMPH EX EXIST FPP1P
## -0.007901611 0.119111349 -0.190546478 -0.104230420 -0.023496930 -0.166286363
## FPP1S FPUH FREQ IN JJAT JJPR
## -0.228153745 -0.161136362 -0.026758914 0.171985691 -0.057458796 -0.171374793
## LD MDCA MDCO MDWS MENTAL NCOMP
## 0.159464818 -0.041311476 -0.047633498 -0.066621945 0.137513754 0.038907288
## NN OCCUR PASS PEAS PIT PLACE
## 0.204121990 0.015900251 -0.006229030 -0.056468983 -0.185192182 -0.163070169
## POLITE POS PROG QUAN QUPR RB
## -0.141415809 -0.011521906 -0.114565231 -0.148468753 -0.100834781 -0.187688767
## RP SPLIT SPP2 THATD THRC THSC
## 0.001734942 -0.105047453 0.096602054 -0.054135134 0.015899485 -0.057894629
## TIME TPP3P TPP3S TTR VBD VBG
## -0.124827074 -0.009218624 -0.056785938 -0.044364165 -0.082457830 0.040924578
## VBN VIMP VPRT WHQU WHSC XX0
## 0.027018470 0.246976702 -0.154619968 0.109596509 0.114186152 -0.217382666
## YNQU
## -0.027774744
res.ind <- cbind(TxBdata[,1:5], as.data.frame(pca$x)[,1:6])
head(res.ind)
## Filename Country Series Level
## 1 Achievers_A1_Instructional_0012.txt Spain Achievers A
## 2 Solutions_Pre-Intermediate_Instructional_0023.txt Spain Solutions B
## 3 Achievers_A2_Personal_0003.txt Spain Achievers B
## 4 Achievers_B1_plus_Informative_0007.txt Spain Achievers D
## 5 POC_5e_Spoken_0003.txt France POC B
## 6 Access_4_Narrative_0013.txt Germany Access D
## Register PC1 PC2 PC3 PC4 PC5
## 1 Instructional 3.093368 1.9013518 0.05136536 0.1894348974 0.02132023
## 2 Instructional 3.323671 0.6341624 0.50300157 0.2640014332 -1.01850292
## 3 Personal -1.287205 1.7556911 -0.94319919 -1.0893089092 0.44173584
## 4 Informative 1.416627 -2.7239559 -1.76705268 -1.1605859275 1.22300895
## 5 Conversation -3.043701 2.3029358 -0.08889758 0.0002598051 -0.54563425
## 6 Fiction -1.029236 -1.4477043 1.30971484 2.1975802039 -1.57940462
## PC6
## 1 0.9301099
## 2 -0.1214005
## 3 0.3882705
## 4 -0.9204889
## 5 -0.2713718
## 6 -1.0899312
res.ind %>%
group_by(Register) %>%
summarise_if(is.numeric, mean)
## # A tibble: 5 × 7
## Register PC1 PC2 PC3 PC4 PC5 PC6
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Conversation -2.29 0.929 -0.141 -0.268 -0.0248 0.0639
## 2 Fiction -0.851 -0.805 1.02 1.09 0.111 -0.0998
## 3 Informative 0.0572 -2.45 -0.830 0.0134 -0.0772 0.117
## 4 Instructional 2.68 0.931 0.150 -0.242 0.00795 -0.0670
## 5 Personal -1.92 -0.286 -0.0515 -0.0200 0.0672 -0.0938
res.ind %>%
group_by(Register, Level) %>%
summarise_if(is.numeric, mean) %>%
mutate(across(where(is.numeric), round, 2)) %>%
as.data.frame()
## Register Level PC1 PC2 PC3 PC4 PC5 PC6
## 1 Conversation A -2.39 2.39 -1.23 0.71 -0.45 -0.01
## 2 Conversation B -2.54 1.72 -0.25 0.04 -0.14 0.13
## 3 Conversation C -2.25 0.70 0.18 -0.41 0.09 -0.02
## 4 Conversation D -2.10 -0.08 0.28 -0.73 0.17 0.09
## 5 Conversation E -2.13 -0.14 0.07 -0.98 0.16 0.17
## 6 Fiction A -0.95 0.85 -0.54 1.48 -0.31 -0.46
## 7 Fiction B -0.89 -0.14 0.95 1.78 -0.06 -0.03
## 8 Fiction C -0.98 -0.81 1.62 1.23 0.26 -0.16
## 9 Fiction D -0.71 -1.57 1.27 0.72 0.21 -0.01
## 10 Fiction E -0.80 -1.45 1.16 0.56 0.25 -0.01
## 11 Informative A -0.09 -1.11 -1.94 0.87 -0.88 -0.15
## 12 Informative B 0.15 -1.67 -1.19 0.46 -0.38 0.13
## 13 Informative C -0.02 -2.37 -0.68 -0.03 -0.06 -0.01
## 14 Informative D 0.06 -2.89 -0.45 -0.19 0.06 0.10
## 15 Informative E 0.15 -3.13 -0.79 -0.38 0.30 0.43
## 16 Instructional A 2.89 1.55 -0.20 0.46 -0.34 -0.24
## 17 Instructional B 2.68 1.27 0.09 0.00 -0.12 -0.12
## 18 Instructional C 2.59 0.99 0.28 -0.32 -0.07 0.01
## 19 Instructional D 2.63 0.70 0.28 -0.49 0.12 0.07
## 20 Instructional E 2.64 0.09 0.20 -0.80 0.49 -0.16
## 21 Personal A -1.84 0.53 -1.11 1.21 -0.31 0.12
## 22 Personal B -1.85 0.40 -0.58 0.59 0.21 -0.07
## 23 Personal C -2.05 -0.46 0.52 -0.17 0.06 -0.03
## 24 Personal D -1.89 -1.05 0.45 -0.63 0.39 -0.06
## 25 Personal E -1.96 -0.92 0.21 -1.10 -0.19 -0.45
res.ind %>%
select(Register, Level, PC2) %>%
group_by(Register, Level) %>%
summarise_if(is.numeric, c(Median = median, MAD = mad))%>%
mutate(across(where(is.numeric), round, 2)) %>%
as.data.frame()
## Register Level Median MAD
## 1 Conversation A 2.60 0.76
## 2 Conversation B 1.87 1.02
## 3 Conversation C 0.85 1.26
## 4 Conversation D 0.20 1.68
## 5 Conversation E -0.16 1.33
## 6 Fiction A 1.09 0.59
## 7 Fiction B -0.06 0.74
## 8 Fiction C -0.82 0.76
## 9 Fiction D -1.59 1.05
## 10 Fiction E -1.43 1.07
## 11 Informative A -1.00 0.75
## 12 Informative B -1.74 0.86
## 13 Informative C -2.53 1.06
## 14 Informative D -3.11 0.73
## 15 Informative E -3.29 0.76
## 16 Instructional A 1.53 0.66
## 17 Instructional B 1.32 0.57
## 18 Instructional C 1.03 0.48
## 19 Instructional D 0.74 0.59
## 20 Instructional E 0.09 0.76
## 21 Personal A 0.58 0.45
## 22 Personal B 0.51 1.10
## 23 Personal C -0.76 0.67
## 24 Personal D -0.97 0.69
## 25 Personal E -1.03 0.89
# Search for example texts to illustrate results
res.ind %>%
#filter(PC3 > 2.5 & PC2 < -2) %>%
filter(PC4 < -2.5) %>%
select(Filename, PC3, PC4) %>%
mutate(across(where(is.numeric), round, 2))
## Filename PC3 PC4
## 1 GreenLine_5_Personal_0002.txt -0.90 -2.78
## 2 NGL_4_Spoken_0007.txt 0.73 -2.61
## 3 NM_2_Spoken_0012.txt -0.36 -2.67
## 4 EIM_3_Informative_0002.txt -1.46 -2.51
## 5 Solutions_Intermediate_Plus_Spoken_0040.txt 0.24 -2.53
## 6 Achievers_B1_plus_Personal_0001.txt 0.74 -2.66
# Dimension 1
# Model with only Register as a fixed effect
lm1 <- lm(PC1 ~ Register, data = res.ind)
summary(lm1)
##
## Call:
## lm(formula = PC1 ~ Register, data = res.ind)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.97018 -0.47497 -0.00695 0.44715 2.98546
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.29071 0.03006 -76.212 < 2e-16 ***
## RegisterFiction 1.43957 0.05245 27.444 < 2e-16 ***
## RegisterInformative 2.34787 0.04852 48.387 < 2e-16 ***
## RegisterInstructional 4.96575 0.04149 119.689 < 2e-16 ***
## RegisterPersonal 0.36737 0.08383 4.382 1.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7257 on 1956 degrees of freedom
## Multiple R-squared: 0.8883, Adjusted R-squared: 0.8881
## F-statistic: 3889 on 4 and 1956 DF, p-value: < 2.2e-16
# Models with Textbook series as random intercepts
md1 <- lmer(PC1 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md1Register <- lmer(PC1 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md1Level <- lmer(PC1 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md1, md1Register, md1Level)
## Data: res.ind
## Models:
## md1Register: PC1 ~ Register + (1 | Series)
## md1Level: PC1 ~ Level + (1 | Series)
## md1: PC1 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md1Register 7 4080.4 4119.4 -2033.2 4066.4
## md1Level 7 8533.0 8572.0 -4259.5 8519.0 0.0 0
## md1 27 4068.3 4219.0 -2007.2 4014.3 4504.6 20 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md1, wrap.labels = 300) # Marginal R2 = 0.890
PC 1 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -2.37 | -2.59 – -2.15 | <0.001 |
Register [Fiction] | 1.61 | 1.36 – 1.87 | <0.001 |
Register [Informative] | 2.23 | 1.96 – 2.50 | <0.001 |
Register [Instructional] | 5.29 | 5.10 – 5.47 | <0.001 |
Register [Personal] | 0.48 | 0.08 – 0.88 | 0.019 |
Level [B] | -0.12 | -0.30 – 0.05 | 0.167 |
Level [C] | 0.12 | -0.05 – 0.29 | 0.159 |
Level [D] | 0.23 | 0.06 – 0.41 | 0.010 |
Level [E] | 0.27 | 0.07 – 0.48 | 0.010 |
Register [Fiction] * Level [B] | 0.18 | -0.15 – 0.51 | 0.284 |
Register [Informative] * Level [B] | 0.36 | 0.02 – 0.70 | 0.038 |
Register [Instructional] * Level [B] | -0.10 | -0.35 – 0.15 | 0.434 |
Register [Personal] * Level [B] | 0.11 | -0.39 – 0.61 | 0.671 |
Register [Fiction] * Level [C] | -0.25 | -0.58 – 0.07 | 0.130 |
Register [Informative] * Level [C] | -0.00 | -0.32 – 0.31 | 0.993 |
Register [Instructional] * Level [C] | -0.39 | -0.62 – -0.15 | 0.001 |
Register [Personal] * Level [C] | -0.22 | -0.72 – 0.28 | 0.381 |
Register [Fiction] * Level [D] | -0.05 | -0.38 – 0.27 | 0.739 |
Register [Informative] * Level [D] | -0.01 | -0.33 – 0.31 | 0.946 |
Register [Instructional] * Level [D] | -0.47 | -0.72 – -0.23 | <0.001 |
Register [Personal] * Level [D] | -0.07 | -0.60 – 0.46 | 0.800 |
Register [Fiction] * Level [E] | -0.24 | -0.58 – 0.10 | 0.173 |
Register [Informative] * Level [E] | 0.06 | -0.29 – 0.40 | 0.747 |
Register [Instructional] * Level [E] | -0.50 | -0.77 – -0.22 | <0.001 |
Register [Personal] * Level [E] | -0.18 | -0.74 – 0.38 | 0.527 |
Random Effects | |||
σ2 | 0.45 | ||
τ00 Series | 0.07 | ||
ICC | 0.14 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.890 / 0.906 |
tab_model(md1Register) # Marginal R2 = 0.877
PC 1 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -2.28 | -2.47 – -2.09 | <0.001 |
Register [Fiction] | 1.55 | 1.45 – 1.65 | <0.001 |
Register [Informative] | 2.34 | 2.25 – 2.43 | <0.001 |
Register [Instructional] | 4.99 | 4.92 – 5.07 | <0.001 |
Register [Personal] | 0.41 | 0.25 – 0.56 | <0.001 |
Random Effects | |||
σ2 | 0.46 | ||
τ00 Series | 0.08 | ||
ICC | 0.14 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.887 / 0.903 |
tab_model(md1Level) # Marginal R2 = 0.002
PC 1 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.09 | -0.30 – 0.47 | 0.665 |
Level [B] | -0.11 | -0.44 – 0.21 | 0.487 |
Level [C] | -0.10 | -0.41 – 0.21 | 0.518 |
Level [D] | 0.07 | -0.24 – 0.39 | 0.642 |
Level [E] | 0.09 | -0.26 – 0.44 | 0.607 |
Random Effects | |||
σ2 | 4.46 | ||
τ00 Series | 0.21 | ||
ICC | 0.04 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.002 / 0.046 |
# Tweak plot aestetics with: https://cran.r-project.org/web/packages/sjPlot/vignettes/custplot.html
# Colour customisation trick from: https://stackoverflow.com/questions/55598920/different-line-colors-in-forest-plot-output-from-sjplot-r-package
# Plot of fixed effects:
plot_model(md1Register,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5),
title = "",
wrap.labels = 40,
axis.title = "PC1 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA1_lmer_fixedeffects_Register.svg"), height = 3, width = 8)
Register_results <- emmeans(md1Register, "Register")
summary(Register_results)
## Register emmean SE df lower.CL upper.CL
## Conversation -2.2793 0.102 11.6 -2.502 -2.056
## Fiction -0.7267 0.106 13.9 -0.955 -0.498
## Informative 0.0603 0.104 12.7 -0.165 0.286
## Instructional 2.7141 0.101 11.3 2.492 2.937
## Personal -1.8734 0.122 25.5 -2.125 -1.622
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
comparisons <- pairs(Register_results, adjust = "tukey")
comparisons
## contrast estimate SE df t.ratio p.value
## Conversation - Fiction -1.553 0.0508 1963 -30.535 <.0001
## Conversation - Informative -2.340 0.0465 1961 -50.341 <.0001
## Conversation - Instructional -4.993 0.0399 1961 -125.141 <.0001
## Conversation - Personal -0.406 0.0791 1958 -5.134 <.0001
## Fiction - Informative -0.787 0.0557 1962 -14.135 <.0001
## Fiction - Instructional -3.441 0.0497 1962 -69.168 <.0001
## Fiction - Personal 1.147 0.0840 1958 13.645 <.0001
## Informative - Instructional -2.654 0.0447 1957 -59.399 <.0001
## Informative - Personal 1.934 0.0816 1957 23.692 <.0001
## Instructional - Personal 4.587 0.0780 1957 58.820 <.0001
##
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 5 estimates
#write_last_clip()
confint(comparisons)
## contrast estimate SE df lower.CL upper.CL
## Conversation - Fiction -1.553 0.0508 1963 -1.691 -1.414
## Conversation - Informative -2.340 0.0465 1961 -2.466 -2.213
## Conversation - Instructional -4.993 0.0399 1961 -5.102 -4.884
## Conversation - Personal -0.406 0.0791 1958 -0.622 -0.190
## Fiction - Informative -0.787 0.0557 1962 -0.939 -0.635
## Fiction - Instructional -3.441 0.0497 1962 -3.577 -3.305
## Fiction - Personal 1.147 0.0840 1958 0.917 1.376
## Informative - Instructional -2.654 0.0447 1957 -2.776 -2.532
## Informative - Personal 1.934 0.0816 1957 1.711 2.156
## Instructional - Personal 4.587 0.0780 1957 4.374 4.800
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 5 estimates
#write_last_clip()
TxBdata %>%
filter(Register == "Instructional") %>%
group_by(Level) %>%
summarise(median(VPRT))
## # A tibble: 5 × 2
## Level `median(VPRT)`
## <fct> <dbl>
## 1 A -0.585
## 2 B -0.483
## 3 C -0.450
## 4 D -0.428
## 5 E -0.274
# Plot of random effects:
plot_model(md1,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = "bw",
wrap.labels = 40,
axis.title = "PC1 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA1_lmer_randomeffects.svg"), height = 3, width = 8)
# Dimension 2
md2 <- lmer(PC2 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md2Register <- lmer(PC2 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md2Level <- lmer(PC2 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md2, md2Register, md2Level)
## Data: res.ind
## Models:
## md2Register: PC2 ~ Register + (1 | Series)
## md2Level: PC2 ~ Level + (1 | Series)
## md2: PC2 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md2Register 7 6155.2 6194.3 -3070.6 6141.2
## md2Level 7 7290.1 7329.2 -3638.1 7276.1 0.0 0
## md2 27 5200.9 5351.6 -2573.4 5146.9 2129.2 20 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md2) # Marginal R2 = 0.723
PC 2 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 2.46 | 2.20 – 2.73 | <0.001 |
Register [Fiction] | -1.80 | -2.14 – -1.46 | <0.001 |
Register [Informative] | -3.44 | -3.79 – -3.08 | <0.001 |
Register [Instructional] | -0.87 | -1.12 – -0.62 | <0.001 |
Register [Personal] | -1.89 | -2.42 – -1.35 | <0.001 |
Level [B] | -0.72 | -0.96 – -0.49 | <0.001 |
Level [C] | -1.71 | -1.94 – -1.49 | <0.001 |
Level [D] | -2.37 | -2.61 – -2.14 | <0.001 |
Level [E] | -2.65 | -2.92 – -2.37 | <0.001 |
Register [Fiction] * Level [B] |
-0.28 | -0.72 – 0.17 | 0.222 |
Register [Informative] * Level [B] |
0.13 | -0.32 – 0.58 | 0.565 |
Register [Instructional] * Level [B] |
0.44 | 0.11 – 0.77 | 0.009 |
Register [Personal] * Level [B] |
0.53 | -0.14 – 1.20 | 0.120 |
Register [Fiction] * Level [C] |
0.08 | -0.35 – 0.52 | 0.705 |
Register [Informative] * Level [C] |
0.42 | -0.01 – 0.84 | 0.053 |
Register [Instructional] * Level [C] |
1.11 | 0.80 – 1.43 | <0.001 |
Register [Personal] * Level [C] |
0.59 | -0.07 – 1.26 | 0.081 |
Register [Fiction] * Level [D] |
-0.01 | -0.44 – 0.42 | 0.972 |
Register [Informative] * Level [D] |
0.49 | 0.07 – 0.91 | 0.024 |
Register [Instructional] * Level [D] |
1.49 | 1.16 – 1.81 | <0.001 |
Register [Personal] * Level [D] |
0.59 | -0.12 – 1.30 | 0.104 |
Register [Fiction] * Level [E] |
0.42 | -0.04 – 0.87 | 0.072 |
Register [Informative] * Level [E] |
0.39 | -0.07 – 0.86 | 0.098 |
Register [Instructional] * Level [E] |
1.05 | 0.68 – 1.42 | <0.001 |
Register [Personal] * Level [E] |
1.01 | 0.27 – 1.76 | 0.008 |
Random Effects | |||
σ2 | 0.80 | ||
τ00 Series | 0.09 | ||
ICC | 0.10 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.723 / 0.752 |
tab_model(md2Register) # Marginal R2 = 0.558
PC 2 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.98 | 0.76 – 1.19 | <0.001 |
Register [Fiction] | -1.90 | -2.07 – -1.73 | <0.001 |
Register [Informative] | -3.39 | -3.55 – -3.24 | <0.001 |
Register [Instructional] | -0.04 | -0.18 – 0.09 | 0.520 |
Register [Personal] | -1.31 | -1.58 – -1.05 | <0.001 |
Random Effects | |||
σ2 | 1.33 | ||
τ00 Series | 0.09 | ||
ICC | 0.06 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.558 / 0.585 |
tab_model(md2Level) # Marginal R2 = 0.228
PC 2 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 1.44 | 1.18 – 1.70 | <0.001 |
Level [B] | -0.70 | -0.94 – -0.46 | <0.001 |
Level [C] | -1.45 | -1.68 – -1.23 | <0.001 |
Level [D] | -2.15 | -2.37 – -1.92 | <0.001 |
Level [E] | -2.54 | -2.79 – -2.28 | <0.001 |
Random Effects | |||
σ2 | 2.37 | ||
τ00 Series | 0.08 | ||
ICC | 0.03 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.228 / 0.253 |
# Plot of fixed effects:
plot_model(md2,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5),
title = "",
wrap.labels = 40,
axis.title = "PC2 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA2_lmer_fixedeffects.svg"), height = 8, width = 8)
#svg(here("plots", "TxB_predicted_PC2_scores_interactions.svg"), height = 5, width = 8)
visreg(md2, xvar = "Level", by="Register", type = "conditional",
line=list(col="darkred"),
xlab = "Textbook Level", ylab = "PC2"
#,gg = TRUE
,layout=c(5,1)
)
dev.off()
## null device
## 1
# Plots of random effects
## Random intercepts
plot_model(md2,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = "bw",
wrap.labels = 40,
axis.title = "PC2 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA2_lmer_randomeffects.svg"), height = 3, width = 8)
# Textbook Series-Register interactions
visreg::visreg(md2, "Register", by="Series", re.form=~(1|Series),
ylab="PC2", line=list(col="darkred"))
visreg(md2, xvar = "Series", by="Level", type = "conditional", re.form=~(1|Series),
line=list(col="darkred"), xlab = "Textbook Series", ylab = "PC2",
layout=c(1,5))
# Dimension 3
md3 <- lmer(PC3 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md3Register <- lmer(PC3 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md3Level <- lmer(PC3 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md3, md3Register, md3Level)
## Data: res.ind
## Models:
## md3Register: PC3 ~ Register + (1 | Series)
## md3Level: PC3 ~ Level + (1 | Series)
## md3: PC3 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md3Register 7 5139.9 5179.0 -2563.0 5125.9
## md3Level 7 5528.8 5567.9 -2757.4 5514.8 0.00 0
## md3 27 4582.6 4733.3 -2264.3 4528.6 986.21 20 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md3) # Marginal R2 = 0.436
PC 3 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -1.33 | -1.62 – -1.05 | <0.001 |
Register [Fiction] | 0.79 | 0.50 – 1.08 | <0.001 |
Register [Informative] | -0.79 | -1.10 – -0.49 | <0.001 |
Register [Instructional] | 1.03 | 0.82 – 1.24 | <0.001 |
Register [Personal] | 0.19 | -0.27 – 0.65 | 0.411 |
Level [B] | 0.97 | 0.77 – 1.17 | <0.001 |
Level [C] | 1.39 | 1.20 – 1.58 | <0.001 |
Level [D] | 1.50 | 1.30 – 1.70 | <0.001 |
Level [E] | 1.60 | 1.36 – 1.83 | <0.001 |
Register [Fiction] * Level [B] |
0.55 | 0.17 – 0.93 | 0.004 |
Register [Informative] * Level [B] |
-0.13 | -0.52 – 0.25 | 0.504 |
Register [Instructional] * Level [B] |
-0.67 | -0.95 – -0.39 | <0.001 |
Register [Personal] * Level [B] |
-0.34 | -0.91 – 0.23 | 0.242 |
Register [Fiction] * Level [C] |
0.78 | 0.41 – 1.15 | <0.001 |
Register [Informative] * Level [C] |
-0.17 | -0.53 – 0.19 | 0.362 |
Register [Instructional] * Level [C] |
-0.94 | -1.21 – -0.68 | <0.001 |
Register [Personal] * Level [C] |
0.17 | -0.40 – 0.73 | 0.568 |
Register [Fiction] * Level [D] |
0.34 | -0.03 – 0.70 | 0.072 |
Register [Informative] * Level [D] |
0.01 | -0.35 – 0.37 | 0.953 |
Register [Instructional] * Level [D] |
-1.01 | -1.29 – -0.73 | <0.001 |
Register [Personal] * Level [D] |
0.02 | -0.58 – 0.63 | 0.939 |
Register [Fiction] * Level [E] |
0.23 | -0.15 – 0.62 | 0.238 |
Register [Informative] * Level [E] |
-0.18 | -0.58 – 0.21 | 0.364 |
Register [Instructional] * Level [E] |
-1.01 | -1.32 – -0.70 | <0.001 |
Register [Personal] * Level [E] |
-0.21 | -0.84 – 0.43 | 0.520 |
Random Effects | |||
σ2 | 0.58 | ||
τ00 Series | 0.14 | ||
ICC | 0.19 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.436 / 0.543 |
tab_model(md3Register) # Marginal R2 = 0.272
PC 3 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -0.21 | -0.45 – 0.03 | 0.083 |
Register [Fiction] | 1.27 | 1.14 – 1.40 | <0.001 |
Register [Informative] | -0.73 | -0.85 – -0.61 | <0.001 |
Register [Instructional] | 0.29 | 0.19 – 0.39 | <0.001 |
Register [Personal] | 0.14 | -0.07 – 0.34 | 0.188 |
Random Effects | |||
σ2 | 0.79 | ||
τ00 Series | 0.12 | ||
ICC | 0.13 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.272 / 0.369 |
tab_model(md3Level) # Marginal R2 = 0.119
PC 3 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -0.93 | -1.18 – -0.68 | <0.001 |
Level [B] | 0.74 | 0.59 – 0.89 | <0.001 |
Level [C] | 1.05 | 0.91 – 1.19 | <0.001 |
Level [D] | 1.10 | 0.96 – 1.25 | <0.001 |
Level [E] | 1.13 | 0.97 – 1.30 | <0.001 |
Random Effects | |||
σ2 | 0.96 | ||
τ00 Series | 0.11 | ||
ICC | 0.10 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.119 / 0.209 |
# Plot of fixed effects:
plot_model(md3,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5),
title = "",
wrap.labels = 40,
axis.title = "PC3 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA3_lmer_fixedeffects.svg"), height = 8, width = 8)
#svg(here("plots", "TxB_predicted_PC3_scores_interactions.svg"), height = 5, width = 8)
visreg(md3, xvar = "Level", by="Register", type = "conditional",
line=list(col="darkred"),
xlab = "Textbook Level", ylab = "PC3"
#,gg = TRUE
,layout=c(5,1)
)
dev.off()
## null device
## 1
# Plot of random effects:
plot_model(md3,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
color = "bw",
wrap.labels = 40,
axis.title = "PC3 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA3_lmer_randomeffects.svg"), height = 3, width = 8)
# Textbook Series-Register interactions
visreg::visreg(md3, "Series", by="Register", re.form=~(1|Series),
ylab="PC2", line=list(col="darkred"))
visreg(md2, xvar = "Level", by="Series", type = "conditional", re.form=~(1|Series),
line=list(col="darkred"), xlab = "Textbook Series", ylab = "PC2")
# Dimension 4
md4 <- lmer(PC4 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md4Register <- lmer(PC4 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md4Level <- lmer(PC4 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md4, md4Register, md4Level)
## Data: res.ind
## Models:
## md4Register: PC4 ~ Register + (1 | Series)
## md4Level: PC4 ~ Level + (1 | Series)
## md4: PC4 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md4Register 7 5034.0 5073.0 -2510.0 5020.0
## md4Level 7 5043.6 5082.7 -2514.8 5029.6 0.00 0
## md4 27 4372.1 4522.8 -2159.1 4318.1 711.52 20 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md4) # Marginal R2 = 0.426
PC 4 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.76 | 0.56 – 0.96 | <0.001 |
Register [Fiction] | 0.72 | 0.44 – 1.00 | <0.001 |
Register [Informative] | 0.20 | -0.09 – 0.49 | 0.176 |
Register [Instructional] | -0.23 | -0.43 – -0.03 | 0.023 |
Register [Personal] | 0.49 | 0.06 – 0.93 | 0.026 |
Level [B] | -0.67 | -0.87 – -0.48 | <0.001 |
Level [C] | -1.14 | -1.32 – -0.96 | <0.001 |
Level [D] | -1.42 | -1.61 – -1.23 | <0.001 |
Level [E] | -1.76 | -1.99 – -1.54 | <0.001 |
Register [Fiction] * Level [B] |
0.97 | 0.61 – 1.33 | <0.001 |
Register [Informative] * Level [B] |
0.22 | -0.14 – 0.59 | 0.230 |
Register [Instructional] * Level [B] |
0.21 | -0.06 – 0.47 | 0.130 |
Register [Personal] * Level [B] |
-0.02 | -0.57 – 0.52 | 0.930 |
Register [Fiction] * Level [C] |
0.85 | 0.50 – 1.20 | <0.001 |
Register [Informative] * Level [C] |
0.25 | -0.09 – 0.59 | 0.155 |
Register [Instructional] * Level [C] |
0.37 | 0.11 – 0.62 | 0.005 |
Register [Personal] * Level [C] |
-0.26 | -0.80 – 0.28 | 0.342 |
Register [Fiction] * Level [D] |
0.65 | 0.30 – 1.00 | <0.001 |
Register [Informative] * Level [D] |
0.34 | -0.00 – 0.68 | 0.053 |
Register [Instructional] * Level [D] |
0.47 | 0.20 – 0.73 | 0.001 |
Register [Personal] * Level [D] |
-0.43 | -1.01 – 0.14 | 0.140 |
Register [Fiction] * Level [E] |
0.82 | 0.45 – 1.19 | <0.001 |
Register [Informative] * Level [E] |
0.42 | 0.05 – 0.80 | 0.027 |
Register [Instructional] * Level [E] |
0.44 | 0.14 – 0.74 | 0.004 |
Register [Personal] * Level [E] |
-0.55 | -1.15 – 0.05 | 0.072 |
Random Effects | |||
σ2 | 0.52 | ||
τ00 Series | 0.05 | ||
ICC | 0.08 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.426 / 0.472 |
tab_model(md4Register) # Marginal R2 = 0.203
PC 4 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -0.24 | -0.39 – -0.08 | 0.002 |
Register [Fiction] | 1.33 | 1.20 – 1.45 | <0.001 |
Register [Informative] | 0.30 | 0.18 – 0.41 | <0.001 |
Register [Instructional] | 0.05 | -0.05 – 0.15 | 0.355 |
Register [Personal] | 0.24 | 0.04 – 0.44 | 0.016 |
Random Effects | |||
σ2 | 0.75 | ||
τ00 Series | 0.04 | ||
ICC | 0.05 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.203 / 0.244 |
tab_model(md4Level) # Marginal R2 = 0.187
PC 4 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.81 | 0.60 – 1.01 | <0.001 |
Level [B] | -0.41 | -0.54 – -0.28 | <0.001 |
Level [C] | -0.86 | -0.99 – -0.73 | <0.001 |
Level [D] | -1.07 | -1.20 – -0.94 | <0.001 |
Level [E] | -1.35 | -1.49 – -1.20 | <0.001 |
Random Effects | |||
σ2 | 0.75 | ||
τ00 Series | 0.07 | ||
ICC | 0.09 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.187 / 0.259 |
# Plot of fixed effects:
plot_model(md4,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5),
title = "",
wrap.labels = 40,
axis.title = "PC4 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA4_lmer_fixedeffects.svg"), height = 8, width = 8)
#svg(here("plots", "TxB_predicted_PC4_scores_interactions.svg"), height = 5, width = 8)
visreg(md4, xvar = "Level", by="Register", type = "conditional",
line=list(col="darkred"),
xlab = "Textbook Level", ylab = "PC4"
#,gg = TRUE
,layout=c(5,1)
)
dev.off()
## null device
## 1
# Plot of random effects:
plot_model(md4,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
color = "bw",
wrap.labels = 40,
axis.title = "PC4 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA4_lmer_randomeffects.svg"), height = 3, width = 8)
# Dimension 5
md5 <- lmer(PC5 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md5Register <- lmer(PC5 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md5Level <- lmer(PC5 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md5, md5Register, md5Level)
## Data: res.ind
## Models:
## md5Register: PC5 ~ Register + (1 | Series)
## md5Level: PC5 ~ Level + (1 | Series)
## md5: PC5 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md5Register 7 4521.9 4561.0 -2253.9 4507.9
## md5Level 7 4386.8 4425.9 -2186.4 4372.8 135.039 0
## md5 27 4382.8 4533.5 -2164.4 4328.8 44.037 20 0.001488 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md5) # Marginal R2 = 0.071
PC 5 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -0.36 | -0.65 – -0.08 | 0.013 |
Register [Fiction] | 0.05 | -0.23 – 0.33 | 0.733 |
Register [Informative] | -0.31 | -0.60 – -0.02 | 0.034 |
Register [Instructional] | 0.14 | -0.07 – 0.34 | 0.187 |
Register [Personal] | 0.13 | -0.30 – 0.57 | 0.547 |
Level [B] | 0.32 | 0.13 – 0.51 | 0.001 |
Level [C] | 0.54 | 0.36 – 0.72 | <0.001 |
Level [D] | 0.62 | 0.43 – 0.81 | <0.001 |
Level [E] | 0.36 | 0.13 – 0.58 | 0.002 |
Register [Fiction] * Level [B] |
-0.10 | -0.46 – 0.26 | 0.593 |
Register [Informative] * Level [B] |
0.09 | -0.28 – 0.45 | 0.633 |
Register [Instructional] * Level [B] |
-0.11 | -0.37 – 0.16 | 0.436 |
Register [Personal] * Level [B] |
0.07 | -0.48 – 0.61 | 0.810 |
Register [Fiction] * Level [C] |
0.00 | -0.35 – 0.36 | 0.979 |
Register [Informative] * Level [C] |
0.31 | -0.03 – 0.65 | 0.075 |
Register [Instructional] * Level [C] |
-0.24 | -0.50 – 0.02 | 0.066 |
Register [Personal] * Level [C] |
-0.15 | -0.69 – 0.39 | 0.594 |
Register [Fiction] * Level [D] |
-0.13 | -0.48 – 0.22 | 0.475 |
Register [Informative] * Level [D] |
0.29 | -0.06 – 0.63 | 0.101 |
Register [Instructional] * Level [D] |
-0.17 | -0.43 – 0.10 | 0.211 |
Register [Personal] * Level [D] |
0.06 | -0.52 – 0.63 | 0.851 |
Register [Fiction] * Level [E] |
0.10 | -0.27 – 0.47 | 0.602 |
Register [Informative] * Level [E] |
0.57 | 0.20 – 0.95 | 0.003 |
Register [Instructional] * Level [E] |
0.31 | 0.01 – 0.61 | 0.040 |
Register [Personal] * Level [E] |
-0.35 | -0.95 – 0.25 | 0.256 |
Random Effects | |||
σ2 | 0.52 | ||
τ00 Series | 0.14 | ||
ICC | 0.21 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.071 / 0.269 |
tab_model(md5Register) # Marginal R2 = 0.001
PC 5 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.04 | -0.22 – 0.30 | 0.776 |
Register [Fiction] | 0.02 | -0.09 – 0.13 | 0.682 |
Register [Informative] | 0.03 | -0.08 – 0.13 | 0.611 |
Register [Instructional] | 0.06 | -0.02 – 0.15 | 0.157 |
Register [Personal] | 0.06 | -0.12 – 0.23 | 0.514 |
Random Effects | |||
σ2 | 0.57 | ||
τ00 Series | 0.15 | ||
ICC | 0.20 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.001 / 0.205 |
tab_model(md5Level) # Marginal R2 = 0.055
PC 5 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | -0.34 | -0.60 – -0.08 | 0.011 |
Level [B] | 0.27 | 0.16 – 0.39 | <0.001 |
Level [C] | 0.49 | 0.38 – 0.60 | <0.001 |
Level [D] | 0.57 | 0.47 – 0.68 | <0.001 |
Level [E] | 0.55 | 0.43 – 0.67 | <0.001 |
Random Effects | |||
σ2 | 0.53 | ||
τ00 Series | 0.14 | ||
ICC | 0.21 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.055 / 0.252 |
# Plot of fixed effects:
plot_model(md5,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5),
title = "",
wrap.labels = 40,
axis.title = "PC5 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA5_lmer_fixedeffects.svg"), height = 3, width = 8)
# Plot of random effects:
plot_model(md5,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
color = "bw",
wrap.labels = 40,
axis.title = "PC5 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA5_lmer_randomeffects.svg"), height = 3, width = 8)
# Dimension 6
md6 <- lmer(PC6 ~ Register*Level + (1|Series), data = res.ind, REML = FALSE)
md6Register <- lmer(PC6 ~ Register + (1|Series), data = res.ind, REML = FALSE)
md6Level <- lmer(PC6 ~ Level + (1|Series), data = res.ind, REML = FALSE)
anova(md6, md6Register, md6Level)
## Data: res.ind
## Models:
## md6Register: PC6 ~ Register + (1 | Series)
## md6Level: PC6 ~ Level + (1 | Series)
## md6: PC6 ~ Register * Level + (1 | Series)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## md6Register 7 4141.3 4180.4 -2063.7 4127.3
## md6Level 7 4151.6 4190.6 -2068.8 4137.6 0.000 0
## md6 27 4099.5 4250.2 -2022.8 4045.5 92.039 20 3.255e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(md6) # Marginal R2 = 0.048
PC 6 | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 0.13 | -0.15 – 0.40 | 0.374 |
Register [Fiction] | -0.26 | -0.52 – -0.00 | 0.049 |
Register [Informative] | -0.40 | -0.67 – -0.13 | 0.004 |
Register [Instructional] | -0.36 | -0.55 – -0.17 | <0.001 |
Register [Personal] | -0.06 | -0.47 – 0.34 | 0.762 |
Level [B] | 0.14 | -0.04 – 0.32 | 0.123 |
Level [C] | 0.01 | -0.16 – 0.18 | 0.922 |
Level [D] | 0.00 | -0.18 – 0.18 | 0.995 |
Level [E] | 0.23 | 0.02 – 0.44 | 0.029 |
Register [Fiction] * Level [B] |
0.31 | -0.03 – 0.64 | 0.072 |
Register [Informative] * Level [B] |
0.16 | -0.18 – 0.50 | 0.364 |
Register [Instructional] * Level [B] |
-0.04 | -0.29 – 0.21 | 0.767 |
Register [Personal] * Level [B] |
-0.35 | -0.85 – 0.16 | 0.181 |
Register [Fiction] * Level [C] |
0.19 | -0.13 – 0.52 | 0.243 |
Register [Informative] * Level [C] |
0.21 | -0.11 – 0.53 | 0.190 |
Register [Instructional] * Level [C] |
0.25 | 0.01 – 0.49 | 0.039 |
Register [Personal] * Level [C] |
-0.06 | -0.56 – 0.45 | 0.830 |
Register [Fiction] * Level [D] |
0.37 | 0.04 – 0.69 | 0.027 |
Register [Informative] * Level [D] |
0.39 | 0.07 – 0.71 | 0.018 |
Register [Instructional] * Level [D] |
0.33 | 0.09 – 0.58 | 0.008 |
Register [Personal] * Level [D] |
0.06 | -0.48 – 0.59 | 0.829 |
Register [Fiction] * Level [E] |
0.07 | -0.27 – 0.42 | 0.674 |
Register [Informative] * Level [E] |
0.58 | 0.23 – 0.93 | 0.001 |
Register [Instructional] * Level [E] |
-0.05 | -0.33 – 0.23 | 0.722 |
Register [Personal] * Level [E] |
-0.47 | -1.03 – 0.09 | 0.098 |
Random Effects | |||
σ2 | 0.45 | ||
τ00 Series | 0.14 | ||
ICC | 0.23 | ||
N Series | 9 | ||
Observations | 1961 | ||
Marginal R2 / Conditional R2 | 0.048 / 0.268 |
# Plot of fixed effects:
plot_model(md6,
type = "est",
show.intercept = TRUE,
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
colors = palette[c(1:3,8,7)],
group.terms = c(1:5,1,1,1,1,2:5,2:5,2:5,2:5),
title = "",
wrap.labels = 40,
axis.title = "PC6 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA6_lmer_fixedeffects.svg"), height = 3, width = 8)
# Plot of random effects:
plot_model(md6,
type = "re", # Option to visualise random effects
show.values=TRUE,
show.p=TRUE,
value.offset = .4,
value.size = 3.5,
color = "bw",
wrap.labels = 40,
axis.title = "PC6 estimated coefficients") +
theme_sjplot2()
#ggsave(here("plots", "TxB_PCA6_lmer_randomeffects.svg"), height = 3, width = 8)
# Boxplot visualisation
res.ind %>%
ggplot(aes(x = Register, y = PC2, fill = Level, facet = Register)) +
geom_boxplot() +
facet_grid()
ggplot(res.ind, aes(x=Register,y=PC2, fill = Level, colour = Register, facet = Register))+ # Or leave out "colour = Register" to keep the dots in black
#geom_point(position = position_jitter(width = .15), size = .25)+
# note that here we need to set the x-variable to a numeric variable and bump it to get the boxplots to line up with the rainclouds.
geom_boxplot(aes(x = as.numeric(Register), y = PC2), colour = "BLACK") +
ylab('PC2')+
theme_minimal()+
guides(fill = FALSE, colour = FALSE) +
scale_colour_manual(values = colours)+
scale_fill_manual(values = colours)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#ggsave(here("plots", "TxB_PC2_RegisterLevelsBoxplots.svg"), width = 13, height = 8)
#packages.bib <- sapply(1:length(loadedNamespaces()), function(i) toBibtex(citation(loadedNamespaces()[i])))
knitr::write_bib(c(.packages(), "knitr"), "packages.bib")
## Warning in utils::citation(..., lib.loc = lib.loc): no date field in DESCRIPTION
## file of package 'suffrager'
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] visreg_2.7.0 tibble_3.1.6 tidyr_1.1.4 suffrager_0.1.0
## [5] sjPlot_2.8.9 psych_2.0.12 PCAtools_2.2.0 ggrepel_0.9.1
## [9] pca3d_0.10.2 patchwork_1.1.1 lme4_1.1-27.1 Matrix_1.3-2
## [13] ggthemes_4.2.4 here_1.0.1 forcats_0.5.1 factoextra_1.0.7
## [17] emmeans_1.5.4 dplyr_1.0.7 DescTools_0.99.40 cowplot_1.1.1
## [21] caret_6.0-86 ggplot2_3.3.5 lattice_0.20-41
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.1
## [3] htmlwidgets_1.5.4 grid_4.0.3
## [5] BiocParallel_1.24.1 pROC_1.17.0.1
## [7] munsell_0.5.0 codetools_0.2-18
## [9] effectsize_0.4.4-2 miniUI_0.1.1.1
## [11] withr_2.4.3 colorspace_2.0-2
## [13] highr_0.9 knitr_1.37
## [15] rstudioapi_0.13 stats4_4.0.3
## [17] ggsignif_0.6.1 MatrixGenerics_1.2.1
## [19] labeling_0.4.2 mnormt_2.0.2
## [21] farver_2.1.0 datawizard_0.2.1
## [23] glmmTMB_1.0.2.1 rprojroot_2.0.2
## [25] coda_0.19-4 vctrs_0.3.8
## [27] generics_0.1.1 TH.data_1.0-10
## [29] ipred_0.9-11 xfun_0.29
## [31] R6_2.5.1 rsvd_1.0.3
## [33] manipulateWidget_0.10.1 DelayedArray_0.16.2
## [35] assertthat_0.2.1 promises_1.2.0.1
## [37] scales_1.1.1 multcomp_1.4-16
## [39] nnet_7.3-15 rootSolve_1.8.2.1
## [41] gtable_0.3.0 beachmat_2.6.4
## [43] lmom_2.8 sandwich_3.0-0
## [45] timeDate_3043.102 rlang_0.4.12
## [47] splines_4.0.3 TMB_1.7.19
## [49] rstatix_0.7.0 ModelMetrics_1.2.2.2
## [51] broom_0.7.9 rgl_0.105.22
## [53] yaml_2.2.1 reshape2_1.4.4
## [55] abind_1.4-5 modelr_0.1.8
## [57] crosstalk_1.2.0 backports_1.4.1
## [59] httpuv_1.6.4 tools_4.0.3
## [61] lava_1.6.9 ellipsis_0.3.2
## [63] jquerylib_0.1.4 BiocGenerics_0.36.0
## [65] Rcpp_1.0.7 plyr_1.8.6
## [67] sparseMatrixStats_1.2.1 purrr_0.3.4
## [69] ggpubr_0.4.0 rpart_4.1-15
## [71] S4Vectors_0.28.1 zoo_1.8-9
## [73] haven_2.3.1 magrittr_2.0.1
## [75] data.table_1.14.2 openxlsx_4.2.3
## [77] tmvnsim_1.0-2 mvtnorm_1.1-1
## [79] sjmisc_2.8.6 matrixStats_0.58.0
## [81] hms_1.0.0 mime_0.12
## [83] evaluate_0.14 xtable_1.8-4
## [85] pbkrtest_0.5.1 rio_0.5.26
## [87] sjstats_0.18.1 readxl_1.3.1
## [89] IRanges_2.24.1 ggeffects_1.0.1
## [91] compiler_4.0.3 ellipse_0.4.2
## [93] crayon_1.4.2 minqa_1.2.4
## [95] htmltools_0.5.2 later_1.3.0
## [97] expm_0.999-6 Exact_2.1
## [99] lubridate_1.7.10 DBI_1.1.1
## [101] sjlabelled_1.1.7 MASS_7.3-53.1
## [103] boot_1.3-27 car_3.0-10
## [105] cli_3.1.0 parallel_4.0.3
## [107] insight_0.14.5 gower_0.2.2
## [109] pkgconfig_2.0.3 foreign_0.8-81
## [111] recipes_0.1.15 foreach_1.5.1
## [113] bslib_0.3.1 dqrng_0.2.1
## [115] webshot_0.5.2 estimability_1.3
## [117] prodlim_2019.11.13 snakecase_0.11.0
## [119] stringr_1.4.0 digest_0.6.29
## [121] parameters_0.13.0.1 rmarkdown_2.11
## [123] cellranger_1.1.0 gld_2.6.2
## [125] DelayedMatrixStats_1.12.3 curl_4.3.2
## [127] shiny_1.7.1 nloptr_1.2.2.3
## [129] lifecycle_1.0.1 nlme_3.1-152
## [131] jsonlite_1.7.2 carData_3.0-4
## [133] fansi_0.5.0 pillar_1.6.4
## [135] fastmap_1.1.0 survival_3.2-7
## [137] glue_1.6.0 bayestestR_0.9.0
## [139] zip_2.1.1 iterators_1.0.13
## [141] class_7.3-18 stringi_1.7.6
## [143] sass_0.4.0 performance_0.7.2
## [145] BiocSingular_1.6.0 irlba_2.3.3
## [147] e1071_1.7-4