This script is part of the Online Appendix to my PhD thesis.
Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.
For more information, see: https://elenlefoll.github.io/TextbookEnglish/
Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.
Built with R 4.0.3
These chunks import the data directly from the Excel files in which the manual tag check and corrections was performed.
knitr::opts_chunk$set(echo = TRUE, tidy = TRUE, message=FALSE, paged.print=TRUE, fig.width = 10, warning=FALSE)
library(caret) # For computing confusion matrices
library(harrypotter) # Only for colour scheme
library(here) # For path management
library(paletteer) # For nice colours
library(readxl) # For the direct import of Excel files
library(tidyverse) # For everything else!
# Function to import and wrangle the evaluation data from the Excel files in which the manual evaluation was conducted
importEval3 <- function(file, fileID, register, corpus) {
Tag1 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag1, Tag1Gold) %>%
rename(Tag = Tag1, TagGold = Tag1Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
Tag2 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag2, Tag2Gold) %>%
rename(Tag = Tag2, TagGold = Tag2Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
Tag3 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag3, Tag3Gold) %>%
rename(Tag = Tag3, TagGold = Tag3Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
output <- rbind(Tag1, Tag2, Tag3) %>%
mutate(across(where(is.factor), str_remove_all, pattern = fixed(" "))) %>% # Removes all white spaces which are found in the excel files
filter(!is.na(Output)) %>%
mutate_if(is.character, as.factor)
}
# Second function to import and wrangle the evaluation data for Excel files with four tag columns as opposed to three
importEval4 <- function(file, fileID, register, corpus) {
Tag1 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag1, Tag1Gold) %>%
rename(Tag = Tag1, TagGold = Tag1Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
Tag2 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag2, Tag2Gold) %>%
rename(Tag = Tag2, TagGold = Tag2Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
Tag3 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag3, Tag3Gold) %>%
rename(Tag = Tag3, TagGold = Tag3Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
Tag4 <- file %>%
add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
select(FileID, Corpus, Register, Output, Tokens, Tag4, Tag4Gold) %>%
rename(Tag = Tag4, TagGold = Tag4Gold, Token = Tokens) %>%
mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>%
mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
output <- rbind(Tag1, Tag2, Tag3, Tag4) %>%
mutate(across(where(is.factor), str_remove_all, pattern = fixed(" "))) %>% # Removes all white spaces which are found in the excel files
filter(!is.na(Tag)) %>%
mutate_if(is.character, as.factor)
}
# Function to decide which of the two above functions should be used
importEval <- function(file, fileID, register, corpus) {
if(sum(!is.na(file$Tag4)) > 0) {
output = importEval4(file = file, fileID = fileID, register = register, corpus = corpus)
}
else{
output = importEval3(file = file, fileID = fileID, register = register, corpus = corpus)
}
}
Solutions_Intermediate_Spoken_0032 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Solutions_Intermediate_Spoken_0032_Evaluation.xlsx")), fileID = "Solutions_Intermediate_Spoken_0032", register = "Conversation", corpus = "TEC-Sp")
HT_5_Poetry_0001 <- importEval(file = read_excel(here("MFTE", "Evaluation", "HT_5_Poetry_0001_Evaluation.xlsx")), fileID = "HT_5_Poetry_0001", register = "Poetry", corpus = "TEC-Fr")
Achievers_A1_Informative_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Achievers_A1_Informative_0006_Evaluation.xlsx")), fileID = "Achievers_A1_Informative_0006", register = "Informative", corpus = "TEC-Sp")
New_GreenLine_5_Personal_0003 <- importEval(file = read_excel(here("MFTE", "Evaluation", "New_GreenLine_5_Personal_0003_Evaluation.xlsx")), fileID = "New_GreenLine_5_Personal_0003", register = "Personal communication", corpus = "TEC-Ger")
Piece_of_cake_3e_Instructional_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Piece_of_cake_3e_Instructional_0006_Evaluation.xlsx")), fileID = "Piece_of_cake_3e_Instructional_0006", register = "Instructional", corpus = "TEC-Fr")
Access_4_Narrative_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Access_4_Narrative_0006_Evaluation.xlsx")), fileID = "Access_4_Narrative_0006", register = "Fiction", corpus = "TEC-Ger")
BNCBFict_b2 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_b2.xlsx")), fileID = "BNCBFict_b2", register = "fiction", corpus = "BNC2014")
BNCBFict_m54 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_m54.xlsx")), fileID = "BNCBFict_m54", register = "fiction", corpus = "BNC2014")
BNCBFict_e27 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_e27.xlsx")), fileID = "BNCBFict_e27", register = "fiction", corpus = "BNC2014")
BNCBMass16 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBMass16.xlsx")), fileID = "BNCBMass16", register = "news", corpus = "BNC2014")
BNCBMass23 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBMass23.xlsx")), fileID = "BNCBMass23", register = "news", corpus = "BNC2014")
BNCBReg111 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBReg111.xlsx")), fileID = "BNCBReg111", register = "news", corpus = "BNC2014")
BNCBReg750 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBReg750.xlsx")), fileID = "BNCBReg750", register = "news", corpus = "BNC2014")
BNCBSer486 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBSer486.xlsx")), fileID = "BNCBSer486", register = "news", corpus = "BNC2014")
BNCBSer562 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBSer562.xlsx")), fileID = "BNCBSer562", register = "news", corpus = "BNC2014")
BNCBEBl8 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBEBl8.xlsx")), fileID = "BNCBEBl8", register = "internet", corpus = "BNC2014")
BNCBEFor32 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBEFor32.xlsx")), fileID = "BNCBEFor32", register = "internet", corpus = "BNC2014")
S2DD <- importEval(file = read_excel(here("MFTE", "Evaluation", "S2DD.xlsx")), fileID = "S2DD", register = "spoken", corpus = "BNC2014")
S3AV <- importEval(file = read_excel(here("MFTE", "Evaluation", "S3AV.xlsx")), fileID = "S3AV", register = "spoken", corpus = "BNC2014")
SEL5 <- importEval(file = read_excel(here("MFTE", "Evaluation", "SEL5.xlsx")), fileID = "SEL5", register = "spoken", corpus = "BNC2014")
SVLK <- importEval(file = read_excel(here("MFTE", "Evaluation", "SVLK.xlsx")), fileID = "SVLK", register = "spoken", corpus = "BNC2014")
SZXQ <- importEval(file = read_excel(here("MFTE", "Evaluation", "SZXQ.xlsx")), fileID = "SZXQ", register = "spoken", corpus = "BNC2014")
TaggerEval <- rbind(Solutions_Intermediate_Spoken_0032, HT_5_Poetry_0001, Achievers_A1_Informative_0006, New_GreenLine_5_Personal_0003, Piece_of_cake_3e_Instructional_0006, Access_4_Narrative_0006, BNCBEBl8, BNCBFict_b2, BNCBFict_m54, BNCBFict_e27, BNCBEFor32, BNCBMass16, BNCBMass23, BNCBReg111, BNCBReg750, BNCBSer486, BNCBSer562, S2DD, S3AV, SEL5, SVLK, SZXQ)
summary(TaggerEval)
## FileID Corpus Register Output
## BNCBFict_b2 : 2621 TEC-Sp : 1042 fiction :6500 ._. : 1156
## BNCBFict_e27: 2104 TEC-Fr : 2058 news :6312 the_DT : 820
## BNCBFict_m54: 1775 TEC-Ger: 1415 spoken :6047 ,_, : 720
## BNCBMass16 : 1619 BNC2014:20718 internet :1859 a_DT : 466
## SEL5 : 1463 Instructional:1048 of_IN : 328
## BNCBEFor32 : 1305 Poetry :1010 (Other):21742
## (Other) :14346 (Other) :2457 NA's : 1
## Token Tag TagGold Evaluation
## . : 1156 NN : 4415 NN : 4328 Mode :logical
## the : 820 IN : 1810 IN : 1773 FALSE:930
## , : 720 DT : 1454 DT : 1457 TRUE :24303
## to : 495 . : 1367 . : 1367
## 's : 493 VPRT : 1044 VPRT : 1054
## (Other):21547 VBD : 899 VBD : 895
## NA's : 2 (Other):14244 (Other):14359
TaggerEval <- TaggerEval %>% mutate(Tag = ifelse(Tag == "PHC", "CC", as.character(Tag))) %>%
mutate(TagGold = ifelse(TagGold == "PHC", "CC", as.character(TagGold))) %>% mutate(Tag = ifelse(Tag ==
"QLIKE", "LIKE", as.character(Tag))) %>% mutate(TagGold = ifelse(TagGold == "QLIKE",
"LIKE", as.character(TagGold))) %>% mutate(Tag = ifelse(Tag == "TO", "IN", as.character(Tag))) %>%
mutate(TagGold = ifelse(TagGold == "TO", "IN", as.character(TagGold))) %>% mutate_if(is.character,
as.factor) %>% mutate(Evaluation = ifelse(as.character(Tag) == as.character(TagGold),
TRUE, FALSE))
head(TaggerEval) # Check sanity of data
## # A tibble: 6 × 8
## FileID Corpus Register Output Token Tag TagGold Evaluation
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <lgl>
## 1 Solutions_Interm… TEC-Sp Conversat… Intervie… Interv… NN NN TRUE
## 2 Solutions_Interm… TEC-Sp Conversat… In_IN In IN IN TRUE
## 3 Solutions_Interm… TEC-Sp Conversat… this_DEMO this DEMO DEMO TRUE
## 4 Solutions_Interm… TEC-Sp Conversat… part_NN part NN NN TRUE
## 5 Solutions_Interm… TEC-Sp Conversat… of_IN of IN IN TRUE
## 6 Solutions_Interm… TEC-Sp Conversat… the_DT the DT DT TRUE
summary(TaggerEval) # Check sanity of data
## FileID Corpus Register Output
## BNCBFict_b2 : 2621 TEC-Sp : 1042 fiction :6500 ._. : 1156
## BNCBFict_e27: 2104 TEC-Fr : 2058 news :6312 the_DT : 820
## BNCBFict_m54: 1775 TEC-Ger: 1415 spoken :6047 ,_, : 720
## BNCBMass16 : 1619 BNC2014:20718 internet :1859 a_DT : 466
## SEL5 : 1463 Instructional:1048 of_IN : 328
## BNCBEFor32 : 1305 Poetry :1010 (Other):21742
## (Other) :14346 (Other) :2457 NA's : 1
## Token Tag TagGold Evaluation
## . : 1156 NN : 4415 NN : 4328 Mode :logical
## the : 820 IN : 2145 IN : 2113 FALSE:832
## , : 720 DT : 1454 DT : 1457 TRUE :24401
## to : 495 . : 1367 . : 1367
## 's : 493 VPRT : 1044 VPRT : 1054
## (Other):21547 VBD : 899 VBD : 895
## NA's : 2 (Other):13909 (Other):14019
# saveRDS(TaggerEval, here('MFTE', 'Evaluation',
# 'MFTE_PhD_Evaluation_Results.rds')) # Last saved 10 Nov 2021
# write.csv(TaggerEval, here('MFTE', 'Evaluation',
# 'MFTE_PhD_Evaluation_Results.csv')) # Last saved 10 Nov 2021
TaggerEval <- readRDS(here("MFTE", "Evaluation", "MFTE_PhD_Evaluation_Results.rds"))
summary(TaggerEval)
## FileID Corpus Register Output
## BNCBFict_b2 : 2621 TEC-Sp : 1042 fiction :6500 ._. : 1156
## BNCBFict_e27: 2104 TEC-Fr : 2058 news :6312 the_DT : 820
## BNCBFict_m54: 1775 TEC-Ger: 1415 spoken :6047 ,_, : 720
## BNCBMass16 : 1619 BNC2014:20718 internet :1859 a_DT : 466
## SEL5 : 1463 Instructional:1048 of_IN : 328
## BNCBEFor32 : 1305 Poetry :1010 (Other):21742
## (Other) :14346 (Other) :2457 NA's : 1
## Token Tag TagGold Evaluation
## . : 1156 NN : 4415 NN : 4328 Mode :logical
## the : 820 IN : 2145 IN : 2113 FALSE:832
## , : 720 DT : 1454 DT : 1457 TRUE :24401
## to : 495 . : 1367 . : 1367
## 's : 493 VPRT : 1044 VPRT : 1054
## (Other):21547 VBD : 899 VBD : 895
## NA's : 2 (Other):13909 (Other):14019
In this chunk, I calculate the recall and precision rates of each feature, ignoring unclear tokens and all punctuation and symbols.
# Total number of TEC tags manually checked
TaggerEval %>% filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>% nrow()
## [1] 4515
# Number of UNCLEAR evaluation tags
TaggerEval %>%
filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>%
filter(TagGold == "UNCLEAR") %>%
nrow() # 0 in TEC sample
## [1] 0
data <- TaggerEval %>%
filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>%
filter(TagGold != "UNCLEAR") %>%
filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove punctuation tags which are uninteresting here.
filter(Tag != "SYM" & Tag != "``") %>%
droplevels(.) %>%
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID Corpus Register Output Token Tag TagGold
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
## Mode FALSE TRUE
## logical 114 3831
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall # Note that is not very representative because it includes tags which are not intended for use in the MDA studies, e.g., LS and FW.
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9711027 0.9688555 0.9653870 0.9761054 0.2035488
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
## Precision Recall F1
## Class: ABLE 1.0000000 1.00000000 1.0000000
## Class: ACT 0.9672131 0.98333333 0.9752066
## Class: AMP 1.0000000 1.00000000 1.0000000
## Class: ASPECT 1.0000000 1.00000000 1.0000000
## Class: BEMA 1.0000000 1.00000000 1.0000000
## Class: CAUSE 1.0000000 1.00000000 1.0000000
## Class: CC 1.0000000 0.99115044 0.9955556
## Class: CD 0.9545455 0.95454545 0.9545455
## Class: COMM 1.0000000 0.97727273 0.9885057
## Class: COND 1.0000000 1.00000000 1.0000000
## Class: CONT 1.0000000 1.00000000 1.0000000
## Class: CUZ 1.0000000 1.00000000 1.0000000
## Class: DEMO 0.9655172 0.96551724 0.9655172
## Class: DMA 1.0000000 1.00000000 1.0000000
## Class: DOAUX 0.8571429 1.00000000 0.9230769
## Class: DT 1.0000000 1.00000000 1.0000000
## Class: DWNT 0.6666667 1.00000000 0.8000000
## Class: ELAB 1.0000000 1.00000000 1.0000000
## Class: EMPH 0.8285714 1.00000000 0.9062500
## Class: EX 1.0000000 1.00000000 1.0000000
## Class: EXIST 1.0000000 1.00000000 1.0000000
## Class: FPP1P 1.0000000 1.00000000 1.0000000
## Class: FPP1S 1.0000000 1.00000000 1.0000000
## Class: FPUH 1.0000000 1.00000000 1.0000000
## Class: FREQ 1.0000000 1.00000000 1.0000000
## Class: FW 0.1000000 1.00000000 0.1818182
## Class: GTO 1.0000000 1.00000000 1.0000000
## Class: HDG 1.0000000 1.00000000 1.0000000
## Class: HGOT 1.0000000 1.00000000 1.0000000
## Class: IN 1.0000000 0.99731903 0.9986577
## Class: JJ 0.9555556 0.98473282 0.9699248
## Class: JPRED 0.9736842 0.90243902 0.9367089
## Class: LIKE 0.8333333 1.00000000 0.9090909
## Class: MDCA 1.0000000 1.00000000 1.0000000
## Class: MDCO 1.0000000 1.00000000 1.0000000
## Class: MDMM 1.0000000 0.66666667 0.8000000
## Class: MDNE 1.0000000 0.80000000 0.8888889
## Class: MDWO 1.0000000 1.00000000 1.0000000
## Class: MDWS 1.0000000 1.00000000 1.0000000
## Class: MENTAL 0.9892473 0.98924731 0.9892473
## Class: NCOMP 0.8800000 1.00000000 0.9361702
## Class: NN 0.9508393 0.98754670 0.9688454
## Class: NULL 1.0000000 0.07692308 0.1428571
## Class: OCCUR 0.9444444 1.00000000 0.9714286
## Class: PASS 0.8888889 0.88888889 0.8888889
## Class: PEAS 1.0000000 0.86666667 0.9285714
## Class: PGET 1.0000000 1.00000000 1.0000000
## Class: PIT 1.0000000 1.00000000 1.0000000
## Class: PLACE 1.0000000 0.83333333 0.9090909
## Class: POLITE 1.0000000 1.00000000 1.0000000
## Class: POS 1.0000000 1.00000000 1.0000000
## Class: PROG 1.0000000 0.89473684 0.9444444
## Class: QUAN 0.9622642 0.98076923 0.9714286
## Class: QUPR 1.0000000 1.00000000 1.0000000
## Class: RB 1.0000000 0.98571429 0.9928058
## Class: RP 1.0000000 1.00000000 1.0000000
## Class: SO 1.0000000 0.63636364 0.7777778
## Class: SPLIT 1.0000000 1.00000000 1.0000000
## Class: SPP2 1.0000000 1.00000000 1.0000000
## Class: STPR 0.6000000 1.00000000 0.7500000
## Class: THATD 0.8571429 1.00000000 0.9230769
## Class: THRC 1.0000000 0.71428571 0.8333333
## Class: THSC 0.6923077 1.00000000 0.8181818
## Class: TIME 1.0000000 0.96774194 0.9836066
## Class: TPP3P 1.0000000 1.00000000 1.0000000
## Class: TPP3S 1.0000000 1.00000000 1.0000000
## Class: VB 0.9448819 0.93750000 0.9411765
## Class: VBD 0.9733333 0.98648649 0.9798658
## Class: VBG 0.9642857 1.00000000 0.9818182
## Class: VBN 0.8461538 0.91666667 0.8800000
## Class: VIMP 0.9868421 0.88235294 0.9316770
## Class: VPRT 0.9796954 0.97969543 0.9796954
## Class: WHQU 0.9677419 1.00000000 0.9836066
## Class: WHSC 1.0000000 0.97058824 0.9850746
## Class: XX0 1.0000000 1.00000000 1.0000000
## Class: YNQU 1.0000000 1.00000000 1.0000000
## Class: OCR NA 0.00000000 NA
# Number of UNCLEAR evaluation tags
TaggerEval %>%
filter(Register == "spoken") %>%
filter(TagGold == "UNCLEAR") %>%
nrow() # 7 in Spoken BNC2014 sample
## [1] 7
data <- TaggerEval %>%
filter(Register == "spoken") %>%
filter(TagGold != "UNCLEAR") %>%
filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
droplevels(.) %>%
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID Corpus Register Output Token Tag TagGold
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
## Mode FALSE TRUE
## logical 224 5388
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9600855 0.9584079 0.9546300 0.9650557 0.1193870
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
## Precision Recall F1
## Class: ABLE 1.0000000 1.0000000 1.0000000
## Class: ACT 0.9831933 0.9831933 0.9831933
## Class: AMP 1.0000000 1.0000000 1.0000000
## Class: ASPECT 0.9285714 0.9285714 0.9285714
## Class: BEMA 0.9696970 1.0000000 0.9846154
## Class: CAUSE 0.8000000 1.0000000 0.8888889
## Class: CC 1.0000000 1.0000000 1.0000000
## Class: CD 0.7719298 0.9777778 0.8627451
## Class: COMM 1.0000000 1.0000000 1.0000000
## Class: CONC 1.0000000 1.0000000 1.0000000
## Class: COND 1.0000000 1.0000000 1.0000000
## Class: CONT 1.0000000 1.0000000 1.0000000
## Class: CUZ 1.0000000 0.9375000 0.9677419
## Class: DEMO 1.0000000 0.7702703 0.8702290
## Class: DMA 1.0000000 0.9426752 0.9704918
## Class: DOAUX 0.9629630 0.9811321 0.9719626
## Class: DT 1.0000000 0.9960784 0.9980354
## Class: ELAB 1.0000000 1.0000000 1.0000000
## Class: EMPH 0.9864865 0.9864865 0.9864865
## Class: EX 0.8695652 1.0000000 0.9302326
## Class: EXIST 1.0000000 1.0000000 1.0000000
## Class: FPP1P 1.0000000 1.0000000 1.0000000
## Class: FPP1S 1.0000000 0.9797980 0.9897959
## Class: FPUH 1.0000000 1.0000000 1.0000000
## Class: FREQ 1.0000000 1.0000000 1.0000000
## Class: FW 0.0000000 NA NA
## Class: GTO 1.0000000 0.9166667 0.9565217
## Class: HDG 1.0000000 0.9473684 0.9729730
## Class: HGOT 0.8333333 0.8333333 0.8333333
## Class: IN 0.9565217 0.9924812 0.9741697
## Class: JJAT 0.9292035 0.9459459 0.9375000
## Class: JJPR 0.8795181 0.9240506 0.9012346
## Class: LIKE 0.9838710 0.9104478 0.9457364
## Class: MDCA 1.0000000 1.0000000 1.0000000
## Class: MDCO 1.0000000 1.0000000 1.0000000
## Class: MDMM 1.0000000 1.0000000 1.0000000
## Class: MDNE 1.0000000 0.8500000 0.9189189
## Class: MDWO 1.0000000 0.9705882 0.9850746
## Class: MDWS 1.0000000 1.0000000 1.0000000
## Class: MENTAL 0.9230769 1.0000000 0.9600000
## Class: NCOMP 0.8070175 1.0000000 0.8932039
## Class: NN 0.9572901 0.9701493 0.9636768
## Class: OCCUR 1.0000000 1.0000000 1.0000000
## Class: PASS 0.8000000 1.0000000 0.8888889
## Class: PEAS 0.9487179 0.8809524 0.9135802
## Class: PGET 0.6666667 0.6666667 0.6666667
## Class: PIT 1.0000000 1.0000000 1.0000000
## Class: PLACE 0.9696970 0.9142857 0.9411765
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS 0.5555556 1.0000000 0.7142857
## Class: PROG 1.0000000 0.8529412 0.9206349
## Class: QUAN 1.0000000 1.0000000 1.0000000
## Class: QUPR 1.0000000 1.0000000 1.0000000
## Class: QUTAG 1.0000000 1.0000000 1.0000000
## Class: RB 0.9698795 0.9200000 0.9442815
## Class: RP 0.9444444 0.7727273 0.8500000
## Class: SO 0.9583333 1.0000000 0.9787234
## Class: SPLIT 1.0000000 1.0000000 1.0000000
## Class: SPP2 1.0000000 1.0000000 1.0000000
## Class: STPR 1.0000000 1.0000000 1.0000000
## Class: THATD 0.6363636 1.0000000 0.7777778
## Class: THRC 0.6923077 0.6923077 0.6923077
## Class: THSC 0.3200000 1.0000000 0.4848485
## Class: TIME 1.0000000 0.9000000 0.9473684
## Class: TPP3P 1.0000000 1.0000000 1.0000000
## Class: TPP3S 1.0000000 1.0000000 1.0000000
## Class: USEDTO 1.0000000 1.0000000 1.0000000
## Class: VB 0.9397590 0.9435484 0.9416499
## Class: VBD 0.9831461 0.9722222 0.9776536
## Class: VBG 0.8604651 0.9736842 0.9135802
## Class: VBN 0.3636364 1.0000000 0.5333333
## Class: VIMP 0.6470588 0.5789474 0.6111111
## Class: VPRT 0.9775281 0.9586777 0.9680111
## Class: WHQU 0.6190476 0.9285714 0.7428571
## Class: WHSC 0.9846154 0.8888889 0.9343066
## Class: XX0 1.0000000 1.0000000 1.0000000
## Class: YNQU 0.8333333 1.0000000 0.9090909
## Class: NULL NA 0.0000000 NA
# Number of UNCLEAR evaluation tags
TaggerEval %>%
filter(Register == "fiction") %>%
filter(TagGold == "UNCLEAR") %>%
nrow() # 0 in BNC2014 Baby+ fiction subsample
## [1] 0
data <- TaggerEval %>%
filter(Register == "fiction") %>%
filter(TagGold != "UNCLEAR") %>%
filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
filter(Tag != "SYM" & Tag != "``") %>%
droplevels(.) %>%
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID Corpus Register Output Token Tag TagGold
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
## Mode FALSE TRUE
## logical 168 5346
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9695321 0.9674367 0.9646492 0.9739093 0.1904244
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
## Precision Recall F1
## Class: ACT 1.0000000 0.9916667 0.9958159
## Class: AMP 0.9473684 1.0000000 0.9729730
## Class: ASPECT 1.0000000 1.0000000 1.0000000
## Class: BEMA 0.9809524 1.0000000 0.9903846
## Class: CAUSE 1.0000000 1.0000000 1.0000000
## Class: CC 0.9948187 0.9795918 0.9871465
## Class: CD 1.0000000 0.9354839 0.9666667
## Class: COMM 0.9868421 1.0000000 0.9933775
## Class: CONC 1.0000000 1.0000000 1.0000000
## Class: COND 1.0000000 1.0000000 1.0000000
## Class: CONT 1.0000000 1.0000000 1.0000000
## Class: CUZ 1.0000000 1.0000000 1.0000000
## Class: DEMO 0.9791667 0.8867925 0.9306931
## Class: DMA 0.9230769 0.9230769 0.9230769
## Class: DOAUX 0.9523810 1.0000000 0.9756098
## Class: DT 1.0000000 1.0000000 1.0000000
## Class: DWNT 1.0000000 1.0000000 1.0000000
## Class: ELAB 1.0000000 0.7500000 0.8571429
## Class: EMPH 1.0000000 1.0000000 1.0000000
## Class: EX 1.0000000 1.0000000 1.0000000
## Class: EXIST 1.0000000 1.0000000 1.0000000
## Class: FPP1P 1.0000000 1.0000000 1.0000000
## Class: FPP1S 1.0000000 1.0000000 1.0000000
## Class: FPUH 1.0000000 1.0000000 1.0000000
## Class: FREQ 1.0000000 1.0000000 1.0000000
## Class: FW 0.0000000 NA NA
## Class: GTO 1.0000000 1.0000000 1.0000000
## Class: HDG 1.0000000 1.0000000 1.0000000
## Class: HGOT 1.0000000 1.0000000 1.0000000
## Class: IN 0.9804688 0.9980119 0.9891626
## Class: JJAT 0.9072165 0.8979592 0.9025641
## Class: JJPR 0.8541667 0.9111111 0.8817204
## Class: LIKE 0.9411765 1.0000000 0.9696970
## Class: MDCA 1.0000000 1.0000000 1.0000000
## Class: MDCO 1.0000000 1.0000000 1.0000000
## Class: MDMM 1.0000000 1.0000000 1.0000000
## Class: MDNE 1.0000000 0.9375000 0.9677419
## Class: MDWO 0.9444444 1.0000000 0.9714286
## Class: MDWS 1.0000000 1.0000000 1.0000000
## Class: MENTAL 1.0000000 1.0000000 1.0000000
## Class: NCOMP 0.8823529 0.9836066 0.9302326
## Class: NN 0.9695238 0.9695238 0.9695238
## Class: OCCUR 0.9285714 1.0000000 0.9629630
## Class: PASS 0.9354839 0.9666667 0.9508197
## Class: PEAS 0.9772727 0.7962963 0.8775510
## Class: PIT 1.0000000 1.0000000 1.0000000
## Class: PLACE 1.0000000 0.9375000 0.9677419
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS 0.9000000 1.0000000 0.9473684
## Class: PROG 0.9583333 0.8846154 0.9200000
## Class: QUAN 1.0000000 0.9692308 0.9843750
## Class: QUPR 1.0000000 1.0000000 1.0000000
## Class: QUTAG 1.0000000 1.0000000 1.0000000
## Class: RB 0.9865772 0.9423077 0.9639344
## Class: RP 1.0000000 0.7750000 0.8732394
## Class: SO 1.0000000 1.0000000 1.0000000
## Class: SPLIT 1.0000000 1.0000000 1.0000000
## Class: SPP2 1.0000000 1.0000000 1.0000000
## Class: STPR 1.0000000 1.0000000 1.0000000
## Class: THATD 0.7894737 1.0000000 0.8823529
## Class: THRC 0.7142857 1.0000000 0.8333333
## Class: THSC 0.8823529 0.9677419 0.9230769
## Class: TIME 0.9666667 1.0000000 0.9830508
## Class: TPP3P 1.0000000 1.0000000 1.0000000
## Class: TPP3S 1.0000000 1.0000000 1.0000000
## Class: VB 0.9377990 0.9751244 0.9560976
## Class: VBD 0.9801700 0.9829545 0.9815603
## Class: VBG 0.9120879 0.9540230 0.9325843
## Class: VBN 0.5806452 0.8571429 0.6923077
## Class: VIMP 0.7500000 0.7500000 0.7500000
## Class: VPRT 0.9781022 0.9370629 0.9571429
## Class: WHQU 0.8461538 1.0000000 0.9166667
## Class: WHSC 0.9818182 0.9818182 0.9818182
## Class: XX0 1.0000000 1.0000000 1.0000000
## Class: YNQU 0.8181818 1.0000000 0.9000000
## Class: NULL NA 0.0000000 NA
# Number of files and tags included in this part of the evaluation (intended to match the ITTC data)
TaggerEval %>%
filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>%
group_by(FileID) %>%
count()
## # A tibble: 8 × 2
## # Groups: FileID [8]
## FileID n
## <fct> <int>
## 1 BNCBEBl8 554
## 2 BNCBEFor32 1305
## 3 BNCBMass16 1619
## 4 BNCBMass23 268
## 5 BNCBReg111 1230
## 6 BNCBReg750 1275
## 7 BNCBSer486 1182
## 8 BNCBSer562 738
# Number of UNCLEAR evaluation tags
TaggerEval %>%
filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>%
filter(TagGold == "UNCLEAR") %>%
nrow() # 8
## [1] 8
data <- TaggerEval %>%
filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>%
filter(TagGold != "UNCLEAR") %>%
filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
filter(Tag != "SYM" & Tag != "``") %>%
droplevels(.) %>%
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID Corpus Register Output Token Tag TagGold
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
## Mode FALSE TRUE
## logical 309 7113
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9583670 0.9542631 0.9535718 0.9627979 0.2431959
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
## Precision Recall F1
## Class: ACT 0.9210526 0.9887006 0.9536785
## Class: AMP 1.0000000 0.9375000 0.9677419
## Class: ASPECT 1.0000000 1.0000000 1.0000000
## Class: BEMA 0.9909910 0.9909910 0.9909910
## Class: CAUSE 1.0000000 1.0000000 1.0000000
## Class: CC 0.9960474 0.9921260 0.9940828
## Class: CD 0.9924242 0.9776119 0.9849624
## Class: COMM 1.0000000 1.0000000 1.0000000
## Class: CONC 0.9000000 0.8181818 0.8571429
## Class: COND 1.0000000 1.0000000 1.0000000
## Class: CONT 0.9642857 1.0000000 0.9818182
## Class: CUZ 1.0000000 0.9000000 0.9473684
## Class: DEMO 1.0000000 0.9607843 0.9800000
## Class: DMA 0.5000000 0.4000000 0.4444444
## Class: DOAUX 0.9200000 0.9200000 0.9200000
## Class: DT 1.0000000 0.9959184 0.9979550
## Class: DWNT 1.0000000 1.0000000 1.0000000
## Class: ELAB 1.0000000 1.0000000 1.0000000
## Class: EMPH 0.9761905 0.9534884 0.9647059
## Class: EX 1.0000000 1.0000000 1.0000000
## Class: EXIST 0.9642857 1.0000000 0.9818182
## Class: FPP1P 1.0000000 1.0000000 1.0000000
## Class: FPP1S 1.0000000 1.0000000 1.0000000
## Class: FPUH 1.0000000 0.6666667 0.8000000
## Class: FREQ 1.0000000 1.0000000 1.0000000
## Class: FW 0.2857143 0.4000000 0.3333333
## Class: GTO 1.0000000 1.0000000 1.0000000
## Class: HDG 1.0000000 1.0000000 1.0000000
## Class: IN 0.9857988 0.9964115 0.9910767
## Class: JJAT 0.9373134 0.8722222 0.9035971
## Class: JJPR 0.9195402 0.7407407 0.8205128
## Class: LIKE 1.0000000 1.0000000 1.0000000
## Class: MDCA 1.0000000 1.0000000 1.0000000
## Class: MDCO 1.0000000 1.0000000 1.0000000
## Class: MDMM 1.0000000 1.0000000 1.0000000
## Class: MDNE 1.0000000 0.9545455 0.9767442
## Class: MDWO 1.0000000 1.0000000 1.0000000
## Class: MDWS 1.0000000 1.0000000 1.0000000
## Class: MENTAL 0.9814815 1.0000000 0.9906542
## Class: NCOMP 0.9189189 0.9941520 0.9550562
## Class: NN 0.9566396 0.9778393 0.9671233
## Class: OCCUR 1.0000000 1.0000000 1.0000000
## Class: PASS 0.9240506 0.9240506 0.9240506
## Class: PEAS 1.0000000 0.9142857 0.9552239
## Class: PGET 1.0000000 0.6666667 0.8000000
## Class: PIT 1.0000000 0.9615385 0.9803922
## Class: PLACE 0.8636364 1.0000000 0.9268293
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS 0.9777778 0.9565217 0.9670330
## Class: PROG 0.9210526 0.8750000 0.8974359
## Class: PRP 0.0000000 0.0000000 NaN
## Class: QUAN 0.9638554 1.0000000 0.9815951
## Class: QUPR 1.0000000 1.0000000 1.0000000
## Class: RB 0.9629630 0.9489051 0.9558824
## Class: RP 1.0000000 0.8181818 0.9000000
## Class: SO 1.0000000 0.8888889 0.9411765
## Class: SPLIT 1.0000000 1.0000000 1.0000000
## Class: SPP2 1.0000000 1.0000000 1.0000000
## Class: STPR 0.5000000 1.0000000 0.6666667
## Class: THATD 0.8461538 1.0000000 0.9166667
## Class: THRC 1.0000000 0.5000000 0.6666667
## Class: THSC 0.8500000 1.0000000 0.9189189
## Class: TIME 0.9512195 0.9750000 0.9629630
## Class: TPP3P 1.0000000 1.0000000 1.0000000
## Class: TPP3S 1.0000000 1.0000000 1.0000000
## Class: URL 1.0000000 1.0000000 1.0000000
## Class: USEDTO 0.0000000 NA NA
## Class: VB 0.8988764 0.9302326 0.9142857
## Class: VBD 0.9587156 0.9720930 0.9653580
## Class: VBG 0.9099099 0.9099099 0.9099099
## Class: VBN 0.4150943 1.0000000 0.5866667
## Class: VIMP 0.7142857 0.3448276 0.4651163
## Class: VPRT 0.9488636 0.9515670 0.9502134
## Class: WHQU 1.0000000 0.4444444 0.6153846
## Class: WHSC 0.9500000 1.0000000 0.9743590
## Class: XX0 1.0000000 0.9736842 0.9866667
## Class: YNQU 0.0000000 NA NA
## Class: `` NA 0.0000000 NA
## Class: NULL NA 0.0000000 NA
## Class: SYM NA 0.0000000 NA
# Number of tags evaluated per file
TaggerEval %>% group_by(FileID) %>% count(.) %>% arrange(desc(n)) %>% as.data.frame()
## FileID n
## 1 BNCBFict_b2 2621
## 2 BNCBFict_e27 2104
## 3 BNCBFict_m54 1775
## 4 BNCBMass16 1619
## 5 SEL5 1463
## 6 BNCBEFor32 1305
## 7 BNCBReg750 1275
## 8 BNCBReg111 1230
## 9 SVLK 1222
## 10 BNCBSer486 1182
## 11 S2DD 1180
## 12 S3AV 1126
## 13 SZXQ 1056
## 14 Piece_of_cake_3e_Instructional_0006 1048
## 15 HT_5_Poetry_0001 1010
## 16 New_GreenLine_5_Personal_0003 796
## 17 BNCBSer562 738
## 18 Solutions_Intermediate_Spoken_0032 636
## 19 Access_4_Narrative_0006 619
## 20 BNCBEBl8 554
## 21 Achievers_A1_Informative_0006 406
## 22 BNCBMass23 268
# Number of UNCLEAR tokens
TaggerEval %>% filter(TagGold == "UNCLEAR") %>% nrow()
## [1] 15
# Tagger evaluation
summary(TaggerEval$Evaluation)
## Mode FALSE TRUE
## logical 832 24401
data <- TaggerEval %>%
filter(TagGold != "UNCLEAR") %>%
filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
filter(Tag != "SYM" & Tag != "``") %>%
filter(TagGold != "SYM" & TagGold != "``") %>%
droplevels(.) %>%
mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9638522 0.9613992 0.9613298 0.9662538 0.1924325
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
# Quick summary of results: recall, precision and f1
cm$byClass[,5:7]
## Precision Recall F1
## Class: ABLE 1.0000000 1.000000000 1.00000000
## Class: ACT 0.9618182 0.986940299 0.97421731
## Class: AMP 0.9795918 0.979591837 0.97959184
## Class: ASPECT 0.9852941 0.985294118 0.98529412
## Class: BEMA 0.9839080 0.997668998 0.99074074
## Class: CAUSE 0.9736842 1.000000000 0.98666667
## Class: CC 0.9973333 0.990728477 0.99401993
## Class: CD 0.9375000 0.969827586 0.95338983
## Class: COMM 0.9960317 0.996031746 0.99603175
## Class: CONC 0.9729730 0.947368421 0.96000000
## Class: COND 1.0000000 1.000000000 1.00000000
## Class: CONT 0.9957447 1.000000000 0.99786780
## Class: CUZ 1.0000000 0.944444444 0.97142857
## Class: DEMO 0.9890710 0.874396135 0.92820513
## Class: DMA 0.9796954 0.932367150 0.95544554
## Class: DOAUX 0.9385965 0.972727273 0.95535714
## Class: DT 1.0000000 0.997940975 0.99896943
## Class: DWNT 0.9375000 1.000000000 0.96774194
## Class: ELAB 1.0000000 0.909090909 0.95238095
## Class: EMPH 0.9550562 0.982658960 0.96866097
## Class: EX 0.9552239 1.000000000 0.97709924
## Class: EXIST 0.9848485 1.000000000 0.99236641
## Class: FPP1P 1.0000000 1.000000000 1.00000000
## Class: FPP1S 1.0000000 0.990719258 0.99533800
## Class: FPUH 1.0000000 0.992700730 0.99633700
## Class: FREQ 1.0000000 1.000000000 1.00000000
## Class: FW 0.1363636 0.500000000 0.21428571
## Class: GTO 1.0000000 0.964285714 0.98181818
## Class: HDG 1.0000000 0.966666667 0.98305085
## Class: HGOT 0.8750000 0.875000000 0.87500000
## Class: IN 0.9813346 0.996210327 0.98871650
## Class: JJ 0.9555556 0.984732824 0.96992481
## Class: JJAT 0.9267913 0.892053973 0.90909091
## Class: JJPR 0.8834586 0.848375451 0.86556169
## Class: JPRED 0.9736842 0.902439024 0.93670886
## Class: LIKE 0.9680851 0.938144330 0.95287958
## Class: MDCA 1.0000000 1.000000000 1.00000000
## Class: MDCO 1.0000000 1.000000000 1.00000000
## Class: MDMM 1.0000000 0.947368421 0.97297297
## Class: MDNE 1.0000000 0.897058824 0.94573643
## Class: MDWO 0.9791667 0.989473684 0.98429319
## Class: MDWS 1.0000000 1.000000000 1.00000000
## Class: MENTAL 0.9709443 0.997512438 0.98404908
## Class: NCOMP 0.8888889 0.993788820 0.93841642
## Class: NN 0.9589290 0.976432532 0.96760160
## Class: NULL 1.0000000 0.009009009 0.01785714
## Class: OCCUR 0.9607843 1.000000000 0.98000000
## Class: PASS 0.9130435 0.933333333 0.92307692
## Class: PEAS 0.9812500 0.867403315 0.92082111
## Class: PGET 0.9000000 0.750000000 0.81818182
## Class: PIT 1.0000000 0.991202346 0.99558174
## Class: PLACE 0.9555556 0.934782609 0.94505495
## Class: POLITE 1.0000000 1.000000000 1.00000000
## Class: POS 0.9247312 0.977272727 0.95027624
## Class: PROG 0.9629630 0.873949580 0.91629956
## Class: PRP 0.0000000 0.000000000 NaN
## Class: QUAN 0.9805447 0.988235294 0.98437500
## Class: QUPR 1.0000000 1.000000000 1.00000000
## Class: QUTAG 1.0000000 1.000000000 1.00000000
## Class: RB 0.9768786 0.942379182 0.95931883
## Class: RP 0.9837398 0.817567568 0.89298893
## Class: SO 0.9729730 0.935064935 0.95364238
## Class: SPLIT 1.0000000 1.000000000 1.00000000
## Class: SPP2 1.0000000 1.000000000 1.00000000
## Class: STPR 0.6842105 1.000000000 0.81250000
## Class: THATD 0.7540984 1.000000000 0.85981308
## Class: THRC 0.8235294 0.700000000 0.75675676
## Class: THSC 0.7232143 0.987804878 0.83505155
## Class: TIME 0.9765625 0.961538462 0.96899225
## Class: TPP3P 1.0000000 1.000000000 1.00000000
## Class: TPP3S 1.0000000 1.000000000 1.00000000
## Class: URL 1.0000000 1.000000000 1.00000000
## Class: USEDTO 0.5000000 1.000000000 0.66666667
## Class: VB 0.9272300 0.946107784 0.93657380
## Class: VBD 0.9744160 0.978770950 0.97658863
## Class: VBG 0.9084249 0.942965779 0.92537313
## Class: VBN 0.5092593 0.932203390 0.65868263
## Class: VIMP 0.8869565 0.723404255 0.79687500
## Class: VPRT 0.9683301 0.957305503 0.96278626
## Class: WHQU 0.8405797 0.906250000 0.87218045
## Class: WHSC 0.9723320 0.960937500 0.96660118
## Class: XX0 1.0000000 0.992307692 0.99613900
## Class: YNQU 0.8333333 1.000000000 0.90909091
## Class: OCR NA 0.000000000 NA
# Generate a better formatted results table for export: recall, precision and f1
confusion_matrix <- cm$table
total <- sum(confusion_matrix)
number_of_classes <- nrow(confusion_matrix)
correct <- diag(confusion_matrix)
# sum all columns
total_actual_class <- apply(confusion_matrix, 2, sum)
# sum all rows
total_pred_class <- apply(confusion_matrix, 1, sum)
# Precision = TP / all that were predicted as positive
precision <- correct / total_pred_class
# Recall = TP / all that were actually positive
recall <- correct / total_actual_class
# F1
f1 <- (2 * precision * recall) / (precision + recall)
# create data frame to output results
results <- data.frame(precision, recall, f1, total_actual_class)
results
## precision recall f1 total_actual_class
## ABLE 1.0000000 1.000000000 1.00000000 6
## ACT 0.9618182 0.986940299 0.97421731 536
## AMP 0.9795918 0.979591837 0.97959184 49
## ASPECT 0.9852941 0.985294118 0.98529412 68
## BEMA 0.9839080 0.997668998 0.99074074 429
## CAUSE 0.9736842 1.000000000 0.98666667 37
## CC 0.9973333 0.990728477 0.99401993 755
## CD 0.9375000 0.969827586 0.95338983 232
## COMM 0.9960317 0.996031746 0.99603175 252
## CONC 0.9729730 0.947368421 0.96000000 38
## COND 1.0000000 1.000000000 1.00000000 69
## CONT 0.9957447 1.000000000 0.99786780 468
## CUZ 1.0000000 0.944444444 0.97142857 54
## DEMO 0.9890710 0.874396135 0.92820513 207
## DMA 0.9796954 0.932367150 0.95544554 207
## DOAUX 0.9385965 0.972727273 0.95535714 110
## DT 1.0000000 0.997940975 0.99896943 1457
## DWNT 0.9375000 1.000000000 0.96774194 15
## ELAB 1.0000000 0.909090909 0.95238095 11
## EMPH 0.9550562 0.982658960 0.96866097 173
## EX 0.9552239 1.000000000 0.97709924 64
## EXIST 0.9848485 1.000000000 0.99236641 65
## FPP1P 1.0000000 1.000000000 1.00000000 158
## FPP1S 1.0000000 0.990719258 0.99533800 431
## FPUH 1.0000000 0.992700730 0.99633700 137
## FREQ 1.0000000 1.000000000 1.00000000 70
## FW 0.1363636 0.500000000 0.21428571 6
## GTO 1.0000000 0.964285714 0.98181818 28
## HDG 1.0000000 0.966666667 0.98305085 60
## HGOT 0.8750000 0.875000000 0.87500000 8
## IN 0.9813346 0.996210327 0.98871650 2111
## JJ 0.9555556 0.984732824 0.96992481 131
## JJAT 0.9267913 0.892053973 0.90909091 667
## JJPR 0.8834586 0.848375451 0.86556169 277
## JPRED 0.9736842 0.902439024 0.93670886 41
## LIKE 0.9680851 0.938144330 0.95287958 97
## MDCA 1.0000000 1.000000000 1.00000000 56
## MDCO 1.0000000 1.000000000 1.00000000 45
## MDMM 1.0000000 0.947368421 0.97297297 19
## MDNE 1.0000000 0.897058824 0.94573643 68
## MDWO 0.9791667 0.989473684 0.98429319 95
## MDWS 1.0000000 1.000000000 1.00000000 83
## MENTAL 0.9709443 0.997512438 0.98404908 402
## NCOMP 0.8888889 0.993788820 0.93841642 322
## NN 0.9589290 0.976432532 0.96760160 4328
## NULL 1.0000000 0.009009009 0.01785714 111
## OCCUR 0.9607843 1.000000000 0.98000000 49
## PASS 0.9130435 0.933333333 0.92307692 135
## PEAS 0.9812500 0.867403315 0.92082111 181
## PGET 0.9000000 0.750000000 0.81818182 12
## PIT 1.0000000 0.991202346 0.99558174 341
## PLACE 0.9555556 0.934782609 0.94505495 92
## POLITE 1.0000000 1.000000000 1.00000000 15
## POS 0.9247312 0.977272727 0.95027624 88
## PROG 0.9629630 0.873949580 0.91629956 119
## PRP 0.0000000 0.000000000 NaN 1
## QUAN 0.9805447 0.988235294 0.98437500 255
## QUPR 1.0000000 1.000000000 1.00000000 58
## QUTAG 1.0000000 1.000000000 1.00000000 20
## RB 0.9768786 0.942379182 0.95931883 538
## RP 0.9837398 0.817567568 0.89298893 148
## SO 0.9729730 0.935064935 0.95364238 77
## SPLIT 1.0000000 1.000000000 1.00000000 113
## SPP2 1.0000000 1.000000000 1.00000000 299
## STPR 0.6842105 1.000000000 0.81250000 13
## THATD 0.7540984 1.000000000 0.85981308 46
## THRC 0.8235294 0.700000000 0.75675676 40
## THSC 0.7232143 0.987804878 0.83505155 82
## TIME 0.9765625 0.961538462 0.96899225 130
## TPP3P 1.0000000 1.000000000 1.00000000 206
## TPP3S 1.0000000 1.000000000 1.00000000 480
## URL 1.0000000 1.000000000 1.00000000 1
## USEDTO 0.5000000 1.000000000 0.66666667 1
## VB 0.9272300 0.946107784 0.93657380 835
## VBD 0.9744160 0.978770950 0.97658863 895
## VBG 0.9084249 0.942965779 0.92537313 263
## VBN 0.5092593 0.932203390 0.65868263 59
## VIMP 0.8869565 0.723404255 0.79687500 141
## VPRT 0.9683301 0.957305503 0.96278626 1054
## WHQU 0.8405797 0.906250000 0.87218045 64
## WHSC 0.9723320 0.960937500 0.96660118 256
## XX0 1.0000000 0.992307692 0.99613900 260
## YNQU 0.8333333 1.000000000 0.90909091 40
## OCR NaN 0.000000000 NaN 31
resultslong <- results %>% drop_na() %>% mutate(tag = row.names(.)) %>% filter(tag !=
"NULL" & tag != "SYM" & tag != "OCR" & tag != "FW" & tag != "USEDTO") %>% rename(n = total_actual_class) %>%
pivot_longer(cols = c("precision", "recall", "f1"), names_to = "metric", values_to = "value") %>%
mutate(metric = factor(metric, levels = c("precision", "recall", "f1")))
summary(resultslong$n)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 49.0 110.0 282.8 263.0 4328.0
ggplot(resultslong, aes(y = reorder(tag, desc(tag)), x = value, group = metric, colour = n)) +
geom_point(size = 2) + ylab("") + xlab("") + facet_wrap(~metric) + scale_color_paletteer_c("harrypotter::harrypotter",
trans = "log", breaks = c(1, 10, 100, 1000), labels = c(1, 10, 100, 1000), name = "# tokens \nmanually\nevaluated") +
theme_bw() + theme(panel.grid.major.y = element_line(colour = "darkgrey")) +
theme(legend.position = "right")
# ggsave(here('Plots', 'TaggerAccuracyPlot.svg'), width = 7, height = 12)
# Adding an error tag with the incorrectly assigned tag and underscore and then
# the correct 'gold' label
errors <- TaggerEval %>% filter(Evaluation == "FALSE") %>% filter(TagGold != "UNCLEAR") %>%
mutate(Error = paste(Tag, TagGold, sep = " -> "))
# Total number of errors
nrow(errors) # 817
## [1] 817
FreqErrors <- errors %>% # filter(Corpus %in% c('TEC-Fr', 'TEC-Ger', 'TEC-Sp')) %>%
count(Error) %>% arrange(desc(n))
# Number of error types that only occur once
FreqErrors %>% filter(n == 1) %>% nrow()
## [1] 94
# Total number of error types
nrow(FreqErrors)
## [1] 198
FreqErrors %>% # filter(n > 10) %>%
print.data.frame()
## Error n
## 1 NCOMP -> NULL 37
## 2 NN -> JJAT 35
## 3 JJAT -> NN 27
## 4 NN -> VB 27
## 5 IN -> RP 25
## 6 NN -> VPRT 24
## 7 VB -> NN 22
## 8 THSC -> DEMO 19
## 9 VB -> VIMP 19
## 10 NN -> OCR 16
## 11 VBN -> JJAT 16
## 12 ACT -> NULL 15
## 13 THATD -> NULL 15
## 14 CD -> NN 12
## 15 MENTAL -> NULL 12
## 16 NN -> VBG 11
## 17 NN -> VIMP 11
## 18 THSC -> THRC 11
## 19 VBG -> PROG 11
## 20 VBN -> JJPR 11
## 21 VBN -> VBD 10
## 22 WHQU -> WHSC 10
## 23 JJPR -> RB 9
## 24 JJPR -> JJAT 8
## 25 JJPR -> NN 8
## 26 NN -> JJPR 8
## 27 PASS -> JJPR 8
## 28 VBD -> PEAS 8
## 29 VBN -> PEAS 8
## 30 VPRT -> NN 8
## 31 VPRT -> VB 8
## 32 VPRT -> VIMP 8
## 33 YNQU -> NULL 8
## 34 BEMA -> NULL 7
## 35 FW -> NN 7
## 36 FW -> OCR 7
## 37 NN -> CD 7
## 38 POS -> VPRT 7
## 39 VB -> VPRT 7
## 40 VBG -> NN 7
## 41 DOAUX -> ACT 6
## 42 IN -> RB 6
## 43 JJAT -> RB 6
## 44 THRC -> DEMO 6
## 45 WHSC -> WHQU 6
## 46 EMPH -> SO 5
## 47 NN -> DMA 5
## 48 VIMP -> VB 5
## 49 NN -> PROG 4
## 50 NN -> RB 4
## 51 RB -> JJPR 4
## 52 STPR -> NULL 4
## 53 VB -> LIKE 4
## 54 VBG -> JJAT 4
## 55 VBN -> PASS 4
## 56 VIMP -> NN 4
## 57 VIMP -> VPRT 4
## 58 ACT -> DOAUX 3
## 59 EX -> PLACE 3
## 60 JJAT -> DMA 3
## 61 JJAT -> JJPR 3
## 62 NN -> CC 3
## 63 PASS -> PEAS 3
## 64 PROG -> JJPR 3
## 65 RB -> JJAT 3
## 66 VB -> JJAT 3
## 67 VB -> PEAS 3
## 68 VBD -> JJPR 3
## 69 VBD -> PASS 3
## 70 VBD -> VBN 3
## 71 VPRT -> MDNE 3
## 72 CD -> FPP1S 2
## 73 DMA -> XX0 2
## 74 EMPH -> TIME 2
## 75 FW -> IN 2
## 76 IN -> CC 2
## 77 IN -> CUZ 2
## 78 IN -> TIME 2
## 79 JJ -> NN 2
## 80 JJ -> OCR 2
## 81 JJAT -> FW 2
## 82 JJAT -> VBG 2
## 83 JJPR -> DMA 2
## 84 LIKE -> VPRT 2
## 85 MDWO -> VBD 2
## 86 NCOMP -> OCR 2
## 87 NN -> DT 2
## 88 NN -> FPP1S 2
## 89 NN -> IN 2
## 90 NN -> JPRED 2
## 91 NN -> MDNE 2
## 92 NN -> NULL 2
## 93 NN -> PIT 2
## 94 NN -> VBD 2
## 95 PEAS -> VBD 2
## 96 QUAN -> NN 2
## 97 RB -> DMA 2
## 98 RP -> RB 2
## 99 STPR -> OCR 2
## 100 SYM -> IN 2
## 101 TIME -> CONC 2
## 102 VBD -> VB 2
## 103 VPRT -> LIKE 2
## 104 VPRT -> VBD 2
## 105 ACT -> COMM 1
## 106 ACT -> MENTAL 1
## 107 ACT -> NCOMP 1
## 108 AMP -> RB 1
## 109 ASPECT -> NULL 1
## 110 CAUSE -> NULL 1
## 111 CC -> DT 1
## 112 CC -> IN 1
## 113 CD -> PRP 1
## 114 COMM -> NCOMP 1
## 115 CONC -> TIME 1
## 116 CONT -> BEMA 1
## 117 CONT -> NULL 1
## 118 DEMO -> THRC 1
## 119 DEMO -> THSC 1
## 120 DMA -> NULL 1
## 121 DMA -> RB 1
## 122 DOAUX -> NULL 1
## 123 DWNT -> QUAN 1
## 124 EMPH -> NULL 1
## 125 EXIST -> NULL 1
## 126 FW -> CC 1
## 127 FW -> NULL 1
## 128 FW -> VBG 1
## 129 HGOT -> PEAS 1
## 130 IN -> DMA 1
## 131 IN -> HDG 1
## 132 IN -> PLACE 1
## 133 JJ -> JPRED 1
## 134 JJ -> RB 1
## 135 JJAT -> ELAB 1
## 136 JJAT -> PGET 1
## 137 JJAT -> QUAN 1
## 138 JJAT -> VPRT 1
## 139 JJPR -> IN 1
## 140 JJPR -> PLACE 1
## 141 JJPR -> VB 1
## 142 JJPR -> VBG 1
## 143 JPRED -> PLACE 1
## 144 LIKE -> VB 1
## 145 NCOMP -> ASPECT 1
## 146 NN -> FPUH 1
## 147 NN -> FW 1
## 148 NN -> HDG 1
## 149 NN -> JJ 1
## 150 NN -> MDMM 1
## 151 NN -> PASS 1
## 152 NN -> PEAS 1
## 153 NN -> PGET 1
## 154 NN -> POS 1
## 155 NN -> QUAN 1
## 156 NN -> SYM 1
## 157 OCCUR -> ACT 1
## 158 OCCUR -> NULL 1
## 159 PASS -> JPRED 1
## 160 PEAS -> VBN 1
## 161 PGET -> JJAT 1
## 162 PLACE -> EMPH 1
## 163 PLACE -> IN 1
## 164 PLACE -> JJAT 1
## 165 PLACE -> RP 1
## 166 POS -> `` 1
## 167 PROG -> NN 1
## 168 PRP -> PIT 1
## 169 QUAN -> AMP 1
## 170 QUAN -> EMPH 1
## 171 QUAN -> OCR 1
## 172 RB -> IN 1
## 173 RB -> NULL 1
## 174 RB -> RP 1
## 175 SO -> CUZ 1
## 176 SO -> EMPH 1
## 177 THSC -> RB 1
## 178 TIME -> CC 1
## 179 USEDTO -> PASS 1
## 180 VB -> DEMO 1
## 181 VB -> MDNE 1
## 182 VB -> OCR 1
## 183 VB -> VBD 1
## 184 VBD -> HGOT 1
## 185 VBD -> JJAT 1
## 186 VBD -> MDNE 1
## 187 VBD -> MDWO 1
## 188 VBG -> GTO 1
## 189 VBG -> JJ 1
## 190 VBG -> JJPR 1
## 191 VBN -> NN 1
## 192 VBN -> PGET 1
## 193 VBN -> VB 1
## 194 VBN -> VIMP 1
## 195 VPRT -> JJPR 1
## 196 VPRT -> POS 1
## 197 WHQU -> NN 1
## 198 WHSC -> DMA 1
errors %>% filter(Error == "THSC -> THRC") %>% select(FileID, Output, Tag, TagGold) %>%
print(n = 30)
## # A tibble: 11 × 4
## FileID Output Tag TagGold
## <fct> <fct> <fct> <fct>
## 1 HT_5_Poetry_0001 That_THSC THSC THRC
## 2 New_GreenLine_5_Personal_0003 that_THSC THSC THRC
## 3 New_GreenLine_5_Personal_0003 that_THSC THSC THRC
## 4 BNCBReg750 that_THSC THSC THRC
## 5 BNCBReg750 that_THSC THSC THRC
## 6 BNCBSer486 that_THSC THSC THRC
## 7 BNCBSer562 that_THSC THSC THRC
## 8 S3AV that_THSC THSC THRC
## 9 S3AV that_THSC THSC THRC
## 10 SVLK that_THSC THSC THRC
## 11 SVLK that_THSC THSC THRC
errors %>% filter(Error == "NN -> JJAT") %>% select(-Output, -Corpus, -Tag, -TagGold) %>%
filter(grepl(x = Token, pattern = "[A-Z]+.")) %>% print.data.frame()
## FileID Register Token Evaluation Error
## 1 BNCBEFor32 internet Intermediate FALSE NN -> JJAT
## 2 BNCBMass16 news FINAL FALSE NN -> JJAT
## 3 BNCBMass16 news Big FALSE NN -> JJAT
## 4 BNCBReg111 news Scottish FALSE NN -> JJAT
## 5 BNCBReg111 news Scottish FALSE NN -> JJAT
## 6 BNCBReg111 news Mental FALSE NN -> JJAT
## 7 BNCBReg111 news Scottish FALSE NN -> JJAT
## 8 BNCBReg111 news Central FALSE NN -> JJAT
## 9 BNCBReg750 news English FALSE NN -> JJAT
## 10 BNCBReg750 news Natural FALSE NN -> JJAT
## 11 BNCBReg750 news European FALSE NN -> JJAT
## 12 BNCBReg750 news Christian FALSE NN -> JJAT
## 13 BNCBReg750 news Social FALSE NN -> JJAT
## 14 BNCBReg750 news Common FALSE NN -> JJAT
## 15 BNCBSer486 news Northern FALSE NN -> JJAT
## 16 BNCBSer486 news Northern FALSE NN -> JJAT
## 17 BNCBSer486 news Northern FALSE NN -> JJAT
## 18 BNCBSer562 news United FALSE NN -> JJAT
## 19 BNCBSer562 news White FALSE NN -> JJAT
## 20 BNCBSer562 news Untold FALSE NN -> JJAT
## 21 BNCBSer562 news New FALSE NN -> JJAT
## 22 SEL5 spoken Black FALSE NN -> JJAT
errors %>% filter(Error %in% c("NN -> VB", "VB -> NN", "NN -> VPRT", "VPRT -> NN")) %>%
count(Token) %>% arrange(desc(n)) %>% print.data.frame()
## Token n
## 1 mince 5
## 2 build 4
## 3 win 4
## 4 hunt 3
## 5 wags 3
## 6 throw 2
## 7 look 2
## 8 swamp 2
## 9 stop 2
## 10 defeats 2
## 11 fight 1
## 12 go 1
## 13 prize 1
## 14 Fly 1
## 15 Have 1
## 16 rule 1
## 17 run 1
## 18 chalk 1
## 19 shoot 1
## 20 stir 1
## 21 visit 1
## 22 chat 1
## 23 checks 1
## 24 dispatch 1
## 25 Is 1
## 26 kiss 1
## 27 mean 1
## 28 climb 1
## 29 start 1
## 30 trails 1
## 31 Travel 1
## 32 glue 1
## 33 leak 1
## 34 leaks 1
## 35 shreds 1
## 36 sniff 1
## 37 balances 1
## 38 convict 1
## 39 panic 1
## 40 suits 1
## 41 crumble 1
## 42 SLEEPS 1
## 43 debate 1
## 44 question 1
## 45 thread 1
## 46 upgrade 1
## 47 hurdle 1
## 48 land 1
## 49 scramble 1
## 50 pocket 1
## 51 care 1
## 52 socialise 1
## 53 controls 1
## 54 talks 1
## 55 escapes 1
## 56 mate 1
## 57 tastes 1
## 58 jump 1
## 59 stroke 1
## 60 bang 1
## 61 rhymes 1
## 62 spice 1
errors %>% filter(Error == "ACT -> NULL") %>% count(Token) %>% arrange(desc(n)) %>%
print.data.frame()
## Token n
## 1 win 3
## 2 throw 2
## 3 lost 2
## 4 left 1
## 5 waiting 1
## 6 working 1
## 7 running 1
## 8 done 1
## 9 fixed 1
## 10 Play 1
## 11 reached 1
# packages.bib <- sapply(1:length(loadedNamespaces()), function(i)
# toBibtex(citation(loadedNamespaces()[i])))
knitr::write_bib(c(.packages(), "knitr"), "packages.bib")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [5] readr_2.0.2 tidyr_1.1.4 tibble_3.1.6 tidyverse_1.3.0
## [9] readxl_1.3.1 paletteer_1.3.0 here_1.0.1 harrypotter_2.1.1
## [13] caret_6.0-86 ggplot2_3.3.5 lattice_0.20-41
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-152 fs_1.5.2 lubridate_1.7.10
## [4] httr_1.4.2 rprojroot_2.0.2 tools_4.0.3
## [7] backports_1.4.1 bslib_0.3.1 utf8_1.2.2
## [10] R6_2.5.1 rpart_4.1-15 DBI_1.1.1
## [13] colorspace_2.0-2 nnet_7.3-15 withr_2.4.3
## [16] tidyselect_1.1.1 gridExtra_2.3 compiler_4.0.3
## [19] cli_3.1.0 rvest_1.0.0 formatR_1.8
## [22] xml2_1.3.3 labeling_0.4.2 prismatic_1.0.0
## [25] sass_0.4.0 scales_1.1.1 digest_0.6.29
## [28] rmarkdown_2.11 pkgconfig_2.0.3 htmltools_0.5.2
## [31] highr_0.9 dbplyr_2.1.0 fastmap_1.1.0
## [34] rlang_0.4.12 rstudioapi_0.13 farver_2.1.0
## [37] jquerylib_0.1.4 generics_0.1.1 jsonlite_1.7.2
## [40] ModelMetrics_1.2.2.2 magrittr_2.0.1 Matrix_1.3-2
## [43] Rcpp_1.0.7 munsell_0.5.0 fansi_0.5.0
## [46] lifecycle_1.0.1 stringi_1.7.6 pROC_1.17.0.1
## [49] yaml_2.2.1 MASS_7.3-53.1 plyr_1.8.6
## [52] recipes_0.1.15 grid_4.0.3 crayon_1.4.2
## [55] haven_2.3.1 splines_4.0.3 hms_1.0.0
## [58] knitr_1.37 pillar_1.6.4 reshape2_1.4.4
## [61] codetools_0.2-18 stats4_4.0.3 reprex_1.0.0
## [64] glue_1.6.0 evaluate_0.14 data.table_1.14.2
## [67] modelr_0.1.8 vctrs_0.3.8 tzdb_0.1.2
## [70] foreach_1.5.1 cellranger_1.1.0 gtable_0.3.0
## [73] rematch2_2.1.2 assertthat_0.2.1 xfun_0.29
## [76] gower_0.2.2 prodlim_2019.11.13 broom_0.7.9
## [79] e1071_1.7-4 class_7.3-18 survival_3.2-7
## [82] timeDate_3043.102 iterators_1.0.13 lava_1.6.9
## [85] ellipsis_0.3.2 ipred_0.9-11