This script is part of the Online Appendix to my PhD thesis.

Please cite as: Le Foll, Elen. 2022. Textbook English: A Corpus-Based Analysis of the Language of EFL textbooks used in Secondary Schools in France, Germany and Spain. PhD thesis. Osnabrück University.

For more information, see: https://elenlefoll.github.io/TextbookEnglish/

Please note that the plot dimensions in this notebook have been optimised for the print version of the thesis.

Set-up

Built with R 4.0.3

Tagger evaluation on TEC data

Data import from evaluation files

These chunks import the data directly from the Excel files in which the manual tag check and corrections was performed.

knitr::opts_chunk$set(echo = TRUE, tidy = TRUE, message=FALSE, paged.print=TRUE, fig.width = 10, warning=FALSE)

library(caret) # For computing confusion matrices
library(harrypotter) # Only for colour scheme
library(here) # For path management
library(paletteer) # For nice colours
library(readxl) # For the direct import of Excel files
library(tidyverse) # For everything else!
# Function to import and wrangle the evaluation data from the Excel files in which the manual evaluation was conducted
importEval3 <- function(file, fileID, register, corpus) {
  Tag1 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag1, Tag1Gold) %>% 
  rename(Tag = Tag1, TagGold = Tag1Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)
  
  Tag2 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag2, Tag2Gold) %>% 
  rename(Tag = Tag2, TagGold = Tag2Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

Tag3 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag3, Tag3Gold) %>% 
  rename(Tag = Tag3, TagGold = Tag3Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

output <- rbind(Tag1, Tag2, Tag3) %>% 
  mutate(across(where(is.factor), str_remove_all, pattern = fixed(" "))) %>% # Removes all white spaces which are found in the excel files
  filter(!is.na(Output)) %>% 
  mutate_if(is.character, as.factor)
}

# Second function to import and wrangle the evaluation data for Excel files with four tag columns as opposed to three
importEval4 <- function(file, fileID, register, corpus) {
  Tag1 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag1, Tag1Gold) %>% 
  rename(Tag = Tag1, TagGold = Tag1Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)
  
  Tag2 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag2, Tag2Gold) %>% 
  rename(Tag = Tag2, TagGold = Tag2Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

Tag3 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag3, Tag3Gold) %>% 
  rename(Tag = Tag3, TagGold = Tag3Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

Tag4 <- file %>% 
  add_column(FileID = fileID, Register = register, Corpus = corpus) %>%
  select(FileID, Corpus, Register, Output, Tokens, Tag4, Tag4Gold) %>% 
  rename(Tag = Tag4, TagGold = Tag4Gold, Token = Tokens) %>% 
  mutate(Evaluation = ifelse(is.na(TagGold), TRUE, FALSE)) %>% 
  mutate(TagGold = ifelse(is.na(TagGold), as.character(Tag), as.character(TagGold))) %>%
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

output <- rbind(Tag1, Tag2, Tag3, Tag4) %>% 
  mutate(across(where(is.factor), str_remove_all, pattern = fixed(" "))) %>% # Removes all white spaces which are found in the excel files
  filter(!is.na(Tag)) %>% 
  mutate_if(is.character, as.factor)

}

# Function to decide which of the two above functions should be used
importEval <- function(file, fileID, register, corpus) { 
  if(sum(!is.na(file$Tag4)) > 0) {
    output = importEval4(file = file, fileID = fileID, register = register, corpus = corpus)
  }
  else{
    output = importEval3(file = file, fileID = fileID, register = register, corpus = corpus)
  }
}

Solutions_Intermediate_Spoken_0032 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Solutions_Intermediate_Spoken_0032_Evaluation.xlsx")), fileID = "Solutions_Intermediate_Spoken_0032", register = "Conversation", corpus = "TEC-Sp")

HT_5_Poetry_0001 <- importEval(file = read_excel(here("MFTE", "Evaluation", "HT_5_Poetry_0001_Evaluation.xlsx")), fileID = "HT_5_Poetry_0001", register = "Poetry", corpus = "TEC-Fr")

Achievers_A1_Informative_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Achievers_A1_Informative_0006_Evaluation.xlsx")), fileID = "Achievers_A1_Informative_0006", register = "Informative", corpus = "TEC-Sp")

New_GreenLine_5_Personal_0003 <- importEval(file = read_excel(here("MFTE", "Evaluation", "New_GreenLine_5_Personal_0003_Evaluation.xlsx")), fileID = "New_GreenLine_5_Personal_0003", register = "Personal communication", corpus = "TEC-Ger")

Piece_of_cake_3e_Instructional_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Piece_of_cake_3e_Instructional_0006_Evaluation.xlsx")), fileID = "Piece_of_cake_3e_Instructional_0006", register = "Instructional", corpus = "TEC-Fr")

Access_4_Narrative_0006 <- importEval(file = read_excel(here("MFTE", "Evaluation", "Access_4_Narrative_0006_Evaluation.xlsx")), fileID = "Access_4_Narrative_0006", register = "Fiction", corpus = "TEC-Ger")

BNCBFict_b2 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_b2.xlsx")), fileID = "BNCBFict_b2", register = "fiction", corpus = "BNC2014")

BNCBFict_m54 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_m54.xlsx")), fileID = "BNCBFict_m54", register = "fiction", corpus = "BNC2014")

BNCBFict_e27 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBFict_e27.xlsx")), fileID = "BNCBFict_e27", register = "fiction", corpus = "BNC2014")

BNCBMass16 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBMass16.xlsx")), fileID = "BNCBMass16", register = "news", corpus = "BNC2014")

BNCBMass23 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBMass23.xlsx")), fileID = "BNCBMass23", register = "news", corpus = "BNC2014")

BNCBReg111 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBReg111.xlsx")), fileID = "BNCBReg111", register = "news", corpus = "BNC2014")

BNCBReg750 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBReg750.xlsx")), fileID = "BNCBReg750", register = "news", corpus = "BNC2014")

BNCBSer486 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBSer486.xlsx")), fileID = "BNCBSer486", register = "news", corpus = "BNC2014")

BNCBSer562 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBSer562.xlsx")), fileID = "BNCBSer562", register = "news", corpus = "BNC2014")

BNCBEBl8 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBEBl8.xlsx")), fileID = "BNCBEBl8", register = "internet", corpus = "BNC2014")

BNCBEFor32 <- importEval(file = read_excel(here("MFTE", "Evaluation", "BNCBEFor32.xlsx")), fileID = "BNCBEFor32", register = "internet", corpus = "BNC2014")

S2DD <- importEval(file = read_excel(here("MFTE", "Evaluation", "S2DD.xlsx")), fileID = "S2DD", register = "spoken", corpus = "BNC2014")

S3AV <- importEval(file = read_excel(here("MFTE", "Evaluation", "S3AV.xlsx")), fileID = "S3AV", register = "spoken", corpus = "BNC2014")

SEL5 <- importEval(file = read_excel(here("MFTE", "Evaluation", "SEL5.xlsx")), fileID = "SEL5", register = "spoken", corpus = "BNC2014")

SVLK <- importEval(file = read_excel(here("MFTE", "Evaluation", "SVLK.xlsx")), fileID = "SVLK", register = "spoken", corpus = "BNC2014")

SZXQ <- importEval(file = read_excel(here("MFTE", "Evaluation", "SZXQ.xlsx")), fileID = "SZXQ", register = "spoken", corpus = "BNC2014")

TaggerEval <- rbind(Solutions_Intermediate_Spoken_0032, HT_5_Poetry_0001, Achievers_A1_Informative_0006, New_GreenLine_5_Personal_0003, Piece_of_cake_3e_Instructional_0006, Access_4_Narrative_0006, BNCBEBl8, BNCBFict_b2, BNCBFict_m54, BNCBFict_e27, BNCBEFor32, BNCBMass16, BNCBMass23, BNCBReg111, BNCBReg750, BNCBSer486, BNCBSer562, S2DD, S3AV, SEL5, SVLK, SZXQ) 

summary(TaggerEval)
##           FileID          Corpus               Register        Output     
##  BNCBFict_b2 : 2621   TEC-Sp : 1042   fiction      :6500   ._.    : 1156  
##  BNCBFict_e27: 2104   TEC-Fr : 2058   news         :6312   the_DT :  820  
##  BNCBFict_m54: 1775   TEC-Ger: 1415   spoken       :6047   ,_,    :  720  
##  BNCBMass16  : 1619   BNC2014:20718   internet     :1859   a_DT   :  466  
##  SEL5        : 1463                   Instructional:1048   of_IN  :  328  
##  BNCBEFor32  : 1305                   Poetry       :1010   (Other):21742  
##  (Other)     :14346                   (Other)      :2457   NA's   :    1  
##      Token            Tag           TagGold      Evaluation     
##  .      : 1156   NN     : 4415   NN     : 4328   Mode :logical  
##  the    :  820   IN     : 1810   IN     : 1773   FALSE:930      
##  ,      :  720   DT     : 1454   DT     : 1457   TRUE :24303    
##  to     :  495   .      : 1367   .      : 1367                  
##  's     :  493   VPRT   : 1044   VPRT   : 1054                  
##  (Other):21547   VBD    :  899   VBD    :  895                  
##  NA's   :    2   (Other):14244   (Other):14359

Implement changes made to MFTE since evaluation

TaggerEval <- TaggerEval %>% mutate(Tag = ifelse(Tag == "PHC", "CC", as.character(Tag))) %>% 
    mutate(TagGold = ifelse(TagGold == "PHC", "CC", as.character(TagGold))) %>% mutate(Tag = ifelse(Tag == 
    "QLIKE", "LIKE", as.character(Tag))) %>% mutate(TagGold = ifelse(TagGold == "QLIKE", 
    "LIKE", as.character(TagGold))) %>% mutate(Tag = ifelse(Tag == "TO", "IN", as.character(Tag))) %>% 
    mutate(TagGold = ifelse(TagGold == "TO", "IN", as.character(TagGold))) %>% mutate_if(is.character, 
    as.factor) %>% mutate(Evaluation = ifelse(as.character(Tag) == as.character(TagGold), 
    TRUE, FALSE))

head(TaggerEval)  # Check sanity of data
## # A tibble: 6 × 8
##   FileID            Corpus Register   Output    Token   Tag   TagGold Evaluation
##   <fct>             <fct>  <fct>      <fct>     <fct>   <fct> <fct>   <lgl>     
## 1 Solutions_Interm… TEC-Sp Conversat… Intervie… Interv… NN    NN      TRUE      
## 2 Solutions_Interm… TEC-Sp Conversat… In_IN     In      IN    IN      TRUE      
## 3 Solutions_Interm… TEC-Sp Conversat… this_DEMO this    DEMO  DEMO    TRUE      
## 4 Solutions_Interm… TEC-Sp Conversat… part_NN   part    NN    NN      TRUE      
## 5 Solutions_Interm… TEC-Sp Conversat… of_IN     of      IN    IN      TRUE      
## 6 Solutions_Interm… TEC-Sp Conversat… the_DT    the     DT    DT      TRUE
summary(TaggerEval)  # Check sanity of data
##           FileID          Corpus               Register        Output     
##  BNCBFict_b2 : 2621   TEC-Sp : 1042   fiction      :6500   ._.    : 1156  
##  BNCBFict_e27: 2104   TEC-Fr : 2058   news         :6312   the_DT :  820  
##  BNCBFict_m54: 1775   TEC-Ger: 1415   spoken       :6047   ,_,    :  720  
##  BNCBMass16  : 1619   BNC2014:20718   internet     :1859   a_DT   :  466  
##  SEL5        : 1463                   Instructional:1048   of_IN  :  328  
##  BNCBEFor32  : 1305                   Poetry       :1010   (Other):21742  
##  (Other)     :14346                   (Other)      :2457   NA's   :    1  
##      Token            Tag           TagGold      Evaluation     
##  .      : 1156   NN     : 4415   NN     : 4328   Mode :logical  
##  the    :  820   IN     : 2145   IN     : 2113   FALSE:832      
##  ,      :  720   DT     : 1454   DT     : 1457   TRUE :24401    
##  to     :  495   .      : 1367   .      : 1367                  
##  's     :  493   VPRT   : 1044   VPRT   : 1054                  
##  (Other):21547   VBD    :  899   VBD    :  895                  
##  NA's   :    2   (Other):13909   (Other):14019
# saveRDS(TaggerEval, here('MFTE', 'Evaluation',
# 'MFTE_PhD_Evaluation_Results.rds')) # Last saved 10 Nov 2021

# write.csv(TaggerEval, here('MFTE', 'Evaluation',
# 'MFTE_PhD_Evaluation_Results.csv')) # Last saved 10 Nov 2021

Quick data import

TaggerEval <- readRDS(here("MFTE", "Evaluation", "MFTE_PhD_Evaluation_Results.rds"))
summary(TaggerEval)
##           FileID          Corpus               Register        Output     
##  BNCBFict_b2 : 2621   TEC-Sp : 1042   fiction      :6500   ._.    : 1156  
##  BNCBFict_e27: 2104   TEC-Fr : 2058   news         :6312   the_DT :  820  
##  BNCBFict_m54: 1775   TEC-Ger: 1415   spoken       :6047   ,_,    :  720  
##  BNCBMass16  : 1619   BNC2014:20718   internet     :1859   a_DT   :  466  
##  SEL5        : 1463                   Instructional:1048   of_IN  :  328  
##  BNCBEFor32  : 1305                   Poetry       :1010   (Other):21742  
##  (Other)     :14346                   (Other)      :2457   NA's   :    1  
##      Token            Tag           TagGold      Evaluation     
##  .      : 1156   NN     : 4415   NN     : 4328   Mode :logical  
##  the    :  820   IN     : 2145   IN     : 2113   FALSE:832      
##  ,      :  720   DT     : 1454   DT     : 1457   TRUE :24401    
##  to     :  495   .      : 1367   .      : 1367                  
##  's     :  493   VPRT   : 1044   VPRT   : 1054                  
##  (Other):21547   VBD    :  899   VBD    :  895                  
##  NA's   :    2   (Other):13909   (Other):14019

Estimating MFTE accuracy for TEC only

In this chunk, I calculate the recall and precision rates of each feature, ignoring unclear tokens and all punctuation and symbols.

# Total number of TEC tags manually checked
TaggerEval %>% filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>% nrow()
## [1] 4515
# Number of UNCLEAR evaluation tags
TaggerEval %>% 
  filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>% 
  filter(TagGold == "UNCLEAR") %>% 
  nrow() # 0 in TEC sample
## [1] 0
data <- TaggerEval %>% 
  filter(Corpus %in% c("TEC-Fr", "TEC-Ger", "TEC-Sp")) %>% 
  filter(TagGold != "UNCLEAR") %>% 
  filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove punctuation tags which are uninteresting here.
  filter(Tag != "SYM" & Tag != "``") %>% 
  droplevels(.) %>% 
  mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
  mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))

# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID     Corpus     Register   Output     Token      Tag        TagGold   
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
##    Mode   FALSE    TRUE 
## logical     114    3831
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall # Note that is not very representative because it includes tags which are not intended for use in the MDA studies, e.g., LS and FW.
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9711027      0.9688555      0.9653870      0.9761054      0.2035488 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
##               Precision     Recall        F1
## Class: ABLE   1.0000000 1.00000000 1.0000000
## Class: ACT    0.9672131 0.98333333 0.9752066
## Class: AMP    1.0000000 1.00000000 1.0000000
## Class: ASPECT 1.0000000 1.00000000 1.0000000
## Class: BEMA   1.0000000 1.00000000 1.0000000
## Class: CAUSE  1.0000000 1.00000000 1.0000000
## Class: CC     1.0000000 0.99115044 0.9955556
## Class: CD     0.9545455 0.95454545 0.9545455
## Class: COMM   1.0000000 0.97727273 0.9885057
## Class: COND   1.0000000 1.00000000 1.0000000
## Class: CONT   1.0000000 1.00000000 1.0000000
## Class: CUZ    1.0000000 1.00000000 1.0000000
## Class: DEMO   0.9655172 0.96551724 0.9655172
## Class: DMA    1.0000000 1.00000000 1.0000000
## Class: DOAUX  0.8571429 1.00000000 0.9230769
## Class: DT     1.0000000 1.00000000 1.0000000
## Class: DWNT   0.6666667 1.00000000 0.8000000
## Class: ELAB   1.0000000 1.00000000 1.0000000
## Class: EMPH   0.8285714 1.00000000 0.9062500
## Class: EX     1.0000000 1.00000000 1.0000000
## Class: EXIST  1.0000000 1.00000000 1.0000000
## Class: FPP1P  1.0000000 1.00000000 1.0000000
## Class: FPP1S  1.0000000 1.00000000 1.0000000
## Class: FPUH   1.0000000 1.00000000 1.0000000
## Class: FREQ   1.0000000 1.00000000 1.0000000
## Class: FW     0.1000000 1.00000000 0.1818182
## Class: GTO    1.0000000 1.00000000 1.0000000
## Class: HDG    1.0000000 1.00000000 1.0000000
## Class: HGOT   1.0000000 1.00000000 1.0000000
## Class: IN     1.0000000 0.99731903 0.9986577
## Class: JJ     0.9555556 0.98473282 0.9699248
## Class: JPRED  0.9736842 0.90243902 0.9367089
## Class: LIKE   0.8333333 1.00000000 0.9090909
## Class: MDCA   1.0000000 1.00000000 1.0000000
## Class: MDCO   1.0000000 1.00000000 1.0000000
## Class: MDMM   1.0000000 0.66666667 0.8000000
## Class: MDNE   1.0000000 0.80000000 0.8888889
## Class: MDWO   1.0000000 1.00000000 1.0000000
## Class: MDWS   1.0000000 1.00000000 1.0000000
## Class: MENTAL 0.9892473 0.98924731 0.9892473
## Class: NCOMP  0.8800000 1.00000000 0.9361702
## Class: NN     0.9508393 0.98754670 0.9688454
## Class: NULL   1.0000000 0.07692308 0.1428571
## Class: OCCUR  0.9444444 1.00000000 0.9714286
## Class: PASS   0.8888889 0.88888889 0.8888889
## Class: PEAS   1.0000000 0.86666667 0.9285714
## Class: PGET   1.0000000 1.00000000 1.0000000
## Class: PIT    1.0000000 1.00000000 1.0000000
## Class: PLACE  1.0000000 0.83333333 0.9090909
## Class: POLITE 1.0000000 1.00000000 1.0000000
## Class: POS    1.0000000 1.00000000 1.0000000
## Class: PROG   1.0000000 0.89473684 0.9444444
## Class: QUAN   0.9622642 0.98076923 0.9714286
## Class: QUPR   1.0000000 1.00000000 1.0000000
## Class: RB     1.0000000 0.98571429 0.9928058
## Class: RP     1.0000000 1.00000000 1.0000000
## Class: SO     1.0000000 0.63636364 0.7777778
## Class: SPLIT  1.0000000 1.00000000 1.0000000
## Class: SPP2   1.0000000 1.00000000 1.0000000
## Class: STPR   0.6000000 1.00000000 0.7500000
## Class: THATD  0.8571429 1.00000000 0.9230769
## Class: THRC   1.0000000 0.71428571 0.8333333
## Class: THSC   0.6923077 1.00000000 0.8181818
## Class: TIME   1.0000000 0.96774194 0.9836066
## Class: TPP3P  1.0000000 1.00000000 1.0000000
## Class: TPP3S  1.0000000 1.00000000 1.0000000
## Class: VB     0.9448819 0.93750000 0.9411765
## Class: VBD    0.9733333 0.98648649 0.9798658
## Class: VBG    0.9642857 1.00000000 0.9818182
## Class: VBN    0.8461538 0.91666667 0.8800000
## Class: VIMP   0.9868421 0.88235294 0.9316770
## Class: VPRT   0.9796954 0.97969543 0.9796954
## Class: WHQU   0.9677419 1.00000000 0.9836066
## Class: WHSC   1.0000000 0.97058824 0.9850746
## Class: XX0    1.0000000 1.00000000 1.0000000
## Class: YNQU   1.0000000 1.00000000 1.0000000
## Class: OCR           NA 0.00000000        NA

MFTE accuracy for reference corpora or comparable

Conversation

# Number of UNCLEAR evaluation tags
TaggerEval %>% 
  filter(Register == "spoken") %>% 
  filter(TagGold == "UNCLEAR") %>% 
  nrow() # 7 in Spoken BNC2014 sample
## [1] 7
data <- TaggerEval %>% 
  filter(Register == "spoken") %>% 
  filter(TagGold != "UNCLEAR") %>% 
  filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
  droplevels(.) %>% 
  mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
  mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))

# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID     Corpus     Register   Output     Token      Tag        TagGold   
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
##    Mode   FALSE    TRUE 
## logical     224    5388
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall 
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9600855      0.9584079      0.9546300      0.9650557      0.1193870 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
##               Precision    Recall        F1
## Class: ABLE   1.0000000 1.0000000 1.0000000
## Class: ACT    0.9831933 0.9831933 0.9831933
## Class: AMP    1.0000000 1.0000000 1.0000000
## Class: ASPECT 0.9285714 0.9285714 0.9285714
## Class: BEMA   0.9696970 1.0000000 0.9846154
## Class: CAUSE  0.8000000 1.0000000 0.8888889
## Class: CC     1.0000000 1.0000000 1.0000000
## Class: CD     0.7719298 0.9777778 0.8627451
## Class: COMM   1.0000000 1.0000000 1.0000000
## Class: CONC   1.0000000 1.0000000 1.0000000
## Class: COND   1.0000000 1.0000000 1.0000000
## Class: CONT   1.0000000 1.0000000 1.0000000
## Class: CUZ    1.0000000 0.9375000 0.9677419
## Class: DEMO   1.0000000 0.7702703 0.8702290
## Class: DMA    1.0000000 0.9426752 0.9704918
## Class: DOAUX  0.9629630 0.9811321 0.9719626
## Class: DT     1.0000000 0.9960784 0.9980354
## Class: ELAB   1.0000000 1.0000000 1.0000000
## Class: EMPH   0.9864865 0.9864865 0.9864865
## Class: EX     0.8695652 1.0000000 0.9302326
## Class: EXIST  1.0000000 1.0000000 1.0000000
## Class: FPP1P  1.0000000 1.0000000 1.0000000
## Class: FPP1S  1.0000000 0.9797980 0.9897959
## Class: FPUH   1.0000000 1.0000000 1.0000000
## Class: FREQ   1.0000000 1.0000000 1.0000000
## Class: FW     0.0000000        NA        NA
## Class: GTO    1.0000000 0.9166667 0.9565217
## Class: HDG    1.0000000 0.9473684 0.9729730
## Class: HGOT   0.8333333 0.8333333 0.8333333
## Class: IN     0.9565217 0.9924812 0.9741697
## Class: JJAT   0.9292035 0.9459459 0.9375000
## Class: JJPR   0.8795181 0.9240506 0.9012346
## Class: LIKE   0.9838710 0.9104478 0.9457364
## Class: MDCA   1.0000000 1.0000000 1.0000000
## Class: MDCO   1.0000000 1.0000000 1.0000000
## Class: MDMM   1.0000000 1.0000000 1.0000000
## Class: MDNE   1.0000000 0.8500000 0.9189189
## Class: MDWO   1.0000000 0.9705882 0.9850746
## Class: MDWS   1.0000000 1.0000000 1.0000000
## Class: MENTAL 0.9230769 1.0000000 0.9600000
## Class: NCOMP  0.8070175 1.0000000 0.8932039
## Class: NN     0.9572901 0.9701493 0.9636768
## Class: OCCUR  1.0000000 1.0000000 1.0000000
## Class: PASS   0.8000000 1.0000000 0.8888889
## Class: PEAS   0.9487179 0.8809524 0.9135802
## Class: PGET   0.6666667 0.6666667 0.6666667
## Class: PIT    1.0000000 1.0000000 1.0000000
## Class: PLACE  0.9696970 0.9142857 0.9411765
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS    0.5555556 1.0000000 0.7142857
## Class: PROG   1.0000000 0.8529412 0.9206349
## Class: QUAN   1.0000000 1.0000000 1.0000000
## Class: QUPR   1.0000000 1.0000000 1.0000000
## Class: QUTAG  1.0000000 1.0000000 1.0000000
## Class: RB     0.9698795 0.9200000 0.9442815
## Class: RP     0.9444444 0.7727273 0.8500000
## Class: SO     0.9583333 1.0000000 0.9787234
## Class: SPLIT  1.0000000 1.0000000 1.0000000
## Class: SPP2   1.0000000 1.0000000 1.0000000
## Class: STPR   1.0000000 1.0000000 1.0000000
## Class: THATD  0.6363636 1.0000000 0.7777778
## Class: THRC   0.6923077 0.6923077 0.6923077
## Class: THSC   0.3200000 1.0000000 0.4848485
## Class: TIME   1.0000000 0.9000000 0.9473684
## Class: TPP3P  1.0000000 1.0000000 1.0000000
## Class: TPP3S  1.0000000 1.0000000 1.0000000
## Class: USEDTO 1.0000000 1.0000000 1.0000000
## Class: VB     0.9397590 0.9435484 0.9416499
## Class: VBD    0.9831461 0.9722222 0.9776536
## Class: VBG    0.8604651 0.9736842 0.9135802
## Class: VBN    0.3636364 1.0000000 0.5333333
## Class: VIMP   0.6470588 0.5789474 0.6111111
## Class: VPRT   0.9775281 0.9586777 0.9680111
## Class: WHQU   0.6190476 0.9285714 0.7428571
## Class: WHSC   0.9846154 0.8888889 0.9343066
## Class: XX0    1.0000000 1.0000000 1.0000000
## Class: YNQU   0.8333333 1.0000000 0.9090909
## Class: NULL          NA 0.0000000        NA

Fiction

# Number of UNCLEAR evaluation tags
TaggerEval %>% 
  filter(Register == "fiction") %>% 
  filter(TagGold == "UNCLEAR") %>% 
  nrow() # 0 in BNC2014 Baby+ fiction subsample
## [1] 0
data <- TaggerEval %>% 
  filter(Register == "fiction") %>% 
  filter(TagGold != "UNCLEAR") %>% 
  filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
  filter(Tag != "SYM" & Tag != "``") %>% 
  droplevels(.) %>% 
  mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
  mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))

# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID     Corpus     Register   Output     Token      Tag        TagGold   
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
##    Mode   FALSE    TRUE 
## logical     168    5346
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall 
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9695321      0.9674367      0.9646492      0.9739093      0.1904244 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
##               Precision    Recall        F1
## Class: ACT    1.0000000 0.9916667 0.9958159
## Class: AMP    0.9473684 1.0000000 0.9729730
## Class: ASPECT 1.0000000 1.0000000 1.0000000
## Class: BEMA   0.9809524 1.0000000 0.9903846
## Class: CAUSE  1.0000000 1.0000000 1.0000000
## Class: CC     0.9948187 0.9795918 0.9871465
## Class: CD     1.0000000 0.9354839 0.9666667
## Class: COMM   0.9868421 1.0000000 0.9933775
## Class: CONC   1.0000000 1.0000000 1.0000000
## Class: COND   1.0000000 1.0000000 1.0000000
## Class: CONT   1.0000000 1.0000000 1.0000000
## Class: CUZ    1.0000000 1.0000000 1.0000000
## Class: DEMO   0.9791667 0.8867925 0.9306931
## Class: DMA    0.9230769 0.9230769 0.9230769
## Class: DOAUX  0.9523810 1.0000000 0.9756098
## Class: DT     1.0000000 1.0000000 1.0000000
## Class: DWNT   1.0000000 1.0000000 1.0000000
## Class: ELAB   1.0000000 0.7500000 0.8571429
## Class: EMPH   1.0000000 1.0000000 1.0000000
## Class: EX     1.0000000 1.0000000 1.0000000
## Class: EXIST  1.0000000 1.0000000 1.0000000
## Class: FPP1P  1.0000000 1.0000000 1.0000000
## Class: FPP1S  1.0000000 1.0000000 1.0000000
## Class: FPUH   1.0000000 1.0000000 1.0000000
## Class: FREQ   1.0000000 1.0000000 1.0000000
## Class: FW     0.0000000        NA        NA
## Class: GTO    1.0000000 1.0000000 1.0000000
## Class: HDG    1.0000000 1.0000000 1.0000000
## Class: HGOT   1.0000000 1.0000000 1.0000000
## Class: IN     0.9804688 0.9980119 0.9891626
## Class: JJAT   0.9072165 0.8979592 0.9025641
## Class: JJPR   0.8541667 0.9111111 0.8817204
## Class: LIKE   0.9411765 1.0000000 0.9696970
## Class: MDCA   1.0000000 1.0000000 1.0000000
## Class: MDCO   1.0000000 1.0000000 1.0000000
## Class: MDMM   1.0000000 1.0000000 1.0000000
## Class: MDNE   1.0000000 0.9375000 0.9677419
## Class: MDWO   0.9444444 1.0000000 0.9714286
## Class: MDWS   1.0000000 1.0000000 1.0000000
## Class: MENTAL 1.0000000 1.0000000 1.0000000
## Class: NCOMP  0.8823529 0.9836066 0.9302326
## Class: NN     0.9695238 0.9695238 0.9695238
## Class: OCCUR  0.9285714 1.0000000 0.9629630
## Class: PASS   0.9354839 0.9666667 0.9508197
## Class: PEAS   0.9772727 0.7962963 0.8775510
## Class: PIT    1.0000000 1.0000000 1.0000000
## Class: PLACE  1.0000000 0.9375000 0.9677419
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS    0.9000000 1.0000000 0.9473684
## Class: PROG   0.9583333 0.8846154 0.9200000
## Class: QUAN   1.0000000 0.9692308 0.9843750
## Class: QUPR   1.0000000 1.0000000 1.0000000
## Class: QUTAG  1.0000000 1.0000000 1.0000000
## Class: RB     0.9865772 0.9423077 0.9639344
## Class: RP     1.0000000 0.7750000 0.8732394
## Class: SO     1.0000000 1.0000000 1.0000000
## Class: SPLIT  1.0000000 1.0000000 1.0000000
## Class: SPP2   1.0000000 1.0000000 1.0000000
## Class: STPR   1.0000000 1.0000000 1.0000000
## Class: THATD  0.7894737 1.0000000 0.8823529
## Class: THRC   0.7142857 1.0000000 0.8333333
## Class: THSC   0.8823529 0.9677419 0.9230769
## Class: TIME   0.9666667 1.0000000 0.9830508
## Class: TPP3P  1.0000000 1.0000000 1.0000000
## Class: TPP3S  1.0000000 1.0000000 1.0000000
## Class: VB     0.9377990 0.9751244 0.9560976
## Class: VBD    0.9801700 0.9829545 0.9815603
## Class: VBG    0.9120879 0.9540230 0.9325843
## Class: VBN    0.5806452 0.8571429 0.6923077
## Class: VIMP   0.7500000 0.7500000 0.7500000
## Class: VPRT   0.9781022 0.9370629 0.9571429
## Class: WHQU   0.8461538 1.0000000 0.9166667
## Class: WHSC   0.9818182 0.9818182 0.9818182
## Class: XX0    1.0000000 1.0000000 1.0000000
## Class: YNQU   0.8181818 1.0000000 0.9000000
## Class: NULL          NA 0.0000000        NA

Informative

# Number of files and tags included in this part of the evaluation (intended to match the ITTC data)
TaggerEval %>% 
  filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>% 
  group_by(FileID) %>% 
  count() 
## # A tibble: 8 × 2
## # Groups:   FileID [8]
##   FileID         n
##   <fct>      <int>
## 1 BNCBEBl8     554
## 2 BNCBEFor32  1305
## 3 BNCBMass16  1619
## 4 BNCBMass23   268
## 5 BNCBReg111  1230
## 6 BNCBReg750  1275
## 7 BNCBSer486  1182
## 8 BNCBSer562   738
# Number of UNCLEAR evaluation tags
TaggerEval %>% 
  filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>% 
  filter(TagGold == "UNCLEAR") %>% 
  nrow() # 8
## [1] 8
data <- TaggerEval %>% 
  filter(Register == "news" | FileID %in% c("BNCBEFor32", "BNCBEBl8")) %>% 
  filter(TagGold != "UNCLEAR") %>% 
  filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
  filter(Tag != "SYM" & Tag != "``") %>% 
  droplevels(.) %>% 
  mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
  mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))

# Spot gold tag corrections that are not actually errors (should return zero rows if all is well)
data[data$Tag==data$TagGold & data$Evaluation == FALSE,] %>% as.data.frame()
## [1] FileID     Corpus     Register   Output     Token      Tag        TagGold   
## [8] Evaluation
## <0 rows> (or 0-length row.names)
# Total number of false tags
summary(data$Evaluation)
##    Mode   FALSE    TRUE 
## logical     309    7113
cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall 
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9583670      0.9542631      0.9535718      0.9627979      0.2431959 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN
# Accuracy metrics per feature: recall, precision and f1
cm$byClass[,5:7]
##               Precision    Recall        F1
## Class: ACT    0.9210526 0.9887006 0.9536785
## Class: AMP    1.0000000 0.9375000 0.9677419
## Class: ASPECT 1.0000000 1.0000000 1.0000000
## Class: BEMA   0.9909910 0.9909910 0.9909910
## Class: CAUSE  1.0000000 1.0000000 1.0000000
## Class: CC     0.9960474 0.9921260 0.9940828
## Class: CD     0.9924242 0.9776119 0.9849624
## Class: COMM   1.0000000 1.0000000 1.0000000
## Class: CONC   0.9000000 0.8181818 0.8571429
## Class: COND   1.0000000 1.0000000 1.0000000
## Class: CONT   0.9642857 1.0000000 0.9818182
## Class: CUZ    1.0000000 0.9000000 0.9473684
## Class: DEMO   1.0000000 0.9607843 0.9800000
## Class: DMA    0.5000000 0.4000000 0.4444444
## Class: DOAUX  0.9200000 0.9200000 0.9200000
## Class: DT     1.0000000 0.9959184 0.9979550
## Class: DWNT   1.0000000 1.0000000 1.0000000
## Class: ELAB   1.0000000 1.0000000 1.0000000
## Class: EMPH   0.9761905 0.9534884 0.9647059
## Class: EX     1.0000000 1.0000000 1.0000000
## Class: EXIST  0.9642857 1.0000000 0.9818182
## Class: FPP1P  1.0000000 1.0000000 1.0000000
## Class: FPP1S  1.0000000 1.0000000 1.0000000
## Class: FPUH   1.0000000 0.6666667 0.8000000
## Class: FREQ   1.0000000 1.0000000 1.0000000
## Class: FW     0.2857143 0.4000000 0.3333333
## Class: GTO    1.0000000 1.0000000 1.0000000
## Class: HDG    1.0000000 1.0000000 1.0000000
## Class: IN     0.9857988 0.9964115 0.9910767
## Class: JJAT   0.9373134 0.8722222 0.9035971
## Class: JJPR   0.9195402 0.7407407 0.8205128
## Class: LIKE   1.0000000 1.0000000 1.0000000
## Class: MDCA   1.0000000 1.0000000 1.0000000
## Class: MDCO   1.0000000 1.0000000 1.0000000
## Class: MDMM   1.0000000 1.0000000 1.0000000
## Class: MDNE   1.0000000 0.9545455 0.9767442
## Class: MDWO   1.0000000 1.0000000 1.0000000
## Class: MDWS   1.0000000 1.0000000 1.0000000
## Class: MENTAL 0.9814815 1.0000000 0.9906542
## Class: NCOMP  0.9189189 0.9941520 0.9550562
## Class: NN     0.9566396 0.9778393 0.9671233
## Class: OCCUR  1.0000000 1.0000000 1.0000000
## Class: PASS   0.9240506 0.9240506 0.9240506
## Class: PEAS   1.0000000 0.9142857 0.9552239
## Class: PGET   1.0000000 0.6666667 0.8000000
## Class: PIT    1.0000000 0.9615385 0.9803922
## Class: PLACE  0.8636364 1.0000000 0.9268293
## Class: POLITE 1.0000000 1.0000000 1.0000000
## Class: POS    0.9777778 0.9565217 0.9670330
## Class: PROG   0.9210526 0.8750000 0.8974359
## Class: PRP    0.0000000 0.0000000       NaN
## Class: QUAN   0.9638554 1.0000000 0.9815951
## Class: QUPR   1.0000000 1.0000000 1.0000000
## Class: RB     0.9629630 0.9489051 0.9558824
## Class: RP     1.0000000 0.8181818 0.9000000
## Class: SO     1.0000000 0.8888889 0.9411765
## Class: SPLIT  1.0000000 1.0000000 1.0000000
## Class: SPP2   1.0000000 1.0000000 1.0000000
## Class: STPR   0.5000000 1.0000000 0.6666667
## Class: THATD  0.8461538 1.0000000 0.9166667
## Class: THRC   1.0000000 0.5000000 0.6666667
## Class: THSC   0.8500000 1.0000000 0.9189189
## Class: TIME   0.9512195 0.9750000 0.9629630
## Class: TPP3P  1.0000000 1.0000000 1.0000000
## Class: TPP3S  1.0000000 1.0000000 1.0000000
## Class: URL    1.0000000 1.0000000 1.0000000
## Class: USEDTO 0.0000000        NA        NA
## Class: VB     0.8988764 0.9302326 0.9142857
## Class: VBD    0.9587156 0.9720930 0.9653580
## Class: VBG    0.9099099 0.9099099 0.9099099
## Class: VBN    0.4150943 1.0000000 0.5866667
## Class: VIMP   0.7142857 0.3448276 0.4651163
## Class: VPRT   0.9488636 0.9515670 0.9502134
## Class: WHQU   1.0000000 0.4444444 0.6153846
## Class: WHSC   0.9500000 1.0000000 0.9743590
## Class: XX0    1.0000000 0.9736842 0.9866667
## Class: YNQU   0.0000000        NA        NA
## Class: ``            NA 0.0000000        NA
## Class: NULL          NA 0.0000000        NA
## Class: SYM           NA 0.0000000        NA

Estimating overall MFTE accuracy for corpora used in thesis

# Number of tags evaluated per file
TaggerEval %>% group_by(FileID) %>% count(.) %>% arrange(desc(n)) %>% as.data.frame()
##                                 FileID    n
## 1                          BNCBFict_b2 2621
## 2                         BNCBFict_e27 2104
## 3                         BNCBFict_m54 1775
## 4                           BNCBMass16 1619
## 5                                 SEL5 1463
## 6                           BNCBEFor32 1305
## 7                           BNCBReg750 1275
## 8                           BNCBReg111 1230
## 9                                 SVLK 1222
## 10                          BNCBSer486 1182
## 11                                S2DD 1180
## 12                                S3AV 1126
## 13                                SZXQ 1056
## 14 Piece_of_cake_3e_Instructional_0006 1048
## 15                    HT_5_Poetry_0001 1010
## 16       New_GreenLine_5_Personal_0003  796
## 17                          BNCBSer562  738
## 18  Solutions_Intermediate_Spoken_0032  636
## 19             Access_4_Narrative_0006  619
## 20                            BNCBEBl8  554
## 21       Achievers_A1_Informative_0006  406
## 22                          BNCBMass23  268
# Number of UNCLEAR tokens
TaggerEval %>% filter(TagGold == "UNCLEAR") %>% nrow()
## [1] 15
# Tagger evaluation
summary(TaggerEval$Evaluation)
##    Mode   FALSE    TRUE 
## logical     832   24401
data <- TaggerEval %>% 
  filter(TagGold != "UNCLEAR") %>% 
  filter(Tag %in% c(str_extract(Tag, "[A-Z0-9]+"))) %>% # Remove all punctuation tags which are uninteresting here.
  filter(Tag != "SYM" & Tag != "``") %>% 
  filter(TagGold != "SYM" & TagGold != "``") %>% 
  droplevels(.) %>% 
  mutate(Tag = factor(Tag, levels = union(levels(Tag), levels(TagGold)))) %>% # Ensure that the factor levels are the same for the next caret operation
  mutate(TagGold = factor(TagGold, levels = union(levels(Tag), levels(TagGold))))

cm <- caret::confusionMatrix(data$Tag, data$TagGold) # Create confusion matrix
cm$overall 
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9638522      0.9613992      0.9613298      0.9662538      0.1924325 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN
# Quick summary of results: recall, precision and f1
cm$byClass[,5:7]
##               Precision      Recall         F1
## Class: ABLE   1.0000000 1.000000000 1.00000000
## Class: ACT    0.9618182 0.986940299 0.97421731
## Class: AMP    0.9795918 0.979591837 0.97959184
## Class: ASPECT 0.9852941 0.985294118 0.98529412
## Class: BEMA   0.9839080 0.997668998 0.99074074
## Class: CAUSE  0.9736842 1.000000000 0.98666667
## Class: CC     0.9973333 0.990728477 0.99401993
## Class: CD     0.9375000 0.969827586 0.95338983
## Class: COMM   0.9960317 0.996031746 0.99603175
## Class: CONC   0.9729730 0.947368421 0.96000000
## Class: COND   1.0000000 1.000000000 1.00000000
## Class: CONT   0.9957447 1.000000000 0.99786780
## Class: CUZ    1.0000000 0.944444444 0.97142857
## Class: DEMO   0.9890710 0.874396135 0.92820513
## Class: DMA    0.9796954 0.932367150 0.95544554
## Class: DOAUX  0.9385965 0.972727273 0.95535714
## Class: DT     1.0000000 0.997940975 0.99896943
## Class: DWNT   0.9375000 1.000000000 0.96774194
## Class: ELAB   1.0000000 0.909090909 0.95238095
## Class: EMPH   0.9550562 0.982658960 0.96866097
## Class: EX     0.9552239 1.000000000 0.97709924
## Class: EXIST  0.9848485 1.000000000 0.99236641
## Class: FPP1P  1.0000000 1.000000000 1.00000000
## Class: FPP1S  1.0000000 0.990719258 0.99533800
## Class: FPUH   1.0000000 0.992700730 0.99633700
## Class: FREQ   1.0000000 1.000000000 1.00000000
## Class: FW     0.1363636 0.500000000 0.21428571
## Class: GTO    1.0000000 0.964285714 0.98181818
## Class: HDG    1.0000000 0.966666667 0.98305085
## Class: HGOT   0.8750000 0.875000000 0.87500000
## Class: IN     0.9813346 0.996210327 0.98871650
## Class: JJ     0.9555556 0.984732824 0.96992481
## Class: JJAT   0.9267913 0.892053973 0.90909091
## Class: JJPR   0.8834586 0.848375451 0.86556169
## Class: JPRED  0.9736842 0.902439024 0.93670886
## Class: LIKE   0.9680851 0.938144330 0.95287958
## Class: MDCA   1.0000000 1.000000000 1.00000000
## Class: MDCO   1.0000000 1.000000000 1.00000000
## Class: MDMM   1.0000000 0.947368421 0.97297297
## Class: MDNE   1.0000000 0.897058824 0.94573643
## Class: MDWO   0.9791667 0.989473684 0.98429319
## Class: MDWS   1.0000000 1.000000000 1.00000000
## Class: MENTAL 0.9709443 0.997512438 0.98404908
## Class: NCOMP  0.8888889 0.993788820 0.93841642
## Class: NN     0.9589290 0.976432532 0.96760160
## Class: NULL   1.0000000 0.009009009 0.01785714
## Class: OCCUR  0.9607843 1.000000000 0.98000000
## Class: PASS   0.9130435 0.933333333 0.92307692
## Class: PEAS   0.9812500 0.867403315 0.92082111
## Class: PGET   0.9000000 0.750000000 0.81818182
## Class: PIT    1.0000000 0.991202346 0.99558174
## Class: PLACE  0.9555556 0.934782609 0.94505495
## Class: POLITE 1.0000000 1.000000000 1.00000000
## Class: POS    0.9247312 0.977272727 0.95027624
## Class: PROG   0.9629630 0.873949580 0.91629956
## Class: PRP    0.0000000 0.000000000        NaN
## Class: QUAN   0.9805447 0.988235294 0.98437500
## Class: QUPR   1.0000000 1.000000000 1.00000000
## Class: QUTAG  1.0000000 1.000000000 1.00000000
## Class: RB     0.9768786 0.942379182 0.95931883
## Class: RP     0.9837398 0.817567568 0.89298893
## Class: SO     0.9729730 0.935064935 0.95364238
## Class: SPLIT  1.0000000 1.000000000 1.00000000
## Class: SPP2   1.0000000 1.000000000 1.00000000
## Class: STPR   0.6842105 1.000000000 0.81250000
## Class: THATD  0.7540984 1.000000000 0.85981308
## Class: THRC   0.8235294 0.700000000 0.75675676
## Class: THSC   0.7232143 0.987804878 0.83505155
## Class: TIME   0.9765625 0.961538462 0.96899225
## Class: TPP3P  1.0000000 1.000000000 1.00000000
## Class: TPP3S  1.0000000 1.000000000 1.00000000
## Class: URL    1.0000000 1.000000000 1.00000000
## Class: USEDTO 0.5000000 1.000000000 0.66666667
## Class: VB     0.9272300 0.946107784 0.93657380
## Class: VBD    0.9744160 0.978770950 0.97658863
## Class: VBG    0.9084249 0.942965779 0.92537313
## Class: VBN    0.5092593 0.932203390 0.65868263
## Class: VIMP   0.8869565 0.723404255 0.79687500
## Class: VPRT   0.9683301 0.957305503 0.96278626
## Class: WHQU   0.8405797 0.906250000 0.87218045
## Class: WHSC   0.9723320 0.960937500 0.96660118
## Class: XX0    1.0000000 0.992307692 0.99613900
## Class: YNQU   0.8333333 1.000000000 0.90909091
## Class: OCR           NA 0.000000000         NA
# Generate a better formatted results table for export: recall, precision and f1
confusion_matrix <- cm$table
total <- sum(confusion_matrix)
number_of_classes <- nrow(confusion_matrix)
correct <- diag(confusion_matrix)
# sum all columns
total_actual_class <- apply(confusion_matrix, 2, sum)
# sum all rows
total_pred_class <- apply(confusion_matrix, 1, sum)
# Precision = TP / all that were predicted as positive
precision <- correct / total_pred_class
# Recall = TP / all that were actually positive
recall <- correct / total_actual_class
# F1
f1 <- (2 * precision * recall) / (precision + recall)
# create data frame to output results
results <- data.frame(precision, recall, f1, total_actual_class)
results
##        precision      recall         f1 total_actual_class
## ABLE   1.0000000 1.000000000 1.00000000                  6
## ACT    0.9618182 0.986940299 0.97421731                536
## AMP    0.9795918 0.979591837 0.97959184                 49
## ASPECT 0.9852941 0.985294118 0.98529412                 68
## BEMA   0.9839080 0.997668998 0.99074074                429
## CAUSE  0.9736842 1.000000000 0.98666667                 37
## CC     0.9973333 0.990728477 0.99401993                755
## CD     0.9375000 0.969827586 0.95338983                232
## COMM   0.9960317 0.996031746 0.99603175                252
## CONC   0.9729730 0.947368421 0.96000000                 38
## COND   1.0000000 1.000000000 1.00000000                 69
## CONT   0.9957447 1.000000000 0.99786780                468
## CUZ    1.0000000 0.944444444 0.97142857                 54
## DEMO   0.9890710 0.874396135 0.92820513                207
## DMA    0.9796954 0.932367150 0.95544554                207
## DOAUX  0.9385965 0.972727273 0.95535714                110
## DT     1.0000000 0.997940975 0.99896943               1457
## DWNT   0.9375000 1.000000000 0.96774194                 15
## ELAB   1.0000000 0.909090909 0.95238095                 11
## EMPH   0.9550562 0.982658960 0.96866097                173
## EX     0.9552239 1.000000000 0.97709924                 64
## EXIST  0.9848485 1.000000000 0.99236641                 65
## FPP1P  1.0000000 1.000000000 1.00000000                158
## FPP1S  1.0000000 0.990719258 0.99533800                431
## FPUH   1.0000000 0.992700730 0.99633700                137
## FREQ   1.0000000 1.000000000 1.00000000                 70
## FW     0.1363636 0.500000000 0.21428571                  6
## GTO    1.0000000 0.964285714 0.98181818                 28
## HDG    1.0000000 0.966666667 0.98305085                 60
## HGOT   0.8750000 0.875000000 0.87500000                  8
## IN     0.9813346 0.996210327 0.98871650               2111
## JJ     0.9555556 0.984732824 0.96992481                131
## JJAT   0.9267913 0.892053973 0.90909091                667
## JJPR   0.8834586 0.848375451 0.86556169                277
## JPRED  0.9736842 0.902439024 0.93670886                 41
## LIKE   0.9680851 0.938144330 0.95287958                 97
## MDCA   1.0000000 1.000000000 1.00000000                 56
## MDCO   1.0000000 1.000000000 1.00000000                 45
## MDMM   1.0000000 0.947368421 0.97297297                 19
## MDNE   1.0000000 0.897058824 0.94573643                 68
## MDWO   0.9791667 0.989473684 0.98429319                 95
## MDWS   1.0000000 1.000000000 1.00000000                 83
## MENTAL 0.9709443 0.997512438 0.98404908                402
## NCOMP  0.8888889 0.993788820 0.93841642                322
## NN     0.9589290 0.976432532 0.96760160               4328
## NULL   1.0000000 0.009009009 0.01785714                111
## OCCUR  0.9607843 1.000000000 0.98000000                 49
## PASS   0.9130435 0.933333333 0.92307692                135
## PEAS   0.9812500 0.867403315 0.92082111                181
## PGET   0.9000000 0.750000000 0.81818182                 12
## PIT    1.0000000 0.991202346 0.99558174                341
## PLACE  0.9555556 0.934782609 0.94505495                 92
## POLITE 1.0000000 1.000000000 1.00000000                 15
## POS    0.9247312 0.977272727 0.95027624                 88
## PROG   0.9629630 0.873949580 0.91629956                119
## PRP    0.0000000 0.000000000        NaN                  1
## QUAN   0.9805447 0.988235294 0.98437500                255
## QUPR   1.0000000 1.000000000 1.00000000                 58
## QUTAG  1.0000000 1.000000000 1.00000000                 20
## RB     0.9768786 0.942379182 0.95931883                538
## RP     0.9837398 0.817567568 0.89298893                148
## SO     0.9729730 0.935064935 0.95364238                 77
## SPLIT  1.0000000 1.000000000 1.00000000                113
## SPP2   1.0000000 1.000000000 1.00000000                299
## STPR   0.6842105 1.000000000 0.81250000                 13
## THATD  0.7540984 1.000000000 0.85981308                 46
## THRC   0.8235294 0.700000000 0.75675676                 40
## THSC   0.7232143 0.987804878 0.83505155                 82
## TIME   0.9765625 0.961538462 0.96899225                130
## TPP3P  1.0000000 1.000000000 1.00000000                206
## TPP3S  1.0000000 1.000000000 1.00000000                480
## URL    1.0000000 1.000000000 1.00000000                  1
## USEDTO 0.5000000 1.000000000 0.66666667                  1
## VB     0.9272300 0.946107784 0.93657380                835
## VBD    0.9744160 0.978770950 0.97658863                895
## VBG    0.9084249 0.942965779 0.92537313                263
## VBN    0.5092593 0.932203390 0.65868263                 59
## VIMP   0.8869565 0.723404255 0.79687500                141
## VPRT   0.9683301 0.957305503 0.96278626               1054
## WHQU   0.8405797 0.906250000 0.87218045                 64
## WHSC   0.9723320 0.960937500 0.96660118                256
## XX0    1.0000000 0.992307692 0.99613900                260
## YNQU   0.8333333 1.000000000 0.90909091                 40
## OCR          NaN 0.000000000        NaN                 31
resultslong <- results %>% drop_na() %>% mutate(tag = row.names(.)) %>% filter(tag != 
    "NULL" & tag != "SYM" & tag != "OCR" & tag != "FW" & tag != "USEDTO") %>% rename(n = total_actual_class) %>% 
    pivot_longer(cols = c("precision", "recall", "f1"), names_to = "metric", values_to = "value") %>% 
    mutate(metric = factor(metric, levels = c("precision", "recall", "f1")))

summary(resultslong$n)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    49.0   110.0   282.8   263.0  4328.0
ggplot(resultslong, aes(y = reorder(tag, desc(tag)), x = value, group = metric, colour = n)) + 
    geom_point(size = 2) + ylab("") + xlab("") + facet_wrap(~metric) + scale_color_paletteer_c("harrypotter::harrypotter", 
    trans = "log", breaks = c(1, 10, 100, 1000), labels = c(1, 10, 100, 1000), name = "# tokens \nmanually\nevaluated") + 
    theme_bw() + theme(panel.grid.major.y = element_line(colour = "darkgrey")) + 
    theme(legend.position = "right")

# ggsave(here('Plots', 'TaggerAccuracyPlot.svg'), width = 7, height = 12)

Exploring tagger errors

# Adding an error tag with the incorrectly assigned tag and underscore and then
# the correct 'gold' label
errors <- TaggerEval %>% filter(Evaluation == "FALSE") %>% filter(TagGold != "UNCLEAR") %>% 
    mutate(Error = paste(Tag, TagGold, sep = " -> "))

# Total number of errors
nrow(errors)  # 817
## [1] 817
FreqErrors <- errors %>% # filter(Corpus %in% c('TEC-Fr', 'TEC-Ger', 'TEC-Sp')) %>%
count(Error) %>% arrange(desc(n))

# Number of error types that only occur once
FreqErrors %>% filter(n == 1) %>% nrow()
## [1] 94
# Total number of error types
nrow(FreqErrors)
## [1] 198
FreqErrors %>% # filter(n > 10) %>%
print.data.frame()
##               Error  n
## 1     NCOMP -> NULL 37
## 2        NN -> JJAT 35
## 3        JJAT -> NN 27
## 4          NN -> VB 27
## 5          IN -> RP 25
## 6        NN -> VPRT 24
## 7          VB -> NN 22
## 8      THSC -> DEMO 19
## 9        VB -> VIMP 19
## 10        NN -> OCR 16
## 11      VBN -> JJAT 16
## 12      ACT -> NULL 15
## 13    THATD -> NULL 15
## 14         CD -> NN 12
## 15   MENTAL -> NULL 12
## 16        NN -> VBG 11
## 17       NN -> VIMP 11
## 18     THSC -> THRC 11
## 19      VBG -> PROG 11
## 20      VBN -> JJPR 11
## 21       VBN -> VBD 10
## 22     WHQU -> WHSC 10
## 23       JJPR -> RB  9
## 24     JJPR -> JJAT  8
## 25       JJPR -> NN  8
## 26       NN -> JJPR  8
## 27     PASS -> JJPR  8
## 28      VBD -> PEAS  8
## 29      VBN -> PEAS  8
## 30       VPRT -> NN  8
## 31       VPRT -> VB  8
## 32     VPRT -> VIMP  8
## 33     YNQU -> NULL  8
## 34     BEMA -> NULL  7
## 35         FW -> NN  7
## 36        FW -> OCR  7
## 37         NN -> CD  7
## 38      POS -> VPRT  7
## 39       VB -> VPRT  7
## 40        VBG -> NN  7
## 41     DOAUX -> ACT  6
## 42         IN -> RB  6
## 43       JJAT -> RB  6
## 44     THRC -> DEMO  6
## 45     WHSC -> WHQU  6
## 46       EMPH -> SO  5
## 47        NN -> DMA  5
## 48       VIMP -> VB  5
## 49       NN -> PROG  4
## 50         NN -> RB  4
## 51       RB -> JJPR  4
## 52     STPR -> NULL  4
## 53       VB -> LIKE  4
## 54      VBG -> JJAT  4
## 55      VBN -> PASS  4
## 56       VIMP -> NN  4
## 57     VIMP -> VPRT  4
## 58     ACT -> DOAUX  3
## 59      EX -> PLACE  3
## 60      JJAT -> DMA  3
## 61     JJAT -> JJPR  3
## 62         NN -> CC  3
## 63     PASS -> PEAS  3
## 64     PROG -> JJPR  3
## 65       RB -> JJAT  3
## 66       VB -> JJAT  3
## 67       VB -> PEAS  3
## 68      VBD -> JJPR  3
## 69      VBD -> PASS  3
## 70       VBD -> VBN  3
## 71     VPRT -> MDNE  3
## 72      CD -> FPP1S  2
## 73       DMA -> XX0  2
## 74     EMPH -> TIME  2
## 75         FW -> IN  2
## 76         IN -> CC  2
## 77        IN -> CUZ  2
## 78       IN -> TIME  2
## 79         JJ -> NN  2
## 80        JJ -> OCR  2
## 81       JJAT -> FW  2
## 82      JJAT -> VBG  2
## 83      JJPR -> DMA  2
## 84     LIKE -> VPRT  2
## 85      MDWO -> VBD  2
## 86     NCOMP -> OCR  2
## 87         NN -> DT  2
## 88      NN -> FPP1S  2
## 89         NN -> IN  2
## 90      NN -> JPRED  2
## 91       NN -> MDNE  2
## 92       NN -> NULL  2
## 93        NN -> PIT  2
## 94        NN -> VBD  2
## 95      PEAS -> VBD  2
## 96       QUAN -> NN  2
## 97        RB -> DMA  2
## 98         RP -> RB  2
## 99      STPR -> OCR  2
## 100       SYM -> IN  2
## 101    TIME -> CONC  2
## 102       VBD -> VB  2
## 103    VPRT -> LIKE  2
## 104     VPRT -> VBD  2
## 105     ACT -> COMM  1
## 106   ACT -> MENTAL  1
## 107    ACT -> NCOMP  1
## 108       AMP -> RB  1
## 109  ASPECT -> NULL  1
## 110   CAUSE -> NULL  1
## 111        CC -> DT  1
## 112        CC -> IN  1
## 113       CD -> PRP  1
## 114   COMM -> NCOMP  1
## 115    CONC -> TIME  1
## 116    CONT -> BEMA  1
## 117    CONT -> NULL  1
## 118    DEMO -> THRC  1
## 119    DEMO -> THSC  1
## 120     DMA -> NULL  1
## 121       DMA -> RB  1
## 122   DOAUX -> NULL  1
## 123    DWNT -> QUAN  1
## 124    EMPH -> NULL  1
## 125   EXIST -> NULL  1
## 126        FW -> CC  1
## 127      FW -> NULL  1
## 128       FW -> VBG  1
## 129    HGOT -> PEAS  1
## 130       IN -> DMA  1
## 131       IN -> HDG  1
## 132     IN -> PLACE  1
## 133     JJ -> JPRED  1
## 134        JJ -> RB  1
## 135    JJAT -> ELAB  1
## 136    JJAT -> PGET  1
## 137    JJAT -> QUAN  1
## 138    JJAT -> VPRT  1
## 139      JJPR -> IN  1
## 140   JJPR -> PLACE  1
## 141      JJPR -> VB  1
## 142     JJPR -> VBG  1
## 143  JPRED -> PLACE  1
## 144      LIKE -> VB  1
## 145 NCOMP -> ASPECT  1
## 146      NN -> FPUH  1
## 147        NN -> FW  1
## 148       NN -> HDG  1
## 149        NN -> JJ  1
## 150      NN -> MDMM  1
## 151      NN -> PASS  1
## 152      NN -> PEAS  1
## 153      NN -> PGET  1
## 154       NN -> POS  1
## 155      NN -> QUAN  1
## 156       NN -> SYM  1
## 157    OCCUR -> ACT  1
## 158   OCCUR -> NULL  1
## 159   PASS -> JPRED  1
## 160     PEAS -> VBN  1
## 161    PGET -> JJAT  1
## 162   PLACE -> EMPH  1
## 163     PLACE -> IN  1
## 164   PLACE -> JJAT  1
## 165     PLACE -> RP  1
## 166       POS -> ``  1
## 167      PROG -> NN  1
## 168      PRP -> PIT  1
## 169     QUAN -> AMP  1
## 170    QUAN -> EMPH  1
## 171     QUAN -> OCR  1
## 172        RB -> IN  1
## 173      RB -> NULL  1
## 174        RB -> RP  1
## 175       SO -> CUZ  1
## 176      SO -> EMPH  1
## 177      THSC -> RB  1
## 178      TIME -> CC  1
## 179  USEDTO -> PASS  1
## 180      VB -> DEMO  1
## 181      VB -> MDNE  1
## 182       VB -> OCR  1
## 183       VB -> VBD  1
## 184     VBD -> HGOT  1
## 185     VBD -> JJAT  1
## 186     VBD -> MDNE  1
## 187     VBD -> MDWO  1
## 188      VBG -> GTO  1
## 189       VBG -> JJ  1
## 190     VBG -> JJPR  1
## 191       VBN -> NN  1
## 192     VBN -> PGET  1
## 193       VBN -> VB  1
## 194     VBN -> VIMP  1
## 195    VPRT -> JJPR  1
## 196     VPRT -> POS  1
## 197      WHQU -> NN  1
## 198     WHSC -> DMA  1
errors %>% filter(Error == "THSC -> THRC") %>% select(FileID, Output, Tag, TagGold) %>% 
    print(n = 30)
## # A tibble: 11 × 4
##    FileID                        Output    Tag   TagGold
##    <fct>                         <fct>     <fct> <fct>  
##  1 HT_5_Poetry_0001              That_THSC THSC  THRC   
##  2 New_GreenLine_5_Personal_0003 that_THSC THSC  THRC   
##  3 New_GreenLine_5_Personal_0003 that_THSC THSC  THRC   
##  4 BNCBReg750                    that_THSC THSC  THRC   
##  5 BNCBReg750                    that_THSC THSC  THRC   
##  6 BNCBSer486                    that_THSC THSC  THRC   
##  7 BNCBSer562                    that_THSC THSC  THRC   
##  8 S3AV                          that_THSC THSC  THRC   
##  9 S3AV                          that_THSC THSC  THRC   
## 10 SVLK                          that_THSC THSC  THRC   
## 11 SVLK                          that_THSC THSC  THRC
errors %>% filter(Error == "NN -> JJAT") %>% select(-Output, -Corpus, -Tag, -TagGold) %>% 
    filter(grepl(x = Token, pattern = "[A-Z]+.")) %>% print.data.frame()
##        FileID Register        Token Evaluation      Error
## 1  BNCBEFor32 internet Intermediate      FALSE NN -> JJAT
## 2  BNCBMass16     news        FINAL      FALSE NN -> JJAT
## 3  BNCBMass16     news          Big      FALSE NN -> JJAT
## 4  BNCBReg111     news     Scottish      FALSE NN -> JJAT
## 5  BNCBReg111     news     Scottish      FALSE NN -> JJAT
## 6  BNCBReg111     news       Mental      FALSE NN -> JJAT
## 7  BNCBReg111     news     Scottish      FALSE NN -> JJAT
## 8  BNCBReg111     news      Central      FALSE NN -> JJAT
## 9  BNCBReg750     news      English      FALSE NN -> JJAT
## 10 BNCBReg750     news      Natural      FALSE NN -> JJAT
## 11 BNCBReg750     news     European      FALSE NN -> JJAT
## 12 BNCBReg750     news    Christian      FALSE NN -> JJAT
## 13 BNCBReg750     news       Social      FALSE NN -> JJAT
## 14 BNCBReg750     news       Common      FALSE NN -> JJAT
## 15 BNCBSer486     news     Northern      FALSE NN -> JJAT
## 16 BNCBSer486     news     Northern      FALSE NN -> JJAT
## 17 BNCBSer486     news     Northern      FALSE NN -> JJAT
## 18 BNCBSer562     news       United      FALSE NN -> JJAT
## 19 BNCBSer562     news        White      FALSE NN -> JJAT
## 20 BNCBSer562     news       Untold      FALSE NN -> JJAT
## 21 BNCBSer562     news          New      FALSE NN -> JJAT
## 22       SEL5   spoken        Black      FALSE NN -> JJAT
errors %>% filter(Error %in% c("NN -> VB", "VB -> NN", "NN -> VPRT", "VPRT -> NN")) %>% 
    count(Token) %>% arrange(desc(n)) %>% print.data.frame()
##        Token n
## 1      mince 5
## 2      build 4
## 3        win 4
## 4       hunt 3
## 5       wags 3
## 6      throw 2
## 7       look 2
## 8      swamp 2
## 9       stop 2
## 10   defeats 2
## 11     fight 1
## 12        go 1
## 13     prize 1
## 14       Fly 1
## 15      Have 1
## 16      rule 1
## 17       run 1
## 18     chalk 1
## 19     shoot 1
## 20      stir 1
## 21     visit 1
## 22      chat 1
## 23    checks 1
## 24  dispatch 1
## 25        Is 1
## 26      kiss 1
## 27      mean 1
## 28     climb 1
## 29     start 1
## 30    trails 1
## 31    Travel 1
## 32      glue 1
## 33      leak 1
## 34     leaks 1
## 35    shreds 1
## 36     sniff 1
## 37  balances 1
## 38   convict 1
## 39     panic 1
## 40     suits 1
## 41   crumble 1
## 42    SLEEPS 1
## 43    debate 1
## 44  question 1
## 45    thread 1
## 46   upgrade 1
## 47    hurdle 1
## 48      land 1
## 49  scramble 1
## 50    pocket 1
## 51      care 1
## 52 socialise 1
## 53  controls 1
## 54     talks 1
## 55   escapes 1
## 56      mate 1
## 57    tastes 1
## 58      jump 1
## 59    stroke 1
## 60      bang 1
## 61    rhymes 1
## 62     spice 1
errors %>% filter(Error == "ACT -> NULL") %>% count(Token) %>% arrange(desc(n)) %>% 
    print.data.frame()
##      Token n
## 1      win 3
## 2    throw 2
## 3     lost 2
## 4     left 1
## 5  waiting 1
## 6  working 1
## 7  running 1
## 8     done 1
## 9    fixed 1
## 10    Play 1
## 11 reached 1

Package used in this script

# packages.bib <- sapply(1:length(loadedNamespaces()), function(i)
# toBibtex(citation(loadedNamespaces()[i])))

knitr::write_bib(c(.packages(), "knitr"), "packages.bib")

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.1     stringr_1.4.0     dplyr_1.0.7       purrr_0.3.4      
##  [5] readr_2.0.2       tidyr_1.1.4       tibble_3.1.6      tidyverse_1.3.0  
##  [9] readxl_1.3.1      paletteer_1.3.0   here_1.0.1        harrypotter_2.1.1
## [13] caret_6.0-86      ggplot2_3.3.5     lattice_0.20-41  
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152         fs_1.5.2             lubridate_1.7.10    
##  [4] httr_1.4.2           rprojroot_2.0.2      tools_4.0.3         
##  [7] backports_1.4.1      bslib_0.3.1          utf8_1.2.2          
## [10] R6_2.5.1             rpart_4.1-15         DBI_1.1.1           
## [13] colorspace_2.0-2     nnet_7.3-15          withr_2.4.3         
## [16] tidyselect_1.1.1     gridExtra_2.3        compiler_4.0.3      
## [19] cli_3.1.0            rvest_1.0.0          formatR_1.8         
## [22] xml2_1.3.3           labeling_0.4.2       prismatic_1.0.0     
## [25] sass_0.4.0           scales_1.1.1         digest_0.6.29       
## [28] rmarkdown_2.11       pkgconfig_2.0.3      htmltools_0.5.2     
## [31] highr_0.9            dbplyr_2.1.0         fastmap_1.1.0       
## [34] rlang_0.4.12         rstudioapi_0.13      farver_2.1.0        
## [37] jquerylib_0.1.4      generics_0.1.1       jsonlite_1.7.2      
## [40] ModelMetrics_1.2.2.2 magrittr_2.0.1       Matrix_1.3-2        
## [43] Rcpp_1.0.7           munsell_0.5.0        fansi_0.5.0         
## [46] lifecycle_1.0.1      stringi_1.7.6        pROC_1.17.0.1       
## [49] yaml_2.2.1           MASS_7.3-53.1        plyr_1.8.6          
## [52] recipes_0.1.15       grid_4.0.3           crayon_1.4.2        
## [55] haven_2.3.1          splines_4.0.3        hms_1.0.0           
## [58] knitr_1.37           pillar_1.6.4         reshape2_1.4.4      
## [61] codetools_0.2-18     stats4_4.0.3         reprex_1.0.0        
## [64] glue_1.6.0           evaluate_0.14        data.table_1.14.2   
## [67] modelr_0.1.8         vctrs_0.3.8          tzdb_0.1.2          
## [70] foreach_1.5.1        cellranger_1.1.0     gtable_0.3.0        
## [73] rematch2_2.1.2       assertthat_0.2.1     xfun_0.29           
## [76] gower_0.2.2          prodlim_2019.11.13   broom_0.7.9         
## [79] e1071_1.7-4          class_7.3-18         survival_3.2-7      
## [82] timeDate_3043.102    iterators_1.0.13     lava_1.6.9          
## [85] ellipsis_0.3.2       ipred_0.9-11
file., See AUTHORS. 2021. Paletteer: Comprehensive Collection of Color Palettes. https://github.com/EmilHvitfeldt/paletteer.
Henry, Lionel, and Hadley Wickham. 2020. Purrr: Functional Programming Tools. https://CRAN.R-project.org/package=purrr.
Jimenez Rico, Alejandro. 2020. Harrypotter: Palettes Generated from All "Harry Potter" Movies. https://github.com/aljrico/harrypotter.
Kuhn, Max. 2020. Caret: Classification and Regression Training. https://github.com/topepo/caret/.
Müller, Kirill. 2020. Here: A Simpler Way to Find Your Files. https://CRAN.R-project.org/package=here.
Müller, Kirill, and Hadley Wickham. 2021. Tibble: Simple Data Frames. https://CRAN.R-project.org/package=tibble.
R Core Team. 2020. R: A Language and Environment for Statistical Computing. Vienna, Austria: R Foundation for Statistical Computing. https://www.R-project.org/.
Sarkar, Deepayan. 2008. Lattice: Multivariate Data Visualization with r. New York: Springer. http://lmdvr.r-forge.r-project.org.
———. 2020. Lattice: Trellis Graphics for r. http://lattice.r-forge.r-project.org/.
Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. https://ggplot2.tidyverse.org.
———. 2019a. Stringr: Simple, Consistent Wrappers for Common String Operations. https://CRAN.R-project.org/package=stringr.
———. 2019b. Tidyverse: Easily Install and Load the Tidyverse. https://CRAN.R-project.org/package=tidyverse.
———. 2021a. Forcats: Tools for Working with Categorical Variables (Factors). https://CRAN.R-project.org/package=forcats.
———. 2021b. Tidyr: Tidy Messy Data. https://CRAN.R-project.org/package=tidyr.
Wickham, Hadley, Mara Averick, Jennifer Bryan, Winston Chang, Lucy D’Agostino McGowan, Romain François, Garrett Grolemund, et al. 2019. “Welcome to the tidyverse.” Journal of Open Source Software 4 (43): 1686. https://doi.org/10.21105/joss.01686.
Wickham, Hadley, and Jennifer Bryan. 2019. Readxl: Read Excel Files. https://CRAN.R-project.org/package=readxl.
Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, Kohske Takahashi, Claus Wilke, Kara Woo, Hiroaki Yutani, and Dewey Dunnington. 2021. Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics. https://CRAN.R-project.org/package=ggplot2.
Wickham, Hadley, Romain François, Lionel Henry, and Kirill Müller. 2021. Dplyr: A Grammar of Data Manipulation. https://CRAN.R-project.org/package=dplyr.
Wickham, Hadley, and Jim Hester. 2021. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Xie, Yihui. 2014. “Knitr: A Comprehensive Tool for Reproducible Research in R.” In Implementing Reproducible Computational Research, edited by Victoria Stodden, Friedrich Leisch, and Roger D. Peng. Chapman; Hall/CRC. http://www.crcpress.com/product/isbn/9781466561595.
———. 2015. Dynamic Documents with R and Knitr. 2nd ed. Boca Raton, Florida: Chapman; Hall/CRC. https://yihui.org/knitr/.
———. 2021. Knitr: A General-Purpose Package for Dynamic Report Generation in r. https://yihui.org/knitr/.