library(tidyverse)
library(quanteda)
first_debate <- read.csv("../data/us_election_2020_1st_presidential_debate.csv",
  stringsAsFactors = F,encoding="UTF-8")

# optional / bonus : speaker variable with regular expressions
first_debate <- first_debate %>%
  mutate(speaker=str_extract(speaker,"[A-z]*$")) 

debate_corp <- corpus(first_debate)
# optional : renaming
docnames(debate_corp) <- paste0(1:nrow(first_debate),"_",
                                first_debate$speaker)
debate_toks <- tokens(debate_corp)

debate_dfm <- dfm(debate_toks)

Pre-processing

Pre-processing has three different aspects

  • restricting feature definitions
  • removing uninformative features
  • uniting features

The structure of your dfm

First, some tools to understand what your dfm / tokens object currently looks like. There are specific commands to get

  • the number of tokens
ntoken(debate_toks) %>% head()
## 1_Wallace 2_Wallace   3_Biden   4_Trump   5_Biden 6_Wallace 
##       135       116         6         5         3       149
  • the number of features in a dfm
nfeat(debate_dfm)
## [1] 2297
  • the most frequent features
topfeatures(debate_dfm)
##    .    ,  the   to  you  and    a   of   in that 
## 1627 1127  806  562  524  468  391  358  305  299
  • the number of documents
ndoc(debate_dfm)
## [1] 789
  • Names of features as a vector
featnames(debate_dfm) %>% head()
## [1] "good"      "evening"   "from"      "the"       "health"    "education"
  • frequency of features as a vector
featfreq(debate_dfm) %>% head()
##      good   evening      from       the    health education 
##        31         1        34       806        11         2

You can use many of these commands to check the results of your pre-processing.

Restricting feature definitions

Example: Restrict features by removing punctuation, numbers and symbols

# check feature number
nfeat(debate_dfm)
## [1] 2297
# restrict feature definition
debate_toks <- tokens(debate_corp,  remove_punct=T,
  remove_numbers=T, remove_symbols=T)
debate_dfm <- dfm(debate_toks)

# check new feature number
nfeat(debate_dfm)
## [1] 2186

Removing uninformative features

Now try yourself!

  • check stopwords in your language, e.g.ย with stopwords("en") - if you think it is useful, remove them
stopwords("en")
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"       "will"
dfm_remove(debate_dfm,stopwords("en")) %>% nfeat()
## [1] 2033

dfm_trim() is an alternative that selects words based on their frequency.

  • try the different options of dfm_trim()
    • remove features that occur less than 10 times in total
    • remove features occuring in less than 2 documents
    • remove features occuring more than 50 times

Tip: If you specify verbose=T in your command, you see how much you remove.

dfm_trim(debate_dfm,min_termfreq = 10,
        min_docfreq = 2,
        max_termfreq = 50,
        verbose=T)
## Removing features occurring:
##   - fewer than 10 times: 1,904
##   - more than 50 times: 62
##   - in fewer than 2 documents: 1,088
##   Total features removed: 1,966 (89.9%).
## Document-feature matrix of: 789 documents, 220 features (97.69% sparse) and 2 docvars.
##            features
## docs        good from health chris first trump vice joe biden minute
##   1_Wallace    1    2      1     1     2     1    1   1     1      2
##   2_Wallace    0    0      2     0     0     1    1   0     1      0
##   3_Biden      0    0      0     0     0     0    0   0     0      0
##   4_Trump      0    0      0     0     0     0    0   0     0      0
##   5_Biden      0    0      0     0     0     0    0   0     0      0
##   6_Wallace    0    0      0     0     4     2    1   0     1      0
## [ reached max_ndoc ... 783 more documents, reached max_nfeat ... 210 more features ]

Uniting features

Stem the features of your dfm. How many features did you loose with that?

nfeat(debate_dfm)
## [1] 2186
debate_dfm <- dfm_wordstem(debate_dfm,"en")

nfeat(debate_dfm)
## [1] 1745

Transformations

We have already used some commands to transform dfms for pre-processing.

  • dfm_subset() : Selection based on docvars
  • dfm_group() : grouping of documents based on docvars
  • dfm_select(): Selection of features
    • dfm_trim(): Selection of features based on frequency
  • dfm_weight() & dfm_tfidf() : weighting the feature counts
  • dfm_lookup() : Looking up dictionaries

Grouping

First, group your dataset by speaker. How did the number of documents change?

ndoc(debate_dfm)
## [1] 789
new_debate_dfm <- dfm_group(debate_dfm,speaker)
ndoc(new_debate_dfm)
## [1] 3

Subsetting

Subset your documents. If you still use the debate dataset, you are a bit limited but you could drop statements by Chris Wallace. Otherwise, try to drop certain years. Again, check the number of documents before and after.

ndoc(debate_dfm)
## [1] 789
new_debate_dfm <- dfm_subset(debate_dfm,speaker!="Wallace")
ndoc(new_debate_dfm)
## [1] 563

Weighting

Finally, weigh your dfm: dfm_weight() allows you to weigh your dfm by different characteristics. You can consult the documentation to see the different options.

See how the topfeatures() change when you use binary weighting and relative frequency.

dfm_weight(debate_dfm,"boolean") %>% topfeatures()
##  the   to  you  and that    a   it   of    i   is 
##  300  256  249  217  209  208  194  187  164  159
dfm_weight(debate_dfm,"prop") %>% topfeatures()
##      the      you       to     that       it        i      and        a       is 
## 26.74026 22.01335 20.03773 18.59985 15.86069 14.49628 13.91903 13.69351 12.41338 
##      not 
## 12.14712

A frequently applied weight is term-frequency-inverse-document-frequency. It is also implemented in quanteda with dfm_tfidf(). Look at the results of this weighting for your first documents. Is this more or less meaningful to you?

debate_dfm
## Document-feature matrix of: 789 documents, 1,745 features (99.04% sparse) and 2 docvars.
##            features
## docs        good even from the health educ campus of case western
##   1_Wallace    1    1    2  15      1    1      1  5    1       1
##   2_Wallace    0    0    0  10      2    0      0  1    0       0
##   3_Biden      0    0    0   0      0    0      0  0    0       0
##   4_Trump      0    0    0   0      0    0      0  0    0       0
##   5_Biden      0    0    0   0      0    0      0  0    0       0
##   6_Wallace    0    0    0  10      0    0      0  3    0       0
## [ reached max_ndoc ... 783 more documents, reached max_nfeat ... 1,735 more features ]
dfm_tfidf(debate_dfm)
## Document-feature matrix of: 789 documents, 1,745 features (99.04% sparse) and 2 docvars.
##            features
## docs            good     even     from      the   health     educ   campus
##   1_Wallace 1.482104 1.554654 2.899838 6.299336 1.897077 2.596047 2.897077
##   2_Wallace 0        0        0        4.199557 3.794154 0        0       
##   3_Biden   0        0        0        0        0        0        0       
##   4_Trump   0        0        0        0        0        0        0       
##   5_Biden   0        0        0        0        0        0        0       
##   6_Wallace 0        0        0        4.199557 0        0        0       
##            features
## docs               of     case  western
##   1_Wallace 3.1261770 1.897077 2.596047
##   2_Wallace 0.6252354 0        0       
##   3_Biden   0         0        0       
##   4_Trump   0         0        0       
##   5_Biden   0         0        0       
##   6_Wallace 1.8757062 0        0       
## [ reached max_ndoc ... 783 more documents, reached max_nfeat ... 1,735 more features ]