library(tidyverse)
library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
first_debate <- read.csv("../data/us_election_2020_1st_presidential_debate.csv",
stringsAsFactors = F,encoding="UTF-8")
# optional / bonus : speaker variable with regular expressions
first_debate <- first_debate %>%
mutate(speaker=str_extract(speaker,"[A-z]*$"))
debate_corp <- corpus(first_debate)
# optional : renaming
docnames(debate_corp) <- paste0(1:nrow(first_debate),"_",
first_debate$speaker)
tokens(debate_corp,"word")
## Tokens consisting of 789 documents and 2 docvars.
## 1_Wallace :
## [1] "Good" "evening" "from" "the" "Health" "Education"
## [7] "Campus" "of" "Case" "Western" "Reserve" "University"
## [ ... and 123 more ]
##
## 2_Wallace :
## [1] "This" "debate" "is" "being" "conducted" "under"
## [7] "health" "and" "safety" "protocols" "designed" "by"
## [ ... and 104 more ]
##
## 3_Biden :
## [1] "How" "you" "doing" "," "man" "?"
##
## 4_Trump :
## [1] "How" "are" "you" "doing" "?"
##
## 5_Biden :
## [1] "I'm" "well" "."
##
## 6_Wallace :
## [1] "Gentlemen" "," "a" "lot" "of" "people"
## [7] "been" "waiting" "for" "this" "night" ","
## [ ... and 137 more ]
##
## [ reached max_ndoc ... 783 more documents ]
tokens(debate_corp,"sentence")
## Tokens consisting of 789 documents and 2 docvars.
## 1_Wallace :
## [1] "Good evening from the Health Education Campus of Case Western Reserve University and the Cleveland Clinic."
## [2] "I'm Chris Wallace of Fox News and I welcome you to the first of the 2020 Presidential Debates between President Donald J."
## [3] "Trump and former Vice President Joe Biden."
## [4] "This debate is sponsored by the Commission on Presidential debates."
## [5] "The Commission has designed the format, six roughly 15 minute segments with two minute answers from each candidate to the first question, then open discussion for the rest of each segment."
## [6] "Both campaigns have agreed to these rules."
## [7] "For the record, I decided the topics and the questions in each topic."
## [8] "I can assure you none of the questions has been shared with the Commission or the two candidates."
##
## 2_Wallace :
## [1] "This debate is being conducted under health and safety protocols designed by the Cleveland Clinic, which is serving as the Health Security advisor to the Commission for all four debates."
## [2] "As a precaution, both campaigns have agreed the candidates will not shake hands at the beginning of tonight's debate."
## [3] "The audience here in the hall has promised to remain silent."
## [4] "No cheers, no boos, or other interruptions so we, and more importantly you, can focus on what the candidates have to say."
## [5] "No noise except right now, as we welcome the Republican nominee, President Trump, and the Democratic nominee Vice President Biden."
##
## 3_Biden :
## [1] "How you doing, man?"
##
## 4_Trump :
## [1] "How are you doing?"
##
## 5_Biden :
## [1] "I'm well."
##
## 6_Wallace :
## [1] "Gentlemen, a lot of people been waiting for this night, so let's get going."
## [2] "Our first subject is the Supreme Court."
## [3] "President Trump, you nominated Amy Coney Barrett over the weekend to succeed the late Ruth Bader Ginsburg on the Court."
## [4] "You say the Constitution is clear about your obligation and the Senate's to consider a nominee to the Court."
## [5] "Vice President Biden, you say that this is an effort by the President and Republicans to jam through on an appointment in what you call an abuse of power."
## [6] "My first question to both of you tonight, why are you right in the argument you make and your opponent wrong?"
## [7] "And where do you think a Justice Barrett would take the court?"
## [8] "President Trump, in this first segment, you go first."
## [9] "Two minutes."
##
## [ reached max_ndoc ... 783 more documents ]
tokens(debate_corp,"character")
## Tokens consisting of 789 documents and 2 docvars.
## 1_Wallace :
## [1] "G" "o" "o" "d" "e" "v" "e" "n" "i" "n" "g" "f"
## [ ... and 604 more ]
##
## 2_Wallace :
## [1] "T" "h" "i" "s" "d" "e" "b" "a" "t" "e" "i" "s"
## [ ... and 505 more ]
##
## 3_Biden :
## [1] "H" "o" "w" "y" "o" "u" "d" "o" "i" "n" "g" ","
## [ ... and 4 more ]
##
## 4_Trump :
## [1] "H" "o" "w" "a" "r" "e" "y" "o" "u" "d" "o" "i"
## [ ... and 3 more ]
##
## 5_Biden :
## [1] "I" "'" "m" "w" "e" "l" "l" "."
##
## 6_Wallace :
## [1] "G" "e" "n" "t" "l" "e" "m" "e" "n" "," "a" "l"
## [ ... and 595 more ]
##
## [ reached max_ndoc ... 783 more documents ]
debate_toks <- tokens(debate_corp)
debate_dfm <- dfm(debate_toks)
kwic(debate_toks, "crosstalk") %>%
# restrict to first 15 occurances
head(15) %>%
textplot_xray()
window
gives the number of features before or afterkwic(debate_toks, "country",window=4)
## Keyword-in-context with 31 matches.
## [167_Trump, 9] to you, the | country |
## [167_Trump, 150] should have closed our | country |
## [169_Trump, 9] should have closed our | country |
## [215_Trump, 36] the history of our | country |
## [226_Trump, 9] to shut down this | country |
## [228_Trump, 29] to shut down the | country |
## [238_Trump, 8] shut down the whole | country |
## [284_Trump, 152] He will destroy this | country |
## [286_Trump, 85] close down the whole | country |
## [286_Trump, 94] close down the whole | country |
## [286_Trump, 98] country and destroy our | country |
## [286_Trump, 101] our country. Our | country |
## [479_Wallace, 79] I think that the | country |
## [489_Wallace, 27] race issues facing this | country |
## [501_Trump, 43] as anybody in this | country |
## [502_Trump, 213] the people of this | country |
## [503_Wallace, 73] for Blacks in this | country |
## [504_Biden, 11] systemic injustice in this | country |
## [514_Wallace, 48] systemic racism in this | country |
## [517_Trump, 61] people to hate our | country |
## [517_Trump, 90] core values of this | country |
## [517_Trump, 98] teaching people that our | country |
## [517_Trump, 117] people to hate our | country |
## [523_Biden, 163] going to bring this | country |
## [545_Trump, 115] got to run this | country |
## [647_Trump, 111] head of a major | country |
## [676_Trump, 6] more money than our | country |
## [726_Biden, 199] to determine what this | country |
## [728_Trump, 109] a disgrace to our | country |
## [729_Trump, 73] ballots all over the | country |
## [746_Trump, 69] horrible thing for our | country |
##
## would have been left
## . Wait a minute
## because you thought it
## . And by the
## and I want to
## . We just went
## .
## .
## . This guy will
## and destroy our country
## . Our country is
## is coming back incredibly
## would be better served
## over the next four
## . You did the
## want and demand law
## ?
## , in education and
## , sir?
## And I'm not going
## . They were teaching
## is a horrible place
## . And I'm not
## together is bring everybody
## and they ran it
## , it's a forest
## could make in 100
## is going to look
## , and we've caught
## . There's fraud.
## .
textstat_readability(debate_corp) %>% head()
## document Flesch
## 1 1_Wallace 62.15573
## 2 2_Wallace 50.10547
## 3 3_Biden 97.02500
## 4 4_Trump 97.02500
## 5 5_Biden 120.20500
## 6 6_Wallace 70.34232
dfm_group()
is a helper function that groups all texts with the same docvar, here: speaker; this way we can compare all texts by a speaker to the other textskeyness <- dfm_group(debate_dfm,speaker) %>%
textstat_keyness("Biden")
head(keyness)
## feature chi2 p n_target n_reference
## 1 he 74.55951 0.000000e+00 131 74
## 2 fact 33.01375 9.150953e-09 37 13
## 3 he's 31.47253 2.022813e-08 36 13
## 4 is 29.24807 6.367958e-08 120 110
## 5 his 28.69282 8.481774e-08 33 12
## 6 vote 26.09031 3.258156e-07 21 4
keyness %>%
textplot_keyness()
You can do the following tasks either on the same corpus or another corpus you find interesting.
Using the commands above, check in which context the concepts of women and men are used.
Tip: You can use the question mark as replacement of a single letter or the asteriks sign as a replacement of multiple letters, e.g. to include singular and plural.
kwic(debate_toks,"wom?n")
## Keyword-in-context with 4 matches.
## [13_Biden, 151] . Once again, a | woman |
## [13_Biden, 169] . They're able to charge | women |
## [490_Biden, 108] Klux Klan. A young | woman |
## [504_Biden, 46] decent, honorable men and | women |
##
## could pay more money because
## more for the same exact
## got killed and they asked
## . They risk their lives
kwic(debate_toks,"m?n")
## Keyword-in-context with 14 matches.
## [3_Biden, 5] How you doing, | man |
## [13_Biden, 177] the same exact procedure a | man |
## [134_Biden, 65] And the fact is this | man |
## [146_Biden, 6] Will you shut up, | man |
## [152_Biden, 14] it? Keep yapping, | man |
## [190_Biden, 5] This is the same | man |
## [192_Biden, 53] . This is the same | man |
## [194_Biden, 7] here's the deal. This | man |
## [329_Trump, 120] believe that. Now the | man |
## [495_Biden, 4] This is a | man |
## [495_Biden, 59] 500 African Americans. This | man |
## [495_Biden, 68] savior of African-Americans? This | man |
## [504_Biden, 44] good, decent, honorable | men |
## [567_Trump, 17] the killer of a young | man |
##
## ?
## gets.
## doesn't know what he's talking
## ?
## .
## who told you-
## .
## is talking about a vaccine
## got fired right after that
## who, in fact,
## is as a savior of
## cares at all? This
## and women. They risk
## in the middle of the
Using the commands above, find out what are the most typical words for one of actors in your dataset. If you are still using the debates, look at a different actor than before and try not to copy the code.
keyness <- dfm_group(debate_dfm,speaker) %>%
textstat_keyness("Trump")
head(keyness)
## feature chi2 p n_target n_reference
## 1 they 46.93622 7.333467e-12 99 50
## 2 joe 43.28432 4.733614e-11 30 1
## 3 don't 23.82414 1.055503e-06 43 19
## 4 very 23.39493 1.319258e-06 34 12
## 5 our 23.18091 1.474535e-06 27 7
## 6 left 19.69621 9.078145e-06 20 4
keyness %>%
textplot_keyness()
Using the commands above, calculate readability scores for the speakers. You can check the documentation of textstat_readability()
to see different metrics.
Merge them to the original data - you might have to create a document variable to do so. Then use your data wrangling skills to calculate who is on average the most readable.
# calculate readability
readability_stats <- textstat_readability(debate_corp)
# merging
first_debate$document <- paste0(1:nrow(first_debate),"_",
first_debate$speaker)
readability_stats <- left_join(first_debate,readability_stats)
## Joining, by = "document"
# analyze
readability_stats %>%
group_by(speaker) %>%
select(Flesch) %>%
summarize_all(mean)
## Adding missing grouping variables: `speaker`
## # A tibble: 3 x 2
## speaker Flesch
## <chr> <dbl>
## 1 Biden 85.9
## 2 Trump 88.4
## 3 Wallace 73.0