library(tidyverse)
library(quanteda)
library(quanteda.textstats)

first_debate <- read.csv("../data/us_election_2020_1st_presidential_debate.csv",
  stringsAsFactors = F,encoding="UTF-8")

# optional / bonus : speaker variable with regular expressions
first_debate <- first_debate %>%
  mutate(speaker=str_extract(speaker,"[A-z]*$")) 

debate_corp <- corpus(first_debate)
# optional : renaming
docnames(debate_corp) <- paste0(1:nrow(first_debate),"_",
                                first_debate$speaker)
debate_toks <- tokens(debate_corp)

debate_dfm <- dfm(debate_toks)

Loading and applying a dictionary

# read in dictionary in yaml-format
newsmap_dict <- dictionary(file = "english.yml",
                           format = "YAML")

## Warning in readLines(con): incomplete final line found on 'english.yml'

newsmap_dict

## Dictionary object with 5 primary key entries and 3 nested levels.
## - [AFRICA]:
##   - [EAST]:
##     - [BI]:
##       - burundi, burundian*, bujumbura
##     - [DJ]:
##       - djibouti, djiboutian*
##     - [ER]:
##       - eritrea, eritrean*, asmara
##     - [ET]:
##       - ethiopia, ethiopian*, addis ababa
##     - [KE]:
##       - kenya, kenyan*, nairobi
##     - [KM]:
##       - comoros, comorian*, moroni
##     [ reached max_nkey ... 13 more keys ]
##   - [MIDDLE]:
##     - [AO]:
##       - angola, angolan*, luanda
##     - [CD]:
##       - democratic republic congo, dr congo, drc, democratic republic congolese, dr congolese, kinshasa
##     - [CF]:
##       - central african republic, central african*, bangui
##     - [CG]:
##       - congo, congo republic, congolese, brazzaville
##     - [CM]:
##       - cameroon, cameroonian*, yaounde, yaoundé
##     - [GA]:
##       - gabon, gabonese, libreville
##     [ reached max_nkey ... 3 more keys ]
##   - [NORTH]:
##     - [DZ]:
##       - algeria, algerian*, algiers
##     - [EG]:
##       - egypt, egyptian*, cairo
##     - [EH]:
##       - western sahara, western saharan*, el aaiun
##     - [LY]:
##       - libya, libyan*, tripoli
##     - [MA]:
##       - morocco, moroccan*, rabat
##     - [SD]:
##       - sudan, sudanese, khartoum
##     [ reached max_nkey ... 2 more keys ]
##   - [SOUTH]:
##     - [BW]:
##       - botswana, botswanan*, gaborone
##     - [LS]:
##       - lesotho, lesothonian*, maseru
##     - [NA]:
##       - namibia, namibian*, windhoek
##     - [SZ]:
##       - swaziland, swazi*, lobamba, mbabane
##     - [ZA]:
##       - south africa, s african, sa, south african*, s african*, cape town, johannesburg, pretoria
##   - [WEST]:
##     - [BF]:
##       - burkina faso, burkinabe*, ouagadougou
##     - [BJ]:
##       - benin, beninese, beninois, porto novo
##     - [CI]:
##       - ivory coast, côte d'ivoire, i coast, ivorian*, yamoussoukro, abidjan
##     - [CV]:
##       - cape verde, cape verdean*, praia
##     - [GH]:
##       - ghana, ghanaian*, accra
##     - [GM]:
##       - gambia, gambian*, banjul
##     [ reached max_nkey ... 11 more keys ]
## - [AMERICA]:
##   - [CARIB]:
##     - [AG]:
##       - antigua and barbuda, antiguan*, barbudan*
##     - [AI]:
##       - anguilla, anguillan*, the valley
##     - [AW]:
##       - aruba, aruban*, oranjestad
##     - [BB]:
##       - barbados, barbadian*, bridgetown
##     - [BL]:
##       - saint barthelemy, saint-barthelemy, saint-barthélemy, st barthelemy, barthelemois, gustavia
##     - [BQ]:
##       - bonaire, bonairean*, kralendijk
##     [ reached max_nkey ... 22 more keys ]
##   - [CENTER]:
##     - [BZ]:
##       - belize, belizean*, belmopan
##     - [CR]:
##       - costa rica, costa rican*, ticos, san jose
##     - [GT]:
##       - guatemala, guatemalan*, guatemala city
##     - [HN]:
##       - honduras, honduran*, tegucigalpa
##     - [MX]:
##       - mexico, mexican*, mexico city
##     - [NI]:
##       - nicaragua, nicaraguan*, managua
##     [ reached max_nkey ... 2 more keys ]
##   - [SOUTH]:
##     - [AR]:
##       - argentina, argentine*, argentinian*, buenos aires
##     - [BO]:
##       - bolivia, bolivian*, sucre, la paz
##     - [BR]:
##       - brazil, brazilian*, brasilia, sao paulo, rio
##     - [CL]:
##       - chile, chilean*, santiago
##     - [CO]:
##       - colombia, colombian*, bogota
##     - [EC]:
##       - ecuador, ecuadorian*, quito
##     [ reached max_nkey ... 8 more keys ]
##   - [NORTH]:
##     - [BM]:
##       - bermuda, bermudan*
##     - [CA]:
##       - canada, canadian*, ottawa, toronto, quebec
##     - [GL]:
##       - greenland, greenlander*, nuuk
##     - [PM]:
##       - saint pierre and miquelon, st pierre and miquelon, saint pierrais, miquelonnais, saint pierre
##     - [US]:
##       - united states, us, american*, washington, new york
## - [ASIA]:
##   - [CENTER]:
##     - [KG]:
##       - kyrgyzstan, kyrgyz*, bishkek
##     - [KZ]:
##       - kazakhstan, kazakh*, astana
##     - [TJ]:
##       - tajikistan, tajiks*, dushanbe
##     - [TM]:
##       - turkmenistan, turkmen*, ashhabad
##     - [UZ]:
##       - uzbekistan, uzbek*, tashkent
##   - [EAST]:
##     - [CN]:
##       - china, chinese, beijing, shanghai
##     - [HK]:
##       - hong kong, hongkongese
##     - [JP]:
##       - japan, japanese, tokyo
##     - [KP]:
##       - north korea, n korea, north korean*, n korean*, dprk, pyongyang
##     - [KR]:
##       - south korea, s korea, south korean, s korean*, seoul
##     - [MN]:
##       - mongolia, mongolian*, ulan bator
##     [ reached max_nkey ... 2 more keys ]
##   - [SOUTH]:
##     - [AF]:
##       - afghanistan, afghan*, kabul
##     - [BD]:
##       - bangladesh, bangladeshi*, dhaka, dacca
##     - [BT]:
##       - bhutan, bhutanese, thimphu
##     - [IN]:
##       - india, indian*, mumbai, new delhi
##     - [IR]:
##       - iran, iranian*, tehran
##     - [LK]:
##       - sri lanka, sri lankan*, colombo
##     [ reached max_nkey ... 3 more keys ]
##   - [SOUTH-EAST]:
##     - [BN]:
##       - brunei, bruneian*
##     - [ID]:
##       - indonesia, indonesian*, jakarta
##     - [KH]:
##       - cambodia, cambodian*, phnom penh
##     - [LA]:
##       - laos, laotian*, vientiane
##     - [MM]:
##       - myanmar, burma, myanmarese, burmese, yangon, naypyidaw
##     - [MY]:
##       - malaysia, malaysian*, kuala lumpur, putrajaya
##     [ reached max_nkey ... 5 more keys ]
##   - [WEST]:
##     - [AE]:
##       - united arab emirates, uae, emirati*, emiri*, dubai, abu dhabi
##     - [AM]:
##       - armenia, armenian*, yerevan
##     - [AZ]:
##       - azerbaijan, azerbaijani*, azeri*, baku
##     - [BH]:
##       - bahrain, bahraini*, manama
##     - [CY]:
##       - cyprus, cypriot*, nicosia
##     - [GE]:
##       - georgia, georgian*, tbilisi
##     [ reached max_nkey ... 12 more keys ]
## - [EUROPE]:
##   - [EAST]:
##     - [BG]:
##       - bulgaria, bulgarian*, sofia
##     - [BY]:
##       - belarus, belarusian*, minsk
##     - [CZ]:
##       - czech republic, czech*, prague
##     - [HU]:
##       - hungary, hungarian*, budapest
##     - [MD]:
##       - moldova, moldovan*, chisinau
##     - [PL]:
##       - poland, polish, pole*, warsaw
##     [ reached max_nkey ... 4 more keys ]
##   - [NORTH]:
##     - [AX]:
##       - aland islands, aland island*, alandish, mariehamn
##     - [DK]:
##       - denmark, danish, dane*, copenhagen
##     - [EE]:
##       - estonia, estonian*, tallinn
##     - [FI]:
##       - finland, finnish, finn*, helsinki
##     - [FO]:
##       - faeroe islands, faeroe island*, faroese*, torshavn
##     - [GB]:
##       - uk, united kingdom, britain, british, briton*, brit*, london
##     [ reached max_nkey ... 10 more keys ]
##   - [SOUTH]:
##     - [AD]:
##       - andorra, andorran*
##     - [AL]:
##       - albania, albanian*, tirana
##     - [BA]:
##       - bosnia, bosnian*, bosnia and herzegovina, herzegovina, sarajevo
##     - [ES]:
##       - spain, spanish, spaniard*, madrid, barcelona
##     - [GI]:
##       - gibraltar, gibraltarian*, llanitos
##     - [GR]:
##       - greece, greek*, athens
##     [ reached max_nkey ... 11 more keys ]
##   - [WEST]:
##     - [AT]:
##       - austria, austrian*, vienna
##     - [BE]:
##       - belgium, belgian*, brussels
##     - [CH]:
##       - switzerland, swiss*, zurich, bern
##     - [DE]:
##       - germany, german*, berlin, frankfurt
##     - [FR]:
##       - france, french*, paris
##     - [LI]:
##       - liechtenstein, liechtenstein*, vaduz
##     [ reached max_nkey ... 3 more keys ]
## - [OCEANIA]:
##   - [AU-NZ]:
##     - [AU]:
##       - australia, australian*, aussie*, oz, canberra, sydney
##     - [CK]:
##       - cook islands, cook island*, avarua
##     - [NF]:
##       - norfolk island, norfolk islander*
##     - [NZ]:
##       - new zealand, n zealand, nz, new zealander*, kiwi*, wellington, auckland
##   - [MEL]:
##     - [FJ]:
##       - fiji, fijian*
##     - [NC]:
##       - new caledonia, new caledonian*, noumea
##     - [PG]:
##       - papua new guinea, papua new guinean*, papuan*, port moresby
##     - [SB]:
##       - solomon islands, solomon island*, honiara
##     - [VU]:
##       - vanuatu, vanuatuan*, port vila
##   - [MIC]:
##     - [FM]:
##       - micronesia, micronesian*, palikir
##     - [GU]:
##       - guam, guamanian*, hagatna
##     - [KI]:
##       - kiribati, kiribati*, tarawa
##     - [MH]:
##       - marshall islands, marshall island*, marshallese, majuro
##     - [MP]:
##       - northern mariana islands, northern mariana island*, capital hill
##     - [NR]:
##       - nauru, nauruan*, yaren
##     [ reached max_nkey ... 1 more key ]
##   - [POL]:
##     - [AS]:
##       - american samoa, american samoan*, pago pago
##     - [NU]:
##       - niue, niuean*, alofi
##     - [PF]:
##       - french polynesia, french polynesian*, papeete
##     - [PN]:
##       - pitcairn islands, pitcairn island*, adamstown
##     - [TK]:
##       - tokelau, tokelauan*, nukunonu
##     - [TO]:
##       - tonga, tongan*, nuku'alofa
##     [ reached max_nkey ... 3 more keys ]

# apply dictionary at dfm level
dict_dfm_results <- dfm_lookup(debate_dfm,newsmap_dict)
# snippet
dict_dfm_results[650:655,111:113]

## Document-feature matrix of: 6 documents, 3 features (94.44% sparse) and 2 docvars.
##              features
## docs          AMERICA.NORTH.GL AMERICA.NORTH.PM AMERICA.NORTH.US
##   650_Biden                  0                0                0
##   651_Trump                  0                0                0
##   652_Wallace                0                0                0
##   653_Trump                  0                0                0
##   654_Wallace                0                0                0
##   655_Biden                  0                0                1

# apply dictionary at tokens level
dict_toks_results <- tokens_lookup(debate_toks,newsmap_dict)
# snippet
dict_toks_results[650:655]

## Tokens consisting of 6 documents and 2 docvars.
## 650_Biden :
## character(0)
## 
## 651_Trump :
## character(0)
## 
## 652_Wallace :
## character(0)
## 
## 653_Trump :
## character(0)
## 
## 654_Wallace :
## character(0)
## 
## 655_Biden :
## [1] "AMERICA.NORTH.US"

Dictionary results with textstat_frequency()

dict_dfm_results %>% textstat_frequency()

##              feature frequency rank docfreq group
## 1   AMERICA.NORTH.US        44    1      35   all
## 2       ASIA.EAST.CN        10    2       9   all
## 3     EUROPE.EAST.RU         6    3       6   all
## 4     EUROPE.WEST.FR         5    4       4   all
## 5      ASIA.SOUTH.IN         2    5       2   all
## 6  AMERICA.CENTER.MX         1    6       1   all
## 7   AMERICA.SOUTH.BR         1    6       1   all
## 8       ASIA.EAST.JP         1    6       1   all
## 9       ASIA.WEST.IQ         1    6       1   all
## 10    EUROPE.EAST.UA         1    6       1   all
## 11   EUROPE.NORTH.IE         1    6       1   all
## 12    EUROPE.WEST.DE         1    6       1   all

Differences between tokens and dfm

We can apply dictionaries to tokens and dfms. How are the results different?

newsmap_dict <- dictionary(file = "english.yml",
                           format = "YAML")

## Warning in readLines(con): incomplete final line found on 'english.yml'

debate_dfm %>% dfm_lookup(newsmap_dict) %>% 
  textstat_frequency() %>% head(4)

##            feature frequency rank docfreq group
## 1 AMERICA.NORTH.US        44    1      35   all
## 2     ASIA.EAST.CN        10    2       9   all
## 3   EUROPE.EAST.RU         6    3       6   all
## 4   EUROPE.WEST.FR         5    4       4   all

debate_toks %>% tokens_lookup(newsmap_dict) %>% 
  dfm() %>% 
  textstat_frequency() %>% head(4)

##            feature frequency rank docfreq group
## 1 america.north.us        58    1      40   all
## 2     asia.east.cn        10    2       9   all
## 3   europe.east.ru         6    3       6   all
## 4   europe.west.fr         5    4       4   all

To understand, have a look at the dictionary key and what it selects in specific texts:

newsmap_dict$AMERICA$NORTH$US

## [1] "united states" "us"            "american*"     "washington"   
## [5] "new york"

tokens_select(debate_toks,newsmap_dict)[12]

## Tokens consisting of 1 document and 2 docvars.
## 12_Biden :
## [1] "American" "United"   "States"   "United"   "States"   "American"

debate_toks[12] %>% dfm() %>% dfm_select(newsmap_dict)

## Document-feature matrix of: 1 document, 1 feature (0.00% sparse) and 2 docvars.
##           features
## docs       american
##   12_Biden        2

Exercises

You can do the following tasks either on the same corpus or another corpus you find interesting.

You can keep using the newsmap dictionary or use a different dictionary

built into quanteda, you find 2015 Lexicoder Sentiment Dictionary in quanteda dictionary format by typing data_dictionary_LSD2015
several dictionaries in wordstat format are available here, most for free
Moral foundations dictionary
with a bit more effort, you could translate any wordlist / thesaurus you find into a dictionary, e.g.: http://gender-decoder.katmatfield.com/about#masculine

Exercise: Grouping

You can also apply dictionaries by group, using docvars to form these groups. For that, you can either group the dfm before applying the dictionary with dfm_group() or specify the group inside the textstat_frequency() command.

→ E.g. How do the candidates differ in geographic focus?

Or, a different grouping variable if you chose to use your own data

dfm_lookup(debate_dfm, newsmap_dict, levels = 1) %>%
  textstat_frequency(groups=speaker)

##   feature frequency rank docfreq   group
## 1 AMERICA        27    1      20   Biden
## 2  EUROPE         4    2       3   Biden
## 3    ASIA         3    3       3   Biden
## 4    ASIA        11    1       8   Trump
## 5  EUROPE         9    2       8   Trump
## 6 AMERICA         8    3       6   Trump
## 7 AMERICA        11    1      10 Wallace
## 8  EUROPE         1    2       1 Wallace

Exercise: Levels

As you may have seen when having a look at the newsmap dictionary, this is dictionary with multiple levels - continents, regions and countries.

While the dictionary normally gives you results for all levels, you can also apply each level separately by specifying levels=1 (or a different level) inside the dfm_lookup() command.

Apply the first level of the newsmap dictionary - if you now work with a different dictionary, try out if this nevertheless works or if your dictionary only has a single level.

dfm_lookup(debate_dfm, newsmap_dict, levels = 1) %>%
  textstat_frequency()

##   feature frequency rank docfreq group
## 1 AMERICA        46    1      36   all
## 2    ASIA        14    2      11   all
## 3  EUROPE        14    2      12   all

Exercise: Weighting

dfm_weight() allows you to weigh your dfm by different characteristics. You can consult the documentation to see the different options.

Try relative frequency and a binary weighting (all non-zeros coded as 1) - How do the results for textstat_frequency differ?

debate_dfm %>% dfm_lookup(newsmap_dict,levels=1) %>% textstat_frequency()

##   feature frequency rank docfreq group
## 1 AMERICA        46    1      36   all
## 2    ASIA        14    2      11   all
## 3  EUROPE        14    2      12   all

dfm_weight(debate_dfm,"prop") %>% dfm_lookup(newsmap_dict,levels=1) %>% textstat_frequency()

##   feature frequency rank docfreq group
## 1 AMERICA 1.4154382    1      36   all
## 2  EUROPE 0.4120606    2      12   all
## 3    ASIA 0.2823297    3      11   all

dfm_weight(debate_dfm,"boolean") %>% dfm_lookup(newsmap_dict,levels=1) %>% textstat_frequency()

##   feature frequency rank docfreq group
## 1 AMERICA        40    1      36   all
## 2    ASIA        13    2      11   all
## 3  EUROPE        13    2      12   all

Exercise: Dictionary Creation

I’ve asked you to think about a concept you want to measure and what words might be indicators for this concept.

Check the documentation of the dictionary() function for the syntax of specifying your own dictionary and create it. Apply it to the presidential debate or your dataset.

If you do not have an idea for your own concept: Try to create a topic-dictionary (e.g. economy, covid, foreign relations, …) and apply it to the presidential debate.

topic_dictionary <- dictionary(list(
    economy=c("jobs","econom*","labor*","employ*"),
    corona=c("corona*","virus*","covid")
    ))
topics <- dfm_lookup(debate_dfm,topic_dictionary)
topics %>% textstat_frequency()

##   feature frequency rank docfreq group
## 1 economy        54    1      34   all
## 2  corona        20    2      17   all

Exercise: Wrangling

We can use our dictionary to classify the statements with a bit of wrangling. For that, it is often easier to transform the results into a data.frame and work with them outside of quanteda.

Use convert() to convert the results of applying the dictionary into a dataframe and bind them to the original data

topic_classification<-convert(topics,"data.frame") %>% bind_cols(first_debate)

Now, we need to find a decision rule:

e.g. assign each statement to the class with most hits
e.g. assign only statements to a class that only have hits from one dictionary key

There are several ways to implement this and they are a question of data management, not text analysis - one is using case_when() inside a mutate() function.

# is it mentioned
topic_classification<- topic_classification %>% 
    mutate(class=case_when(economy>0 & corona>0 ~ "both",
                           economy>0~"economy",
                           corona>0~"corona",
                           TRUE~"other"))

# majority decision
topic_classification<- topic_classification %>% 
    mutate(class_majority=case_when(economy>corona ~ "economy",
                                    economy<corona~"corona",
                                    economy==corona~"unknown"))

Transfer: EUI Theses

In preparation for this class, I downloaded all thesis abstracts listed in cadmus. You can download the data from the course webpage.

Have a look at how thesis abstracts of the different departments consider different continents. Caution: For some theses, cadmus has no abstract. You may want to filter those out. And maybe you want to plot the results?

load("../data/theses.RData")
theses_clean <- theses %>%
  filter(is.na(abstract)==F)
theses_toks <- corpus(theses_clean,text_field="abstract") %>%
  tokens()

stats <- theses_toks %>% 
  tokens_lookup(newsmap_dict,level=1,nomatch = "NN") %>%
  dfm() %>% dfm_group(department) %>%
  dfm_weight("prop") %>%
  textstat_frequency(groups=department)

stats

##    feature    frequency rank docfreq      group
## 1       nn 9.961407e-01    1       1 eco_theses
## 2   europe 2.206712e-03    2       1 eco_theses
## 3  america 1.185986e-03    3       1 eco_theses
## 4     asia 2.721935e-04    4       1 eco_theses
## 5   africa 1.749815e-04    5       1 eco_theses
## 6  oceania 1.944239e-05    6       1 eco_theses
## 7       nn 9.857310e-01    1       1 hec_theses
## 8   europe 1.130117e-02    2       1 hec_theses
## 9     asia 1.198830e-03    3       1 hec_theses
## 10 america 8.991228e-04    4       1 hec_theses
## 11  africa 8.406433e-04    5       1 hec_theses
## 12 oceania 2.923977e-05    6       1 hec_theses
## 13      nn 9.964496e-01    1       1 law_theses
## 14  europe 1.870792e-03    2       1 law_theses
## 15 america 8.807745e-04    3       1 law_theses
## 16    asia 5.666999e-04    4       1 law_theses
## 17  africa 2.321421e-04    5       1 law_theses
## 18      nn 9.914544e-01    1       1 sps_theses
## 19  europe 5.783370e-03    2       1 sps_theses
## 20 america 1.137379e-03    3       1 sps_theses
## 21    asia 1.106914e-03    4       1 sps_theses
## 22  africa 4.976034e-04    5       1 sps_theses
## 23 oceania 2.031034e-05    6       1 sps_theses

stats %>%
  filter(feature!="nn") %>%
  ggplot(aes(x=frequency*100,y=feature,color=group))+geom_jitter(size=3)+
  theme_minimal()+theme(legend.position="bottom")+
  xlab("Frequency in Percent")

Other Options

Another idea you might consider is a targeted dictionary analysis that applies the dictionary only to words surrounding a concept - e.g. to find sentiment about a topic or the way media talk about male and female politicians.

Dictionaries

Session 2

Theresa Gessler

2022-05-09