We use Trump speeches that mention democracy - just to restrict the sample and reduce the time the script will run
# replace with your URL
startpage<-"https://www.presidency.ucsb.edu/advanced-search?field-keywords=democracy&field-keywords2=&field-keywords3=&from%5Bdate%5D=&to%5Bdate%5D=&person2=200301&items_per_page=100"
searchresults<-read_html(startpage)
# choose last page of search results
last_page<-html_nodes(searchresults,".pager-last a") %>% html_attr("href")
library(stringr)
last_page <- str_extract(last_page,"[0-9]*$")
last_page <- as.numeric(last_page)
resulturls <- paste0(startpage,"&page=", 0:last_page)
In the long exercise file, we discussed two options: merging the tables and URLs inside the loop or putting them in a list and extracting them afterwards. I do the second here because it is more robust to interruptions.
empty_list<- list()
empty_list2 <- list()
for (i in 1:length(resulturls)){
page<-read_html(resulturls[i])
empty_list[[i]]<-html_table(page) %>% extract2(1)
empty_list2[[i]]<-html_nodes(page,".views-field-title a") %>%
html_attr("href")
}
results_df <- do.call("rbind",empty_list)
results_df$links <- unlist(empty_list2)
results_df$links <- paste0("https://www.presidency.ucsb.edu",
results_df$links)
Now, we download all speeches for which we collected the URLs. I’ve also built in a time measure so you can get a feeling how long the scraping takes.
start <- Sys.time()
speechtext <- rep("",length(results_df$links))
for (i in 1:length(results_df$links)){
speechtext[i] <- read_html(results_df$links[i]) %>%
html_nodes(".field-docs-content") %>%
html_text()
}
results_df$text <- speechtext
end <- Sys.time()
end-start
## Time difference of 6.170393 mins
As an example: check speeches 100-103.
results_df[100:103,]
## # A tibble: 4 x 5
## Date Related `Document Title` links text
## <chr> <chr> <chr> <chr> <chr>
## 1 Apr 16, 2018 Donald J. Trump "Remarks at the \"Tax ~ https://www~ "\n The ~
## 2 Apr 24, 2018 Donald J. Trump "Remarks at a Welcomin~ https://www~ "\n Pres~
## 3 Apr 27, 2018 Donald J. Trump "President Donald J. T~ https://www~ "\n \"Th~
## 4 Apr 27, 2018 Donald J. Trump "Press Release - What ~ https://www~ "\n Memb~
Can you analyze this data, using any of the methods we’ve used in the class?