Tracking Social Issues and Topics in Presidential Speeches

October 22, 2015
By

(This article was first published on StatOfMind, and kindly contributed to R-bloggers)


Scraping presidential transcripts

To begin, we must scrape the content of all presidential speeches recorded in American history. To do that, I’ll rely on the very handy BeautifulSoup library, and eventually store all data in a pandas dataframe that will be persisted in a pickle file.

 
# import required libraries to scrape presidential transcripts
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import urllib2
import re
 
def get_speech_links():
    '''
    scrape content of pages with all presidential transcript links
    '''
    home_url = 'http://millercenter.org/president/speeches'
    try:
        response = urllib2.urlopen(home_url)
        page_source = response.read()
        soup = BeautifulSoup(page_source, "html5lib")
        transcript_links = soup.findAll("a", {'class': 'transcript'})
        return transcript_links
    except urllib2.HTTPError:
        print 'Homepage not available!'
        return None
 
def get_transcript(speech_link):
    '''
    scrape title of speech, date of speech and full transcipt
    contained in the input speech_link URL
    '''
    speaking = speech_link.split('/')[2]
    new_link = base_url + str(speech_link)
    try:
        response = urllib2.urlopen(new_link)
        page_source = response.read()
        soup = BeautifulSoup(page_source, "html5lib")
        title = soup.find('title').text
        speech_date = title.split('(', 1)[1].split(')')[0]
        transcript = soup.find('div', {'id': 'transcript'}).text
        transcript = transcript.replace('\n', ' ').replace('\r', '').replace('\t', '')
        return {'speaker': speaking,
                'date': speech_date,
                'title': title,
                'transcript': transcript}
    except urllib2.HTTPError:
        print 'skipped ' + str(speech_link)
        return None
# iterate through all links and extract content
transcript_links = get_speech_links()
base_url = 'http://millercenter.org/'
transcript_dict = {}
for i, link in enumerate(transcript_links):
    if i % 100 == 0:
        print 'Scraped ' + str(i) + '/' + str(len(transcript_links)) + ' of links...'
    if link.has_attr('href'):
        transcript_data = get_transcript(link['href'])
        if transcript_data is not None:
            key = transcript_data['speaker'] + '|' + transcript_data['date']
            transcript_dict[key] = transcript_data
 
# dump dataframe to pickle object
df = pd.DataFrame.from_dict(transcript_dict, orient='index')
pickle.dump(df, open( "presidential_speeches.pickle", "wb" ))

Topic modeling and visualization

Now that the raw text of all presidential speeches in American history has been retrieved, we can proceed to light preprocessing before applying Latent Dirichlet Allocation.

 
%matplotlib inline  
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel, LsiModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import defaultdict
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import pickle
import lda

In the following 5 cells, we effectively tokenize and remove stopwords from each document (i.e. presidential speech), compute the frequency of each token, and filter out all those that appear less than 10 times in the entire corpus of presidential speeches. Note that I used an ad-hoc threshold of 10, but this should be a parameter that could be played around. Also, the amount of porcessing on each document is intentionally simplistic. Finally, we set up gensim-specific objects that include a dictionary mapping words to integer ids, and a corpus that simply counts the number of occurences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.

 
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
 
df = pickle.load( open( "presidential_speeches.pickle", "rb" ) )
speeches = df.transcript
documents = speeches.tolist()
texts = [tokenize(document) for document in documents]
 
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
 
texts = [[token for token in text if frequency[token] > 10] for text in texts]
 
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Finding the Optimum Number of Topics

Now that the data is ready, we can run a batch LDA (because of the small size of the dataset that we are working with) to discover the main topics in our document.

# fit LDA model
speeches_topics = LdaModel(corpus=corpus,
                           id2word=dictionary,
                           num_topics=20,
                           passes=10)
# print out first 10 topics
for i, topic in enumerate(speeches_topics.print_topics(10)):
    print '{} --- {}'.format(i, topic)
0 --- 0.025*states + 0.017*united + 0.017*shall + 0.014*state + 0.010*constitution + 0.009*president + 0.009*act + 0.009*congress + 0.008*laws + 0.007*law

1 — 0.026government + 0.007states + 0.007chilean + 0.007men + 0.006sailors + 0.006united + 0.005mr + 0.005german + 0.005police + 0.004vessels

2 — 0.012world + 0.011peace + 0.009people + 0.008america + 0.008freedom + 0.007soviet + 0.006united + 0.006new + 0.005states + 0.005nations

3 — 0.050president + 0.030mr + 0.024think + 0.008secretary + 0.008general + 0.008people + 0.007time + 0.007viet + 0.007going + 0.007nam

4 — 0.011government + 0.007people + 0.007business + 0.006country + 0.005economic + 0.005congress + 0.005world + 0.005federal + 0.005tax + 0.005public

5 — 0.006people + 0.004government + 0.004united + 0.004states + 0.003country + 0.003public + 0.003congress + 0.003question + 0.003going + 0.002time

6 — 0.013states + 0.012government + 0.009united + 0.008congress + 0.007public + 0.005country + 0.005great + 0.005year + 0.004general + 0.004people

7 — 0.011peace + 0.010vietnam + 0.010people + 0.009war + 0.009world + 0.008united + 0.007south + 0.007american + 0.007nations + 0.006states

8 — 0.009world + 0.009congress + 0.008new + 0.008year + 0.007america + 0.006people + 0.006energy + 0.006american + 0.006nation + 0.005government

9 — 0.014government + 0.012people + 0.007states + 0.007union + 0.007constitution + 0.006great + 0.006shall + 0.006men + 0.006country + 0.005free

The display of inferred topics shown above does not really lend itself very well to interpretation. Aside from the fact that you have to read through all the topics, most people will interpret the main themes of each topics differently. This hits right to the core of my mixed feelings towards topic modeling. To be given the ability and opportunity to infer topics from a large set of documents is truly amazing, but I have always personally felt (and maybe that is just me) that the ensuing display of information was lacking. Indeed, I have found that the output of typical topic modeling techniques does not lend itself very well to visualization and – in the case of presentations to the uninitiated – interpretation. However, I recently came across the LDAviz R library developed by Kenny Shirley and Carson Sievert, which to paraphrase their words is a D3.js interactive visualization that's designed help you interpret the topics in a topic model fit to a corpus of text using LDA. Here, we use the great Python extension port of the LDAviz R library, available on GitHub at the following URL https://github.com/bmabey/pyLDAvis. Two attractive features of pyLDAviz are its ability to help interpret the topics extracted from a fitted LDA model, but also the fact that it can be easily incorporated within an iPython notebook in nothing more than two lines of code!

 
vis_data = gensimvis.prepare(speeches_topics, corpus, dictionary)
pyLDAvis.display(vis_data)


Tracking and visualizing topics propensity over time

Now that we have shown how results gathered from topic modeling methods such as LDA can be visualized in a intuitive way, we can move to additional data analysis. In particular, it would be interesting to uncover the temporal variation of topics across American History. I would personally be very curious to find out whether topic modeling can reverse-engineer the major events in American History. In the next step, we produce a dataframe where each row represents a speech and each of the 20 columns represent a topic. Each cell in the dataframe represents the probability that a given topic was assigned to a speech.

 
# example command to print out main topics inferred in a presidential speech
print speeches_topics.get_document_topics(corpus[10]) 
[(7, 0.10493554997876908), (10, 0.011621459517891617), (15, 0.86674743636700446)]
 
# extract all document-topic distritbutions to dictionnary
document_key = list(speeches.index)
document_topic = {}
for doc_id in range(len(corpus)):
    docbok = corpus[doc_id]
    doc_topics = speeches_topics.get_document_topics(docbok, 0)
    tmp = []
    for topic_id, topic_prob in doc_topics:
        tmp.append(topic_prob)
    document_topic[document_key[doc_id]] = tmp
 
# convert dictionnary of document-topic distritbutions to dataframe
df = pd.DataFrame.from_dict(document_topic, orient='index')
 
president_speech = [x.split('|')[0] for x in df.index]
year_speech = [y.split(',')[1] if ',' in y else None for y in [x.split('|')[1] for x in df.index]]
 
topics_speech = df
topics_speech['president'] = pd.Series(president_speech, index=df.index)
topics_speech['year'] = pd.Series(year_speech, index=df.index)
topic_column_names = ['topic_' + str(i) for i in range(0, 20)]
topic_column_names.append('president')
topic_column_names.append('year')
topics_speech.columns = topic_column_names
print topics_speech.head(5)
# topics_speech.to_csv('topics_by_speech.csv', sep=',')
                              topic_0   topic_1   topic_2   topic_3   topic_4  \
lincoln|July 4, 1861         0.000011  0.000011  0.000011  0.011112  0.000011   
buchanan|February 24, 1859   0.000033  0.000033  0.000033  0.203170  0.000033   
reagan|November 11, 1988     0.000526  0.197846  0.000526  0.000526  0.000526   
tyler|February 20, 1845      0.000114  0.000114  0.000114  0.000114  0.000114   
eisenhower|January 17, 1961  0.000063  0.000063  0.000063  0.000063  0.000063   

                              topic_5   topic_6   topic_7   topic_8   topic_9  \
lincoln|July 4, 1861         0.008504  0.000011  0.064621  0.051711  0.752269   
buchanan|February 24, 1859   0.000033  0.000033  0.002136  0.249423  0.032370   
reagan|November 11, 1988     0.453219  0.000526  0.000526  0.000526  0.000526   
tyler|February 20, 1845      0.000114  0.000114  0.014633  0.361549  0.128588   
eisenhower|January 17, 1961  0.615735  0.000063  0.000063  0.000063  0.000063   

                             ...    topic_12  topic_13  topic_14  topic_15  \
lincoln|July 4, 1861         ...    0.000011  0.000011  0.000011  0.111633   
buchanan|February 24, 1859   ...    0.000033  0.000033  0.006279  0.490955   
reagan|November 11, 1988     ...    0.101381  0.239133  0.000526  0.000526   
tyler|February 20, 1845      ...    0.000114  0.000114  0.000114  0.493404   
eisenhower|January 17, 1961  ...    0.000063  0.074416  0.000063  0.000063   

                             topic_16  topic_17  topic_18  topic_19  \
lincoln|July 4, 1861         0.000011  0.000011  0.000011  0.000011   
buchanan|February 24, 1859   0.015235  0.000033  0.000033  0.000033   
reagan|November 11, 1988     0.000526  0.000526  0.000526  0.000526   
tyler|February 20, 1845      0.000114  0.000114  0.000114  0.000114   
eisenhower|January 17, 1961  0.010800  0.000063  0.000063  0.000063   

                              president   year  
lincoln|July 4, 1861            lincoln   1861  
buchanan|February 24, 1859     buchanan   1859  
reagan|November 11, 1988         reagan   1988  
tyler|February 20, 1845           tyler   1845  
eisenhower|January 17, 1961  eisenhower   1961  

[5 rows x 22 columns]

Finally, we can compute the normalized frequency of topics by year and plot these as a time-series using the dygraphs library.

 
columns = ['topic_'+str(i) for i in range(0, 20)] # define columns to process
df = pd.DataFrame(topics_speech.groupby('year')[columns].sum()) # group topics frequency by year
df = 100 * df.div(df.sum(axis=1), axis=0) # normalize topic frequencies by year
df = np.round(df, 1) # round topic frequencies
df.to_csv('topics_by_year.csv', sep=',')

At this point, I’m going to do something that I am not very proud of and proceed to some nasty context switching. Although I played around with the charts library, I was not satisified with the results and temporarily switched to R in order to leverage the dygraphs library. Thankfully, Jupyter notebooks have plenty of magic that make it easy to call R from the notebook itself!

 
import rpy2
 
%load_ext rpy2.ipython
 
%%R
library(htmlwidgets)
library(dygraphs)
library(zoo)
library(xts)

dt <- read.zoo("topics_by_year.csv", sep = ",", index.column = 1, header = TRUE, format = "%Y")
dt_large <- dt[, which(apply(dt, 2, sum)>=500)]
dt_large$others <- 100- apply(dt_large, 1, sum)
dt_large <- as.xts(dt_large)
 
%%R
dygraph(dt_large) %>%
dyAnnotation("1970-1-1", text = "Vietnam") %>%
  dyShading(from = "1961-1-1", to = "1975-1-1", color = "#bdbdbd")  %>%
  dyShading(from = "1938-1-1", to = "1945-1-1", color = "#bdbdbd")  %>%
  dyShading(from = "1914-1-1", to = "1919-1-1", color = "#bdbdbd")  %>%
  dyShading(from = "1861-1-1", to = "1865-1-1", color = "#bdbdbd")  %>%
  dyRoller(rollPeriod = 8) %>%
  dyOptions(stackedGraph = TRUE) %>%
  dyRangeSelector(height = 20) %>%
  dyLegend(show = "follow") %>%
  #dyLegend(width = 900) %>%
  dySeries("topic_1", label = "(sailors/chili)") %>%
  dySeries("topic_4", label = "(world/countries)") %>%
  dySeries("topic_6", label = "(law/congress)") %>%
  dySeries("topic_7", label = "(bank/silver/bank)") %>%
  dySeries("topic_10", label = "(work/people/children)") %>%
  dySeries("topic_12", label = "(peace/war)") %>%
  dySeries("topic_13", label = "(law/constition/congress)") %>%
  dySeries("topic_15", label = "(public)") %>%
  dySeries("topic_18", label = "(business/economic/tax)") %>%
  dySeries("others", label = "(soviet/german/war)")

Tracking and visualizing topics propensity over time



Clustering individual presidential speeches

We can also wrangle the data a little bit more in order to visualize how each individual speeches cluster together. This time, we use document-topic distributions and apply the t-sne dimensionality reduction algorithm to map all speeches into two-dimensional space. Roughly, t-sne is considered to be useful because of its property to conserve the overall topology of the data, so that neighboring (i.e. similar) speeches will hopefully be mapped into neighboring locations in two-dimensional space. Other well-known clustering techniques such as k-means or MDS would likely be just as adequate for this exercise, but I’ve had good fortune when using t-sne, so am unwisely (and perharps not very smartly) sticking to it here.

 
# perform T-SNE on dataframe of document-topic distritbutions
# the aim is to show a 2-D representation of presidential speeches based on their inferred topics
from sklearn import manifold
from time import time
t0 = time()
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
Y = tsne.fit_transform(df)
t1 = time()
print("t-SNE: %.2g sec" % (t1 - t0))
t-SNE: 13 sec
 
# output T-SNE 2-D representation of each speech to file
president_clusters = pd.DataFrame(Y)
president_clusters = president_clusters.set_index(df.index)
print president_clusters.head(5)
president_clusters.to_csv('speech_tsne_clusters.csv', sep=',')
                                     0         1
lincoln|July 4, 1861         12.246211 -4.594903
buchanan|February 24, 1859   13.982249 -1.675186
reagan|November 11, 1988     -7.665759  4.714818
tyler|February 20, 1845      11.953091  1.884652
eisenhower|January 17, 1961 -13.193183 -3.790267

We can now leverage the mpld3 library to display the t-sne clusters inline. The interactive figure below shows the 2-dimensional t-sne coordinates of all 880 presidential speeches in American history. One of the challenges here was to generate distinct colors to map the different presidents, and I don’t think I did a particularly good job at it (the figure could probably benefit from a legend too, but I opted to waste my time on adding tooltip functionnality instead!)

import matplotlib.pyplot as plt
import numpy as np
import mpld3
import colorsys
from mpld3 import plugins
mpld3.enable_notebook()

presidents = [x.split('|')[0] for x in president_clusters.ix[:,0]]
unique_presidents = list(set(presidents))
N = len(unique_presidents)
HSV_tuples = [(x*1.0/N, 0.7, 0.9) for x in range(N)]
RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
 
president_colors = {}
for i, p in enumerate(unique_presidents):
    president_colors[p] = RGB_tuples[i]
 
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'), figsize=(10,10))
ax.grid(color='white', linestyle='solid')
scatter = ax.scatter(president_clusters.ix[:, 1],
                  president_clusters.ix[:, 2],
                     s=80,
                     label=presidents,
                     alpha=0.3,
                  c=[president_colors[p] for p in presidents])

ax.grid(color='white', linestyle='solid')
ax.set_title("Clustering presidential speeches", size=20)
labels = ['

{president}

'
.format(president=x) for x in presidents] tooltip = plugins.PointHTMLTooltip(scatter, labels) plugins.connect(fig, tooltip)