# K-means clustering of inaugural addresses

## Global count across all inaugural addresses

... in order to determine which words we want to include in the clustering

In [1]:
import nltk
import string

stopwords = nltk.corpus.stopwords.words("english")

global_count = nltk.FreqDist()

for fileid in nltk.corpus.inaugural.fileids():
    words = nltk.corpus.inaugural.words(fileids = fileid)
    
    # remove stopwords etc
    cleaned_words = [ ]
    for word in words:
        if word not in stopwords and len(word) >= 4 and word.strip(string.punctuation) != "":
            cleaned_words.append(word.lower())
    
    # storing counts globally in a counter across all speeches
    global_count.update(cleaned_words)
    


In [2]:
global_count.most_common(20)

[('government', 600),
 ('people', 594),
 ('must', 374),
 ('upon', 371),
 ('great', 346),
 ('world', 346),
 ('states', 335),
 ('nation', 330),
 ('country', 322),
 ('shall', 316),
 ('every', 301),
 ('peace', 259),
 ('citizens', 248),
 ('power', 241),
 ('america', 240),
 ('public', 227),
 ('time', 223),
 ('would', 213),
 ('constitution', 209),
 ('united', 204)]

In [3]:
# We decide on lower and upper count limits
# for words to consider in the clustering.
# we're left with 184 words. 

lower_limit = 50
upper_limit = 250

words_to_keep = [ ]
for word, count in global_count.items():
    if count >= lower_limit and count <= upper_limit:
        words_to_keep.append(word)

len(words_to_keep)

184

In [4]:
# Now we count words separately for each inaugural address,
# keeping only the words-to-keep from above

stopwords = nltk.corpus.stopwords.words("english")

# for each inaugural address, store a separate word count
# as an nltk FreqDist object
inaugural_wordcounts = [ ]

for fileid in nltk.corpus.inaugural.fileids():
    words = nltk.corpus.inaugural.words(fileids = fileid)
    
    # remove stopwords etc
    cleaned_words = [ ]
    for word in words:
        if word.lower() in words_to_keep:
            cleaned_words.append(word.lower())
            
    # storing counts for the current speech
    wordcount = nltk.FreqDist(cleaned_words)
    inaugural_wordcounts.append(wordcount)
    

## A data frame of counts

In [5]:
import pandas as pd

inaugural_df = pd.DataFrame(inaugural_wordcounts).fillna(0)
inaugural_df.head()

Unnamed: 0,fellow,citizens,among,life,could,with,greater,that,order,present,...,seek,countrymen,democracy,promise,century,come,business,help,today,americans
0,3.0,5.0,1.0,1.0,3.0,17.0,1.0,18,2.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,5.0,4.0,2.0,1.0,16.0,0.0,22,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.0,7.0,1.0,1.0,0.0,20.0,1.0,24,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.0,10.0,7.0,2.0,2.0,28.0,0.0,37,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## tf-idf transformation

Our dataframe of counts from above has the tf values. The i-th row has the tf values for the i-th inaugural address.

Now, for each of the words-to-keep, we determine df: the number of inaugural addresses that contain the word. 

In [6]:
df = { }
for word in words_to_keep:
    # in how many inaugural addresses does word appear
    # at least once?
    doccount = 0
    for inaugural_count in inaugural_wordcounts:
        if inaugural_count[ word ] >= 1:
            doccount += 1
            
    df[ word ] = doccount

### tf-idf transform

Each column contains the tf values for one word. We multiply it with the idf value for that word to get that word's tf-idf for each document. 

For each word, we store a column with the tf-idf values in a dictionary called newcolumns. We then make a new dataframe out of those columns. 

In [7]:
import math

num_documents = len(nltk.corpus.inaugural.fileids())

newcolumns = { }

for word in inaugural_df.columns:
    idf = math.log(num_documents / df[ word])
    newcolumns[ word ] = inaugural_df[ word ] * idf
    
inaugural_tfidf_df = pd.DataFrame(newcolumns)
inaugural_tfidf_df.head()

Unnamed: 0,fellow,citizens,among,life,could,with,greater,that,order,present,...,seek,countrymen,democracy,promise,century,come,business,help,today,americans
0,0.496543,0.442767,0.316337,0.185717,1.653531,0.290605,0.710242,0.0,1.420483,2.610947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.165514,0.088553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.522189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.496543,0.442767,1.265349,0.371434,0.551177,0.273511,0.0,0.0,2.840966,1.044379,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.158601,0.619874,0.316337,0.185717,0.0,0.341889,0.710242,0.0,0.710242,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.324116,0.885534,2.214361,0.371434,1.102354,0.478644,0.0,0.0,2.130725,1.566568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# K-means clustering

we decide to try 10 clusters, we may need to revise this later. 

In [8]:
from sklearn.cluster import KMeans

kmeans_obj = KMeans(n_clusters=10, n_init = 10, random_state = 0)
kmeans_obj.fit(inaugural_tfidf_df)

Integrating the cluster and the metadata (speaker and year) in the data frame:

In [9]:
inaugural_tfidf_df["cluster"] = kmeans_obj.labels_
inaugural_tfidf_df["meta"] = nltk.corpus.inaugural.fileids()

## Inspecting the clustering

We first look at which speaker/years got clustered together by printing the metadata columns for each cluster. We see that we got several singleton clusters, that's not so great. But we do see one or two clusters that seem to group together speeches from similar times, which is nice (cluster 4, cluster 6). We may want to redo the clustering with more or fewer clusters. 

Here the code on grouping pandas dataframes by a particular column comes in handy. In our case, we group by cluster. 

In [10]:
for grouplabel, groupdf in inaugural_tfidf_df.groupby("cluster"):
    print("-------")
    print(grouplabel)
    print(groupdf.meta)

-------
0
14    1845-Polk.txt
Name: meta, dtype: object
-------
1
37     1937-Roosevelt.txt
38     1941-Roosevelt.txt
40        1949-Truman.txt
41    1953-Eisenhower.txt
42    1957-Eisenhower.txt
43       1961-Kennedy.txt
45         1969-Nixon.txt
47        1977-Carter.txt
49        1985-Reagan.txt
56         2013-Obama.txt
Name: meta, dtype: object
-------
2
13    1841-Harrison.txt
Name: meta, dtype: object
-------
3
30    1909-Taft.txt
Name: meta, dtype: object
-------
4
7       1817-Monroe.txt
9        1825-Adams.txt
12    1837-VanBuren.txt
16      1853-Pierce.txt
17    1857-Buchanan.txt
18     1861-Lincoln.txt
23    1881-Garfield.txt
34    1925-Coolidge.txt
Name: meta, dtype: object
-------
5
0     1789-Washington.txt
1     1793-Washington.txt
2          1797-Adams.txt
3      1801-Jefferson.txt
4      1805-Jefferson.txt
5        1809-Madison.txt
6        1813-Madison.txt
10       1829-Jackson.txt
11       1833-Jackson.txt
15        1849-Taylor.txt
19       1865-Lincoln.txt
20      

## More inspection: top words for each cluster

We look at the words with the highest average tf-idf weights for each cluster, that is, the most important words for each cluster. We see a good bit of repetition (revenue, congress, constitution), indicating that maybe we have too many clusters. 

In [11]:
# iterating over each cluster as a group.
# grouplabel will be the cluster number,
# groupdf is the sub-dataframe
for grouplabel, groupdf in inaugural_tfidf_df.groupby("cluster"):
    print("------")
    print("group", grouplabel)
    # restricting to the columns that we actually clustered on:
    # this excludes the "meta" and "cluster" columns
    relevant_cols = groupdf.loc[:, "fellow":"americans"]
    # for each relevant column:
    # - compute the mean tf-idf value in this sub-dataframe,
    # - sort words by mean tf-idf, descending (highest first),
    # - then show the 10 first items (10 highest tf-idfs)
    print(relevant_cols.mean().sort_values(ascending = False)[:10])

------
group 0
union           19.577649
revenue         16.964333
protection       9.833291
powers           9.296478
federal          8.654441
interests        7.310651
institutions     7.035305
foreign          6.729817
constitution     6.599269
within           6.422241
dtype: float64
------
group 1
democracy    3.561497
freedom      3.458130
help         2.892442
america      2.508386
together     2.445491
know         2.402071
earth        2.161466
americans    2.071639
today        2.059701
human        1.723225
dtype: float64
------
group 2
executive       19.542514
constitution    15.838246
however         13.288910
character       12.074107
power            9.697812
principle        9.445278
citizen          9.445278
state            9.370008
powers           9.296478
union            7.953420
dtype: float64
------
group 3
business     26.117645
proper       12.291614
trade        10.851445
federal      10.818052
congress      9.943383
ought         9.297135
policy        8.1