In [1]:
import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.float_format', lambda x: "{:,}".format(x))
In [3]:
district_2=pd.read_csv('../data/dis2_combined.csv').fillna(0.0)
/Users/xgl470/.pyenv/versions/3.5.1/envs/prelect/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (5,16,20) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Keep only active voters

In [4]:
district_2=district_2[(district_2['STATUS']=='ACTIVE')]

Count by Registered Party

In [5]:
ENROLLMENT=pd.DataFrame(district_2.groupby('ENROLLMENT').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="ENROLLMENT", y=0, data=ENROLLMENT)
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f2b57f0>

Remove REP/IND from population

In [6]:
district_2=district_2[(district_2['ENROLLMENT']!='REP')]
district_2=district_2[(district_2['ENROLLMENT']!='IND')]

Age Distribution w/ voting likelihood

In [7]:
turnout_predict=pd.DataFrame(district_2.groupby('AGE')['turnout_predict'].mean()).reset_index()
factor=.035/max(turnout_predict.turnout_predict)
turnout_predict['turnout_predict']=turnout_predict['turnout_predict']*factor
In [8]:
ages=district_2[(district_2['AGE']>=18) &(district_2['AGE']<=110)]

fig, ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(ages['AGE'])
sns.stripplot(x="AGE", y="turnout_predict", data=turnout_predict)
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f14c0b8>

We created 10 'clusters' for NYC voters using k means clustering. The idea is each voter is statistically placed into 1 of 10 clusters based upon similarity measures. Here are some key measures of how each cluster varies:

Voter count by cluster:

In [9]:
CLUSTERS=pd.DataFrame(district_2.groupby('cluster').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y=0, data=CLUSTERS)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c08e320>

Expected Turnout

Cluster 7 will likely have the highest turnout in next month's primary for district 2

In [10]:
CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['turnout_predict'].sum()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='turnout_predict', data=CLUSTERS)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c4fc8d0>

AGE

In [11]:
CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['AGE'].mean()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='AGE', data=CLUSTERS)
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cf50ac8>

Gender Breakdown

While cluster 7 will contain the most voters in next month's primary, clusters 0/1 are heavily comprised of female likely voters. Cluster 6 is the opposite (heavily male populated)

In [12]:
genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()
In [13]:
genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()
genders=genders.append(pd.DataFrame([[0.0,'F',6.0]],columns=[0,'GENDER','cluster']))
genders=genders.append(pd.DataFrame([[0.0,'F',9.0]],columns=[0,'GENDER','cluster']))
genders=genders.sort_values(['cluster'])
genders=genders[(genders['GENDER']=='F')]
genders=genders.pivot_table(0, ['cluster'], 'GENDER').fillna(0.0).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='F', data=genders)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x104b28320>

Ethnicity breakdown

In [14]:
cols=['AsianGreaterEastAsianEastAsian',
 'AsianGreaterEastAsianJapanese',
 'AsianIndianSubContinent',
 'GreaterAfricanAfricans',
 'GreaterAfricanMuslim',
 'GreaterEuropeanBritish',
 'GreaterEuropeanEastEuropean',
 'GreaterEuropeanJewish',
 'GreaterEuropeanWestEuropeanFrench',
 'GreaterEuropeanWestEuropeanGermanic',
 'GreaterEuropeanWestEuropeanHispanic',
 'GreaterEuropeanWestEuropeanItalian',
 'GreaterEuropeanWestEuropeanNordic',]
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for column in ethnicity.columns[1:]:
    max_=max(ethnicity[column])
    ethnicity[column]=ethnicity[column]/max_
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_column_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')
cluster_sum={}
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for cluster in list(ethnicity.cluster):
    cluster_sum[cluster]=ethnicity[ethnicity.cluster==cluster][cols].values.max()
for column in ethnicity.columns[1:]:
    ethnicity[column] = ethnicity.apply(lambda row: row[column]/cluster_sum[row['cluster']],axis=1)
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_cluster_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')

Scaled by columns on the left (highest value for each ethnicity=1.0)

In [15]:
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_column_scaled, linewidths=.5, cmap="YlGnBu")
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e2769e8>

Scaled by clusters on the bottom (highest value for each cluster=1.0)

In [16]:
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_cluster_scaled, linewidths=.5, cmap="YlGnBu")
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e9da358>

Voting History

In [17]:
cols=['2016-General','2016-Primary','2013-General','2013-Primary','2012-General','2012-Primary']
history=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
history=pd.melt(history, id_vars=["cluster"], var_name='Label', value_name="Value")
history=history.pivot(index='Label', columns='cluster', values='Value')
In [18]:
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(history, linewidths=.5, cmap="YlGnBu")
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e3ddf28>