import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.float_format', lambda x: "{:,}".format(x))

district_2=pd.read_csv('../data/dis2_combined.csv').fillna(0.0)

/Users/xgl470/.pyenv/versions/3.5.1/envs/prelect/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (5,16,20) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Keep only active voters¶

district_2=district_2[(district_2['STATUS']=='ACTIVE')]

Count by Registered Party¶

ENROLLMENT=pd.DataFrame(district_2.groupby('ENROLLMENT').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="ENROLLMENT", y=0, data=ENROLLMENT)

<matplotlib.axes._subplots.AxesSubplot at 0x10f2b57f0>

Remove REP/IND from population¶

district_2=district_2[(district_2['ENROLLMENT']!='REP')]
district_2=district_2[(district_2['ENROLLMENT']!='IND')]

Age Distribution w/ voting likelihood¶

turnout_predict=pd.DataFrame(district_2.groupby('AGE')['turnout_predict'].mean()).reset_index()
factor=.035/max(turnout_predict.turnout_predict)
turnout_predict['turnout_predict']=turnout_predict['turnout_predict']*factor

ages=district_2[(district_2['AGE']>=18) &(district_2['AGE']<=110)]

fig, ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(ages['AGE'])
sns.stripplot(x="AGE", y="turnout_predict", data=turnout_predict)

<matplotlib.axes._subplots.AxesSubplot at 0x10f14c0b8>

We created 10 'clusters' for NYC voters using k means clustering. The idea is each voter is statistically placed into 1 of 10 clusters based upon similarity measures. Here are some key measures of how each cluster varies:¶

Voter count by cluster:¶

CLUSTERS=pd.DataFrame(district_2.groupby('cluster').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y=0, data=CLUSTERS)

<matplotlib.axes._subplots.AxesSubplot at 0x10c08e320>

Expected Turnout¶

Cluster 7 will likely have the highest turnout in next month's primary for district 2¶

CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['turnout_predict'].sum()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='turnout_predict', data=CLUSTERS)

<matplotlib.axes._subplots.AxesSubplot at 0x10c4fc8d0>

AGE¶

CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['AGE'].mean()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='AGE', data=CLUSTERS)

<matplotlib.axes._subplots.AxesSubplot at 0x10cf50ac8>

Gender Breakdown¶

While cluster 7 will contain the most voters in next month's primary, clusters 0/1 are heavily comprised of female likely voters. Cluster 6 is the opposite (heavily male populated)¶

genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()

genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()
genders=genders.append(pd.DataFrame([[0.0,'F',6.0]],columns=[0,'GENDER','cluster']))
genders=genders.append(pd.DataFrame([[0.0,'F',9.0]],columns=[0,'GENDER','cluster']))
genders=genders.sort_values(['cluster'])
genders=genders[(genders['GENDER']=='F')]
genders=genders.pivot_table(0, ['cluster'], 'GENDER').fillna(0.0).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='F', data=genders)

<matplotlib.axes._subplots.AxesSubplot at 0x104b28320>

Ethnicity breakdown¶

cols=['AsianGreaterEastAsianEastAsian',
 'AsianGreaterEastAsianJapanese',
 'AsianIndianSubContinent',
 'GreaterAfricanAfricans',
 'GreaterAfricanMuslim',
 'GreaterEuropeanBritish',
 'GreaterEuropeanEastEuropean',
 'GreaterEuropeanJewish',
 'GreaterEuropeanWestEuropeanFrench',
 'GreaterEuropeanWestEuropeanGermanic',
 'GreaterEuropeanWestEuropeanHispanic',
 'GreaterEuropeanWestEuropeanItalian',
 'GreaterEuropeanWestEuropeanNordic',]
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for column in ethnicity.columns[1:]:
    max_=max(ethnicity[column])
    ethnicity[column]=ethnicity[column]/max_
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_column_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')
cluster_sum={}
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for cluster in list(ethnicity.cluster):
    cluster_sum[cluster]=ethnicity[ethnicity.cluster==cluster][cols].values.max()
for column in ethnicity.columns[1:]:
    ethnicity[column] = ethnicity.apply(lambda row: row[column]/cluster_sum[row['cluster']],axis=1)
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_cluster_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')

Scaled by columns on the left (highest value for each ethnicity=1.0)¶

fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_column_scaled, linewidths=.5, cmap="YlGnBu")

<matplotlib.axes._subplots.AxesSubplot at 0x10e2769e8>

Scaled by clusters on the bottom (highest value for each cluster=1.0)¶

fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_cluster_scaled, linewidths=.5, cmap="YlGnBu")

<matplotlib.axes._subplots.AxesSubplot at 0x10e9da358>

Voting History¶

cols=['2016-General','2016-Primary','2013-General','2013-Primary','2012-General','2012-Primary']
history=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
history=pd.melt(history, id_vars=["cluster"], var_name='Label', value_name="Value")
history=history.pivot(index='Label', columns='cluster', values='Value')

fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(history, linewidths=.5, cmap="YlGnBu")

<matplotlib.axes._subplots.AxesSubplot at 0x10e3ddf28>