import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.float_format', lambda x: "{:,}".format(x))
district_2=pd.read_csv('../data/dis2_combined.csv').fillna(0.0)
district_2=district_2[(district_2['STATUS']=='ACTIVE')]
ENROLLMENT=pd.DataFrame(district_2.groupby('ENROLLMENT').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="ENROLLMENT", y=0, data=ENROLLMENT)
district_2=district_2[(district_2['ENROLLMENT']!='REP')]
district_2=district_2[(district_2['ENROLLMENT']!='IND')]
turnout_predict=pd.DataFrame(district_2.groupby('AGE')['turnout_predict'].mean()).reset_index()
factor=.035/max(turnout_predict.turnout_predict)
turnout_predict['turnout_predict']=turnout_predict['turnout_predict']*factor
ages=district_2[(district_2['AGE']>=18) &(district_2['AGE']<=110)]
fig, ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(ages['AGE'])
sns.stripplot(x="AGE", y="turnout_predict", data=turnout_predict)
CLUSTERS=pd.DataFrame(district_2.groupby('cluster').size()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y=0, data=CLUSTERS)
CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['turnout_predict'].sum()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='turnout_predict', data=CLUSTERS)
CLUSTERS=pd.DataFrame(district_2.groupby('cluster')['AGE'].mean()).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='AGE', data=CLUSTERS)
genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()
genders=pd.DataFrame(district_2.groupby(['cluster','GENDER']).size().groupby(level = 0).transform(lambda x: x/x.sum()))
genders=genders.reset_index()
genders=genders.append(pd.DataFrame([[0.0,'F',6.0]],columns=[0,'GENDER','cluster']))
genders=genders.append(pd.DataFrame([[0.0,'F',9.0]],columns=[0,'GENDER','cluster']))
genders=genders.sort_values(['cluster'])
genders=genders[(genders['GENDER']=='F')]
genders=genders.pivot_table(0, ['cluster'], 'GENDER').fillna(0.0).reset_index()
fig, ax = plt.subplots()
fig.set_size_inches(18, 8.27)
sns.barplot(x="cluster", y='F', data=genders)
cols=['AsianGreaterEastAsianEastAsian',
'AsianGreaterEastAsianJapanese',
'AsianIndianSubContinent',
'GreaterAfricanAfricans',
'GreaterAfricanMuslim',
'GreaterEuropeanBritish',
'GreaterEuropeanEastEuropean',
'GreaterEuropeanJewish',
'GreaterEuropeanWestEuropeanFrench',
'GreaterEuropeanWestEuropeanGermanic',
'GreaterEuropeanWestEuropeanHispanic',
'GreaterEuropeanWestEuropeanItalian',
'GreaterEuropeanWestEuropeanNordic',]
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for column in ethnicity.columns[1:]:
max_=max(ethnicity[column])
ethnicity[column]=ethnicity[column]/max_
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_column_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')
cluster_sum={}
ethnicity=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
for cluster in list(ethnicity.cluster):
cluster_sum[cluster]=ethnicity[ethnicity.cluster==cluster][cols].values.max()
for column in ethnicity.columns[1:]:
ethnicity[column] = ethnicity.apply(lambda row: row[column]/cluster_sum[row['cluster']],axis=1)
ethnicity=pd.melt(ethnicity, id_vars=["cluster"], var_name='Label', value_name="Value")
ethnicity_cluster_scaled=ethnicity.pivot(index='Label', columns='cluster', values='Value')
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_column_scaled, linewidths=.5, cmap="YlGnBu")
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(ethnicity_cluster_scaled, linewidths=.5, cmap="YlGnBu")
cols=['2016-General','2016-Primary','2013-General','2013-Primary','2012-General','2012-Primary']
history=pd.DataFrame(district_2.groupby('cluster')[cols].mean()).reset_index()
history=pd.melt(history, id_vars=["cluster"], var_name='Label', value_name="Value")
history=history.pivot(index='Label', columns='cluster', values='Value')
fig, ax = plt.subplots()
fig.set_size_inches(22, 15)
sns.heatmap(history, linewidths=.5, cmap="YlGnBu")