In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Choose Your data set

In [53]:
df = pd.read_csv("Causes_of_death_united_states.csv")
df.head()
Out[53]:
Year 113 Cause Name Cause Name State Deaths Age-adjusted Death Rate
0 2012 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 21 2.6
1 2017 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 29 3.3
2 2016 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 30 3.7
3 2013 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 30 3.8
4 2000 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 23 3.8

Pre-Processing

In [54]:
df.dropna()
Out[54]:
Year 113 Cause Name Cause Name State Deaths Age-adjusted Death Rate
0 2012 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 21 2.6
1 2017 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 29 3.3
2 2016 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 30 3.7
3 2013 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 30 3.8
4 2000 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 23 3.8
5 2014 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Arizona 325 4.1
6 2009 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 29 4.4
7 2015 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 39 4.5
8 2014 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 37 4.5
9 2011 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease South Dakota 49 4.5
10 2015 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 34 4.9
11 2013 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Arizona 374 4.9
12 2001 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 279 5.0
13 1999 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 30 5.1
14 2016 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 40 5.2
15 2006 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 30 5.2
16 2002 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 31 5.2
17 1999 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 278 5.2
18 2017 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 439 5.3
19 2003 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 304 5.3
20 2011 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 43 5.4
21 2008 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Vermont 39 5.4
22 2005 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 33 5.4
23 2002 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 306 5.4
24 2000 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Washington 293 5.4
25 2015 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Arizona 458 5.5
26 2004 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 33 5.5
27 2012 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease South Dakota 58 5.6
28 2011 Intentional self-harm (suicide) (*U03,X60-X84,... Suicide District of Columbia 37 5.6
29 2011 Nephritis, nephrotic syndrome and nephrosis (N... Kidney disease Arizona 395 5.6
... ... ... ... ... ... ...
10838 2002 All Causes All causes West Virginia 21016 999.0
10839 2006 All Causes All causes Mississippi 28564 999.1
10840 2001 All Causes All causes West Virginia 20967 1000.9
10841 2005 All Causes All causes Alabama 47090 1001.3
10842 2001 All Causes All causes Alabama 45316 1002.1
10843 2003 All Causes All causes West Virginia 21306 1003.1
10844 1999 All Causes All causes Kentucky 39321 1004.1
10845 2000 All Causes All causes Alabama 45062 1004.8
10846 2000 All Causes All causes Louisiana 41138 1006.3
10847 2002 All Causes All causes Louisiana 41984 1008.5
10848 1999 All Causes All causes Alabama 44806 1009.3
10849 2002 All Causes All causes Kentucky 40697 1009.7
10850 2000 All Causes All causes West Virginia 21114 1011.1
10851 1999 All Causes All causes West Virginia 21049 1012.3
10852 2001 All Causes All causes Louisiana 41757 1013.1
10853 2002 All Causes All causes Alabama 46069 1013.4
10854 2003 All Causes All causes Louisiana 42719 1013.7
10855 2003 All Causes All causes Alabama 46716 1020.2
10856 1999 All Causes All causes Louisiana 41238 1021.6
10857 2005 All Causes All causes Louisiana 44355 1023.7
10858 2005 All Causes All causes Mississippi 29196 1028.7
10859 2003 All Causes All causes Mississippi 28489 1031.6
10860 2002 All Causes All causes District of Columbia 5851 1034.1
10861 2001 All Causes All causes Mississippi 28259 1034.3
10862 1999 All Causes All causes Mississippi 28185 1043.4
10863 2001 All Causes All causes District of Columbia 5951 1049.9
10864 2002 All Causes All causes Mississippi 28853 1051.6
10865 2000 All Causes All causes Mississippi 28654 1051.9
10866 2000 All Causes All causes District of Columbia 6001 1061.2
10867 1999 All Causes All causes District of Columbia 6076 1087.3

10868 rows × 6 columns

In [55]:
df = df.drop('113 Cause Name', 1)
In [56]:
df.columns = ['Year', "Cause Name", "State", "Deaths", "Death Rate"]
In [57]:
df = df.sort_values('Deaths',ascending = False)

Statistical Analysis

In [9]:
df.describe().T
Out[9]:
count mean std min 25% 50% 75% max
Year 10868.0 2008.000000 5.477478 1999.0 2003.0 2008.0 2013.000 2017.0
Deaths 10868.0 15459.910195 112876.022311 21.0 612.0 1718.5 5756.500 2813503.0
Death Rate 10868.0 127.563894 223.639771 2.6 19.2 35.9 151.725 1087.3
In [37]:
sns.pairplot(df)
Out[37]:
<seaborn.axisgrid.PairGrid at 0x279cfe394a8>
In [58]:
sum_cause_of_death = df.groupby('Cause Name').sum()
sum_cause_of_death.sort_values('Deaths',ascending = False) # descending
Out[58]:
Year Deaths Death Rate
Cause Name
All causes 1983904 95457138 789050.3
Heart disease 1983904 24445280 195626.9
Cancer 1983904 21687288 176443.1
Stroke 1983904 5453046 45337.9
CLRD 1983904 5189854 44061.7
Unintentional injuries 1983904 4695640 42890.9
Alzheimer's disease 1983904 2989632 24710.1
Diabetes 1983904 2799886 23112.8
Influenza and pneumonia 1983904 2189282 17986.3
Kidney disease 1983904 1717226 13900.1
Suicide 1983904 1394032 13244.3
In [59]:
sns.boxplot(x=df["Deaths"], y=df["Cause Name"])
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x279d0cd2d30>

Exploratory Data Analysis

In [60]:
sns.barplot(x=df["Deaths"], y=df["Cause Name"])
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x279d0df29e8>
In [20]:
!pip install chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go #go.figure
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
Requirement already satisfied: chart_studio in c:\users\sanca\anaconda3\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (4.2.1)
Requirement already satisfied: six in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (1.12.0)
Requirement already satisfied: requests in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (2.21.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (1.3.3)
Requirement already satisfied: idna<2.9,>=2.5 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (2.8)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (3.0.4)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (1.24.1)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (2020.6.20)
In [61]:
sum_state = df.groupby('State').sum() # Vlaues for Choropleth
sum_state = sum_state.drop('Year', 1)
sum_state 
Out[61]:
Deaths Death Rate
State
Alabama 1603535 32037.5
Alaska 117890 25670.3
Arizona 1572443 24457.2
Arkansas 987455 31034.6
California 8169513 23509.8
Colorado 1039435 23718.5
Connecticut 981027 23094.7
Delaware 255015 26677.0
District of Columbia 168907 28687.6
Florida 5868231 24239.7
Georgia 2326265 28840.4
Hawaii 320652 20722.9
Idaho 375595 25334.1
Illinois 3489953 26299.5
Indiana 1934641 28773.2
Iowa 954486 25169.4
Kansas 831840 26425.7
Kentucky 1423885 31846.3
Louisiana 1409567 31523.5
Maine 433412 26379.5
Maryland 1469677 25758.2
Massachusetts 1812189 23818.6
Michigan 3011369 27668.3
Minnesota 1288025 22583.7
Mississippi 992097 33326.6
Missouri 1905549 28874.5
Montana 294784 26115.4
Nebraska 513032 24870.5
Nevada 652604 27969.3
New Hampshire 353884 24851.6
New Jersey 2397044 24580.3
New Mexico 521279 25720.8
New York 5161056 23979.6
North Carolina 2608123 27972.3
North Dakota 202091 24334.9
Ohio 3716207 28611.4
Oklahoma 1243157 31726.3
Oregon 1064013 25161.8
Pennsylvania 4304492 27035.1
Rhode Island 327640 24988.0
South Carolina 1364864 29333.6
South Dakota 244178 25210.4
Tennessee 2012251 31067.9
Texas 5535028 26766.7
United States 84009152 26234.1
Utah 472567 23776.3
Vermont 179348 24864.5
Virginia 1989346 26090.6
Washington 1635078 24654.6
West Virginia 721988 32208.2
Wisconsin 1607291 25141.9
Wyoming 145154 26627.0
In [63]:
sum_state['Deaths'].unique()
Out[63]:
array([ 1603535,   117890,  1572443,   987455,  8169513,  1039435,
         981027,   255015,   168907,  5868231,  2326265,   320652,
         375595,  3489953,  1934641,   954486,   831840,  1423885,
        1409567,   433412,  1469677,  1812189,  3011369,  1288025,
         992097,  1905549,   294784,   513032,   652604,   353884,
        2397044,   521279,  5161056,  2608123,   202091,  3716207,
        1243157,  1064013,  4304492,   327640,  1364864,   244178,
        2012251,  5535028, 84009152,   472567,   179348,  1989346,
        1635078,   721988,  1607291,   145154], dtype=int64)
In [64]:
vals = sum_state['Deaths'].unique()
z = []
for i in vals:
    i = float(i)
    z.append(i)
print(z)
[1603535.0, 117890.0, 1572443.0, 987455.0, 8169513.0, 1039435.0, 981027.0, 255015.0, 168907.0, 5868231.0, 2326265.0, 320652.0, 375595.0, 3489953.0, 1934641.0, 954486.0, 831840.0, 1423885.0, 1409567.0, 433412.0, 1469677.0, 1812189.0, 3011369.0, 1288025.0, 992097.0, 1905549.0, 294784.0, 513032.0, 652604.0, 353884.0, 2397044.0, 521279.0, 5161056.0, 2608123.0, 202091.0, 3716207.0, 1243157.0, 1064013.0, 4304492.0, 327640.0, 1364864.0, 244178.0, 2012251.0, 5535028.0, 84009152.0, 472567.0, 179348.0, 1989346.0, 1635078.0, 721988.0, 1607291.0, 145154.0]
In [65]:
data = dict(
    type = 'choropleth', #key type
    locations = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"],
    locationmode = 'USA-states', #lets plotly know its USA
    colorscale = 'Reds',
    text = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"], #a list of what hovers over each of the locations
    z = z, # equal to the values that are going to be shown to you in an actual color scale
    colorbar = {'title':'Colorbar Title Here'}
)
#text must be in the same index location as locations
In [66]:
data
Out[66]:
{'type': 'choropleth',
 'locations': ['AL',
  'AK',
  'AZ',
  'AR',
  'CA',
  'CO',
  'CT',
  'DE',
  'FL',
  'GA',
  'HI',
  'ID',
  'IL',
  'IN',
  'IA',
  'KS',
  'KY',
  'LA',
  'ME',
  'MD',
  'MA',
  'MI',
  'MN',
  'MS',
  'MO',
  'MT',
  'NE',
  'NV',
  'NH',
  'NJ',
  'NM',
  'NY',
  'NC',
  'ND',
  'OH',
  'OK',
  'OR',
  'PA',
  'RI',
  'SC',
  'SD',
  'TN',
  'TX',
  'UT',
  'VT',
  'VA',
  'WA',
  'WV',
  'WI',
  'WY'],
 'locationmode': 'USA-states',
 'colorscale': 'Reds',
 'text': ['AL',
  'AK',
  'AZ',
  'AR',
  'CA',
  'CO',
  'CT',
  'DE',
  'FL',
  'GA',
  'HI',
  'ID',
  'IL',
  'IN',
  'IA',
  'KS',
  'KY',
  'LA',
  'ME',
  'MD',
  'MA',
  'MI',
  'MN',
  'MS',
  'MO',
  'MT',
  'NE',
  'NV',
  'NH',
  'NJ',
  'NM',
  'NY',
  'NC',
  'ND',
  'OH',
  'OK',
  'OR',
  'PA',
  'RI',
  'SC',
  'SD',
  'TN',
  'TX',
  'UT',
  'VT',
  'VA',
  'WA',
  'WV',
  'WI',
  'WY'],
 'z': [1603535.0,
  117890.0,
  1572443.0,
  987455.0,
  8169513.0,
  1039435.0,
  981027.0,
  255015.0,
  168907.0,
  5868231.0,
  2326265.0,
  320652.0,
  375595.0,
  3489953.0,
  1934641.0,
  954486.0,
  831840.0,
  1423885.0,
  1409567.0,
  433412.0,
  1469677.0,
  1812189.0,
  3011369.0,
  1288025.0,
  992097.0,
  1905549.0,
  294784.0,
  513032.0,
  652604.0,
  353884.0,
  2397044.0,
  521279.0,
  5161056.0,
  2608123.0,
  202091.0,
  3716207.0,
  1243157.0,
  1064013.0,
  4304492.0,
  327640.0,
  1364864.0,
  244178.0,
  2012251.0,
  5535028.0,
  84009152.0,
  472567.0,
  179348.0,
  1989346.0,
  1635078.0,
  721988.0,
  1607291.0,
  145154.0],
 'colorbar': {'title': 'Colorbar Title Here'}}
In [67]:
layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout=layout)
iplot(choromap)

Correlations

In [68]:
corr = sum_state.corr()
corr
Out[68]:
Deaths Death Rate
Deaths 1.000000 -0.029054
Death Rate -0.029054 1.000000
In [69]:
sns.heatmap(corr, annot=True)
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x279d0b18cf8>
In [70]:
sns.regplot(x=df["Deaths"], y=df['Death Rate'])
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x279d1ef6438>