In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Choose Your data set

In [136]:
df = pd.read_csv("who_suicide_statistics.csv")
df.head()
Out[136]:
country year sex age suicides_no population
0 Albania 1985 female 15-24 years NaN 277900.0
1 Albania 1985 female 25-34 years NaN 246800.0
2 Albania 1985 female 35-54 years NaN 267500.0
3 Albania 1985 female 5-14 years NaN 298300.0
4 Albania 1985 female 55-74 years NaN 138700.0
In [ ]:
 

Pre-Processing

In [137]:
df.dropna()
Out[137]:
country year sex age suicides_no population
24 Albania 1987 female 15-24 years 14.0 289700.0
25 Albania 1987 female 25-34 years 4.0 257200.0
26 Albania 1987 female 35-54 years 6.0 278800.0
27 Albania 1987 female 5-14 years 0.0 311000.0
28 Albania 1987 female 55-74 years 0.0 144600.0
29 Albania 1987 female 75+ years 1.0 35600.0
30 Albania 1987 male 15-24 years 21.0 312900.0
31 Albania 1987 male 25-34 years 9.0 274300.0
32 Albania 1987 male 35-54 years 16.0 308000.0
33 Albania 1987 male 5-14 years 0.0 338200.0
34 Albania 1987 male 55-74 years 1.0 137500.0
35 Albania 1987 male 75+ years 1.0 21800.0
36 Albania 1988 female 15-24 years 8.0 295600.0
37 Albania 1988 female 25-34 years 5.0 262400.0
38 Albania 1988 female 35-54 years 4.0 284500.0
39 Albania 1988 female 5-14 years 0.0 317200.0
40 Albania 1988 female 55-74 years 3.0 147500.0
41 Albania 1988 female 75+ years 2.0 36400.0
42 Albania 1988 male 15-24 years 17.0 319200.0
43 Albania 1988 male 25-34 years 5.0 279900.0
44 Albania 1988 male 35-54 years 14.0 314100.0
45 Albania 1988 male 5-14 years 0.0 345000.0
46 Albania 1988 male 55-74 years 4.0 140200.0
47 Albania 1988 male 75+ years 1.0 22300.0
48 Albania 1989 female 15-24 years 5.0 299900.0
49 Albania 1989 female 25-34 years 2.0 266300.0
50 Albania 1989 female 35-54 years 7.0 288600.0
51 Albania 1989 female 5-14 years 0.0 321900.0
52 Albania 1989 female 55-74 years 1.0 149600.0
53 Albania 1989 female 75+ years 0.0 37000.0
... ... ... ... ... ... ...
43734 Virgin Islands (USA) 2011 male 15-24 years 0.0 6553.0
43735 Virgin Islands (USA) 2011 male 25-34 years 1.0 5271.0
43736 Virgin Islands (USA) 2011 male 35-54 years 2.0 13857.0
43737 Virgin Islands (USA) 2011 male 5-14 years 0.0 7400.0
43738 Virgin Islands (USA) 2011 male 55-74 years 3.0 11730.0
43739 Virgin Islands (USA) 2011 male 75+ years 0.0 2046.0
43740 Virgin Islands (USA) 2012 female 15-24 years 0.0 6833.0
43741 Virgin Islands (USA) 2012 female 25-34 years 1.0 6185.0
43742 Virgin Islands (USA) 2012 female 35-54 years 1.0 15126.0
43743 Virgin Islands (USA) 2012 female 5-14 years 0.0 6929.0
43744 Virgin Islands (USA) 2012 female 55-74 years 2.0 13364.0
43745 Virgin Islands (USA) 2012 female 75+ years 0.0 2940.0
43746 Virgin Islands (USA) 2012 male 15-24 years 2.0 6663.0
43747 Virgin Islands (USA) 2012 male 25-34 years 2.0 5061.0
43748 Virgin Islands (USA) 2012 male 35-54 years 2.0 13542.0
43749 Virgin Islands (USA) 2012 male 5-14 years 0.0 7334.0
43750 Virgin Islands (USA) 2012 male 55-74 years 0.0 11957.0
43751 Virgin Islands (USA) 2012 male 75+ years 0.0 2135.0
43752 Virgin Islands (USA) 2015 female 15-24 years 0.0 6675.0
43753 Virgin Islands (USA) 2015 female 25-34 years 0.0 5662.0
43754 Virgin Islands (USA) 2015 female 35-54 years 0.0 14278.0
43755 Virgin Islands (USA) 2015 female 5-14 years 0.0 7099.0
43756 Virgin Islands (USA) 2015 female 55-74 years 0.0 14269.0
43757 Virgin Islands (USA) 2015 female 75+ years 0.0 3508.0
43758 Virgin Islands (USA) 2015 male 15-24 years 0.0 6933.0
43759 Virgin Islands (USA) 2015 male 25-34 years 2.0 4609.0
43760 Virgin Islands (USA) 2015 male 35-54 years 1.0 12516.0
43761 Virgin Islands (USA) 2015 male 5-14 years 0.0 7291.0
43762 Virgin Islands (USA) 2015 male 55-74 years 0.0 12615.0
43763 Virgin Islands (USA) 2015 male 75+ years 0.0 2496.0

36060 rows × 6 columns

In [138]:
#df_sui=pd.DataFrame(df.groupby(['country','year'])['suicides_no'].sum().reset_index())
#df_sui
#count_max_sui=pd.DataFrame(df_sui.groupby('country')['suicides_no'].sum())
#count_max_sui
In [139]:
df.columns = ["country", "year", "sex", "age", "suicides", "population"]
In [140]:
df = df.sort_values('suicides',ascending = False) # makes the dataframes descending

Statistical Analysis

In [141]:
df.describe().T
Out[141]:
count mean std min 25% 50% 75% max
year 43776.0 1.998502e+03 1.033871e+01 1979.0 1990.00 1999.0 2007.0 2016.0
suicides 41520.0 1.933154e+02 8.005899e+02 0.0 1.00 14.0 91.0 22338.0
population 38316.0 1.664091e+06 3.647231e+06 259.0 85112.75 380655.0 1305698.0 43805214.0
In [142]:
sns.pairplot(df) # Scatter plot and Histograms
C:\Users\sanca\Anaconda3\lib\site-packages\numpy\lib\histograms.py:824: RuntimeWarning:

invalid value encountered in greater_equal

C:\Users\sanca\Anaconda3\lib\site-packages\numpy\lib\histograms.py:825: RuntimeWarning:

invalid value encountered in less_equal

Out[142]:
<seaborn.axisgrid.PairGrid at 0x20bd1cd4e10>
In [ ]:
 
In [143]:
sns.boxplot(x=df["suicides"], y=df["age"]) #no Box and Whisker on age goup
Out[143]:
<matplotlib.axes._subplots.AxesSubplot at 0x20bcfd8b128>

Exploratory Data Analysis

In [144]:
sns.barplot(x=df["suicides"], y=df["age"]) #bar blot
Out[144]:
<matplotlib.axes._subplots.AxesSubplot at 0x20bcfd68d68>
In [145]:
!pip install chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go #go.figure
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
Requirement already satisfied: chart_studio in c:\users\sanca\anaconda3\lib\site-packages (1.1.0)
Requirement already satisfied: six in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (1.12.0)
Requirement already satisfied: plotly in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (4.2.1)
Requirement already satisfied: retrying>=1.3.3 in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (1.3.3)
Requirement already satisfied: requests in c:\users\sanca\anaconda3\lib\site-packages (from chart_studio) (2.21.0)
Requirement already satisfied: idna<2.9,>=2.5 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (2.8)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (2020.6.20)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\users\sanca\anaconda3\lib\site-packages (from requests->chart_studio) (1.24.1)
In [156]:
suicides_country = df.groupby('country').sum().reset_index()
suicides_country = df.drop('year', 1)
suicides_country
#impute mean
Out[156]:
country sex age suicides population
33128 Russian Federation male 35-54 years 22338.0 19044200.0
33140 Russian Federation male 35-54 years 21706.0 19249600.0
33212 Russian Federation male 35-54 years 21262.0 21476420.0
33200 Russian Federation male 35-54 years 21063.0 21378098.0
33188 Russian Federation male 35-54 years 20705.0 21016400.0
33152 Russian Federation male 35-54 years 20562.0 19507100.0
33116 Russian Federation male 35-54 years 20256.0 18908000.0
33224 Russian Federation male 35-54 years 20119.0 21320535.0
33164 Russian Federation male 35-54 years 18973.0 19913400.0
33236 Russian Federation male 35-54 years 18681.0 21007346.0
33176 Russian Federation male 35-54 years 18058.0 20487900.0
32960 Russian Federation male 35-54 years 17610.0 17229600.0
33248 Russian Federation male 35-54 years 17465.0 20843896.0
32972 Russian Federation male 35-54 years 17355.0 17023600.0
32984 Russian Federation male 35-54 years 17215.0 16915700.0
33260 Russian Federation male 35-54 years 15843.0 20550405.0
33104 Russian Federation male 35-54 years 15824.0 18933500.0
33020 Russian Federation male 35-54 years 14891.0 16814300.0
33272 Russian Federation male 35-54 years 14020.0 20336306.0
33284 Russian Federation male 35-54 years 13065.0 20173873.0
33092 Russian Federation male 35-54 years 12746.0 18782900.0
33080 Russian Federation male 35-54 years 12517.0 18495100.0
33068 Russian Federation male 35-54 years 12030.0 18058500.0
33296 Russian Federation male 35-54 years 11848.0 20041975.0
42296 United States of America male 35-54 years 11767.0 42798501.0
42320 United States of America male 35-54 years 11763.0 42326226.0
33308 Russian Federation male 35-54 years 11721.0 19901557.0
42308 United States of America male 35-54 years 11681.0 42566273.0
42356 United States of America male 35-54 years 11634.0 41658010.0
42284 United States of America male 35-54 years 11613.0 42932194.0
... ... ... ... ... ...
42810 Uzbekistan male 15-24 years NaN 1880800.0
42811 Uzbekistan male 25-34 years NaN 1223400.0
42812 Uzbekistan male 35-54 years NaN 1283000.0
42813 Uzbekistan male 5-14 years NaN 2195500.0
42814 Uzbekistan male 55-74 years NaN 514800.0
42815 Uzbekistan male 75+ years NaN 134500.0
43536 Virgin Islands (USA) female 15-24 years NaN 10100.0
43537 Virgin Islands (USA) female 25-34 years NaN 6900.0
43538 Virgin Islands (USA) female 35-54 years NaN 8600.0
43539 Virgin Islands (USA) female 5-14 years NaN 12000.0
43540 Virgin Islands (USA) female 55-74 years NaN 4600.0
43541 Virgin Islands (USA) female 75+ years NaN 1100.0
43542 Virgin Islands (USA) male 15-24 years NaN 10500.0
43543 Virgin Islands (USA) male 25-34 years NaN 6800.0
43544 Virgin Islands (USA) male 35-54 years NaN 8000.0
43545 Virgin Islands (USA) male 5-14 years NaN 12500.0
43546 Virgin Islands (USA) male 55-74 years NaN 3800.0
43547 Virgin Islands (USA) male 75+ years NaN 600.0
43548 Virgin Islands (USA) female 15-24 years NaN 11800.0
43549 Virgin Islands (USA) female 25-34 years NaN 8000.0
43550 Virgin Islands (USA) female 35-54 years NaN 10100.0
43551 Virgin Islands (USA) female 5-14 years NaN 14000.0
43552 Virgin Islands (USA) female 55-74 years NaN 5500.0
43553 Virgin Islands (USA) female 75+ years NaN 1300.0
43554 Virgin Islands (USA) male 15-24 years NaN 12300.0
43555 Virgin Islands (USA) male 25-34 years NaN 7800.0
43556 Virgin Islands (USA) male 35-54 years NaN 9600.0
43557 Virgin Islands (USA) male 5-14 years NaN 14600.0
43558 Virgin Islands (USA) male 55-74 years NaN 4500.0
43559 Virgin Islands (USA) male 75+ years NaN 700.0

43776 rows × 5 columns

In [ ]:
 
In [157]:
count = [ dict(
        type = 'choropleth',
        locations = suicides_country['country'],
        locationmode='country names',
        z = suicides_country['suicides'],
        text = suicides_country['country'],
        colorscale = 'Viridis',
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick =False,
            title = 'Suicide by Country'),
      ) ]
layout = dict(
    title = 'World Suicide Map',
    geo = dict(
        showframe = True,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)
fig = dict( data=count, layout=layout )
iplot( fig, validate=False, filename='d3-world-map' )
In [102]:
import plotly.express as px
suicides_country.head()
Out[102]:
country continent lifeExp pop gdpPercap iso_alpha iso_num
11 Afghanistan Asia 43.828 31889923 974.580338 AFG 4
23 Albania Europe 76.423 3600523 5937.029526 ALB 8
35 Algeria Africa 72.301 33333216 6223.367465 DZA 12
47 Angola Africa 42.731 12420476 4797.231267 AGO 24
59 Argentina Americas 75.320 40301927 12779.379640 ARG 32
In [ ]:
 
In [80]:
suicides_country["suicides"].unique() #Make unique values for the list Z
Out[80]:
array([2.2338e+04, 2.1706e+04, 2.1262e+04, ..., 1.0000e+00, 0.0000e+00,
              nan])
In [19]:
#vals = sum_state['Deaths'].unique()
#z = []
#for i in vals:
    #i = float(i)
    #z.append(i)
#print(z)
In [20]:
'''data = dict(
    type = 'choropleth', #key type
    locations = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"],
    locationmode = 'USA-states', #lets plotly know its USA
    colorscale = 'Reds',
    text = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"], #a list of what hovers over each of the locations
    z = z, # equal to the values that are going to be shown to you in an actual color scale
    colorbar = {'title':'Colorbar Title Here'}
)
#text must be in the same index location as locations'''
Out[20]:
'data = dict(\n    type = \'choropleth\', #key type\n    locations = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", \n          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", \n          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", \n          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", \n          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"],\n    locationmode = \'USA-states\', #lets plotly know its USA\n    colorscale = \'Reds\',\n    text = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", \n          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", \n          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", \n          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", \n          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"], #a list of what hovers over each of the locations\n    z = z, # equal to the values that are going to be shown to you in an actual color scale\n    colorbar = {\'title\':\'Colorbar Title Here\'}\n)\n#text must be in the same index location as locations'
In [21]:
#data
In [22]:
"""layout = dict(geo = {'scope':'usa'})
choromap = go.Figure(data = [data],layout=layout)
iplot(choromap)"""
Out[22]:
"layout = dict(geo = {'scope':'usa'})\nchoromap = go.Figure(data = [data],layout=layout)\niplot(choromap)"
In [ ]:
 

Correlations

In [21]:
corr = df.corr()
corr
Out[21]:
year suicides population
year 1.000000 -0.011985 0.027016
suicides -0.011985 1.000000 0.611406
population 0.027016 0.611406 1.000000
In [24]:
sns.heatmap(corr, annot=True)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x20bc5581198>
In [26]:
sns.regplot(x=df["suicides"], y=df['population'])
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x20bc75ef048>
In [ ]: