import os
from glob import glob
import re
import pandas as pd
import numpy as np
!pip install matplotlib
!pip install seaborn
!pip install sklearn
!pip install openpyxl
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Markdown
import plotly.express as px


# def _extract_year_from_dir(path):
#     '''
#     path: path to query extract with year in name
    
#     had to query CDC wonder GUI individually by year
#     due to time outs so need to add year variable
#     '''
#     return re.search('\d+',path).group()

def read_cdc_wonder_query(path):
    '''
    path: path to query extract with query info and footnotes
    
    reads text file and skips the last 83 lines which 
    is used for footnotes in CDC wonder extracts
    '''
    df = pd.read_table(path)
    #df['year'] = _extract_year_from_dir(path) #used in previous version
    
    #get rid of query details at end of text file
    no_notes = df.Notes.isna()
    is_total = df.Notes=='Total'
    #print(is_total.sum())
    return df.loc[(no_notes | is_total)]


def convert_float_to_str(series,fill_missing_to=0):
    '''
    for grouping/coding variables, want to be able to see unique 
    values for pandas fxns such as describe().
    
    '''
    
    return series.fillna(fill_missing_to).astype(int).astype(str)


def clean_column_names(df,replace_str='_'):
    #clean column names
    df.columns = [x.lower().replace(' ',replace_str) for x in df.columns]
    
    
def show_every_other_axis_label(fig,axis='x'):
    '''
    makes a plt figure only show every other label for given axis
    '''
    if axis=='x':
        tick_labels = fig.get_xticklabels()
    else:
        tick_labels = fig.get_yticklabels()
        
    for ind, label in enumerate(tick_labels):
        if ind % 2 == 0:  # show every odd every odd year
            label.set_visible(True)
        else:
            label.set_visible(False)
def format_wide_hedegaard_data_for_plot(df,variable_name_key):
    '''
    format cleaned data prepared from the CDC Hedegaard data brief
    for plotting trajectories based on different dimensions
    '''
    formatted_data = (
        df
        .assign(year=lambda x: x.year.astype(int).astype(str)) #make year factor for no decimals
        .set_index('year')
        [variable_name_key.keys()]
        .rename(columns=variable_name_key)
    )
    return formatted_data

def draw_hedegaard_lineplot(df,variable_name_key=None,data_structure='wide',
                            yvar=None,groupvar=None,lineplot_args={}):
    '''
    convert x variable (years) into a factor and format 
    for seaborn lineplot input using specificied 
    variables in df as keys and names rendered as values
    in the input dict (ie variable name key)
    '''
    
    if data_structure=='wide':
        fig = sns.lineplot(
           data= format_wide_hedegaard_data_for_plot(df,variable_name_key),
            **lineplot_args
        )
    else:
        fig = sns.lineplot(data=df, x="year", y=yvar, hue=groupvar,**lineplot_args)
    #customize aesthetics more like paper
    ##tick labels every other year
    show_every_other_axis_label(fig)
    return fig


!gen3 drs-pull object dg.H34L/5f8f3d67-15a5-4eff-834d-61ecea345588
!gen3 drs-pull object dg.H34L/ef07a234-5ab9-474b-8239-712fdf494164
!gen3 drs-pull object dg.H34L/f5cbd03a-217a-4786-80dd-2ca4fe3a6203
!gen3 drs-pull object dg.H34L/7dd7f3da-c905-4388-a051-c4802f7bbd1c
!gen3 drs-pull object dg.H34L/d0759aac-ea06-40ae-b3af-360ffe1823a0

deaths_gender.xlsx                           : 100%|████████████████████████████
deaths_age_cat.xlsx                          : 100%|████████████████████████████
deaths_type_opioid.xlsx                      : 100%|████████████████████████████
cdc_wonder_year_cause_hedegaard_et_al_2020.txt: 100%|███████████████████████████
cdc_wonder_year_cause_state_hedegaard_et_al_2020.txt: 100%|█████████████████████


#read in cleaned data from HEAL platform
opioid_gender = pd.read_excel("./deaths_gender.xlsx")
opioid_age = pd.read_excel("./deaths_age_cat.xlsx")
opioid_type = pd.read_excel("./deaths_type_opioid.xlsx")


#format dataframe for lineplot
#get rid of spaces in names
clean_column_names(opioid_gender,replace_str='')

#select and format categories and names
gender_column_labels = {
    'deaths_total_per1k':'Total',
    'deaths_male_per1k':'Male',
    'deaths_female_per1k':'Female'
}


#draw plot like  in publication
fig1_gender = draw_hedegaard_lineplot(opioid_gender,gender_column_labels)
##fig/axis titles
ytitle = "Deaths per 100,000 standard population"
xtitle = ''
fig1_name = ("Figure 1. Age-adjusted drug overdose death rates, by sex:"+
             "\n"+
             "United States, 1999–2019")

plt.ylabel(ytitle)
plt.xlabel('')
plt.title(fig1_name)
plt.show()


#age mapping
age_column_labels = {
    'age_15_24_per1k':'15 - 24',
    'age_25_34_per1k':'25 - 34',
    'age_35_44_per1k':'35 - 44',
    'age_45_54_per1k':'45 - 54',
    'age_55_64_per1k': '55 - 64',
    'age_65_over_per1k':'65 and over'
}

fig2_age = draw_hedegaard_lineplot(opioid_age,age_column_labels)

##fig/axis titles
ytitle = "Deaths per 100,000 standard population"
xtitle = ''
fig2_name = ("Figure 2. Drug overdose death rates among those aged 15 and over,"+
             "\n"+
             "by selected age group: United States, 1999–2019")

plt.ylabel(ytitle)
plt.xlabel('')
plt.title(fig2_name)

Text(0.5, 1.0, 'Figure 2. Drug overdose death rates among those aged 15 and over,\nby selected age group: United States, 1999–2019')


#type
type_column_labels = {
'any_opioids_per1k':'All opioids',
 'heroin_per1k':'Heroin',
 'natural_semisynthetic_opioids_per1k':'Natural and semisynthetic opioids',
 'methadone_per1k': 'Methadone',
 'other_than_methadone_per1k':'Synthetic opioids other than methadone'
}

fig3_type = draw_hedegaard_lineplot(opioid_type,type_column_labels)

##fig/axis titles
ytitle = "Deaths per 100,000 standard population"
xtitle = ''
fig3_name = ("Figure 3. Age-adjusted rates of drug overdose deaths involving opioids,"+
             "\n"+
             "by type of opioid: United States, 1999–2019")

plt.ylabel(ytitle)
plt.xlabel('')
plt.title(fig3_name)

Text(0.5, 1.0, 'Figure 3. Age-adjusted rates of drug overdose deaths involving opioids,\nby type of opioid: United States, 1999–2019')


total_df = read_cdc_wonder_query("./cdc_wonder_year_cause_hedegaard_et_al_2020.txt")
clean_column_names(total_df)
#mapping of ICD Codes to labels used in CDC data brief
type_column_labels = {
 'T40.1':'Heroin',
 'T40.2':'Natural and semisynthetic opioids',
 'T40.3': 'Methadone',
 'T40.4':'Synthetic opioids other than methadone'
}

#make a variable with name mapping
total_df['type'] = total_df['multiple_cause_of_death_code'].map(type_column_labels)
#convert year variable to string object for plotting purposes
total_df['year'] = convert_float_to_str(total_df['year'])

#create boolean vector for focus of subsequent analyses (for filtering)
#ie synthetic opioids
is_opioid = total_df['multiple_cause_of_death_code'].isin(type_column_labels.keys())
is_synthetic_type = total_df['multiple_cause_of_death_code']=='T40.4'


#filter only synthetic opioids other than methadone
state_df = read_cdc_wonder_query("./cdc_wonder_year_cause_state_hedegaard_et_al_2020.txt")
clean_column_names(state_df)
type_column_labels = {
 'T40.1':'Heroin',
 'T40.2':'Natural and semisynthetic opioids',
 'T40.3': 'Methadone',
 'T40.4':'Synthetic opioids other than methadone'
}

state_df['type'] = state_df['multiple_cause_of_death_code'].map(type_column_labels)
state_df['year'] = convert_float_to_str(state_df['year'])

#create boolean vector for focus of subsequent analyses (for filtering)
#ie synthetic opioids
is_opioid = state_df['multiple_cause_of_death_code'].isin(type_column_labels.keys())
is_synthetic_type = state_df['multiple_cause_of_death_code']=='T40.4'
#only want individual states and not totals (CDC queries give both)
is_total = np.where(state_df.notes.isna(),False,True)


#make these objects given they are codes
state_df['state_code'] = convert_float_to_str(state_df['state_code'])
#state_df['county_code'] = convert_float_to_str(state_df['county_code'])


#convert metrics to numeric values and make a key for why missing
data = state_df
for metric_name in ['crude_rate','deaths','population','age_adjusted_rate']:
    #get missing val strings
    reasons_missing = ['Suppressed','Unreliable','Missing',None]
    is_missing = data[metric_name].isin(reasons_missing) 

    #make a key showing to retain info on why missing
    data[metric_name+'_missing_desc'] = data[metric_name]
    data.loc[~is_missing,metric_name+'_missing_desc'] = ''

    #missing for diff reasons but need to make float var from string -- if missing, make a NaN
    data.loc[is_missing,metric_name] = np.nan #change missing val strings to NaNs
    data[metric_name] = data[metric_name].astype(float) #change to var to float
del data


#draw the same plot from hedegaard et al data brief but with state level
#data directly from CDC Wonder API
draw_hedegaard_lineplot(
    state_df,
    yvar='age_adjusted_rate',
    groupvar="type",
    data_structure='long',
    lineplot_args={'linewidth':5,
                   'ci':95,
                   'err_style':'band',
                   'n_boot':5000}
)

##fig/axis titles
ytitle = "Deaths per 100,000 standard population"
xtitle = ''
fig3_name = ("Age-adjusted rates of drug overdose deaths involving opioids,"+
             "\n"+
             "by type of opioid: United States, 1999–2019"+
             "\n"+
            "\nwith 95% confidence intervals")

plt.ylabel(ytitle)
plt.xlabel('')
plt.title(fig3_name)
plt.legend(title="Opioid Type")
#plt.savefig('death_rate_by_opioid_type_cdc_wonder.png')
plt.show()


g_individual = draw_hedegaard_lineplot(
    state_df.loc[is_synthetic_type & ~is_total],
    yvar='age_adjusted_rate',
    groupvar="state",
    data_structure='long'
)
g_individual.legend().remove()
plt.ylabel("Deaths per 100,000 standard population")
plt.title("Death rate by year for individual states")
plt.show()


#2019,synthetic categorical bar plot organized from smallest to largest
is_2019 = state_df.year=='2019'
state_2019_synthetic_sorted_df = (
    state_df
    .loc[(is_2019 & is_synthetic_type & ~is_total)]
    .sort_values(by='age_adjusted_rate',ascending=False)
)


#median state death rate for comparison to individual
median_state_death_rate = state_2019_synthetic_sorted_df.age_adjusted_rate.median()


#plot a histogram
sns.displot(state_2019_synthetic_sorted_df,x='age_adjusted_rate',bins=20,rug=True)
plt.vlines(
    x=median_state_death_rate,
    ymin=0,ymax=6,
    colors='black'
)
plt.xlabel("Deaths per 100,000 standard population")
plt.title("Frequency of synthetic opioid (other than methadone)\nstate death rates for the year 2019")
plt.show()


plt.figure(figsize=(8,10))
g = sns.barplot(
    data=state_2019_synthetic_sorted_df,
    x='age_adjusted_rate',y='state'
)

plt.vlines(x=median_state_death_rate,ymin=-1,ymax=51,colors='black')

plt.xlabel("Deaths per 100,000 standard population")
plt.title(
    "Frequency of synthetic opioid (other than methadone) state death rates\n"+
    "ranked from largest to smallest death rates for the year 2019"
)
plt.show()


#plotly only accepts state abbreviations if not using GEOJSON
state_abbrevs = {'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'}


state_2019_synthetic_sorted_df['State'] = (
    state_2019_synthetic_sorted_df['state']
    .map(state_abbrevs)
)


display(Markdown(("### Frequency of synthetic opioid (other than methadone) state death rates for the year 2019")))
fig = px.choropleth(
    state_2019_synthetic_sorted_df,
    locations='State', 
    locationmode='USA-states',
    color='age_adjusted_rate',
    scope="usa",
    labels={'age_adjusted_rate':'Death Rate'}    
)
fig.show()


from sklearn.cluster import KMeans
kmeans = KMeans(3,random_state=100)


# make a Kmeans cluster model and assign the cluster to each state
kmeans_groups = kmeans.fit_predict(
    (state_2019_synthetic_sorted_df
         [['age_adjusted_rate']]
         .fillna(median_state_death_rate)),
    
    ) + 1
state_2019_synthetic_sorted_df['kmeans_cluster'] = kmeans_groups


cluster_name_map = {
    3:'High',
    1:'Medium',
    2:'Low',
    -1:'Missing data'
}


#needed to impute missing values. Now I am going to add back in the missing values for viz
is_na_rate = state_2019_synthetic_sorted_df.age_adjusted_rate.isna()
state_2019_synthetic_sorted_df.loc[is_na_rate,'kmeans_cluster'] = -1


state_2019_synthetic_sorted_df['Death Rate Cluster'] = (
    state_2019_synthetic_sorted_df
    ['kmeans_cluster']
    .map(cluster_name_map)
)


display(Markdown(("# Frequency of synthetic opioid (other than methadone) state death rates for the year 2019")))
fig = px.choropleth(
    state_2019_synthetic_sorted_df,
    locations='State', 
    locationmode='USA-states',
    color='Death Rate Cluster',
    scope="usa",
    labels={'age_adjusted_rate':'Death Rate'},
    title='Synthetic opioid death rates for the year 2019'
)
fig.show()

Exploring Opioid Overdose Death Rate Trajectories in the United States¶

Summary of findings¶

Load in necessary packages and define functions¶

Pull file objects using the Gen3 SDK¶

Reproduce visualizations directly from Hedegaard et al., 2020 data brief ¶

Figure 1 : Gender¶

Figure 2: Age categories¶

Figure 3: Opioid Types¶

Deeper dive into the variance associated with opioid overdose increases directly from CDC Wonder data¶

Reproduce opioid type figure from publication using direct CDC Wonder data¶

Investigate distribution of 2019 synthetic opioid overdoses across states¶

Mapping 2019 State Death Rates to determine spatial associations/groupings¶

First, lets map the death rates continuously (ie no groupings).¶

Frequency of synthetic opioid (other than methadone) state death rates for the year 2019¶

Using KMeans clustering to group state death rates¶

Frequency of synthetic opioid (other than methadone) state death rates for the year 2019¶

Next steps¶

Exploring Opioid Overdose Death Rate Trajectories in the United States¶

Summary of findings¶

Load in necessary packages and define functions¶

Pull file objects using the Gen3 SDK¶

Reproduce visualizations directly from Hedegaard et al., 2020 data brief¶

Figure 1 : Gender¶

Figure 2: Age categories¶

Figure 3: Opioid Types¶

Deeper dive into the variance associated with opioid overdose increases directly from CDC Wonder data¶

Reproduce opioid type figure from publication using direct CDC Wonder data¶

Investigate distribution of 2019 synthetic opioid overdoses across states¶

Mapping 2019 State Death Rates to determine spatial associations/groupings¶

First, lets map the death rates continuously (ie no groupings).¶

Frequency of synthetic opioid (other than methadone) state death rates for the year 2019¶

Using KMeans clustering to group state death rates¶

Frequency of synthetic opioid (other than methadone) state death rates for the year 2019¶

Next steps¶

Reproduce visualizations directly from Hedegaard et al., 2020 data brief ¶