In this Jupyter notebook, we used the BACPAC study as an example to demonstrate how to navigate datasets within the workspace in HEAL and conduct data analysis using Python libraries.
# Uncomment the line to install python libraries by removing #
#!pip install numpy
#!pip install pandas==1.1.5
#!pip install plotly==4.14.3
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
from pathlib import Path
import numpy as np
import json
import requests
import os
plotly.offline.init_notebook_mode()
Users can query study metadata in HEAL data commons using our metadata service (MDS). The cell below shows how to retrieve the metadata of the BACPAC study by interacting with the gen3 MDS endpoint.
# Query the metadata of BACPAC using the project number "1U24AR076730-01"
response=requests.get("https://healdata.org/mds/metadata?data=True&limit=1000&gen3_discovery.project_number=1U24AR076730-01")
metadata_text=response.text
metadata_object=json.loads(metadata_text)
meta_df = pd.json_normalize([sub['gen3_discovery'] for sub in metadata_object.values() if 'gen3_discovery' in sub.keys()])
meta_df[['research_focus_area', 'study_description_summary', 'institutions']].transpose()
0 | |
---|---|
research_focus_area | Clinical Research in Pain Management |
study_description_summary | The BACPAC Research Programs Data Integration,... |
institutions | UNIV OF NORTH CAROLINA CHAPEL HILL |
!gen3 drs-pull object dg.H34L/80f0a338-18e0-48de-b70f-cdabd63f67d9
!gen3 drs-pull object dg.H34L/530fd95c-48b6-488e-a699-9377180bd82d
!gen3 drs-pull object dg.H34L/654d7f1f-b61c-49a9-8a74-c82400fa4c27
# Read the demographic tsv file into dataframe
demo_bacpac=pd.read_csv("./participant_SMART.tsv", sep="\t", encoding="utf-8")
# Define age groups within participants
age_list = list(demo_bacpac["age_in_years"])
def age_group(agelist):
min_age = min(agelist)
grouplabel1 = str(min_age) + "-55 yr"
grouplabel2= ">55 yr"
grouplist = []
for i in agelist:
if i <=55:
grouplist.append(grouplabel1)
else:
grouplist.append(grouplabel2)
return grouplist
agegrouplist = age_group(age_list)
demo_bacpac["age_group"] = agegrouplist
# Compute three frequency tables using demographic factors
df1=pd.crosstab(index=demo_bacpac['race'], columns=demo_bacpac['sex'])
df2=pd.crosstab(index=demo_bacpac['ethnicity'], columns=demo_bacpac['sex'])
df3=pd.crosstab(index=demo_bacpac['age_group'], columns=demo_bacpac['sex'])
# Dsiplay concatenated tables
pd.concat([df1, df2, df3], keys=['race', 'ethnicity', 'age_group'])
sex | Female | Intersex | Male | Unknown | |
---|---|---|---|---|---|
race | American Indian or Alaska Native | 5 | 1 | 3 | 1 |
Asian | 2 | 0 | 3 | 0 | |
Black or African American | 9 | 3 | 6 | 2 | |
Multiple | 5 | 1 | 4 | 0 | |
Native Hawaiian or Pacific Islander | 5 | 0 | 1 | 2 | |
Not reported | 4 | 2 | 2 | 1 | |
Unknown | 3 | 2 | 3 | 1 | |
White | 37 | 4 | 26 | 12 | |
ethnicity | Hispanic or Latino | 18 | 5 | 12 | 6 |
Not Hispanic or Latino | 36 | 4 | 27 | 6 | |
Not reported | 7 | 1 | 4 | 4 | |
Unknown | 9 | 3 | 5 | 3 | |
age_group | 20-55 yr | 42 | 9 | 29 | 14 |
>55 yr | 28 | 4 | 19 | 5 |
# Generate a stacked bar chart of participants in BACPAC
new_df2 = pd.DataFrame(df2.stack())
new_df2.reset_index(inplace=True)
new_df2 = new_df2.rename({0:"Count", "sex": "Sex", "ethnicity": "Ethnicity"}, axis="columns")
fig1 = px.bar(new_df2, x="Sex", y="Count", color="Ethnicity",
title= "Ethnicity and Sex Characteristics of Participants in the BACPAC Study",
width= 800, height = 500)
fig1.update_layout(title_font_size=20)
fig1.show()
# Read substance use tsv file into dataframe
substance_df = pd.read_csv("./substance_use_SMART.tsv", sep="\t", encoding="utf-8")
# Combine substance use df and demographic df based on participant id
def find_participant(mydf, endstr):
participant_id = []
for i in list(mydf["submitter_id"]):
i_participant = i.rstrip(endstr)
participant_id.append(i_participant)
return participant_id
substance_participant_id = find_participant(substance_df,"_sc")
substance_df["participant_id"] = substance_participant_id
demo_combine_substance = substance_df.merge(demo_bacpac, left_on="participant_id",
right_on="submitter_id", how="outer")
# Add one property of time point in the df
def find_timepoint(mydf):
timepoint = []
for i in list(mydf["visits.submitter_id"]):
if i.endswith("Week 0"):
timepoint.append("Week 0")
else:
timepoint.append("Week 12")
return timepoint
demo_combine_substance["time_point"] = find_timepoint(demo_combine_substance)
# Compute a frequency table using opioid medication factor and time point factor
opioid_crosstab = pd.crosstab(index=demo_combine_substance['OPIOID01'],
columns=demo_combine_substance['time_point'])
new_opioid = pd.DataFrame(opioid_crosstab.stack())
new_opioid.reset_index(inplace=True)
new_opioid = new_opioid.rename({0:"Count", "OPIOID01": "Taking Opioid", "time_point": "Time Point"},
axis="columns")
# Generate a bar chart showing the opioid taking at two time points
fig2 = px.bar(new_opioid, x="Taking Opioid", y="Count", color="Taking Opioid",
facet_row="Time Point", width=800, height=400)
fig2.update_layout(title_text="Self-Report of Opioid Pain Medication Use at Baseline and Twelve Weeks",title_font_size=20)
for data in fig2.data:
data["width"]=0.6
fig2.show()
# Generate a bar chart showing the opioid taking at two time points in different sex groups
opioid_gender = pd.crosstab(index=[demo_combine_substance['OPIOID01'], demo_combine_substance['sex']],
columns=demo_combine_substance['time_point'])
new_opioid_gender = pd.DataFrame(opioid_gender.stack())
new_opioid_gender.reset_index(inplace=True)
new_opioid_gender = new_opioid_gender.rename({0:"Count", "OPIOID01": "Taking Opioid",
"time_point": "Time Point", "sex": "Sex"}, axis="columns")
fig3 = px.bar(new_opioid_gender, y="Sex", x="Count", color="Taking Opioid",
facet_col="Time Point", width=800, height=400, orientation='h',
category_orders={"Sex": ["Intersex", "Unknown", "Male", "Female"]})
fig3.update_layout(title_text="Opioid Pain Medication at Two Time Points in Different Sex Groups",
title_font_size=20)
fig3.show()
The cell below uses the Physical Function 6b T-Score to display physical function outcomes in different ethnicity groups at week 0 and week 12.
# Read physical_function_SMART.tsv into dataframe and merge the df with demographic
function_df = pd.read_csv("./physical_function_SMART.tsv", sep="\t", encoding="utf-16")
function_participant_id = find_participant(function_df, "_pf")
function_df["participant_id"] = function_participant_id
demo_combine_function = function_df.merge(demo_bacpac, left_on="participant_id",
right_on="submitter_id", how="outer")
demo_combine_function["time_point"] = find_timepoint(demo_combine_function)
# Summary table of ROMIS-Physical Function 6b T-Score in different ethnicity groups
ethnicity_PRPF6BT = demo_combine_function[["time_point",
"PRPF6BT",
"ethnicity"]].groupby(['time_point','ethnicity']).describe()
ethnicity_PRPF6BT
PRPF6BT | |||||||||
---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | ||
time_point | ethnicity | ||||||||
Week 0 | Hispanic or Latino | 41.0 | 38.285366 | 3.165009 | 32.5 | 36.0 | 38.50 | 39.300 | 48.7 |
Not Hispanic or Latino | 73.0 | 37.541096 | 2.598228 | 31.5 | 35.1 | 37.60 | 39.300 | 44.2 | |
Not reported | 16.0 | 37.612500 | 2.937544 | 33.4 | 35.1 | 38.05 | 40.200 | 43.1 | |
Unknown | 20.0 | 38.195000 | 2.467361 | 31.5 | 37.4 | 38.50 | 39.300 | 42.1 | |
Week 12 | Hispanic or Latino | 41.0 | 37.536585 | 3.062985 | 29.1 | 35.1 | 37.60 | 39.300 | 43.1 |
Not Hispanic or Latino | 73.0 | 37.589041 | 3.242665 | 31.5 | 35.1 | 37.60 | 40.200 | 44.2 | |
Not reported | 16.0 | 37.743750 | 2.740065 | 33.4 | 36.0 | 36.80 | 39.525 | 43.1 | |
Unknown | 20.0 | 36.925000 | 3.263656 | 29.1 | 35.1 | 36.00 | 38.700 | 44.2 |
# Visualize the distribution of Physical Function 6b T-Score
# at two time points for hispanic and non-hispanic ethnicity groups
fig5 = make_subplots(
rows=2, cols=2,
specs=[[{"colspan": 2}, None],
[{}, {}]],
subplot_titles=("PROMIS-Physical Function 6b T-Score Distribution at Two Time Points","Hispanic or Latino",
"Not Hispanic or Latino"))
fig5.add_trace(go.Histogram(x=demo_combine_function[demo_combine_function["time_point"]=="Week 0"]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20, name="Week 0"),
row=1, col=1)
fig5.add_trace(go.Histogram(x=demo_combine_function[demo_combine_function["time_point"]=="Week 12"]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20, name="Week 12"),
row=1, col=1)
fig5.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 0")&(demo_combine_function["ethnicity"]=="Hispanic or Latino")]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=1)
fig5.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 12")&(demo_combine_function["ethnicity"]=="Hispanic or Latino")]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=1)
fig5.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 0")&(demo_combine_function["ethnicity"]=="Not Hispanic or Latino")]["PRPF6BT"],
marker_color='#EB89B5', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=2)
fig5.add_trace(go.Histogram(x=demo_combine_function[(demo_combine_function["time_point"]=="Week 12")&(demo_combine_function["ethnicity"]=="Not Hispanic or Latino")]["PRPF6BT"],
marker_color='#2B6CBE', opacity=0.75, nbinsx=20,showlegend=False),
row=2, col=2)
fig5.update_layout(barmode='overlay', width=800, height=500,legend_title_text='Time Point')
fig5.update_layout(margin=dict(l=20, r=20, t=50, b=20, pad=2))
fig5.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 40], row=1, col=1)
fig5.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15,
range=[29, 49], row=1, col=1)
fig5.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 15], row=2, col=1)
fig5.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15, range=[29, 49], row=2, col=1)
fig5.update_yaxes(title_text="Count",
title_font_size=15, range=[0, 15], row=2, col=2)
fig5.update_xaxes(title_text="PROMIS-Physical Function 6b T-Score",
title_font_size=15, range=[29, 49], row=2, col=2)
fig5.show()