import numpy as np
import pandas as pd
from IPython.display import Markdown as md
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objects as go
import plotly.express as px
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
patient_df = pd.read_csv('patient.csv')
patient_df
fig = px.histogram(patient_df, x="infection_reason", title='Infection Reason Distribution')
fig.show()
patient_state_no_null_df = patient_df['state'].dropna()
fig = px.pie(patient_state_no_null_df, names='state', title='Patient State Distribution')
fig.show()
age_list = 2020 - patient_df['birth_year']
patient_df.insert(3, "Age", age_list)
patient_df
age_state_df = patient_df.filter(['Age','state']).dropna()
age_state_df = pd.get_dummies(age_state_df)
age_state_df
age_list = age_state_df['Age'].unique()
state_deceased_list, state_isolated_list, state_released_list = [],[],[]
for age in age_list:
state_deceased_list_i = age_state_df[age_state_df['Age']== age]['state_deceased'].sum()
state_deceased_list.append(state_deceased_list_i)
state_isolated_list_i = age_state_df[age_state_df['Age']== age]['state_isolated'].sum()
state_isolated_list.append(state_isolated_list_i)
state_released_list_i = age_state_df[age_state_df['Age']== age]['state_released'].sum()
state_released_list.append(state_released_list_i)
fig = go.Figure(data=[
go.Bar(name='Deceased', x=age_list, y=state_deceased_list, marker_color='red'),
go.Bar(name='Isolated', x=age_list, y=state_isolated_list, marker_color='orange'),
go.Bar(name='Released', x=age_list, y=state_released_list, marker_color='green'),
])
# Change the bar mode
fig.update_layout(barmode='stack', title='Age-State Distribution', xaxis_title='Patient Age', yaxis_title='Count')
fig.show()
route_df = pd.read_csv('route_updated.csv')
route_df
# Mapbox API token
token = open("token.txt","r").readline()
route_df_filtered = route_df[route_df['visit'] != 'hospital_isolated']
route_df_filtered = route_df_filtered.sort_values(by=['date'])
route_df_filtered.head(10)
route_df_filtered['visit'].unique()
route_df_condensed = route_df_filtered.replace('clinic', 'hospital')
route_df_condensed = route_df_condensed.replace('cafe', 'restaurant')
route_df_condensed = route_df_condensed.replace('company', 'office')
route_df_condensed = route_df_condensed.replace('store', 'market')
route_df_condensed['visit'].unique()
fig1 = px.histogram(route_df_condensed, x="visit", title='Public places where the virus was contracted', nbins=60)
fig1.update_layout(
bargap=0.1)
fig1.show()
fig2 = px.histogram(route_df_filtered, x="province", title='Count in S. Korean provinces', nbins=60)
fig2.update_layout(
bargap=0.1)
fig2.show()
unique_date_list = np.array(route_df_filtered['date'].dropna().unique())
unique_date_list.sort()
lat = np.array(route_df_filtered['latitude'].dropna())
lon = np.array(route_df_filtered['longitude'].dropna())
city = np.array(route_df_filtered['city'].dropna())
date = np.array(route_df_filtered['date'].dropna())
visit = np.array(route_df_filtered['visit'].dropna())
unique_date_list_len = unique_date_list.size
date_len = date.size
def remove_special_chars(text):
new_text = text.replace('_', ' ')
return new_text
data_slider = ()
lat_arr, lon_arr, text_arr, city_arr, date_arr, visit_arr, hover_data_arr = [], [], [], [], [], [], []
for i in range(0, lat.size):
lat_arr = np.append(lat_arr, lat[i])
lon_arr = np.append(lon_arr, lon[i])
city_arr = np.append(city_arr, city[i])
date_arr = np.append(date_arr, date[i])
visit_arr = np.append(visit_arr, remove_special_chars(visit[i]))
hover_data_i = f'Date: {date_arr[i]}<br>Lat: {lat_arr[i]} <br>Lon: {lon_arr[i]}<br>Visit: {visit_arr[i]}'
hover_data_arr = np.append(hover_data_arr, hover_data_i)
data_one_day = dict(
lat = lat_arr,
lon = lon_arr,
marker = {'size': 6, 'color':'crimson'},
mode = 'markers',
hovertext = hover_data_arr,
hoverinfo="text",
type = 'scattermapbox',
)
data_slider = data_slider + (data_one_day,)
fig3 = go.Figure(data_slider)
route_df_date_group = route_df_filtered.groupby('date')
route_df_date_group.first()
date_case_cumsum = np.array(route_df_date_group['date'].count().cumsum())
date_case_cumsum
steps = []
step_0 = dict(method='restyle',
args=['visible', [False] * date_len],
label='Start',
)
steps.append(step_0)
for i in range(unique_date_list_len):
step = dict(method='restyle',
args=np.array(['visible', np.full((date_len), False)]),
label='{}'.format(unique_date_list[i])) # label to be displayed for each date
step['args'][1][:date_case_cumsum[i]] = True
steps.append(step)
## Creating the 'sliders' object from the 'steps'
sliders = [dict(active=0, pad={"t": 1}, steps=steps)]
fig3.update_layout(
autosize=True,
mapbox_style="dark",
showlegend=False,
height=600,
mapbox=dict(
accesstoken=token,
bearing=0,
center=dict(
lat=36.735362,
lon=127.828125
),
pitch=0,
zoom=5.5
),
sliders=sliders,
)
patient_df_date_group = patient_df.groupby('confirmed_date')
patient_df.head(10)
patient_df_date_group['confirmed_date'].count()
confirmed_case_cumsum = list(patient_df_date_group['confirmed_date'].count().cumsum())
date_list = list(patient_df['confirmed_date'].dropna().unique())
fig = go.Figure(data=go.Scatter(x=date_list, y=confirmed_case_cumsum, mode='markers'))
fig.update_layout(xaxis_title='Date', yaxis_title='COVID-19 Patients', title="Cumulative Sum of Confirmed Cases in South Korea")
fig.show()
from datetime import datetime
from datetime import timedelta
converted_date_list = []
def get_str_from_date(str_date, add_day=False):
'''
This function converts string to date and adds one day and returns new date in string format
Inputs:
- str_date: dates in string format
- add_day: if True add a day to the dates
Returns:
- dates as datetime object
'''
if add_day:
datetime_obj = datetime.strptime(str_date, '%Y-%m-%d') + timedelta(days=1)
elif len(str_date) > 1:
for d in str_date:
datetime_obj_i = datetime.strptime(d, '%Y-%m-%d')
datetime_obj_i = datetime_obj_i.strftime('%Y-%m-%d')
converted_date_list.append(datetime_obj_i)
return converted_date_list
else:
datetime_obj = datetime.strptime(str_date, '%Y-%m-%d')
return datetime_obj.strftime('%Y-%m-%d')
from scipy.interpolate import splrep, splev
def spline_interp(x, y, x_new):
'''
Spline interpolation for missing dates
Inputs:
x: dates
y: cumulative sum of confirmed cases
x_new: dates for which cumulative sum needs to be interpolated
Returns:
Interpolated cumulative sum
'''
tck = splrep(x, y)
return splev(x_new, tck)
# Creating a continous date array
df_interp = pd.DataFrame()
df_interp['dates'] = np.arange(date_list[0], get_str_from_date(date_list[-1], add_day=True), dtype='datetime64[D]')
# Finding spline interpolated cumulative sum for missing dates
import matplotlib.dates as mdates
datetime_date_list = [datetime.strptime(d, '%Y-%m-%d') for d in date_list]
df_interp['cum_sum'] = spline_interp(mdates.date2num(datetime_date_list), confirmed_case_cumsum, mdates.date2num(df_interp['dates']))
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=df_interp['dates'], y=df_interp['cum_sum'], mode='lines', name='spline interpolation'))
fig2.add_trace(go.Scatter(x=date_list, y=confirmed_case_cumsum, mode='markers', name='actual'))
fig2.update_layout(xaxis_title='Date', yaxis_title='COVID-19 Patients', title="Spline Interpolation for Missing Dates")
fig2.show()
forecast_days = 6
md("#### Now let's find the forecast for the next {} days and plot it against actual data".format(forecast_days))
from statsmodels.tsa.holtwinters import ExponentialSmoothing
def get_exp_smoothing_forecast(end_date, forecast_days):
'''
Creates and trains ExponentialSmoothing model and returns the forecast and model
Inputs:
- end date: desired end date of forecast (datetime)
- forecast_days: number of forecast days (int)
Returns:
- forecasted_df: dataframe containing the forecast dates and predicted case count
- model: ExponentialSmoothing model
'''
series_interp = pd.Series(df_interp['cum_sum'].values,
pd.date_range(start=date_list[0], end=date_list[-1], freq='D'))
model = ExponentialSmoothing(series_interp, trend='add', damped=True).fit(damping_slope=0.91, optimized=True)
forecasted_df = pd.concat([series_interp, model.forecast(forecast_days)])
return forecasted_df, model
forecasted_df, fit1 = get_exp_smoothing_forecast(end_date=date_list[-1], forecast_days=forecast_days)
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=date_list, y=confirmed_case_cumsum, mode='markers', name='actual'))
fig3.add_trace(go.Scatter(x=forecasted_df.index.tolist(), y=forecasted_df.values.tolist(), mode='lines', name='forecast'))
fig3.update_layout(xaxis_title='Date', yaxis_title='COVID-19 Patients', title="Forecasted number of patients vs actual patients in South Korea")
fig3.show()
exp_smoothing_forecast = fit1.forecast(forecast_days)
exp_smoothing_forecast
new_patient_df = pd.read_csv('patient_updated_mar_10_2020.csv')
new_patient_df_date_group = new_patient_df.groupby('confirmed_date')
cumsum_series = new_patient_df_date_group['confirmed_date'].count().cumsum()
updated_confirmed_case_cumsum = list(cumsum_series)
new_date_list = list(new_patient_df['confirmed_date'].dropna().unique())
fig4 = go.Figure()
fig4.add_trace(go.Scatter(x=new_date_list, y=updated_confirmed_case_cumsum, mode='markers', name='updated actual'))
fig4.add_trace(go.Scatter(x=forecasted_df.index.tolist(), y=forecasted_df.values.tolist(), mode='lines', name='forecast'))
fig4.update_layout(xaxis_title='Date', yaxis_title='COVID-19 Patients', title="Forecasted number of patients vs updated actual patients in South Korea")
fig4.show()
display(exp_smoothing_forecast)
# Actual updated count
display(new_patient_df_date_group['confirmed_date'].count().cumsum()[-forecast_days:])
from sklearn.metrics import mean_squared_error
y_true = np.array(updated_confirmed_case_cumsum[-forecast_days:])
y_pred = np.array(forecasted_df.values[-forecast_days:])
forecast_rmse = mean_squared_error(y_true, y_pred, squared=False)
md("#### My forecasted results have a mean deviation of approx. {} counts".format(round(forecast_rmse)))
def mean_absolute_percentage_error(y_true, y_pred):
if len(y_true) > 1 and len(y_pred) > 1 and len(y_true) == len(y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
else:
print("check if y_true and y_pred are same length arrays ")
forecast_mape = mean_absolute_percentage_error(y_true, y_pred)
md("#### My forecasted results have a MAPE of {} %".format(round(forecast_mape, 2)))
new_forecast_days = 54 # 54 days since last dataset
updated_forecasted_df, model = get_exp_smoothing_forecast(end_date='2020-04-27', forecast_days=new_forecast_days)
updated_forecasted_df
percentage_diff = (10738 - 10166) / 10738 * 100
md("### My forecasted result for April 27th is off by {} %".format(round(percentage_diff, 2)))