import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})

%matplotlib inline

# This enables SVG graphics inline. 
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")

mpl.rcParams['xtick.labelsize'] = 16 
mpl.rcParams['ytick.labelsize'] = 16 
mpl.rcParams['legend.fontsize'] = 14


n = 2

def safety(p_covid, N=n):
    """Log of probability nobody has COVID in a room full of N people"""
    p = np.exp(N * np.log(1 - p_covid))

    if type(p) is not float:
        p[p >= 1] = 1
        return p
    else:
        if p <= 1:
            return p
        else:
            return 1


# load into a dataframe:
pop = pd.read_excel('../data/nst-est2019-01.xlsx', comment='#', header=1)

# fetch NYT data:
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv'
df = pd.read_csv(url, usecols=[0, 1, 3, 4], parse_dates=['date'], squeeze=True)

pop.columns = np.append(np.array(['state']), pop.columns[1:].values)
pop.state = pop.state.str.strip('.')

# merge dfs:
df = df.merge(pop, left_on='state', right_on='state')
# calculate per population numbers:
df['normedPopCases'] = df.cases/ df[2019]

# drop days where normalized cases aren't high enough:
df = df[df.normedPopCases > 10 ** -6]

# generate smoothed prob of encountering a COVID + individual on any one day:
probs = df.groupby('state').normedPopCases.rolling(window=7, win_type='gaussian',
                                                   center=False).mean(std=2).diff(
                                                   ).rolling(10).agg(np.sum
                                                   ).reset_index().rename(
                                                   columns={'normedPopCases': 'pCOVID'})

# get our safety function:
probs['safety'] = probs.groupby('state').pCOVID.apply(safety)
# get cumulative safety to date:
probs['cumprob'] = probs.groupby('state').safety.cumprod()

# find best and worst states:
worst = probs[probs.cumprob == probs.groupby('state').apply(np.min).cumprob.min()].state.values[0]
best = probs[probs.cumprob == probs.groupby('state').apply(np.min).cumprob.max()].state.values[0]

# get cases assuming US is homogeneous:
cases = df.groupby('date').agg(np.sum).cases.rolling(window=7, win_type='gaussian',
                                                   center=False).mean(std=2).diff(
                                                   ).rolling(10).agg(np.sum) / pop[pop.state == 'United States'].Census.values[0]

total = safety(cases.values, n)


fig, ax = plt.subplots(figsize=(12, 6))

plt.plot(total, lw=4, color='black', label='Uniform US probability', zorder=np.inf)
for s, g in probs.groupby('state'):
    if s not in ['Massachusetts', worst, best]:
        plt.plot(g.safety.values, alpha=0.1, color='black', zorder=0)
    else:
        plt.plot(g.safety.values, alpha=1, label=s, lw=5)

plt.legend()
plt.xlabel('Days since start of pandemic (per state)')
_ = plt.ylabel('Daily Safety')


fig, ax = plt.subplots(figsize=(12, 6))

plt.plot(np.cumprod(total[~np.isnan(total)]), lw=4, color='black', label='Uniform US probability', zorder=np.inf)
for s, g in probs.groupby('state'):
    if s not in ['Massachusetts', worst, best]:
        plt.plot(g.cumprob.values, alpha=0.1, color='black', zorder=0)
    else:
        plt.plot(g.cumprob.values, alpha=1, label=s, lw=5)

plt.ylim(probs.cumprob.min() * .95, 1.05)
plt.yscale('log')
plt.legend()
plt.xlabel('Days since start of pandemic (per state)')
_ = plt.ylabel('Safety to Date')


todate = probs.groupby('state').cumprob.apply(np.min).reset_index(
                         ).rename({'cumprob': 'SafetyToDate'}, axis=1).sort_values('SafetyToDate')
fig, ax = plt.subplots(figsize=(8, 12.5))
sns.stripplot(x='SafetyToDate', y='state', data=todate)
plt.xlim(todate.SafetyToDate.min() * .8, 1)
plt.xscale('log')
_ = plt.title('Cumulative Prob of COVID contact\nmeeting {0} people / day to date'.format(n))

How heterogeneously is the pandemic hitting the US right now?¶

Set up our safety function as in the previous post:¶

Load the NYT data and run calculations¶

Plot daily safety levels:¶

Plot cumulative safety levels to date:¶

Plot cumulative safety to date levels per state:¶