Python code behind the COVID-19 page

14.04.2020 Filed in: Python |Code

I'm not going to spend loads of time explaining this stuff, I'm bored of it by now. The Ffmpeg command I've used to generate the video is also her.

#! /bin/sh

ffmpeg -framerate 4 -i /home/pi/COVID19/IMAGEFIGS/output%03d.png -vcodec libx264 -crf 25 -pix_fmt yuv420p /home/pi/COVID19/VIDEOS/vid.mp4 > /dev/null

# covid19-1
#
# Plot log10 new cases against log10 total cases
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import argparse

import covid_jwp as jwp


VERSION="0.2"
SHOW = False
td = str(datetime.date.today())

# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-s", "--show", help="show the plot", action="store_true")
# Read arguments from the command line
args = parser.parse_args()

# Check for --version or -V
if args.version:
    print("This is myprogram version "+VERSION)

if args.show:
    SHOW=True


#####################
# Start of main code#
#####################

countries = (('United Kingdom', ' '),
             ('Spain', ' '),
             ('Italy', ' '),
             ('France', ' '),
             ('United States', ' ' ),
             ('China', 'Hubei'))

df = jwp.get_country_df(countries)

# Now set up the display stuff for plot 1 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')

sns.set_palette("hls",len(countries))
ax=sns.scatterplot(x='Total infections', y='New infections', data = df, hue='Location', s=30)
ax.set(xscale="log", yscale="log")

plt.title('Daily COVID-19 Cases Vs. Cumulative COVID19 cases \n($Log_{10 }$ scales). Updated: '+td)
plt.ylim(1, None)
plt.xlim(1, None)

fig = ax.get_figure()
fig.savefig("./PLOTS/covid.png")


if SHOW:
    print('showing')
    # Display it
    plt.show()

# covid19-gdp
#
# Plot log10 deaths Vs GDP per capita
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html
#
# Population and GDP info from World Bank

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import argparse

import covid_jwp as jwp

# Some 'constants' 
VERSION="0.1"
SHOW = False
TEXT = False
CENTROID = False
VALIDTYPES = {'GDP':'GDP per capita (US$)',
              'Elderly':'% of poulation over 65',
              'GINI':'Gini index',
              'Smoke': '% of adults who smoke',
              'Urban':'% of population in an urban environment'}
CORR = 'GDP'
LOGX=False
td = str(datetime.date.today())

# Read in any 'flags' that have been passed on the command line
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-s", "--show", help="display it", action="store_true")
parser.add_argument("-t", "--text", help="add text", action="store_true")
parser.add_argument("-c", "--centroid", help="show regional centroid", action="store_true")
parser.add_argument("-l", "--log", help="log10 on both axes", action="store_true")
parser.add_argument("-d", "--data", help="type of country data to compare", action='store', dest='data_type')

# Read arguments from the command line
args = parser.parse_args()

# Check for --version or -V
if args.version:
    print("This is myprogram version "+VERSION)

if args.show:
    SHOW=True

if args.centroid:
    CENTROID=True

if (args.data_type) in VALIDTYPES.keys():
    CORR = args.data_type

if args.log:
    LOGX=True

if args.text:
    TEXT=True

if CORR == 'GDP':
    LOGX = True
#####################
# Start of main code#
#####################

countries = jwp.COUNTRIES

# get the main dataframe giving infection and deaths
df = jwp.get_country_df(countries)
df_dat = jwp.get_country_data_df(countries)
# print(df_dat.head(15))

# Join them on 'Country'
deaths_df = df_dat.join(df.set_index('Country'), on='Country')

# Smoking and Gini have an incomplete country dataset. So, drop where
# 'Smoke' (or GINI) is not numeric
if CORR=='Smoke':
    deaths_df['Smoke'] = pd.to_numeric(deaths_df['Smoke'], errors='coerce')
    deaths_df = deaths_df.dropna(subset=['Smoke'])

if CORR=='GINI':
    deaths_df['GINI'] = pd.to_numeric(deaths_df['GINI'], errors='coerce')
    deaths_df = deaths_df.dropna(subset=['GINI'])
    

deaths_df['Death_index'] = 1000000000*deaths_df['Total deaths']/deaths_df['Population']
# Calculate 'Death_index'
deaths_df['Death_index'] = deaths_df['Death_index'].replace(0,0.01) # To avoid log10 0 problem

# We only need the current maximum DI
max_DI_df = deaths_df.groupby(['Country']).aggregate(np.max)
max_DI_df.reset_index()


# Create information that is the 'centroid' of points for each region
if CENTROID:
##    G = deaths_df.groupby(['Region'])
##    region_mean = G.aggregate(np.mean)
##    region_mean = region_mean.reset_index()
    max_DI_df.sort_values('Region', inplace=True)
    region_mean = max_DI_df.groupby(['Region']).aggregate(np.mean)
    region_mean = region_mean.reset_index()

# Correlation stuff
dcorr_ = np.array(max_DI_df[CORR].values)
if LOGX:
    dcorr = np.log10(dcorr_)
else:
    dcorr=dcorr_
dgi_ = np.array(max_DI_df['Death_index'].values)
dgi = np.log10(dgi_)

pearsonR = np.corrcoef(dcorr, dgi)[1,0]
pearsons = f'{pearsonR:0.2f}' # For formatted printing


### Now set up the display stuff for plot 2 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')
sns.set_palette("husl",7)


# Plot the data
ax=sns.scatterplot(x=CORR,
                   y='Death_index',
                   data = max_DI_df,
                   hue='Region',
                   size='Population',
                   sizes=(25,750),
                   alpha=.75,
                   edgecolor='black')
# Plot the centoids
if CENTROID:
    ax2=sns.scatterplot(x=CORR,
                        y='Death_index',
                        data = region_mean,
                        hue='Region',
                        marker='*',
                        s=250,
                        edgecolor='black',
                        legend=False)
if TEXT:
    x=max_DI_df[CORR].values
    y=max_DI_df['Death_index'].values
    lab=max_DI_df.index
    jwp.label_point(x,y,lab, plt.gca())

ylimcalc = 1.5*(max_DI_df['Death_index'].values.max())
plt.ylim(10, ylimcalc)
plt.ylabel(r'(Deaths/Poulation)$\times 10^8$')
plt.xlabel(VALIDTYPES[CORR])

if LOGX:
    print('logx')
    ax.set(xscale="log", yscale="log")
    ax.text(100,10, "Pearson's r = "+pearsons)
    plt.xlim(100, None)
else:
    print('not logx')
    ax.set(yscale="log")
    ax.text(1,10, "Pearson's r = "+pearsons)
    plt.xlim(1, None)
    

plt.title('Death Index Vs. '+VALIDTYPES[CORR]+'\nUpdated: '+td+' size of blob relates to population')
plt.legend(fontsize='small', loc='upper left')

fig = ax.get_figure()
fig.savefig("./PLOTS/covid-d-"+CORR+".png")

if SHOW:
    print('showing')
    # Display it
    plt.show()

# Module containing COVID-19 helper functions
# John Palmer
# 2020/04/14

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import os.path
from pathlib import Path

VERSION="0.2"


COUNTRIES = (('United Kingdom', ' '),
             ('France', ' '),
             ('Italy', ' '),
             ('Sweden', ' '),
             ('Bangladesh', ' ' ),
             ('Bolivia', ' '),
             ('Burkina Faso', ' '),
             ('China', '*'),
             ('Cameroon', ' ' ),
             ('Canada', '*'),
             ('Colombia', ' '),
             ('Ecuador', ' '),
             ('Egypt', ' '),
             ('Germany', ' '),
             ('Ghana', ' '),
             ('US', ' '),
             ('Brazil', ' '),
             ('Canada', '*'),
             ('Australia', '*'),
             ('Denmark', ' '),
             ('Iran', ' '),
             ('Iraq', ' '),
             ('India', ' '),
             ('Indonesia', ' '),
             ('Japan', ' '),
             ('Korea, South', ' '),
             ('Malaysia', ' '),
             ('Mexico', ' '),
             ('Morocco', ' '),
             ('New Zealand', ' '),
             ('Niger', ' '),
             ('Nigeria', ' '),
             ('Pakistan', ' '),
             ('Peru', ' '),
             ('Philippines', ' '),
             ('Panama', ' '),
             ('South Africa', ' '),
             ('Spain', ' '),
             ('Sri Lanka', ' '),
             ('Switzerland', ' '),
             ('Thailand', ' '),
             ('Tunisia', ' '),
             ('Turkey', ' '))


# prints a label (val) at each x,y point given list of x,y and label values
def label_point(x, y, val, ax):
    a = pd.DataFrame({'x': x, 'y': y, 'val': val})
    xscale = 0.02 # Needs to be a proper scaling factor
    for i, point in a.iterrows():
        ax.text(float(point['x'])+xscale, point['y'], point['val'])


# For a given country/province with optional start date, return a set of lists:
#   list of date column names (Date)
#   list containing sums of values for those columns
#   list of increments between each column sum
def get_df_data(df, country='United Kingdom', province= ' ', start_date='1/22/20'):
    country=str(country)
    # First find all appropriate rows
    if province.strip()=='':
        rows = df.loc[(df['Country/Region']==country) & (df['Province/State'].isnull())]
    elif province=='*':
        rows = df.loc[(df['Country/Region']==country) & (df['Province/State'].notnull())]
    else:
        rows = df.loc[(df['Country/Region']==country) & (df['Province/State']==province)]
    # Choose a sub-set of date columns
    sub_set = rows.loc[:,start_date:]
    # calculate the things we want
    d = sub_set.columns.values # list of dates
    t = sub_set.sum().values # list of totals from all appropriate rows
    diff = sub_set.sum().diff().values # list of the increment column to column
    return [d, t, diff]




# Read in and return a DataFrame containing concatenated information from
# John Hopkins University data sets. Columns will be:
# Country, Province, Date, New infections, Total infections, New deaths, Total deaths
# Parameters are
# countries: list of countries to get data for
#
def get_country_df(countries = COUNTRIES):
    td = str(datetime.date.today())

    #John Hopkins University data URLs    
    link_JHU = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
    link_JHU_Deaths ="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
    link_JHU_Recover = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
    #link_JHU="https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
    #link ='https://www.arcgis.com/sharing/rest/content/items/e5fd11150d274bebaaf8fe2a7a2bda11/data'

    # Don't want to keep downloading whilst developing so:
    fname = './DATA/COVID_JHU_'+td+'.csv'
    if os.path.exists(fname):
        data_infections = pd.read_csv(fname)
    else:
        data_infections = pd.read_csv(link_JHU)
        data_infections.to_csv(fname, index=False)
    fname = './DATA/COVID_JHU_deaths_'+td+'.csv'
    if os.path.exists(fname):
        data_deaths = pd.read_csv(fname, quotechar='"')
    else:
        data_deaths = pd.read_csv(link_JHU_Deaths,quotechar='"' )
        data_deaths.to_csv(fname, index=False)
    
    FIRSTTIME=True
    for c in countries:
        country=c[0]
        province=c[1]
        if FIRSTTIME:
            column_names =['Date',
                           'New infections',
                           'Total infections',
                           'Country',
                           'Province',
                           'Location',
                           'New deaths',
                           'Total deaths']
            countries_df = pd.DataFrame(columns=column_names)
            FIRSTTIME=False
        
        # Fudge the names to take account of different databases
        if country == 'United States' : country = 'US'
        if province.strip() =='' or province=='*':
            loc=country
        else:
            loc = province+':'+country
            
        dat = get_df_data(data_infections, country, province)

        df = pd.DataFrame({'Date':dat[0], 'New infections':dat[2], 'Total infections':dat[1]})
        df['Country'] = country
        df['Province'] = province
        df['Location']= loc
        dat = get_df_data(data_deaths, country, province)
        df['New deaths'] = dat[2]
        df['Total deaths'] = dat[1]
        countries_df = countries_df.append(df, ignore_index=True)
    return countries_df





## Return a DataFrame of country specific statistics
def get_country_data_df(countries=COUNTRIES):
    gdp = pd.read_csv('./DATA/GDP.csv', header=4)
    pop = pd.read_csv('./DATA/POP.csv', header=4)
    meta = pd.read_csv('./DATA/META.csv')
    age = pd.read_csv('./DATA/ELDERLY.csv', header=2)
    smoke = pd.read_csv('./DATA/SMOKING.csv', header=2)
    gini = pd.read_csv('./DATA/GINI.csv', header=2)
    urban = pd.read_csv('./DATA/URBAN.csv', header=2)

    # Set up parameter lists
    gdppc=[]
    cpop=[]
    creg=[]
    cage=[]
    csmoke=[]
    cgini=[]
    curban=[]
    ccount=[]

    for c in countries:
        country=c[0]
        province=c[1]
        # Some exceptions in naming
        if country =='US':  country= 'United States'
        if country == 'Egypt': country=r"Egypt, Arab Rep."
        if country == 'Korea, South': country = "Korea, Rep."

        ccount.append(c[0])

        # Get a GDP per capita value and the Country 'Country Code'
        row =(gdp.loc[gdp['Country Name']==country])
        # Last valid value in row
        gdppc.append(list(row.ffill(axis=1).iloc[:, -1])[0])
        cc = list(row['Country Code'])[0]

        # population
        row =(pop.loc[pop['Country Code']==cc])
        # Last valid value in row
        # Fix for Hubei
        if country=='China' and province=='Hubei':
            cpop.append(58500000)
        else:
            cpop.append(list(row.ffill(axis=1).iloc[:, -1])[0])

        # Region the Country is in
        row =(meta.loc[meta['Country Code']==cc])
        creg.append( list(row['Region'])[0])

        # % of population over 65
        row = (age.loc[age['Country Code']==cc])
        # Last valid value in row
        cage.append(list(row.ffill(axis=1).iloc[:, -1])[0])

        # % of population who smoke
        row = (smoke.loc[age['Country Code']==cc])
        # Last valid value in row
        csmoke.append(list(row.ffill(axis=1).iloc[:, -1])[0])

        # GINI index
        row = (gini.loc[gini['Country Code']==cc])
        # Last valid value in row
        cgini.append(list(row.ffill(axis=1).iloc[:, -1])[0])

        # % Urbanised population 
        row = (urban.loc[urban['Country Code']==cc])
        # Last valid value in row
        curban.append(list(row.ffill(axis=1).iloc[:, -1])[0])

    c_df = pd.DataFrame({'Country':ccount,
                         'Population':cpop,
                         'GDP':gdppc,
                         'Region':creg,
                         'Elderly':cage,
                         'Smoke':csmoke,
                         'GINI':cgini,
                         'Urban':curban})

    return(c_df)   



# Send files from a list (or singly) via ftp to greymamba
def sendFiles(f,host,user,passw):
    # Deal with single files or list
    if type(f)==str:
        files=[]
        files.append(f)
    else:
        files = f.copy()
  
    cnopts = pysftp.CnOpts()
    cnopts.hostkeys.load('/home/pi/.ssh/known_hosts')

    with pysftp.Connection(host=host, username=user, password=passw) as sftp:
        print ("Connection succesfully established ... ")

        for path in files:
            # Get actual file name without the path
            filename = Path(path).name
        
            localFilePath = path
            remoteFilePath = 'web/Datasets/'+filename

            sftp.put(localFilePath, remoteFilePath)

# covid19-morecorr
#
# Plot relationships between age, urbanisation and GDP
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html
#
# Population and GDP info from World Bank

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import scipy.optimize as sco
import datetime
import seaborn as sns
import argparse

import covid_jwp as jwp

# Some 'constants' 
VERSION="0.1"

TEXT = True
td = str(datetime.date.today())

# Read in any 'flags' that have been passed on the command line
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-t", "--text", help="add text", action="store_true")


# Read arguments from the command line
args = parser.parse_args()

# Check for --version or -V
if args.version:
    print("This is myprogram version "+VERSION)
if args.text:
    TEXT=True



def model_fit(x,C0,C1,C2):
    return C0-C1*np.exp(-C2*x)
    
#####################
# Start of main code#
#####################

countries = jwp.COUNTRIES

# get the main dataframe giving infection and deaths
df = jwp.get_country_df(countries)
# Get the database giving country data
df_dat = jwp.get_country_data_df(countries)
print(df_dat.head(15))

# Join them on 'Country'
df = df_dat.join(df.set_index('Country'), on='Country')

# Deal with incomplete country data
df['Urban'] = pd.to_numeric(df['Urban'], errors='coerce')
df = df.dropna(subset=['Urban'])
df['Elderly'] = pd.to_numeric(df['Elderly'], errors='coerce')
df = df.dropna(subset=['Elderly'])

# Add a log10 GDP column
df['logGDP'] = np.log10(df['GDP'])
df['logUrban'] = np.log10(df['Urban'])
df['logElderly'] = np.log10(df['Elderly'])

# We only need the current maximums
max_df = df.groupby(['Country']).aggregate(np.max)
max_df.reset_index()


# Do some fitting to Urban and Elderly
x = max_df['Elderly'].values
y = max_df['Urban'].values

p0=[80,60,0.2]
p,pcov = sco.curve_fit(model_fit,x,y,p0)
print('Optimised parameters are:', *p)




### Now set up the display stuff for plot 2 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')
ax=sns.scatterplot(x='Elderly',
                   y='Urban',
                   data = max_df,
                   hue='logGDP',
                   palette='terrain', #https://matplotlib.org/tutorials/colors/colormaps.html
                   size='Population',
                   sizes=(25,750),
                   alpha=.95,
                   edgecolor='black')

x1 = np.linspace(1,35,50)
ax2 = sns.lineplot(x=x1, y=model_fit(x1,*p), size=5)
ax2.lines[0].set_linestyle("--")

if TEXT:
    x=max_df['Elderly'].values
    y=max_df['Urban'].values
    lab=max_df.index
    jwp.label_point(x,y,lab, plt.gca())


plt.title('Relationship between Urabanisation,\
 Age of population and GDP (log scale, indicated by colour)')
plt.xlabel('% of population over 65')
plt.ylabel('% of population living in an urban environment')
plt.legend(fontsize='medium', loc='lower right')

fig = ax.get_figure()
fig.savefig("urban-age-gdp.png")

print('showing')
# Display it
plt.show()

Back

Tags: Covid19

GreyMamba

Thinking Allowed … (under construction)

Thinking Allowed … (under construction)

Thinking Allowed … (under construction)

Python code behind the COVID-19 page