Python code behind the COVID-19 page
I'm not going to spend loads of time explaining this stuff, I'm bored of it by now. The Ffmpeg command I've used to generate the video is also her.
#! /bin/sh
ffmpeg -framerate 4 -i /home/pi/COVID19/IMAGEFIGS/output%03d.png -vcodec libx264 -crf 25 -pix_fmt yuv420p /home/pi/COVID19/VIDEOS/vid.mp4 > /dev/null
# covid19-1
#
# Plot log10 new cases against log10 total cases
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import argparse
import covid_jwp as jwp
VERSION="0.2"
SHOW = False
td = str(datetime.date.today())
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-s", "--show", help="show the plot", action="store_true")
# Read arguments from the command line
args = parser.parse_args()
# Check for --version or -V
if args.version:
print("This is myprogram version "+VERSION)
if args.show:
SHOW=True
#####################
# Start of main code#
#####################
countries = (('United Kingdom', ' '),
('Spain', ' '),
('Italy', ' '),
('France', ' '),
('United States', ' ' ),
('China', 'Hubei'))
df = jwp.get_country_df(countries)
# Now set up the display stuff for plot 1 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')
sns.set_palette("hls",len(countries))
ax=sns.scatterplot(x='Total infections', y='New infections', data = df, hue='Location', s=30)
ax.set(xscale="log", yscale="log")
plt.title('Daily COVID-19 Cases Vs. Cumulative COVID19 cases \n($Log_{10 }$ scales). Updated: '+td)
plt.ylim(1, None)
plt.xlim(1, None)
fig = ax.get_figure()
fig.savefig("./PLOTS/covid.png")
if SHOW:
print('showing')
# Display it
plt.show()
# covid19-gdp
#
# Plot log10 deaths Vs GDP per capita
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html
#
# Population and GDP info from World Bank
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import argparse
import covid_jwp as jwp
# Some 'constants'
VERSION="0.1"
SHOW = False
TEXT = False
CENTROID = False
VALIDTYPES = {'GDP':'GDP per capita (US$)',
'Elderly':'% of poulation over 65',
'GINI':'Gini index',
'Smoke': '% of adults who smoke',
'Urban':'% of population in an urban environment'}
CORR = 'GDP'
LOGX=False
td = str(datetime.date.today())
# Read in any 'flags' that have been passed on the command line
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-s", "--show", help="display it", action="store_true")
parser.add_argument("-t", "--text", help="add text", action="store_true")
parser.add_argument("-c", "--centroid", help="show regional centroid", action="store_true")
parser.add_argument("-l", "--log", help="log10 on both axes", action="store_true")
parser.add_argument("-d", "--data", help="type of country data to compare", action='store', dest='data_type')
# Read arguments from the command line
args = parser.parse_args()
# Check for --version or -V
if args.version:
print("This is myprogram version "+VERSION)
if args.show:
SHOW=True
if args.centroid:
CENTROID=True
if (args.data_type) in VALIDTYPES.keys():
CORR = args.data_type
if args.log:
LOGX=True
if args.text:
TEXT=True
if CORR == 'GDP':
LOGX = True
#####################
# Start of main code#
#####################
countries = jwp.COUNTRIES
# get the main dataframe giving infection and deaths
df = jwp.get_country_df(countries)
df_dat = jwp.get_country_data_df(countries)
# print(df_dat.head(15))
# Join them on 'Country'
deaths_df = df_dat.join(df.set_index('Country'), on='Country')
# Smoking and Gini have an incomplete country dataset. So, drop where
# 'Smoke' (or GINI) is not numeric
if CORR=='Smoke':
deaths_df['Smoke'] = pd.to_numeric(deaths_df['Smoke'], errors='coerce')
deaths_df = deaths_df.dropna(subset=['Smoke'])
if CORR=='GINI':
deaths_df['GINI'] = pd.to_numeric(deaths_df['GINI'], errors='coerce')
deaths_df = deaths_df.dropna(subset=['GINI'])
deaths_df['Death_index'] = 1000000000*deaths_df['Total deaths']/deaths_df['Population']
# Calculate 'Death_index'
deaths_df['Death_index'] = deaths_df['Death_index'].replace(0,0.01) # To avoid log10 0 problem
# We only need the current maximum DI
max_DI_df = deaths_df.groupby(['Country']).aggregate(np.max)
max_DI_df.reset_index()
# Create information that is the 'centroid' of points for each region
if CENTROID:
## G = deaths_df.groupby(['Region'])
## region_mean = G.aggregate(np.mean)
## region_mean = region_mean.reset_index()
max_DI_df.sort_values('Region', inplace=True)
region_mean = max_DI_df.groupby(['Region']).aggregate(np.mean)
region_mean = region_mean.reset_index()
# Correlation stuff
dcorr_ = np.array(max_DI_df[CORR].values)
if LOGX:
dcorr = np.log10(dcorr_)
else:
dcorr=dcorr_
dgi_ = np.array(max_DI_df['Death_index'].values)
dgi = np.log10(dgi_)
pearsonR = np.corrcoef(dcorr, dgi)[1,0]
pearsons = f'{pearsonR:0.2f}' # For formatted printing
### Now set up the display stuff for plot 2 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')
sns.set_palette("husl",7)
# Plot the data
ax=sns.scatterplot(x=CORR,
y='Death_index',
data = max_DI_df,
hue='Region',
size='Population',
sizes=(25,750),
alpha=.75,
edgecolor='black')
# Plot the centoids
if CENTROID:
ax2=sns.scatterplot(x=CORR,
y='Death_index',
data = region_mean,
hue='Region',
marker='*',
s=250,
edgecolor='black',
legend=False)
if TEXT:
x=max_DI_df[CORR].values
y=max_DI_df['Death_index'].values
lab=max_DI_df.index
jwp.label_point(x,y,lab, plt.gca())
ylimcalc = 1.5*(max_DI_df['Death_index'].values.max())
plt.ylim(10, ylimcalc)
plt.ylabel(r'(Deaths/Poulation)$\times 10^8$')
plt.xlabel(VALIDTYPES[CORR])
if LOGX:
print('logx')
ax.set(xscale="log", yscale="log")
ax.text(100,10, "Pearson's r = "+pearsons)
plt.xlim(100, None)
else:
print('not logx')
ax.set(yscale="log")
ax.text(1,10, "Pearson's r = "+pearsons)
plt.xlim(1, None)
plt.title('Death Index Vs. '+VALIDTYPES[CORR]+'\nUpdated: '+td+' size of blob relates to population')
plt.legend(fontsize='small', loc='upper left')
fig = ax.get_figure()
fig.savefig("./PLOTS/covid-d-"+CORR+".png")
if SHOW:
print('showing')
# Display it
plt.show()
# Module containing COVID-19 helper functions
# John Palmer
# 2020/04/14
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import pysftp
import datetime
import seaborn as sns
import os.path
from pathlib import Path
VERSION="0.2"
COUNTRIES = (('United Kingdom', ' '),
('France', ' '),
('Italy', ' '),
('Sweden', ' '),
('Bangladesh', ' ' ),
('Bolivia', ' '),
('Burkina Faso', ' '),
('China', '*'),
('Cameroon', ' ' ),
('Canada', '*'),
('Colombia', ' '),
('Ecuador', ' '),
('Egypt', ' '),
('Germany', ' '),
('Ghana', ' '),
('US', ' '),
('Brazil', ' '),
('Canada', '*'),
('Australia', '*'),
('Denmark', ' '),
('Iran', ' '),
('Iraq', ' '),
('India', ' '),
('Indonesia', ' '),
('Japan', ' '),
('Korea, South', ' '),
('Malaysia', ' '),
('Mexico', ' '),
('Morocco', ' '),
('New Zealand', ' '),
('Niger', ' '),
('Nigeria', ' '),
('Pakistan', ' '),
('Peru', ' '),
('Philippines', ' '),
('Panama', ' '),
('South Africa', ' '),
('Spain', ' '),
('Sri Lanka', ' '),
('Switzerland', ' '),
('Thailand', ' '),
('Tunisia', ' '),
('Turkey', ' '))
# prints a label (val) at each x,y point given list of x,y and label values
def label_point(x, y, val, ax):
a = pd.DataFrame({'x': x, 'y': y, 'val': val})
xscale = 0.02 # Needs to be a proper scaling factor
for i, point in a.iterrows():
ax.text(float(point['x'])+xscale, point['y'], point['val'])
# For a given country/province with optional start date, return a set of lists:
# list of date column names (Date)
# list containing sums of values for those columns
# list of increments between each column sum
def get_df_data(df, country='United Kingdom', province= ' ', start_date='1/22/20'):
country=str(country)
# First find all appropriate rows
if province.strip()=='':
rows = df.loc[(df['Country/Region']==country) & (df['Province/State'].isnull())]
elif province=='*':
rows = df.loc[(df['Country/Region']==country) & (df['Province/State'].notnull())]
else:
rows = df.loc[(df['Country/Region']==country) & (df['Province/State']==province)]
# Choose a sub-set of date columns
sub_set = rows.loc[:,start_date:]
# calculate the things we want
d = sub_set.columns.values # list of dates
t = sub_set.sum().values # list of totals from all appropriate rows
diff = sub_set.sum().diff().values # list of the increment column to column
return [d, t, diff]
# Read in and return a DataFrame containing concatenated information from
# John Hopkins University data sets. Columns will be:
# Country, Province, Date, New infections, Total infections, New deaths, Total deaths
# Parameters are
# countries: list of countries to get data for
#
def get_country_df(countries = COUNTRIES):
td = str(datetime.date.today())
#John Hopkins University data URLs
link_JHU = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
link_JHU_Deaths ="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
link_JHU_Recover = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
#link_JHU="https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
#link ='https://www.arcgis.com/sharing/rest/content/items/e5fd11150d274bebaaf8fe2a7a2bda11/data'
# Don't want to keep downloading whilst developing so:
fname = './DATA/COVID_JHU_'+td+'.csv'
if os.path.exists(fname):
data_infections = pd.read_csv(fname)
else:
data_infections = pd.read_csv(link_JHU)
data_infections.to_csv(fname, index=False)
fname = './DATA/COVID_JHU_deaths_'+td+'.csv'
if os.path.exists(fname):
data_deaths = pd.read_csv(fname, quotechar='"')
else:
data_deaths = pd.read_csv(link_JHU_Deaths,quotechar='"' )
data_deaths.to_csv(fname, index=False)
FIRSTTIME=True
for c in countries:
country=c[0]
province=c[1]
if FIRSTTIME:
column_names =['Date',
'New infections',
'Total infections',
'Country',
'Province',
'Location',
'New deaths',
'Total deaths']
countries_df = pd.DataFrame(columns=column_names)
FIRSTTIME=False
# Fudge the names to take account of different databases
if country == 'United States' : country = 'US'
if province.strip() =='' or province=='*':
loc=country
else:
loc = province+':'+country
dat = get_df_data(data_infections, country, province)
df = pd.DataFrame({'Date':dat[0], 'New infections':dat[2], 'Total infections':dat[1]})
df['Country'] = country
df['Province'] = province
df['Location']= loc
dat = get_df_data(data_deaths, country, province)
df['New deaths'] = dat[2]
df['Total deaths'] = dat[1]
countries_df = countries_df.append(df, ignore_index=True)
return countries_df
## Return a DataFrame of country specific statistics
def get_country_data_df(countries=COUNTRIES):
gdp = pd.read_csv('./DATA/GDP.csv', header=4)
pop = pd.read_csv('./DATA/POP.csv', header=4)
meta = pd.read_csv('./DATA/META.csv')
age = pd.read_csv('./DATA/ELDERLY.csv', header=2)
smoke = pd.read_csv('./DATA/SMOKING.csv', header=2)
gini = pd.read_csv('./DATA/GINI.csv', header=2)
urban = pd.read_csv('./DATA/URBAN.csv', header=2)
# Set up parameter lists
gdppc=[]
cpop=[]
creg=[]
cage=[]
csmoke=[]
cgini=[]
curban=[]
ccount=[]
for c in countries:
country=c[0]
province=c[1]
# Some exceptions in naming
if country =='US': country= 'United States'
if country == 'Egypt': country=r"Egypt, Arab Rep."
if country == 'Korea, South': country = "Korea, Rep."
ccount.append(c[0])
# Get a GDP per capita value and the Country 'Country Code'
row =(gdp.loc[gdp['Country Name']==country])
# Last valid value in row
gdppc.append(list(row.ffill(axis=1).iloc[:, -1])[0])
cc = list(row['Country Code'])[0]
# population
row =(pop.loc[pop['Country Code']==cc])
# Last valid value in row
# Fix for Hubei
if country=='China' and province=='Hubei':
cpop.append(58500000)
else:
cpop.append(list(row.ffill(axis=1).iloc[:, -1])[0])
# Region the Country is in
row =(meta.loc[meta['Country Code']==cc])
creg.append( list(row['Region'])[0])
# % of population over 65
row = (age.loc[age['Country Code']==cc])
# Last valid value in row
cage.append(list(row.ffill(axis=1).iloc[:, -1])[0])
# % of population who smoke
row = (smoke.loc[age['Country Code']==cc])
# Last valid value in row
csmoke.append(list(row.ffill(axis=1).iloc[:, -1])[0])
# GINI index
row = (gini.loc[gini['Country Code']==cc])
# Last valid value in row
cgini.append(list(row.ffill(axis=1).iloc[:, -1])[0])
# % Urbanised population
row = (urban.loc[urban['Country Code']==cc])
# Last valid value in row
curban.append(list(row.ffill(axis=1).iloc[:, -1])[0])
c_df = pd.DataFrame({'Country':ccount,
'Population':cpop,
'GDP':gdppc,
'Region':creg,
'Elderly':cage,
'Smoke':csmoke,
'GINI':cgini,
'Urban':curban})
return(c_df)
# Send files from a list (or singly) via ftp to greymamba
def sendFiles(f,host,user,passw):
# Deal with single files or list
if type(f)==str:
files=[]
files.append(f)
else:
files = f.copy()
cnopts = pysftp.CnOpts()
cnopts.hostkeys.load('/home/pi/.ssh/known_hosts')
with pysftp.Connection(host=host, username=user, password=passw) as sftp:
print ("Connection succesfully established ... ")
for path in files:
# Get actual file name without the path
filename = Path(path).name
localFilePath = path
remoteFilePath = 'web/Datasets/'+filename
sftp.put(localFilePath, remoteFilePath)
# covid19-morecorr
#
# Plot relationships between age, urbanisation and GDP
#
# Data from Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)
# https://github.com/CSSEGISandData/COVID-19
# They have a great live map at: https://coronavirus.jhu.edu/map.html
#
# Population and GDP info from World Bank
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import pandas as pd
import scipy.optimize as sco
import datetime
import seaborn as sns
import argparse
import covid_jwp as jwp
# Some 'constants'
VERSION="0.1"
TEXT = True
td = str(datetime.date.today())
# Read in any 'flags' that have been passed on the command line
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-V", "--version", help="show program version", action="store_true")
parser.add_argument("-t", "--text", help="add text", action="store_true")
# Read arguments from the command line
args = parser.parse_args()
# Check for --version or -V
if args.version:
print("This is myprogram version "+VERSION)
if args.text:
TEXT=True
def model_fit(x,C0,C1,C2):
return C0-C1*np.exp(-C2*x)
#####################
# Start of main code#
#####################
countries = jwp.COUNTRIES
# get the main dataframe giving infection and deaths
df = jwp.get_country_df(countries)
# Get the database giving country data
df_dat = jwp.get_country_data_df(countries)
print(df_dat.head(15))
# Join them on 'Country'
df = df_dat.join(df.set_index('Country'), on='Country')
# Deal with incomplete country data
df['Urban'] = pd.to_numeric(df['Urban'], errors='coerce')
df = df.dropna(subset=['Urban'])
df['Elderly'] = pd.to_numeric(df['Elderly'], errors='coerce')
df = df.dropna(subset=['Elderly'])
# Add a log10 GDP column
df['logGDP'] = np.log10(df['GDP'])
df['logUrban'] = np.log10(df['Urban'])
df['logElderly'] = np.log10(df['Elderly'])
# We only need the current maximums
max_df = df.groupby(['Country']).aggregate(np.max)
max_df.reset_index()
# Do some fitting to Urban and Elderly
x = max_df['Elderly'].values
y = max_df['Urban'].values
p0=[80,60,0.2]
p,pcov = sco.curve_fit(model_fit,x,y,p0)
print('Optimised parameters are:', *p)
### Now set up the display stuff for plot 2 using seaborn
sns.set()
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('ggplot')
ax=sns.scatterplot(x='Elderly',
y='Urban',
data = max_df,
hue='logGDP',
palette='terrain', #https://matplotlib.org/tutorials/colors/colormaps.html
size='Population',
sizes=(25,750),
alpha=.95,
edgecolor='black')
x1 = np.linspace(1,35,50)
ax2 = sns.lineplot(x=x1, y=model_fit(x1,*p), size=5)
ax2.lines[0].set_linestyle("--")
if TEXT:
x=max_df['Elderly'].values
y=max_df['Urban'].values
lab=max_df.index
jwp.label_point(x,y,lab, plt.gca())
plt.title('Relationship between Urabanisation,\
Age of population and GDP (log scale, indicated by colour)')
plt.xlabel('% of population over 65')
plt.ylabel('% of population living in an urban environment')
plt.legend(fontsize='medium', loc='lower right')
fig = ax.get_figure()
fig.savefig("urban-age-gdp.png")
print('showing')
# Display it
plt.show()