COVID-19 statistics in China (except Hubei)¶

Data comes from https://tinyurl.com/s6gsq5y

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn import cluster

from ipywidgets import widgets
from IPython.display import display

from lets_plot import *

load_lets_plot_js()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
C:\Temp/ipykernel_6776/2399501846.py in <module>
----> 1 load_lets_plot_js()

NameError: name 'load_lets_plot_js' is not defined

%matplotlib notebook

DATA_URL = 'https://docs.google.com/spreadsheets/d/1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008/export?format=csv&id=1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008&gid=0'
MAX_CLUSTERS_COUNT = 20
OUTCOMES = ['diseased', 'died', 'discharged', 'stable']

def player_widget(plots, *, fps=1):
    interval = max(1, int(1000 / fps))
    player = widgets.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval)
    slider = widgets.IntSlider(min=0, max=len(plots) - 1, step=1, value=0)
    widgets.jslink((player, 'value'), (slider, 'value'))
    widget = widgets.HBox([player, slider])
    iout = widgets.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player})
    return display(widget, iout)

def find_best_clustering(shapely_points, *, max_clusters_count=1, clusters_limit_proportion=.02, \
                         attempts_count=1, method=cluster.KMeans):
    points = shapely_points.apply(lambda p: [p.x, p.y]).to_list()
    max_clusters_count = min(max_clusters_count, len(set([str(p) for p in points])))
    if max_clusters_count < 1:
        return None
    for n in range(max_clusters_count, 0, -1):
        clustering = method(n_clusters=n, n_jobs=attempts_count).fit(points)
        clusters, counts = np.unique(clustering.labels_, return_counts=True)
        if clusters_limit_proportion <= np.min(counts) / np.max(counts):
            return clustering
    assert(False)

def append_cluster_column(gdf, *, distinguished_columns=[], max_clusters_count=1, cluster_column_name='cluster'):
    if any(distinguished_columns):
        column = distinguished_columns[0]
        return pd.concat([
            append_cluster_column(gdf[gdf[column] == value].copy(), \
                                  distinguished_columns=distinguished_columns[1:], \
                                  max_clusters_count=max_clusters_count, \
                                  cluster_column_name=cluster_column_name)
            for value in gdf[column].value_counts().keys()
        ])
    else:
        gdf[cluster_column_name] = find_best_clustering(gdf.geometry, max_clusters_count=max_clusters_count).labels_
        return gdf

def get_clusters_gdf(gdf, outcome, current_date):
    gdf = gdf[gdf.outcome == outcome].copy()
    actual_gdf = gdf[gdf.date_confirmation<=current_date]
    if outcome in ['discharged', 'died']:
        actual_gdf = actual_gdf[actual_gdf.date_death_or_discharge<=current_date]
    
    vc = actual_gdf.cluster.value_counts()
    clusters, counts = vc.keys(), vc.values
    geometry = [gdf[gdf.cluster == cluster].unary_union.centroid for cluster in clusters]
    
    return gpd.GeoDataFrame(dict(cluster=clusters, \
                                 count=counts, \
                                 outcome=[outcome]*len(clusters), \
                                 date=[current_date]*len(clusters), \
                                 geometry=geometry))

# Read the map polygon
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
china = world[world.name == 'China']

# Prepare dataframe
columns = ['date_confirmation', 'outcome', 'date_death_or_discharge', 'country', 'latitude', 'longitude']
except_hubei_data = pd.read_csv(DATA_URL)
df = pd.DataFrame(except_hubei_data)[columns]

# Clean data
df = df[~df.latitude.isna()]
df = df[~df.longitude.isna()]
df = df[~df.date_confirmation.isna()]
df = df[df.country == 'China']
df = df[~(((df.outcome=='discharged')|(df.outcome=='died'))&df.date_death_or_discharge.isna())]

# Fix data
df.latitude = df.latitude.astype(float)
df.longitude = df.longitude.astype(float)
df.date_confirmation = pd.to_datetime(df.date_confirmation, dayfirst=True, errors='coerce')\
    .apply(lambda dt: dt.replace(year=2020))
df.date_death_or_discharge = pd.to_datetime(df.date_death_or_discharge, dayfirst=True, errors='coerce')\
    .apply(lambda dt: dt.replace(year=2020))
df.outcome = df.outcome.replace({'discharge': 'discharged'})\
    .apply(lambda outcome: outcome if outcome in OUTCOMES else 'diseased')

# Prepare geodataframe
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf = gdf[gdf.within(china.iloc[0].geometry)]

# Add clusters by geoposition
gdf = append_cluster_column(gdf, distinguished_columns=['outcome'], max_clusters_count=MAX_CLUSTERS_COUNT)

# Prepare clusterized geodataframe
clusters_gdf = pd.concat([
    pd.concat([
        get_clusters_gdf(gdf, outcome=outcome, current_date=current_date)
        for current_date in gdf.date_confirmation.sort_values().unique()
    ])
    for outcome in OUTCOMES
])

# Prepare list of plots that would be frames of the animation
p = ggplot() + \
    geom_polygon(data=china, fill='#d6d6d6') + \
    theme(legend_position=(.15, .15), axis_title='blank', axis_text='blank', axis_ticks='blank', axis_line='blank') + \
    ggsize(600, 600)

limit = clusters_gdf['count'].max()
plots = []
for current_date in gdf.date_confirmation.sort_values().unique():
    current_clusters_gdf = clusters_gdf[clusters_gdf.date == current_date]
    plots.append(
        p + \
        geom_point(aes(size='count', color='outcome'), data=current_clusters_gdf) + \
        scale_size(name='', limits=[0, limit]) + \
        scale_color_manual(values=['#cf3c38', 'black', '#90c73e', '#4cc5db']) + \
        ggtitle('COVID-19 on ' + np.datetime_as_string(current_date, unit='D'))
    )

player_widget(plots)