COVID-19 statistics in China (except Hubei)

Data comes from https://tinyurl.com/s6gsq5y

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn import cluster

from ipywidgets import widgets
from IPython.display import display

from lets_plot import *
In [2]:
load_lets_plot_js()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
C:\Temp/ipykernel_6776/2399501846.py in <module>
----> 1 load_lets_plot_js()

NameError: name 'load_lets_plot_js' is not defined
In [ ]:
%matplotlib notebook
In [ ]:
DATA_URL = 'https://docs.google.com/spreadsheets/d/1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008/export?format=csv&id=1itaohdPiAeniCXNlntNztZ_oRvjh0HsGuJXUJWET008&gid=0'
MAX_CLUSTERS_COUNT = 20
OUTCOMES = ['diseased', 'died', 'discharged', 'stable']
In [ ]:
def player_widget(plots, *, fps=1):
    interval = max(1, int(1000 / fps))
    player = widgets.Play(min=0, max=len(plots) - 1, step=1, value=0, interval=interval)
    slider = widgets.IntSlider(min=0, max=len(plots) - 1, step=1, value=0)
    widgets.jslink((player, 'value'), (slider, 'value'))
    widget = widgets.HBox([player, slider])
    iout = widgets.interactive_output(lambda n, m: display(plots[n]), {'n': slider, 'm': player})
    return display(widget, iout)
In [ ]:
def find_best_clustering(shapely_points, *, max_clusters_count=1, clusters_limit_proportion=.02, \
                         attempts_count=1, method=cluster.KMeans):
    points = shapely_points.apply(lambda p: [p.x, p.y]).to_list()
    max_clusters_count = min(max_clusters_count, len(set([str(p) for p in points])))
    if max_clusters_count < 1:
        return None
    for n in range(max_clusters_count, 0, -1):
        clustering = method(n_clusters=n, n_jobs=attempts_count).fit(points)
        clusters, counts = np.unique(clustering.labels_, return_counts=True)
        if clusters_limit_proportion <= np.min(counts) / np.max(counts):
            return clustering
    assert(False)
In [ ]:
def append_cluster_column(gdf, *, distinguished_columns=[], max_clusters_count=1, cluster_column_name='cluster'):
    if any(distinguished_columns):
        column = distinguished_columns[0]
        return pd.concat([
            append_cluster_column(gdf[gdf[column] == value].copy(), \
                                  distinguished_columns=distinguished_columns[1:], \
                                  max_clusters_count=max_clusters_count, \
                                  cluster_column_name=cluster_column_name)
            for value in gdf[column].value_counts().keys()
        ])
    else:
        gdf[cluster_column_name] = find_best_clustering(gdf.geometry, max_clusters_count=max_clusters_count).labels_
        return gdf
In [ ]:
def get_clusters_gdf(gdf, outcome, current_date):
    gdf = gdf[gdf.outcome == outcome].copy()
    actual_gdf = gdf[gdf.date_confirmation<=current_date]
    if outcome in ['discharged', 'died']:
        actual_gdf = actual_gdf[actual_gdf.date_death_or_discharge<=current_date]
    
    vc = actual_gdf.cluster.value_counts()
    clusters, counts = vc.keys(), vc.values
    geometry = [gdf[gdf.cluster == cluster].unary_union.centroid for cluster in clusters]
    
    return gpd.GeoDataFrame(dict(cluster=clusters, \
                                 count=counts, \
                                 outcome=[outcome]*len(clusters), \
                                 date=[current_date]*len(clusters), \
                                 geometry=geometry))
In [ ]:
# Read the map polygon
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
china = world[world.name == 'China']
In [ ]:
# Prepare dataframe
columns = ['date_confirmation', 'outcome', 'date_death_or_discharge', 'country', 'latitude', 'longitude']
except_hubei_data = pd.read_csv(DATA_URL)
df = pd.DataFrame(except_hubei_data)[columns]
In [ ]:
# Clean data
df = df[~df.latitude.isna()]
df = df[~df.longitude.isna()]
df = df[~df.date_confirmation.isna()]
df = df[df.country == 'China']
df = df[~(((df.outcome=='discharged')|(df.outcome=='died'))&df.date_death_or_discharge.isna())]
In [ ]:
# Fix data
df.latitude = df.latitude.astype(float)
df.longitude = df.longitude.astype(float)
df.date_confirmation = pd.to_datetime(df.date_confirmation, dayfirst=True, errors='coerce')\
    .apply(lambda dt: dt.replace(year=2020))
df.date_death_or_discharge = pd.to_datetime(df.date_death_or_discharge, dayfirst=True, errors='coerce')\
    .apply(lambda dt: dt.replace(year=2020))
df.outcome = df.outcome.replace({'discharge': 'discharged'})\
    .apply(lambda outcome: outcome if outcome in OUTCOMES else 'diseased')
In [ ]:
# Prepare geodataframe
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf = gdf[gdf.within(china.iloc[0].geometry)]
In [ ]:
# Add clusters by geoposition
gdf = append_cluster_column(gdf, distinguished_columns=['outcome'], max_clusters_count=MAX_CLUSTERS_COUNT)
In [ ]:
# Prepare clusterized geodataframe
clusters_gdf = pd.concat([
    pd.concat([
        get_clusters_gdf(gdf, outcome=outcome, current_date=current_date)
        for current_date in gdf.date_confirmation.sort_values().unique()
    ])
    for outcome in OUTCOMES
])
In [ ]:
# Prepare list of plots that would be frames of the animation
p = ggplot() + \
    geom_polygon(data=china, fill='#d6d6d6') + \
    theme(legend_position=(.15, .15), axis_title='blank', axis_text='blank', axis_ticks='blank', axis_line='blank') + \
    ggsize(600, 600)

limit = clusters_gdf['count'].max()
plots = []
for current_date in gdf.date_confirmation.sort_values().unique():
    current_clusters_gdf = clusters_gdf[clusters_gdf.date == current_date]
    plots.append(
        p + \
        geom_point(aes(size='count', color='outcome'), data=current_clusters_gdf) + \
        scale_size(name='', limits=[0, limit]) + \
        scale_color_manual(values=['#cf3c38', 'black', '#90c73e', '#4cc5db']) + \
        ggtitle('COVID-19 on ' + np.datetime_as_string(current_date, unit='D'))
    )
In [ ]:
player_widget(plots)
", "text/plain": ""}, "metadata": {}, "output_type": "display_data"}]}}, "b03b0800c74741a4b221b472d210a5a3": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "b4e0d03e634a483b9fd9d99ef9c4f55d": {"model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": {"layout": "IPY_MODEL_615f053348b3433a93c903081a6b725c", "outputs": [{"data": {"text/html": "
\n ", "text/plain": ""}, "metadata": {}, "output_type": "display_data"}]}}, "b807750af1d141d59a04aed88f6a0552": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LinkModel", "state": {"source": ["IPY_MODEL_0ab2a42cbc8740dba1fa5d7122588ac3", "value"], "target": ["IPY_MODEL_747edf0666ab4b6d9a4363d783755282", "value"]}}, "bcc675b4ef624247ae68f418b451d4a8": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "SliderStyleModel", "state": {"description_width": ""}}, "be804510102e43c3aa42aa685194d1cc": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "bf7242a655874eb69b829c25effe0b9c": {"model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": {"layout": "IPY_MODEL_0d323541d9924b1ebf17d1c2b5ec4d86", "outputs": [{"data": {"text/html": "
\n ", "text/plain": ""}, "metadata": {}, "output_type": "display_data"}]}}, "c5d3c5d247454d808aac4a8e3f5a8ea0": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LinkModel", "state": {"source": ["IPY_MODEL_fd9ce3a5f6b244a2aab6a0b4c3b5a2cf", "value"], "target": ["IPY_MODEL_54811c02f756439693b4dab8457271b6", "value"]}}, "ce8dbb234f284b2ba744298948e64e23": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": {"description_width": ""}}, "cfd475d8336f4fb5b5b3950c26d41ffb": {"model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": {"layout": "IPY_MODEL_07daf44749e6445786a59c3e94c9341b", "outputs": [{"data": {"text/html": "
\n ", "text/plain": ""}, "metadata": {}, "output_type": "display_data"}]}}, "d1da66120ad244f1b7933bf3f1372942": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "d3316de7499e45e8a19e378c294187cf": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": {"description_width": ""}}, "d8125145d48a46d193d8bf39b5bcfa02": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PlayModel", "state": {"_playing": true, "interval": 1000, "layout": "IPY_MODEL_be804510102e43c3aa42aa685194d1cc", "max": 38, "style": "IPY_MODEL_79d8bf641c984298bb1880441e7f8a9a", "value": 23}}, "dbbd30b1fa3145b491e7ba22e299f40d": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "dce3a046b15e41a782c3564ce908df9e": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "SliderStyleModel", "state": {"description_width": ""}}, "e0a0579e8f474aa59c66cd5f1d084fc1": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PlayModel", "state": {"interval": 1000, "layout": "IPY_MODEL_e371f9cab5e7484790abd8044dcefb17", "max": 38, "style": "IPY_MODEL_ce8dbb234f284b2ba744298948e64e23", "value": 38}}, "e371f9cab5e7484790abd8044dcefb17": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "e76b3bd38ee14206b750b9981b770199": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "eab364485426408d8967a1018da1760d": {"model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {}}, "efdbbda054b546b89155758159a51c2c": {"model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": {"layout": "IPY_MODEL_3da72335e55049c3b971a2d8f3e94c70", "outputs": [{"data": {"text/html": "
\n ", "text/plain": ""}, "metadata": {}, "output_type": "display_data"}]}}, "f3b9e292accf4e9287986fc17edb3676": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": {"description_width": ""}}, "f5366755650548649a30d5be85bddc51": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": {"children": ["IPY_MODEL_fd9ce3a5f6b244a2aab6a0b4c3b5a2cf", "IPY_MODEL_54811c02f756439693b4dab8457271b6"], "layout": "IPY_MODEL_b03b0800c74741a4b221b472d210a5a3"}}, "fd9ce3a5f6b244a2aab6a0b4c3b5a2cf": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "PlayModel", "state": {"interval": 1000, "layout": "IPY_MODEL_52f3fd119ede4ef69cac3af9365f25f6", "max": 38, "style": "IPY_MODEL_d3316de7499e45e8a19e378c294187cf", "value": 16}}, "ff02945c58294ac6b296ac7dbfde78a0": {"model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": {"children": ["IPY_MODEL_0ab2a42cbc8740dba1fa5d7122588ac3", "IPY_MODEL_747edf0666ab4b6d9a4363d783755282"], "layout": "IPY_MODEL_6c6e876da2e24a618237dd2f5f91923d"}}}, "version_major": 2, "version_minor": 0}