add enums + set numpy data structure + plot vac/unvac percent

This commit is contained in:
rmanach 2021-12-26 20:29:13 +01:00
parent 615094877c
commit 70ac7d1bb7

383
drees.py
View File

@ -3,50 +3,59 @@ import json
import logging
import os
from datetime import datetime as dt
from enum import Enum
from enum import Enum, IntEnum
from functools import partial
from multiprocessing import Pool
from typing import Any, Dict, List, Optional, OrderedDict, Tuple
import numpy as np
import pandas as pd
import requests
from matplotlib import dates as md
from matplotlib import pyplot as plt
from numba import njit
FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)
DATE_FORMAT = "%Y-%m-%d"
PLOT_FORMAT = "png"
DATA_URL = "https://data.drees.solidarites-sante.gouv.fr/api/records/1.0/search/?dataset=covid-19-resultats-par-age-issus-des-appariements-entre-si-vic-si-dep-et-vac-si&q=&rows=-1&facet=date&facet=vac_statut&facet=age"
DATA_REPOSITORY = "data"
OUTPUT_REPOSITORY = "output"
class Field(str, Enum):
HC = "hc"
SC = "sc"
DC = "dc"
class DreesEnum(bytes, Enum):
def __new__(cls, value, label):
obj = bytes.__new__(cls, [value])
obj._value_ = value
obj.label = label
return obj
class VacStatus(str, Enum):
NC = "Non-vaccinés"
PDR = "Primo dose récente"
PDE = "Primo dose efficace"
CM3MSR = "Complet de moins de 3 mois - sans rappel"
CM3MAR = "Complet de moins de 3 mois - avec rappel"
CM36MSR = "Complet entre 3 mois et 6 mois - sans rappel"
CM36MAR = "Complet entre 3 mois et 6 mois - avec rappel"
class Field(DreesEnum):
HC = (0, "hc")
SC = (1, "sc")
DC = (2, "dc")
class AgeGroup(str, Enum):
VERY_YOUNG = "[0,19]"
YOUNG = "[20,39]"
MID_OLD = "[40,59]"
OLD = "[60,79]"
VERY_OLD = "[80;+]"
class VacStatus(DreesEnum):
NC = (0, "Non-vaccinés")
PDR = (1, "Primo dose récente")
PDE = (2, "Primo dose efficace")
CM3MSR = (3, "Complet de moins de 3 mois - sans rappel")
CM3MAR = (4, "Complet de moins de 3 mois - avec rappel")
CM36MSR = (5, "Complet entre 3 mois et 6 mois - sans rappel")
CM36MAR = (6, "Complet entre 3 mois et 6 mois - avec rappel")
C6MAR = (7, "Complet de 6 mois et plus - avec rappel")
C6MSR = (8, "Complet de 6 mois et plus - sans rappel")
class AgeGroup(DreesEnum):
VERY_YOUNG = (0, "[0,19]")
YOUNG = (1, "[20,39]")
MID_OLD = (2, "[40,59]")
OLD = (3, "[60,79]")
VERY_OLD = (4, "[80;+]")
def get_data(
@ -57,6 +66,7 @@ def get_data(
"""
collect covid data by age from DREES
"""
logging.info("fetching data...")
os.makedirs(DATA_REPOSITORY, exist_ok=True)
data_url = DATA_URL.format(extension=extension)
if data_url.endswith("/"):
@ -76,6 +86,24 @@ def get_data(
return json.load(open(file_path, "rb"))
def get_enum_vac_status(value):
for vac_status in VacStatus:
if vac_status.label == value:
return vac_status.value
def get_enum_age(value):
for age_group in AgeGroup:
if age_group.label == value:
return age_group.value
def get_enum_field(value):
for field in Field:
if field.label == value:
return field.value
def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
"""
group the original dictionnary into a more readable one
@ -90,6 +118,7 @@ def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
}
}
"""
logging.info("restructuring the data...")
dic_data_grouped: Dict[dt, Any] = OrderedDict()
for row in data["records"]:
row_fields = row["fields"]
@ -104,204 +133,176 @@ def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
dic_data_grouped[date][age][vac_status] = OrderedDict()
for field in fields:
dic_data_grouped[date][age][vac_status][field] = row_fields[field]
logging.info("data restructured")
return dic_data_grouped
@njit
def cumulate_array(array: np.ndarray) -> np.ndarray:
cumulate = list()
sum: float = 0
for item in array:
sum += item
cumulate.append(sum)
return np.array(cumulate)
def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]:
"""
store the data in numpy data structure helped by Enum
"""
logging.info("storing data in numpy data structure...")
np_data = np.empty(
(len(dic_data_grouped), len(AgeGroup), len(VacStatus), len(Field))
)
np_date = np.empty((len(dic_data_grouped)), dtype="datetime64[s]")
for idx_date, (date, dic_age) in enumerate(dic_data_grouped.items()):
np_date[idx_date] = date
for age, dic_vac in dic_age.items():
idx_age = get_enum_age(age)
for vac, dic_field in dic_vac.items():
idx_vac = get_enum_vac_status(vac)
for field, value in dic_field.items():
idx_field = get_enum_field(field)
np_data[idx_date, idx_age, idx_vac, idx_field] = value
logging.info("date and data generated")
return np_data, np_date
def get_plot_fig(
grid: Optional[bool] = True, date_format: Optional[str] = DATE_FORMAT
grid: Optional[bool] = True,
date_format: Optional[str] = DATE_FORMAT,
figsize: Optional[Tuple[int, int]] = None,
) -> plt.figure:
"""
return pyplot fig, ax to plot data over range period with date formatting
"""
fig, ax = plt.subplots()
fig, ax = plt.subplots(figsize=figsize)
ax.grid(grid)
date_formatter = md.DateFormatter(date_format)
ax.xaxis.set_major_locator(md.AutoDateLocator())
ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()
return fig
return fig, ax
def save_and_close_fig(
fig: plt.figure, output_path: str, has_legend: Optional[bool] = True
fig: plt.figure,
output_path: str,
has_legend: Optional[bool] = True,
is_tight: Optional[bool] = True,
):
logging.info(f"plotting : {output_path}...")
if has_legend:
plt.legend()
if is_tight:
plt.tight_layout()
plt.savefig(output_path)
plt.close(fig)
logging.info(f"{output_path} plotted")
def get_cumulative_field_by_age(
dic_data_grouped: Dict[dt, Any], age: str, field: Field
) -> Tuple[np.ndarray, List[dt]]:
"""
cumulate field values over data period
"""
dcs: List[int] = list()
dates: List[dt] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_age := dic_age_grouped.get(age)) is None:
logging.error(f"{age} not found in grouped ages")
continue
for dic_vac_status in dic_age.values():
if (field_value := dic_vac_status[field.value]) is not None:
dcs.append(field_value)
dates.append(date)
np_dcs = np.array(dcs)
np_cumulate = cumulate_array(np_dcs)
return np_cumulate, dates
def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :]
def get_values_by_age_vac_field(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus, field: Field
) -> Tuple[List[dt], List[float]]:
"""
get deep field data by age, vaccine status and field
"""
dates: List[dt] = list()
fields: List[float] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_vac_status := dic_age_grouped.get(age.value)) is not None:
if (dic_field := dic_vac_status.get(vac_status.value)) is not None:
if (field_value := dic_field.get(field.value)) is not None:
fields.append(field_value)
dates.append(date)
return dates, fields
def get_values_by_age_vac(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus
) -> Tuple[List[dt], List[Dict[str, Any]]]:
"""
get deep fields data by age and vaccine status
"""
dates: List[dt] = list()
fields: List[Dict[str, Any]] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_vac_status := dic_age_grouped.get(age.value)) is not None:
if (dic_field := dic_vac_status.get(vac_status.value)) is not None:
fields.append(dic_field)
dates.append(date)
return dates, fields
def plot_cumulative_field(dic_data_grouped: Dict[dt, Any], field: Field) -> None:
fig = get_plot_fig()
def plot_cumulative_field(
np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> None:
fig, _ = get_plot_fig()
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
for age_group in AgeGroup:
deaths, dates = get_cumulative_field_by_age(
dic_data_grouped, age_group.value, field
np_cumulate_vac: np.ndarray = np.cumsum(
np_data_vac[:, age_group.value, field.value], axis=0
)
plt.plot(dates, deaths, label=age_group.value)
np_cumulate_unvac: np.ndarray = np.cumsum(
np_data_unvac[:, age_group.value, field.value], axis=0
)
plt.plot(np_date, np_cumulate_vac, label=f"{age_group.label} vaccinate")
plt.plot(np_date, np_cumulate_unvac, label=f"{age_group.label} unvaccinate")
plt.title(
f"nombre de {field.value} cumulé par age (status vaccinal non pris en compte)"
)
plt.title(f"nombre de {field.label} cumulé par age")
plt.xlabel("date")
save_and_close_fig(
fig, os.path.join(OUTPUT_REPOSITORY, f"cumulative_{field.value}.pdf")
fig, os.path.join(OUTPUT_REPOSITORY, f"cumulative_{field.label}")
)
def extract_field_values(fields: List[Dict[str, Any]], field: Field) -> np.ndarray:
field_values: List[float] = list()
for item in fields:
if (value := item.get(field)) is not None:
field_values.append(value)
return np.asarray(field_values)
def plot_data_by_age_vac(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus
def plot_fields_by_age_vac(
np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus
) -> None:
"""
plot data by vaccine status, age and field
"""
fig = get_plot_fig()
dates, fields = get_values_by_age_vac(dic_data_grouped, age, vac_status)
fig, _ = get_plot_fig()
for field in Field:
field_values = extract_field_values(fields, field)
plt.plot(dates, field_values, label=f"{field.value}")
plt.plot(
np_date,
np_data[:, age_group.value, vac_status.value, field.value],
label=f"{field.label}",
)
plt.xlabel("date")
plt.ylabel("nombre")
plt.title(f"{age}ans - {vac_status}")
plt.title(f"{age_group.label}ans - {vac_status.label}")
save_and_close_fig(fig, os.path.join(OUTPUT_REPOSITORY, f"{age}_{vac_status}.pdf"))
def group_by_date_age_vac(
dic_data_grouped: Dict[dt, Any],
field: Field,
is_vac: Optional[bool] = True,
limit_days: Optional[int] = 30,
) -> Dict[str, Any]:
dic_data: Dict[str, Any] = OrderedDict()
for date, dic_age in dic_data_grouped.items():
if abs((date - dt.now())).days >= limit_days:
continue
date_format = date.strftime(DATE_FORMAT)
dic_data[date_format] = OrderedDict()
for age, dic_vac in dic_age.items():
nb_vac, nb_unvac = 0, 0
for vac_status, dic_field in dic_vac.items():
if vac_status == VacStatus.NC.value:
nb_unvac += dic_field.get(field.value, 0)
continue
nb_vac += dic_field.get(field.value, 0)
sum_vac = nb_vac + nb_unvac
try:
percent_vac = (nb_vac / sum_vac) * 100
except ZeroDivisionError:
percent_vac = 0
try:
percent_unvac = (nb_unvac / sum_vac) * 100
except ZeroDivisionError:
percent_unvac = 0
dic_data[date_format][age] = percent_vac if is_vac else percent_unvac
return dic_data
def plot_bar_data_by_field(
dic_data_grouped: Dict[dt, Any], field: Field, is_vac: Optional[bool] = True
) -> None:
"""
display a bar graph by field grouped by age over the data period
bars displays vaccine status percent
"""
plt.rcParams["font.size"] = "24"
dic_data = group_by_date_age_vac(dic_data_grouped, field, is_vac=is_vac)
df = pd.DataFrame(dic_data).T
ax = df.plot.bar(figsize=(26, 15))
ax.set_title(f"{field.value} vaccinate percent grouped by age")
ax.set_xlabel("date")
fig = ax.get_figure()
plt.xticks(rotation=45)
plt.legend(loc="upper right")
plt.tight_layout()
filename = "vac" if is_vac else "unvac"
fig.savefig(
os.path.join(OUTPUT_REPOSITORY, f"{filename}_age_grouped_{field.value}.pdf")
save_and_close_fig(
fig,
os.path.join(OUTPUT_REPOSITORY, f"all_{age_group.label}_{vac_status.label}"),
)
def build_data_pool_args() -> List[Tuple[AgeGroup, VacStatus]]:
def get_vaccine_percent(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
build tuple arguments to plot all data on multiprocess
get the vaccine percent per date, age and field
the vaccine data holds all the vaccine status except unvaccine
"""
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
np_percent_vac = np_data_vac / np.sum(np_data, axis=2)
np_percent_unvac = np_data_unvac / np.sum(np_data, axis=2)
return np_percent_vac, np_percent_unvac
def plot_bar_data_by_age_field(
np_data: np.ndarray,
np_date: np.ndarray,
age_group: AgeGroup,
field: Field,
) -> None:
"""
display a bar graph by field and age over the data period
bars display vaccine status percent
a limit days period is set to have an readable plot
"""
np_percent_vac, np_percent_unvac = get_vaccine_percent(np_data)
fig, ax = get_plot_fig(figsize=(22, 8))
for idx_date in range(len(np_date)):
vac_percent = np.round(
np_percent_vac[idx_date, age_group.value, field.value] * 100
)
unvac_percent = np.round(
np_percent_unvac[idx_date, age_group.value, field.value] * 100
)
bar_vac = ax.bar(idx_date, vac_percent, color="b", label="vac")
bar_unvac = ax.bar(
idx_date, unvac_percent, bottom=vac_percent, color="r", label="novac"
)
ax.bar_label(bar_vac, label_type="edge", color="black", fontsize="8")
ax.set_ylim(top=105)
ax.set_ylabel("%")
ax.set_title(f"{age_group.label} - {field.label}")
ax.set(
xticks=range(len(np_date)),
xticklabels=[
d.strftime(DATE_FORMAT) if idx % 4 == 0 else ""
for idx, d in enumerate(np_date.astype(dt))
],
)
plt.legend(["vaccinate", "unvaccinate"], loc=0, frameon=True)
save_and_close_fig(
fig,
os.path.join(OUTPUT_REPOSITORY, f"vac_percent_{age_group.label}_{field.label}"),
has_legend=False,
)
def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
"""
get tuple arguments to plot fields data by age and vac status on multiprocess
"""
pool_args: List[Tuple[AgeGroup, VacStatus]] = list()
for age_group in AgeGroup:
@ -310,17 +311,28 @@ def build_data_pool_args() -> List[Tuple[AgeGroup, VacStatus]]:
return pool_args
def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
"""
get tuple arguments to plot fields data by age and field on multiprocess
"""
pool_args: List[Tuple[AgeGroup, Field]] = list()
for age_group in AgeGroup:
for field in Field:
pool_args.append((age_group, field))
return pool_args
if __name__ == "__main__":
"""
This script aims to plot DRESS data
Plots availables :
- cumulative deaths by age
- indicators by vaccine status and age
- indicators vaccine/unvaccine percent grouped by age
- hc, sc, dc by vaccine status and age
- hc, sc, dc (vaccine/unvaccine percent) by age
Main indicators are :
- hospitalisations
- criticals
- deaths
- hospitalisations (hc)
- criticals (sc)
- deaths (dc)
"""
parser = argparse.ArgumentParser()
@ -340,17 +352,18 @@ if __name__ == "__main__":
file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh
)
dic_data_grouped: Dict[dt, Any] = group_by_age_date(
dic_data, [x.value for x in Field]
dic_data, [x.label for x in Field]
)
plot_data_pool_args = build_data_pool_args()
f = partial(plot_data_by_age_vac, dic_data_grouped)
with Pool() as pool:
pool.starmap(f, plot_data_pool_args)
np_data, np_date = get_np_data(dic_data_grouped)
plot_fields_args = get_age_vac_args()
f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
plot_vac_percent_age_args = get_age_field_args()
f_bars = partial(plot_bar_data_by_age_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, plot_fields_args)
pool.starmap(f_bars, plot_vac_percent_age_args)
for field in Field:
plot_cumulative_field(dic_data_grouped, field)
for field in Field:
plot_bar_data_by_field(dic_data_grouped, field)
plot_bar_data_by_field(dic_data_grouped, field, is_vac=False)
plot_cumulative_field(np_data, np_date, field)