add enums + set numpy data structure + plot vac/unvac percent

This commit is contained in:
rmanach 2021-12-26 20:29:13 +01:00
parent 615094877c
commit 70ac7d1bb7

383
drees.py
View File

@ -3,50 +3,59 @@ import json
import logging import logging
import os import os
from datetime import datetime as dt from datetime import datetime as dt
from enum import Enum from enum import Enum, IntEnum
from functools import partial from functools import partial
from multiprocessing import Pool from multiprocessing import Pool
from typing import Any, Dict, List, Optional, OrderedDict, Tuple from typing import Any, Dict, List, Optional, OrderedDict, Tuple
import numpy as np import numpy as np
import pandas as pd
import requests import requests
from matplotlib import dates as md from matplotlib import dates as md
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from numba import njit
FORMAT = "%(asctime)s - %(levelname)s - %(message)s" FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO) logging.basicConfig(format=FORMAT, level=logging.INFO)
DATE_FORMAT = "%Y-%m-%d" DATE_FORMAT = "%Y-%m-%d"
PLOT_FORMAT = "png"
DATA_URL = "https://data.drees.solidarites-sante.gouv.fr/api/records/1.0/search/?dataset=covid-19-resultats-par-age-issus-des-appariements-entre-si-vic-si-dep-et-vac-si&q=&rows=-1&facet=date&facet=vac_statut&facet=age" DATA_URL = "https://data.drees.solidarites-sante.gouv.fr/api/records/1.0/search/?dataset=covid-19-resultats-par-age-issus-des-appariements-entre-si-vic-si-dep-et-vac-si&q=&rows=-1&facet=date&facet=vac_statut&facet=age"
DATA_REPOSITORY = "data" DATA_REPOSITORY = "data"
OUTPUT_REPOSITORY = "output" OUTPUT_REPOSITORY = "output"
class Field(str, Enum): class DreesEnum(bytes, Enum):
HC = "hc" def __new__(cls, value, label):
SC = "sc" obj = bytes.__new__(cls, [value])
DC = "dc" obj._value_ = value
obj.label = label
return obj
class VacStatus(str, Enum): class Field(DreesEnum):
NC = "Non-vaccinés" HC = (0, "hc")
PDR = "Primo dose récente" SC = (1, "sc")
PDE = "Primo dose efficace" DC = (2, "dc")
CM3MSR = "Complet de moins de 3 mois - sans rappel"
CM3MAR = "Complet de moins de 3 mois - avec rappel"
CM36MSR = "Complet entre 3 mois et 6 mois - sans rappel"
CM36MAR = "Complet entre 3 mois et 6 mois - avec rappel"
class AgeGroup(str, Enum): class VacStatus(DreesEnum):
VERY_YOUNG = "[0,19]" NC = (0, "Non-vaccinés")
YOUNG = "[20,39]" PDR = (1, "Primo dose récente")
MID_OLD = "[40,59]" PDE = (2, "Primo dose efficace")
OLD = "[60,79]" CM3MSR = (3, "Complet de moins de 3 mois - sans rappel")
VERY_OLD = "[80;+]" CM3MAR = (4, "Complet de moins de 3 mois - avec rappel")
CM36MSR = (5, "Complet entre 3 mois et 6 mois - sans rappel")
CM36MAR = (6, "Complet entre 3 mois et 6 mois - avec rappel")
C6MAR = (7, "Complet de 6 mois et plus - avec rappel")
C6MSR = (8, "Complet de 6 mois et plus - sans rappel")
class AgeGroup(DreesEnum):
VERY_YOUNG = (0, "[0,19]")
YOUNG = (1, "[20,39]")
MID_OLD = (2, "[40,59]")
OLD = (3, "[60,79]")
VERY_OLD = (4, "[80;+]")
def get_data( def get_data(
@ -57,6 +66,7 @@ def get_data(
""" """
collect covid data by age from DREES collect covid data by age from DREES
""" """
logging.info("fetching data...")
os.makedirs(DATA_REPOSITORY, exist_ok=True) os.makedirs(DATA_REPOSITORY, exist_ok=True)
data_url = DATA_URL.format(extension=extension) data_url = DATA_URL.format(extension=extension)
if data_url.endswith("/"): if data_url.endswith("/"):
@ -76,6 +86,24 @@ def get_data(
return json.load(open(file_path, "rb")) return json.load(open(file_path, "rb"))
def get_enum_vac_status(value):
for vac_status in VacStatus:
if vac_status.label == value:
return vac_status.value
def get_enum_age(value):
for age_group in AgeGroup:
if age_group.label == value:
return age_group.value
def get_enum_field(value):
for field in Field:
if field.label == value:
return field.value
def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]: def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
""" """
group the original dictionnary into a more readable one group the original dictionnary into a more readable one
@ -90,6 +118,7 @@ def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
} }
} }
""" """
logging.info("restructuring the data...")
dic_data_grouped: Dict[dt, Any] = OrderedDict() dic_data_grouped: Dict[dt, Any] = OrderedDict()
for row in data["records"]: for row in data["records"]:
row_fields = row["fields"] row_fields = row["fields"]
@ -104,204 +133,176 @@ def group_by_age_date(data: Dict[str, Any], fields: List[str]) -> Dict[dt, Any]:
dic_data_grouped[date][age][vac_status] = OrderedDict() dic_data_grouped[date][age][vac_status] = OrderedDict()
for field in fields: for field in fields:
dic_data_grouped[date][age][vac_status][field] = row_fields[field] dic_data_grouped[date][age][vac_status][field] = row_fields[field]
logging.info("data restructured")
return dic_data_grouped return dic_data_grouped
@njit def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]:
def cumulate_array(array: np.ndarray) -> np.ndarray: """
cumulate = list() store the data in numpy data structure helped by Enum
sum: float = 0 """
for item in array: logging.info("storing data in numpy data structure...")
sum += item np_data = np.empty(
cumulate.append(sum) (len(dic_data_grouped), len(AgeGroup), len(VacStatus), len(Field))
return np.array(cumulate) )
np_date = np.empty((len(dic_data_grouped)), dtype="datetime64[s]")
for idx_date, (date, dic_age) in enumerate(dic_data_grouped.items()):
np_date[idx_date] = date
for age, dic_vac in dic_age.items():
idx_age = get_enum_age(age)
for vac, dic_field in dic_vac.items():
idx_vac = get_enum_vac_status(vac)
for field, value in dic_field.items():
idx_field = get_enum_field(field)
np_data[idx_date, idx_age, idx_vac, idx_field] = value
logging.info("date and data generated")
return np_data, np_date
def get_plot_fig( def get_plot_fig(
grid: Optional[bool] = True, date_format: Optional[str] = DATE_FORMAT grid: Optional[bool] = True,
date_format: Optional[str] = DATE_FORMAT,
figsize: Optional[Tuple[int, int]] = None,
) -> plt.figure: ) -> plt.figure:
""" """
return pyplot fig, ax to plot data over range period with date formatting return pyplot fig, ax to plot data over range period with date formatting
""" """
fig, ax = plt.subplots() fig, ax = plt.subplots(figsize=figsize)
ax.grid(grid) ax.grid(grid)
date_formatter = md.DateFormatter(date_format) date_formatter = md.DateFormatter(date_format)
ax.xaxis.set_major_locator(md.AutoDateLocator()) ax.xaxis.set_major_locator(md.AutoDateLocator())
ax.xaxis.set_major_formatter(date_formatter) ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate() fig.autofmt_xdate()
return fig return fig, ax
def save_and_close_fig( def save_and_close_fig(
fig: plt.figure, output_path: str, has_legend: Optional[bool] = True fig: plt.figure,
output_path: str,
has_legend: Optional[bool] = True,
is_tight: Optional[bool] = True,
): ):
logging.info(f"plotting : {output_path}...")
if has_legend: if has_legend:
plt.legend() plt.legend()
if is_tight:
plt.tight_layout()
plt.savefig(output_path) plt.savefig(output_path)
plt.close(fig) plt.close(fig)
logging.info(f"{output_path} plotted")
def get_cumulative_field_by_age( def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
dic_data_grouped: Dict[dt, Any], age: str, field: Field return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :]
) -> Tuple[np.ndarray, List[dt]]:
"""
cumulate field values over data period
"""
dcs: List[int] = list()
dates: List[dt] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_age := dic_age_grouped.get(age)) is None:
logging.error(f"{age} not found in grouped ages")
continue
for dic_vac_status in dic_age.values():
if (field_value := dic_vac_status[field.value]) is not None:
dcs.append(field_value)
dates.append(date)
np_dcs = np.array(dcs)
np_cumulate = cumulate_array(np_dcs)
return np_cumulate, dates
def get_values_by_age_vac_field( def plot_cumulative_field(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus, field: Field np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> Tuple[List[dt], List[float]]: ) -> None:
""" fig, _ = get_plot_fig()
get deep field data by age, vaccine status and field np_data_vac, np_data_unvac = split_by_vac_status(np_data)
"""
dates: List[dt] = list()
fields: List[float] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_vac_status := dic_age_grouped.get(age.value)) is not None:
if (dic_field := dic_vac_status.get(vac_status.value)) is not None:
if (field_value := dic_field.get(field.value)) is not None:
fields.append(field_value)
dates.append(date)
return dates, fields
def get_values_by_age_vac(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus
) -> Tuple[List[dt], List[Dict[str, Any]]]:
"""
get deep fields data by age and vaccine status
"""
dates: List[dt] = list()
fields: List[Dict[str, Any]] = list()
for date, dic_age_grouped in dic_data_grouped.items():
if (dic_vac_status := dic_age_grouped.get(age.value)) is not None:
if (dic_field := dic_vac_status.get(vac_status.value)) is not None:
fields.append(dic_field)
dates.append(date)
return dates, fields
def plot_cumulative_field(dic_data_grouped: Dict[dt, Any], field: Field) -> None:
fig = get_plot_fig()
for age_group in AgeGroup: for age_group in AgeGroup:
deaths, dates = get_cumulative_field_by_age( np_cumulate_vac: np.ndarray = np.cumsum(
dic_data_grouped, age_group.value, field np_data_vac[:, age_group.value, field.value], axis=0
) )
plt.plot(dates, deaths, label=age_group.value) np_cumulate_unvac: np.ndarray = np.cumsum(
np_data_unvac[:, age_group.value, field.value], axis=0
)
plt.plot(np_date, np_cumulate_vac, label=f"{age_group.label} vaccinate")
plt.plot(np_date, np_cumulate_unvac, label=f"{age_group.label} unvaccinate")
plt.title( plt.title(f"nombre de {field.label} cumulé par age")
f"nombre de {field.value} cumulé par age (status vaccinal non pris en compte)"
)
plt.xlabel("date") plt.xlabel("date")
save_and_close_fig( save_and_close_fig(
fig, os.path.join(OUTPUT_REPOSITORY, f"cumulative_{field.value}.pdf") fig, os.path.join(OUTPUT_REPOSITORY, f"cumulative_{field.label}")
) )
def extract_field_values(fields: List[Dict[str, Any]], field: Field) -> np.ndarray: def plot_fields_by_age_vac(
field_values: List[float] = list() np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus
for item in fields:
if (value := item.get(field)) is not None:
field_values.append(value)
return np.asarray(field_values)
def plot_data_by_age_vac(
dic_data_grouped: Dict[dt, Any], age: AgeGroup, vac_status: VacStatus
) -> None: ) -> None:
""" """
plot data by vaccine status, age and field plot data by vaccine status, age and field
""" """
fig = get_plot_fig() fig, _ = get_plot_fig()
dates, fields = get_values_by_age_vac(dic_data_grouped, age, vac_status)
for field in Field: for field in Field:
field_values = extract_field_values(fields, field) plt.plot(
plt.plot(dates, field_values, label=f"{field.value}") np_date,
np_data[:, age_group.value, vac_status.value, field.value],
label=f"{field.label}",
)
plt.xlabel("date") plt.xlabel("date")
plt.ylabel("nombre") plt.ylabel("nombre")
plt.title(f"{age}ans - {vac_status}") plt.title(f"{age_group.label}ans - {vac_status.label}")
save_and_close_fig(fig, os.path.join(OUTPUT_REPOSITORY, f"{age}_{vac_status}.pdf")) save_and_close_fig(
fig,
os.path.join(OUTPUT_REPOSITORY, f"all_{age_group.label}_{vac_status.label}"),
def group_by_date_age_vac(
dic_data_grouped: Dict[dt, Any],
field: Field,
is_vac: Optional[bool] = True,
limit_days: Optional[int] = 30,
) -> Dict[str, Any]:
dic_data: Dict[str, Any] = OrderedDict()
for date, dic_age in dic_data_grouped.items():
if abs((date - dt.now())).days >= limit_days:
continue
date_format = date.strftime(DATE_FORMAT)
dic_data[date_format] = OrderedDict()
for age, dic_vac in dic_age.items():
nb_vac, nb_unvac = 0, 0
for vac_status, dic_field in dic_vac.items():
if vac_status == VacStatus.NC.value:
nb_unvac += dic_field.get(field.value, 0)
continue
nb_vac += dic_field.get(field.value, 0)
sum_vac = nb_vac + nb_unvac
try:
percent_vac = (nb_vac / sum_vac) * 100
except ZeroDivisionError:
percent_vac = 0
try:
percent_unvac = (nb_unvac / sum_vac) * 100
except ZeroDivisionError:
percent_unvac = 0
dic_data[date_format][age] = percent_vac if is_vac else percent_unvac
return dic_data
def plot_bar_data_by_field(
dic_data_grouped: Dict[dt, Any], field: Field, is_vac: Optional[bool] = True
) -> None:
"""
display a bar graph by field grouped by age over the data period
bars displays vaccine status percent
"""
plt.rcParams["font.size"] = "24"
dic_data = group_by_date_age_vac(dic_data_grouped, field, is_vac=is_vac)
df = pd.DataFrame(dic_data).T
ax = df.plot.bar(figsize=(26, 15))
ax.set_title(f"{field.value} vaccinate percent grouped by age")
ax.set_xlabel("date")
fig = ax.get_figure()
plt.xticks(rotation=45)
plt.legend(loc="upper right")
plt.tight_layout()
filename = "vac" if is_vac else "unvac"
fig.savefig(
os.path.join(OUTPUT_REPOSITORY, f"{filename}_age_grouped_{field.value}.pdf")
) )
def build_data_pool_args() -> List[Tuple[AgeGroup, VacStatus]]: def get_vaccine_percent(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
""" """
build tuple arguments to plot all data on multiprocess get the vaccine percent per date, age and field
the vaccine data holds all the vaccine status except unvaccine
"""
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
np_percent_vac = np_data_vac / np.sum(np_data, axis=2)
np_percent_unvac = np_data_unvac / np.sum(np_data, axis=2)
return np_percent_vac, np_percent_unvac
def plot_bar_data_by_age_field(
np_data: np.ndarray,
np_date: np.ndarray,
age_group: AgeGroup,
field: Field,
) -> None:
"""
display a bar graph by field and age over the data period
bars display vaccine status percent
a limit days period is set to have an readable plot
"""
np_percent_vac, np_percent_unvac = get_vaccine_percent(np_data)
fig, ax = get_plot_fig(figsize=(22, 8))
for idx_date in range(len(np_date)):
vac_percent = np.round(
np_percent_vac[idx_date, age_group.value, field.value] * 100
)
unvac_percent = np.round(
np_percent_unvac[idx_date, age_group.value, field.value] * 100
)
bar_vac = ax.bar(idx_date, vac_percent, color="b", label="vac")
bar_unvac = ax.bar(
idx_date, unvac_percent, bottom=vac_percent, color="r", label="novac"
)
ax.bar_label(bar_vac, label_type="edge", color="black", fontsize="8")
ax.set_ylim(top=105)
ax.set_ylabel("%")
ax.set_title(f"{age_group.label} - {field.label}")
ax.set(
xticks=range(len(np_date)),
xticklabels=[
d.strftime(DATE_FORMAT) if idx % 4 == 0 else ""
for idx, d in enumerate(np_date.astype(dt))
],
)
plt.legend(["vaccinate", "unvaccinate"], loc=0, frameon=True)
save_and_close_fig(
fig,
os.path.join(OUTPUT_REPOSITORY, f"vac_percent_{age_group.label}_{field.label}"),
has_legend=False,
)
def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
"""
get tuple arguments to plot fields data by age and vac status on multiprocess
""" """
pool_args: List[Tuple[AgeGroup, VacStatus]] = list() pool_args: List[Tuple[AgeGroup, VacStatus]] = list()
for age_group in AgeGroup: for age_group in AgeGroup:
@ -310,17 +311,28 @@ def build_data_pool_args() -> List[Tuple[AgeGroup, VacStatus]]:
return pool_args return pool_args
def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
"""
get tuple arguments to plot fields data by age and field on multiprocess
"""
pool_args: List[Tuple[AgeGroup, Field]] = list()
for age_group in AgeGroup:
for field in Field:
pool_args.append((age_group, field))
return pool_args
if __name__ == "__main__": if __name__ == "__main__":
""" """
This script aims to plot DRESS data This script aims to plot DRESS data
Plots availables : Plots availables :
- cumulative deaths by age - cumulative deaths by age
- indicators by vaccine status and age - hc, sc, dc by vaccine status and age
- indicators vaccine/unvaccine percent grouped by age - hc, sc, dc (vaccine/unvaccine percent) by age
Main indicators are : Main indicators are :
- hospitalisations - hospitalisations (hc)
- criticals - criticals (sc)
- deaths - deaths (dc)
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -340,17 +352,18 @@ if __name__ == "__main__":
file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh
) )
dic_data_grouped: Dict[dt, Any] = group_by_age_date( dic_data_grouped: Dict[dt, Any] = group_by_age_date(
dic_data, [x.value for x in Field] dic_data, [x.label for x in Field]
) )
plot_data_pool_args = build_data_pool_args() np_data, np_date = get_np_data(dic_data_grouped)
f = partial(plot_data_by_age_vac, dic_data_grouped)
with Pool() as pool: plot_fields_args = get_age_vac_args()
pool.starmap(f, plot_data_pool_args) f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
plot_vac_percent_age_args = get_age_field_args()
f_bars = partial(plot_bar_data_by_age_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, plot_fields_args)
pool.starmap(f_bars, plot_vac_percent_age_args)
for field in Field: for field in Field:
plot_cumulative_field(dic_data_grouped, field) plot_cumulative_field(np_data, np_date, field)
for field in Field:
plot_bar_data_by_field(dic_data_grouped, field)
plot_bar_data_by_field(dic_data_grouped, field, is_vac=False)