field by age and vaccine status in case per million + rename functions + fix functions doc

This commit is contained in:
rmanach 2021-12-30 12:03:42 +01:00
parent 88854c8631
commit fc7b84da28
2 changed files with 132 additions and 86 deletions

216
drees.py
View File

@ -61,6 +61,10 @@ class Field(DreesEnum):
DC = (2, "Décés") DC = (2, "Décés")
class Quota(DreesEnum):
EFFECTIF = (0 + len(Field), "Effectif")
class VacStatus(DreesEnum): class VacStatus(DreesEnum):
NC = (0, "Non-vaccinés") NC = (0, "Non-vaccinés")
PDR = (1, "Primo dose récente") PDR = (1, "Primo dose récente")
@ -81,6 +85,7 @@ class AgeGroup(DreesEnum):
VERY_OLD = (4, "[80;+]") VERY_OLD = (4, "[80;+]")
# namedtuple used to store stats (could be better...)
VaccineMean = namedtuple("VaccineMean", ["age", "field", "percent"]) VaccineMean = namedtuple("VaccineMean", ["age", "field", "percent"])
AgeMean = namedtuple("AgeMean", ["age", "field", "percent"]) AgeMean = namedtuple("AgeMean", ["age", "field", "percent"])
@ -92,6 +97,7 @@ def get_data(
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
collect covid data by age from DREES collect covid data by age from DREES
src: DATA_URL
""" """
os.makedirs(DATA_REPOSITORY, exist_ok=True) os.makedirs(DATA_REPOSITORY, exist_ok=True)
data_url = DATA_URL.format(extension=extension) data_url = DATA_URL.format(extension=extension)
@ -130,11 +136,14 @@ def get_enum_field(value):
for field in Field: for field in Field:
if field.name.lower() == value: if field.name.lower() == value:
return field.value return field.value
for quota in Quota:
if quota.name.lower() == value:
return quota.value
def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]: def structure_data(data: Dict[str, Any]) -> Dict[dt, Any]:
""" """
group the original dictionnary into a more readable one struture the original dictionnary into a more readable one
'date': { 'date': {
'age' : { 'age' : {
'vac_status' : { 'vac_status' : {
@ -147,35 +156,38 @@ def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]:
} }
""" """
logging.info("restructuring the data...") logging.info("restructuring the data...")
dic_data_grouped: Dict[dt, Any] = OrderedDict() dic_data: Dict[dt, Any] = OrderedDict()
for row in data["records"]: for row in data["records"]:
row_fields = row["fields"] row_fields = row["fields"]
date = dt.strptime(row_fields["date"], DATE_FORMAT) date = dt.strptime(row_fields["date"], DATE_FORMAT)
age = row_fields["age"] age = row_fields["age"]
vac_status = row_fields["vac_statut"] vac_status = row_fields["vac_statut"]
if date not in dic_data_grouped: if date not in dic_data:
dic_data_grouped[date] = OrderedDict() dic_data[date] = OrderedDict()
if age not in dic_data_grouped[date]: if age not in dic_data[date]:
dic_data_grouped[date][age] = OrderedDict() dic_data[date][age] = OrderedDict()
if vac_status not in dic_data_grouped[date][age]: if vac_status not in dic_data[date][age]:
dic_data_grouped[date][age][vac_status] = OrderedDict() dic_data[date][age][vac_status] = OrderedDict()
for field in Field: for field in Field:
field_name = field.name.lower() field_name = field.name.lower()
dic_data_grouped[date][age][vac_status][field_name] = row_fields[field_name] dic_data[date][age][vac_status][field_name] = row_fields[field_name]
for quota in Quota:
quota_name = quota.name.lower()
dic_data[date][age][vac_status][quota_name] = row_fields[quota_name]
logging.info("data restructured") logging.info("data restructured")
return dic_data_grouped return dic_data
def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]: def get_np_data(dic_data: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]:
""" """
store the data in numpy data structure helped by Enum store the data in numpy data structure
""" """
logging.info("storing data in numpy data structure...") logging.info("storing data in numpy data structure...")
np_data = np.empty( np_data = np.empty(
(len(dic_data_grouped), len(AgeGroup), len(VacStatus), len(Field)) (len(dic_data), len(AgeGroup), len(VacStatus), len(Field) + len(Quota))
) )
np_date = np.empty((len(dic_data_grouped)), dtype="datetime64[s]") np_date = np.empty((len(dic_data)), dtype="datetime64[s]")
for idx_date, (date, dic_age) in enumerate(dic_data_grouped.items()): for idx_date, (date, dic_age) in enumerate(dic_data.items()):
np_date[idx_date] = date np_date[idx_date] = date
for age, dic_vac in dic_age.items(): for age, dic_vac in dic_age.items():
idx_age = get_enum_age(age) idx_age = get_enum_age(age)
@ -192,33 +204,38 @@ def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray
def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
split data to get vaccine data (all vaccine status) and unvaccine data (no vaccine)
"""
return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :] return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :]
def get_vaccine_percent(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: def get_vaccine_status_distribution(
np_data: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
""" """
get the vaccine percent per date, age and field get the vaccine distribution over the whole time period by age and field
the vaccine data holds all the vaccine status except unvaccine the vaccine data holds all the vaccine status except unvaccine
""" """
np_data_vac, np_data_unvac = split_by_vac_status(np_data) np_data_vac, np_data_unvac = split_by_vac_status(np_data)
np_percent_vac = np_data_vac / np.sum(np_data, axis=2) np_vac_distri = np_data_vac / np.sum(np_data, axis=2)
np_percent_unvac = np_data_unvac / np.sum(np_data, axis=2) np_unvac_distri = np_data_unvac / np.sum(np_data, axis=2)
return np_percent_vac, np_percent_unvac return np_vac_distri, np_unvac_distri
def get_percent_age_by_date_field_vac_splited( def get_distribution_age_by_field_and_vac_status(
np_data: np.ndarray, field: Field np_data: np.ndarray, field: Field
) -> Tuple[np.ndarray, np.ndarray]: ) -> Tuple[np.ndarray, np.ndarray]:
""" """
get numpy percent age grouped by date and field splited by vaccine status get distribution age (percent) by field grouped by vaccine status
""" """
np_percent_age_vac = np.empty((len(np_data), len(AgeGroup))) np_age_vac_percent = np.empty((len(np_data), len(AgeGroup)))
np_percent_age_unvac = np.copy(np_percent_age_vac) np_age_unvac_percent = np.copy(np_age_vac_percent)
np_data_vac, np_data_unvac = split_by_vac_status(np_data) np_data_vac, np_data_unvac = split_by_vac_status(np_data)
for idx_date in range(len(np_data_vac)): for idx_date in range(len(np_data_vac)):
sum_effectif = np.nansum(np_data_vac[idx_date, :, field.value]) sum_effectif = np.nansum(np_data_vac[idx_date, :, field.value])
for age_group in AgeGroup: for age_group in AgeGroup:
np_percent_age_vac[idx_date, age_group.value] = np.round( np_age_vac_percent[idx_date, age_group.value] = np.round(
(np_data_vac[idx_date, age_group.value, field.value] / sum_effectif) (np_data_vac[idx_date, age_group.value, field.value] / sum_effectif)
* 100, * 100,
2, 2,
@ -226,17 +243,17 @@ def get_percent_age_by_date_field_vac_splited(
for idx_date in range(len(np_data_unvac)): for idx_date in range(len(np_data_unvac)):
sum_effectif = np.nansum(np_data_unvac[idx_date, :, field.value]) sum_effectif = np.nansum(np_data_unvac[idx_date, :, field.value])
for age_group in AgeGroup: for age_group in AgeGroup:
np_percent_age_unvac[idx_date, age_group.value] = np.round( np_age_unvac_percent[idx_date, age_group.value] = np.round(
(np_data_unvac[idx_date, age_group.value, field.value] / sum_effectif) (np_data_unvac[idx_date, age_group.value, field.value] / sum_effectif)
* 100, * 100,
2, 2,
) )
return np_percent_age_vac, np_percent_age_unvac return np_age_vac_percent, np_age_unvac_percent
def get_percent_age_by_date_field(np_data: np.ndarray, field: Field) -> np.ndarray: def get_distribution_age_by_field(np_data: np.ndarray, field: Field) -> np.ndarray:
""" """
get numpy percent age grouped by date and field get age distribution (percent) over the whole period by field
""" """
np_percent_age = np.empty((len(np_data), len(AgeGroup))) np_percent_age = np.empty((len(np_data), len(AgeGroup)))
for idx_date in range(len(np_data)): for idx_date in range(len(np_data)):
@ -291,33 +308,38 @@ def save_and_close_fig(
def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]: def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]:
""" """
analyse data analyse DREES dataset
useful stats can be compute here if no plots needed
""" """
logging.info("analysing data...") logging.info("analysing data...")
lst_analyse_data: List[Union[VaccineMean, AgeMean]] = list() lst_analyse_data: List[Union[VaccineMean, AgeMean]] = list()
np_percent_vac, _ = get_vaccine_percent(np_data) np_vac_distri, _ = get_vaccine_status_distribution(np_data)
logging.info("--- field by age vaccine mean percent ---") logging.info(
"--- field distribution by age and only vaccine status (averaged over the whole period) ---"
)
for age_group in AgeGroup: for age_group in AgeGroup:
for field in Field: for field in Field:
mean_vac_percent = np.round( vac_percent_mean = np.round(
np.nanmean(np_percent_vac[:, age_group.value, field.value]) * 100, 2 np.nanmean(np_vac_distri[:, age_group.value, field.value]) * 100, 2
) )
print(f"{field.name} - {age_group.label} - vac : {mean_vac_percent}%") print(f"{field.name} - {age_group.label} - vac : {vac_percent_mean}%")
lst_analyse_data.append( lst_analyse_data.append(
VaccineMean(age_group.label, field.label, mean_vac_percent) VaccineMean(age_group.label, field.label, vac_percent_mean)
) )
logging.info("--- age by field and vac status mean percent ---") logging.info(
"--- age distribution by field and vac status (averaged over the whole period) ---"
)
for field in Field: for field in Field:
np_percent_age = get_percent_age_by_date_field(np_data, field) np_age_percent = get_distribution_age_by_field(np_data, field)
( (
np_percent_age_vac, np_percent_age_vac,
np_percent_age_unvac, np_percent_age_unvac,
) = get_percent_age_by_date_field_vac_splited(np_data, field) ) = get_distribution_age_by_field_and_vac_status(np_data, field)
for age_group in AgeGroup: for age_group in AgeGroup:
percent_age_mean = np.round( percent_age_mean = np.round(
np.nanmean(np_percent_age[:, age_group.value]), 2 np.nanmean(np_age_percent[:, age_group.value]), 2
) )
print(f"age: {age_group.label} - field: {field.name} = {percent_age_mean}%") print(f"age: {age_group.label} - field: {field.name} = {percent_age_mean}%")
lst_analyse_data.append( lst_analyse_data.append(
@ -340,18 +362,21 @@ def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]:
return lst_analyse_data return lst_analyse_data
def plot_bar_age_percent_vac_status_by_field( def plot_bar_age_distribution_by_field_and_vac_status(
np_data_vac_status: np.ndarray, np_data: np.ndarray,
np_date: np.ndarray, np_date: np.ndarray,
field: Field, field: Field,
is_vac: Optional[bool] = True, is_vac: Optional[bool] = True,
) -> None: ) -> None:
"""
plot age distribution distribution (percent) by field and vaccine status
"""
fig, ax = get_plot_fig(figsize=(22, 8), locator=md.WeekdayLocator()) fig, ax = get_plot_fig(figsize=(22, 8), locator=md.WeekdayLocator())
bottom = np_data_vac_status[:, 0] bottom = np_data[:, 0]
suffix = "vac" if is_vac else "unvac" suffix = "vac" if is_vac else "unvac"
title = "Vaccinés" if is_vac else "Non vaccinés" title = "Vaccinés" if is_vac else "Non vaccinés"
for age_group in AgeGroup: for age_group in AgeGroup:
percents_age = np_data_vac_status[:, age_group.value] percents_age = np_data[:, age_group.value]
if age_group.value > 0: if age_group.value > 0:
ax.bar( ax.bar(
np_date, np_date,
@ -381,25 +406,30 @@ def plot_bar_age_percent_vac_status_by_field(
) )
def plot_bar_age_percent_by_field( def plot_bar_age_distribution_by_field(
np_data: np.ndarray, np_date: np.ndarray, field: Field np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> None: ) -> None:
""" """
plot percent vaccinated field group by age bar diagram plot age distribution (percent) by field
""" """
( (
np_percent_age_vac, np_age_vac_percent,
np_percent_age_unvac, np_age_unvac_percent,
) = get_percent_age_by_date_field_vac_splited(np_data, field) ) = get_distribution_age_by_field_and_vac_status(np_data, field)
plot_bar_age_percent_vac_status_by_field(np_percent_age_vac, np_date, field) plot_bar_age_distribution_by_field_and_vac_status(
plot_bar_age_percent_vac_status_by_field( np_age_vac_percent, np_date, field
np_percent_age_unvac, np_date, field, is_vac=False )
plot_bar_age_distribution_by_field_and_vac_status(
np_age_unvac_percent, np_date, field, is_vac=False
) )
def plot_cumulative_field( def plot_cumulative_field(
np_data: np.ndarray, np_date: np.ndarray, field: Field np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> None: ) -> None:
"""
plot cumulative field by age and vaccine status (cases per million)
"""
np_data_vac, np_data_unvac = split_by_vac_status(np_data) np_data_vac, np_data_unvac = split_by_vac_status(np_data)
for age_group in AgeGroup: for age_group in AgeGroup:
fig, _ = get_plot_fig() fig, _ = get_plot_fig()
@ -413,8 +443,8 @@ def plot_cumulative_field(
plt.plot(np_date, np_cumulate_unvac, label=f"Non vaccinés") plt.plot(np_date, np_cumulate_unvac, label=f"Non vaccinés")
plt.title(f"{age_group.label} - {field.label}") plt.title(f"{age_group.label} - {field.label}")
plt.xlabel("date") plt.xlabel("Date")
plt.ylabel("nombre") plt.ylabel("Nombre de cas")
save_and_close_fig( save_and_close_fig(
fig, fig,
os.path.join( os.path.join(
@ -428,18 +458,20 @@ def plot_fields_by_age_vac(
np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus
) -> None: ) -> None:
""" """
plot data by vaccine status, age and field plot field data by age and vaccine status (cases per million)
""" """
fig, _ = get_plot_fig() fig, _ = get_plot_fig()
for field in Field: for field in Field:
plt.plot( plt.plot(
np_date, np_date,
np_data[:, age_group.value, vac_status.value, field.value], 10e6
* np_data[:, age_group.value, vac_status.value, field.value]
/ np_data[:, age_group.value, vac_status.value, Quota.EFFECTIF.value],
label=f"{field.label}", label=f"{field.label}",
) )
plt.xlabel("date") plt.xlabel("Date")
plt.ylabel("nombre") plt.ylabel("Cas par million de personnes")
plt.title(f"{age_group.label} - {vac_status.label}") plt.title(f"{age_group.label} - {vac_status.label}")
save_and_close_fig( save_and_close_fig(
@ -450,26 +482,25 @@ def plot_fields_by_age_vac(
) )
def plot_bar_data_by_age_field( def plot_bar_vaccine_status_distribution_by_age_field(
np_data: np.ndarray, np_data: np.ndarray,
np_date: np.ndarray, np_date: np.ndarray,
age_group: AgeGroup, age_group: AgeGroup,
field: Field, field: Field,
) -> None: ) -> None:
""" """
display a bar graph by field and age over the data period display vaccine/unvaccine distribution (percent) over the whole period by age and field
bars display vaccine status percent
""" """
np_percent_vac, np_percent_unvac = get_vaccine_percent(np_data) np_vac_distri, np_unvac_distri = get_vaccine_status_distribution(np_data)
# adjust the fig size to display correctly bars and labels # adjust the fig size to display correctly bars and labels
fig, ax = get_plot_fig(figsize=(22, 8)) fig, ax = get_plot_fig(figsize=(22, 8))
for idx_date in range(len(np_date)): for idx_date in range(len(np_date)):
vac_percent = np.round( vac_percent = np.round(
np_percent_vac[idx_date, age_group.value, field.value] * 100, 2 np_vac_distri[idx_date, age_group.value, field.value] * 100, 2
) )
unvac_percent = np.round( unvac_percent = np.round(
np_percent_unvac[idx_date, age_group.value, field.value] * 100, 2 np_unvac_distri[idx_date, age_group.value, field.value] * 100, 2
) )
bar_vac = ax.bar(idx_date, vac_percent, color="b", label="Vaccinés") bar_vac = ax.bar(idx_date, vac_percent, color="b", label="Vaccinés")
ax.bar( ax.bar(
@ -505,7 +536,7 @@ def plot_bar_data_by_age_field(
def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]: def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
""" """
get tuple arguments to plot fields data by age and vac status on multiprocess build pool age and vac status arguments
""" """
pool_args: List[Tuple[AgeGroup, VacStatus]] = list() pool_args: List[Tuple[AgeGroup, VacStatus]] = list()
for age_group in AgeGroup: for age_group in AgeGroup:
@ -516,7 +547,7 @@ def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
def get_age_field_args() -> List[Tuple[AgeGroup, Field]]: def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
""" """
get tuple arguments to plot fields data by age and field on multiprocess build pool age and field arguments
""" """
pool_args: List[Tuple[AgeGroup, Field]] = list() pool_args: List[Tuple[AgeGroup, Field]] = list()
for age_group in AgeGroup: for age_group in AgeGroup:
@ -525,6 +556,16 @@ def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
return pool_args return pool_args
def get_field_args() -> List[Tuple[Field]]:
"""
build pool field arguments
"""
pool_args: List[Tuple[Field]] = list()
for field in Field:
pool_args.append((field,))
return pool_args
def move_tmp_plots() -> None: def move_tmp_plots() -> None:
""" """
move .tmp.png plots into .png after generation move .tmp.png plots into .png after generation
@ -567,12 +608,15 @@ def generate_html_page(
if __name__ == "__main__": if __name__ == "__main__":
""" """
This script aims to plot DRESS data This script aims to analyse and plot DRESS data
Stats availables:
- Age distribution (percent) by field (vaccine and unvaccine)
- Vaccine/unvaccine distribution (percent) by field and age
Plots availables : Plots availables :
- cumulative deaths by age - cumulative hc, sc, dc by age and vaccine status
- hc, sc, dc by vaccine status and age - hc, sc, dc by vaccine status and age (cases per million)
- hc, sc, dc (vaccine/unvaccine percent) by age - hc, sc, dc (vaccine/unvaccine percent distribution) by age
- hc, sc, dc (age grouped percent) by field - hc, sc, dc (age percent distribution) by field
Main indicators are : Main indicators are :
- hospitalisations (hc) - hospitalisations (hc)
- criticals (sc) - criticals (sc)
@ -605,27 +649,29 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
dic_data: Dict[str, Any] = get_data( dic_data_unstructured: Dict[str, Any] = get_data(
file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh
) )
dic_data_grouped: Dict[dt, Any] = group_by_age_date(dic_data) dic_data: Dict[dt, Any] = structure_data(dic_data_unstructured)
np_data, np_date = get_np_data(dic_data_grouped) np_data, np_date = get_np_data(dic_data)
lst_analyse_data = analyse(np_data) lst_analyse_data = analyse(np_data)
if not args.no_plot: if not args.no_plot:
os.makedirs(OUTPUT_REPOSITORY, exist_ok=True) os.makedirs(OUTPUT_REPOSITORY, exist_ok=True)
plot_fields_args = get_age_vac_args()
f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
plot_vac_percent_age_args = get_age_field_args()
f_bars = partial(plot_bar_data_by_age_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, plot_fields_args)
pool.starmap(f_bars, plot_vac_percent_age_args)
for field in Field: f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
plot_cumulative_field(np_data, np_date, field) f_bars_vaccine = partial(
plot_bar_age_percent_by_field(np_data, np_date, field) plot_bar_vaccine_status_distribution_by_age_field, np_data, np_date
)
f_bars_age = partial(plot_bar_age_distribution_by_field, np_data, np_date)
f_cumulate = partial(plot_cumulative_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, get_age_vac_args())
pool.starmap(f_bars_vaccine, get_age_field_args())
pool.starmap(f_bars_age, get_field_args())
pool.starmap(f_cumulate, get_field_args())
move_tmp_plots() move_tmp_plots()

View File

@ -16,7 +16,7 @@ logging.basicConfig(format=FORMAT, level=logging.INFO)
DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.{extension}" DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.{extension}"
DATA_REPOSITORY = "data" DATA_REPOSITORY = "data"
OUTPUT_REPOSITORY = "output" OUTPUT_REPOSITORY = "static/plots"
class DataProvider: class DataProvider: