From fc7b84da2818d4d35ae99a99a67e778f288563c2 Mon Sep 17 00:00:00 2001 From: rmanach Date: Thu, 30 Dec 2021 12:03:42 +0100 Subject: [PATCH] field by age and vaccine status in case per million + rename functions + fix functions doc --- drees.py | 216 +++++++++++++++++++++++++++++++++---------------------- owid.py | 2 +- 2 files changed, 132 insertions(+), 86 deletions(-) diff --git a/drees.py b/drees.py index 6abb869..57db7cb 100644 --- a/drees.py +++ b/drees.py @@ -61,6 +61,10 @@ class Field(DreesEnum): DC = (2, "Décés") +class Quota(DreesEnum): + EFFECTIF = (0 + len(Field), "Effectif") + + class VacStatus(DreesEnum): NC = (0, "Non-vaccinés") PDR = (1, "Primo dose récente") @@ -81,6 +85,7 @@ class AgeGroup(DreesEnum): VERY_OLD = (4, "[80;+]") +# namedtuple used to store stats (could be better...) VaccineMean = namedtuple("VaccineMean", ["age", "field", "percent"]) AgeMean = namedtuple("AgeMean", ["age", "field", "percent"]) @@ -92,6 +97,7 @@ def get_data( ) -> Dict[str, Any]: """ collect covid data by age from DREES + src: DATA_URL """ os.makedirs(DATA_REPOSITORY, exist_ok=True) data_url = DATA_URL.format(extension=extension) @@ -130,11 +136,14 @@ def get_enum_field(value): for field in Field: if field.name.lower() == value: return field.value + for quota in Quota: + if quota.name.lower() == value: + return quota.value -def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]: +def structure_data(data: Dict[str, Any]) -> Dict[dt, Any]: """ - group the original dictionnary into a more readable one + struture the original dictionnary into a more readable one 'date': { 'age' : { 'vac_status' : { @@ -147,35 +156,38 @@ def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]: } """ logging.info("restructuring the data...") - dic_data_grouped: Dict[dt, Any] = OrderedDict() + dic_data: Dict[dt, Any] = OrderedDict() for row in data["records"]: row_fields = row["fields"] date = dt.strptime(row_fields["date"], DATE_FORMAT) age = row_fields["age"] vac_status = row_fields["vac_statut"] - if date not in dic_data_grouped: - dic_data_grouped[date] = OrderedDict() - if age not in dic_data_grouped[date]: - dic_data_grouped[date][age] = OrderedDict() - if vac_status not in dic_data_grouped[date][age]: - dic_data_grouped[date][age][vac_status] = OrderedDict() + if date not in dic_data: + dic_data[date] = OrderedDict() + if age not in dic_data[date]: + dic_data[date][age] = OrderedDict() + if vac_status not in dic_data[date][age]: + dic_data[date][age][vac_status] = OrderedDict() for field in Field: field_name = field.name.lower() - dic_data_grouped[date][age][vac_status][field_name] = row_fields[field_name] + dic_data[date][age][vac_status][field_name] = row_fields[field_name] + for quota in Quota: + quota_name = quota.name.lower() + dic_data[date][age][vac_status][quota_name] = row_fields[quota_name] logging.info("data restructured") - return dic_data_grouped + return dic_data -def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]: +def get_np_data(dic_data: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]: """ - store the data in numpy data structure helped by Enum + store the data in numpy data structure """ logging.info("storing data in numpy data structure...") np_data = np.empty( - (len(dic_data_grouped), len(AgeGroup), len(VacStatus), len(Field)) + (len(dic_data), len(AgeGroup), len(VacStatus), len(Field) + len(Quota)) ) - np_date = np.empty((len(dic_data_grouped)), dtype="datetime64[s]") - for idx_date, (date, dic_age) in enumerate(dic_data_grouped.items()): + np_date = np.empty((len(dic_data)), dtype="datetime64[s]") + for idx_date, (date, dic_age) in enumerate(dic_data.items()): np_date[idx_date] = date for age, dic_vac in dic_age.items(): idx_age = get_enum_age(age) @@ -192,33 +204,38 @@ def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """ + split data to get vaccine data (all vaccine status) and unvaccine data (no vaccine) + """ return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :] -def get_vaccine_percent(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: +def get_vaccine_status_distribution( + np_data: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray]: """ - get the vaccine percent per date, age and field + get the vaccine distribution over the whole time period by age and field the vaccine data holds all the vaccine status except unvaccine """ np_data_vac, np_data_unvac = split_by_vac_status(np_data) - np_percent_vac = np_data_vac / np.sum(np_data, axis=2) - np_percent_unvac = np_data_unvac / np.sum(np_data, axis=2) - return np_percent_vac, np_percent_unvac + np_vac_distri = np_data_vac / np.sum(np_data, axis=2) + np_unvac_distri = np_data_unvac / np.sum(np_data, axis=2) + return np_vac_distri, np_unvac_distri -def get_percent_age_by_date_field_vac_splited( +def get_distribution_age_by_field_and_vac_status( np_data: np.ndarray, field: Field ) -> Tuple[np.ndarray, np.ndarray]: """ - get numpy percent age grouped by date and field splited by vaccine status + get distribution age (percent) by field grouped by vaccine status """ - np_percent_age_vac = np.empty((len(np_data), len(AgeGroup))) - np_percent_age_unvac = np.copy(np_percent_age_vac) + np_age_vac_percent = np.empty((len(np_data), len(AgeGroup))) + np_age_unvac_percent = np.copy(np_age_vac_percent) np_data_vac, np_data_unvac = split_by_vac_status(np_data) for idx_date in range(len(np_data_vac)): sum_effectif = np.nansum(np_data_vac[idx_date, :, field.value]) for age_group in AgeGroup: - np_percent_age_vac[idx_date, age_group.value] = np.round( + np_age_vac_percent[idx_date, age_group.value] = np.round( (np_data_vac[idx_date, age_group.value, field.value] / sum_effectif) * 100, 2, @@ -226,17 +243,17 @@ def get_percent_age_by_date_field_vac_splited( for idx_date in range(len(np_data_unvac)): sum_effectif = np.nansum(np_data_unvac[idx_date, :, field.value]) for age_group in AgeGroup: - np_percent_age_unvac[idx_date, age_group.value] = np.round( + np_age_unvac_percent[idx_date, age_group.value] = np.round( (np_data_unvac[idx_date, age_group.value, field.value] / sum_effectif) * 100, 2, ) - return np_percent_age_vac, np_percent_age_unvac + return np_age_vac_percent, np_age_unvac_percent -def get_percent_age_by_date_field(np_data: np.ndarray, field: Field) -> np.ndarray: +def get_distribution_age_by_field(np_data: np.ndarray, field: Field) -> np.ndarray: """ - get numpy percent age grouped by date and field + get age distribution (percent) over the whole period by field """ np_percent_age = np.empty((len(np_data), len(AgeGroup))) for idx_date in range(len(np_data)): @@ -291,33 +308,38 @@ def save_and_close_fig( def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]: """ - analyse data + analyse DREES dataset + useful stats can be compute here if no plots needed """ logging.info("analysing data...") lst_analyse_data: List[Union[VaccineMean, AgeMean]] = list() - np_percent_vac, _ = get_vaccine_percent(np_data) + np_vac_distri, _ = get_vaccine_status_distribution(np_data) - logging.info("--- field by age vaccine mean percent ---") + logging.info( + "--- field distribution by age and only vaccine status (averaged over the whole period) ---" + ) for age_group in AgeGroup: for field in Field: - mean_vac_percent = np.round( - np.nanmean(np_percent_vac[:, age_group.value, field.value]) * 100, 2 + vac_percent_mean = np.round( + np.nanmean(np_vac_distri[:, age_group.value, field.value]) * 100, 2 ) - print(f"{field.name} - {age_group.label} - vac : {mean_vac_percent}%") + print(f"{field.name} - {age_group.label} - vac : {vac_percent_mean}%") lst_analyse_data.append( - VaccineMean(age_group.label, field.label, mean_vac_percent) + VaccineMean(age_group.label, field.label, vac_percent_mean) ) - logging.info("--- age by field and vac status mean percent ---") + logging.info( + "--- age distribution by field and vac status (averaged over the whole period) ---" + ) for field in Field: - np_percent_age = get_percent_age_by_date_field(np_data, field) + np_age_percent = get_distribution_age_by_field(np_data, field) ( np_percent_age_vac, np_percent_age_unvac, - ) = get_percent_age_by_date_field_vac_splited(np_data, field) + ) = get_distribution_age_by_field_and_vac_status(np_data, field) for age_group in AgeGroup: percent_age_mean = np.round( - np.nanmean(np_percent_age[:, age_group.value]), 2 + np.nanmean(np_age_percent[:, age_group.value]), 2 ) print(f"age: {age_group.label} - field: {field.name} = {percent_age_mean}%") lst_analyse_data.append( @@ -340,18 +362,21 @@ def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]: return lst_analyse_data -def plot_bar_age_percent_vac_status_by_field( - np_data_vac_status: np.ndarray, +def plot_bar_age_distribution_by_field_and_vac_status( + np_data: np.ndarray, np_date: np.ndarray, field: Field, is_vac: Optional[bool] = True, ) -> None: + """ + plot age distribution distribution (percent) by field and vaccine status + """ fig, ax = get_plot_fig(figsize=(22, 8), locator=md.WeekdayLocator()) - bottom = np_data_vac_status[:, 0] + bottom = np_data[:, 0] suffix = "vac" if is_vac else "unvac" title = "Vaccinés" if is_vac else "Non vaccinés" for age_group in AgeGroup: - percents_age = np_data_vac_status[:, age_group.value] + percents_age = np_data[:, age_group.value] if age_group.value > 0: ax.bar( np_date, @@ -381,25 +406,30 @@ def plot_bar_age_percent_vac_status_by_field( ) -def plot_bar_age_percent_by_field( +def plot_bar_age_distribution_by_field( np_data: np.ndarray, np_date: np.ndarray, field: Field ) -> None: """ - plot percent vaccinated field group by age bar diagram + plot age distribution (percent) by field """ ( - np_percent_age_vac, - np_percent_age_unvac, - ) = get_percent_age_by_date_field_vac_splited(np_data, field) - plot_bar_age_percent_vac_status_by_field(np_percent_age_vac, np_date, field) - plot_bar_age_percent_vac_status_by_field( - np_percent_age_unvac, np_date, field, is_vac=False + np_age_vac_percent, + np_age_unvac_percent, + ) = get_distribution_age_by_field_and_vac_status(np_data, field) + plot_bar_age_distribution_by_field_and_vac_status( + np_age_vac_percent, np_date, field + ) + plot_bar_age_distribution_by_field_and_vac_status( + np_age_unvac_percent, np_date, field, is_vac=False ) def plot_cumulative_field( np_data: np.ndarray, np_date: np.ndarray, field: Field ) -> None: + """ + plot cumulative field by age and vaccine status (cases per million) + """ np_data_vac, np_data_unvac = split_by_vac_status(np_data) for age_group in AgeGroup: fig, _ = get_plot_fig() @@ -413,8 +443,8 @@ def plot_cumulative_field( plt.plot(np_date, np_cumulate_unvac, label=f"Non vaccinés") plt.title(f"{age_group.label} - {field.label}") - plt.xlabel("date") - plt.ylabel("nombre") + plt.xlabel("Date") + plt.ylabel("Nombre de cas") save_and_close_fig( fig, os.path.join( @@ -428,18 +458,20 @@ def plot_fields_by_age_vac( np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus ) -> None: """ - plot data by vaccine status, age and field + plot field data by age and vaccine status (cases per million) """ fig, _ = get_plot_fig() for field in Field: plt.plot( np_date, - np_data[:, age_group.value, vac_status.value, field.value], + 10e6 + * np_data[:, age_group.value, vac_status.value, field.value] + / np_data[:, age_group.value, vac_status.value, Quota.EFFECTIF.value], label=f"{field.label}", ) - plt.xlabel("date") - plt.ylabel("nombre") + plt.xlabel("Date") + plt.ylabel("Cas par million de personnes") plt.title(f"{age_group.label} - {vac_status.label}") save_and_close_fig( @@ -450,26 +482,25 @@ def plot_fields_by_age_vac( ) -def plot_bar_data_by_age_field( +def plot_bar_vaccine_status_distribution_by_age_field( np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, field: Field, ) -> None: """ - display a bar graph by field and age over the data period - bars display vaccine status percent + display vaccine/unvaccine distribution (percent) over the whole period by age and field """ - np_percent_vac, np_percent_unvac = get_vaccine_percent(np_data) + np_vac_distri, np_unvac_distri = get_vaccine_status_distribution(np_data) # adjust the fig size to display correctly bars and labels fig, ax = get_plot_fig(figsize=(22, 8)) for idx_date in range(len(np_date)): vac_percent = np.round( - np_percent_vac[idx_date, age_group.value, field.value] * 100, 2 + np_vac_distri[idx_date, age_group.value, field.value] * 100, 2 ) unvac_percent = np.round( - np_percent_unvac[idx_date, age_group.value, field.value] * 100, 2 + np_unvac_distri[idx_date, age_group.value, field.value] * 100, 2 ) bar_vac = ax.bar(idx_date, vac_percent, color="b", label="Vaccinés") ax.bar( @@ -505,7 +536,7 @@ def plot_bar_data_by_age_field( def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]: """ - get tuple arguments to plot fields data by age and vac status on multiprocess + build pool age and vac status arguments """ pool_args: List[Tuple[AgeGroup, VacStatus]] = list() for age_group in AgeGroup: @@ -516,7 +547,7 @@ def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]: def get_age_field_args() -> List[Tuple[AgeGroup, Field]]: """ - get tuple arguments to plot fields data by age and field on multiprocess + build pool age and field arguments """ pool_args: List[Tuple[AgeGroup, Field]] = list() for age_group in AgeGroup: @@ -525,6 +556,16 @@ def get_age_field_args() -> List[Tuple[AgeGroup, Field]]: return pool_args +def get_field_args() -> List[Tuple[Field]]: + """ + build pool field arguments + """ + pool_args: List[Tuple[Field]] = list() + for field in Field: + pool_args.append((field,)) + return pool_args + + def move_tmp_plots() -> None: """ move .tmp.png plots into .png after generation @@ -567,12 +608,15 @@ def generate_html_page( if __name__ == "__main__": """ - This script aims to plot DRESS data + This script aims to analyse and plot DRESS data + Stats availables: + - Age distribution (percent) by field (vaccine and unvaccine) + - Vaccine/unvaccine distribution (percent) by field and age Plots availables : - - cumulative deaths by age - - hc, sc, dc by vaccine status and age - - hc, sc, dc (vaccine/unvaccine percent) by age - - hc, sc, dc (age grouped percent) by field + - cumulative hc, sc, dc by age and vaccine status + - hc, sc, dc by vaccine status and age (cases per million) + - hc, sc, dc (vaccine/unvaccine percent distribution) by age + - hc, sc, dc (age percent distribution) by field Main indicators are : - hospitalisations (hc) - criticals (sc) @@ -605,27 +649,29 @@ if __name__ == "__main__": args = parser.parse_args() - dic_data: Dict[str, Any] = get_data( + dic_data_unstructured: Dict[str, Any] = get_data( file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh ) - dic_data_grouped: Dict[dt, Any] = group_by_age_date(dic_data) - np_data, np_date = get_np_data(dic_data_grouped) + dic_data: Dict[dt, Any] = structure_data(dic_data_unstructured) + np_data, np_date = get_np_data(dic_data) lst_analyse_data = analyse(np_data) if not args.no_plot: os.makedirs(OUTPUT_REPOSITORY, exist_ok=True) - plot_fields_args = get_age_vac_args() - f_fields = partial(plot_fields_by_age_vac, np_data, np_date) - plot_vac_percent_age_args = get_age_field_args() - f_bars = partial(plot_bar_data_by_age_field, np_data, np_date) - with Pool(2) as pool: - pool.starmap(f_fields, plot_fields_args) - pool.starmap(f_bars, plot_vac_percent_age_args) - for field in Field: - plot_cumulative_field(np_data, np_date, field) - plot_bar_age_percent_by_field(np_data, np_date, field) + f_fields = partial(plot_fields_by_age_vac, np_data, np_date) + f_bars_vaccine = partial( + plot_bar_vaccine_status_distribution_by_age_field, np_data, np_date + ) + f_bars_age = partial(plot_bar_age_distribution_by_field, np_data, np_date) + f_cumulate = partial(plot_cumulative_field, np_data, np_date) + + with Pool(2) as pool: + pool.starmap(f_fields, get_age_vac_args()) + pool.starmap(f_bars_vaccine, get_age_field_args()) + pool.starmap(f_bars_age, get_field_args()) + pool.starmap(f_cumulate, get_field_args()) move_tmp_plots() diff --git a/owid.py b/owid.py index 4aba6f8..d92f78e 100644 --- a/owid.py +++ b/owid.py @@ -16,7 +16,7 @@ logging.basicConfig(format=FORMAT, level=logging.INFO) DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.{extension}" DATA_REPOSITORY = "data" -OUTPUT_REPOSITORY = "output" +OUTPUT_REPOSITORY = "static/plots" class DataProvider: