field by age and vaccine status in case per million + rename functions + fix functions doc

This commit is contained in:
rmanach 2021-12-30 12:03:42 +01:00
parent 88854c8631
commit fc7b84da28
2 changed files with 132 additions and 86 deletions

216
drees.py
View File

@ -61,6 +61,10 @@ class Field(DreesEnum):
DC = (2, "Décés")
class Quota(DreesEnum):
EFFECTIF = (0 + len(Field), "Effectif")
class VacStatus(DreesEnum):
NC = (0, "Non-vaccinés")
PDR = (1, "Primo dose récente")
@ -81,6 +85,7 @@ class AgeGroup(DreesEnum):
VERY_OLD = (4, "[80;+]")
# namedtuple used to store stats (could be better...)
VaccineMean = namedtuple("VaccineMean", ["age", "field", "percent"])
AgeMean = namedtuple("AgeMean", ["age", "field", "percent"])
@ -92,6 +97,7 @@ def get_data(
) -> Dict[str, Any]:
"""
collect covid data by age from DREES
src: DATA_URL
"""
os.makedirs(DATA_REPOSITORY, exist_ok=True)
data_url = DATA_URL.format(extension=extension)
@ -130,11 +136,14 @@ def get_enum_field(value):
for field in Field:
if field.name.lower() == value:
return field.value
for quota in Quota:
if quota.name.lower() == value:
return quota.value
def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]:
def structure_data(data: Dict[str, Any]) -> Dict[dt, Any]:
"""
group the original dictionnary into a more readable one
struture the original dictionnary into a more readable one
'date': {
'age' : {
'vac_status' : {
@ -147,35 +156,38 @@ def group_by_age_date(data: Dict[str, Any]) -> Dict[dt, Any]:
}
"""
logging.info("restructuring the data...")
dic_data_grouped: Dict[dt, Any] = OrderedDict()
dic_data: Dict[dt, Any] = OrderedDict()
for row in data["records"]:
row_fields = row["fields"]
date = dt.strptime(row_fields["date"], DATE_FORMAT)
age = row_fields["age"]
vac_status = row_fields["vac_statut"]
if date not in dic_data_grouped:
dic_data_grouped[date] = OrderedDict()
if age not in dic_data_grouped[date]:
dic_data_grouped[date][age] = OrderedDict()
if vac_status not in dic_data_grouped[date][age]:
dic_data_grouped[date][age][vac_status] = OrderedDict()
if date not in dic_data:
dic_data[date] = OrderedDict()
if age not in dic_data[date]:
dic_data[date][age] = OrderedDict()
if vac_status not in dic_data[date][age]:
dic_data[date][age][vac_status] = OrderedDict()
for field in Field:
field_name = field.name.lower()
dic_data_grouped[date][age][vac_status][field_name] = row_fields[field_name]
dic_data[date][age][vac_status][field_name] = row_fields[field_name]
for quota in Quota:
quota_name = quota.name.lower()
dic_data[date][age][vac_status][quota_name] = row_fields[quota_name]
logging.info("data restructured")
return dic_data_grouped
return dic_data
def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]:
def get_np_data(dic_data: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray]:
"""
store the data in numpy data structure helped by Enum
store the data in numpy data structure
"""
logging.info("storing data in numpy data structure...")
np_data = np.empty(
(len(dic_data_grouped), len(AgeGroup), len(VacStatus), len(Field))
(len(dic_data), len(AgeGroup), len(VacStatus), len(Field) + len(Quota))
)
np_date = np.empty((len(dic_data_grouped)), dtype="datetime64[s]")
for idx_date, (date, dic_age) in enumerate(dic_data_grouped.items()):
np_date = np.empty((len(dic_data)), dtype="datetime64[s]")
for idx_date, (date, dic_age) in enumerate(dic_data.items()):
np_date[idx_date] = date
for age, dic_vac in dic_age.items():
idx_age = get_enum_age(age)
@ -192,33 +204,38 @@ def get_np_data(dic_data_grouped: Dict[dt, Any]) -> Tuple[np.ndarray, np.ndarray
def split_by_vac_status(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
split data to get vaccine data (all vaccine status) and unvaccine data (no vaccine)
"""
return np.sum(np_data[:, :, 1:, :], axis=2), np_data[:, :, VacStatus.NC.value, :]
def get_vaccine_percent(np_data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
def get_vaccine_status_distribution(
np_data: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
"""
get the vaccine percent per date, age and field
get the vaccine distribution over the whole time period by age and field
the vaccine data holds all the vaccine status except unvaccine
"""
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
np_percent_vac = np_data_vac / np.sum(np_data, axis=2)
np_percent_unvac = np_data_unvac / np.sum(np_data, axis=2)
return np_percent_vac, np_percent_unvac
np_vac_distri = np_data_vac / np.sum(np_data, axis=2)
np_unvac_distri = np_data_unvac / np.sum(np_data, axis=2)
return np_vac_distri, np_unvac_distri
def get_percent_age_by_date_field_vac_splited(
def get_distribution_age_by_field_and_vac_status(
np_data: np.ndarray, field: Field
) -> Tuple[np.ndarray, np.ndarray]:
"""
get numpy percent age grouped by date and field splited by vaccine status
get distribution age (percent) by field grouped by vaccine status
"""
np_percent_age_vac = np.empty((len(np_data), len(AgeGroup)))
np_percent_age_unvac = np.copy(np_percent_age_vac)
np_age_vac_percent = np.empty((len(np_data), len(AgeGroup)))
np_age_unvac_percent = np.copy(np_age_vac_percent)
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
for idx_date in range(len(np_data_vac)):
sum_effectif = np.nansum(np_data_vac[idx_date, :, field.value])
for age_group in AgeGroup:
np_percent_age_vac[idx_date, age_group.value] = np.round(
np_age_vac_percent[idx_date, age_group.value] = np.round(
(np_data_vac[idx_date, age_group.value, field.value] / sum_effectif)
* 100,
2,
@ -226,17 +243,17 @@ def get_percent_age_by_date_field_vac_splited(
for idx_date in range(len(np_data_unvac)):
sum_effectif = np.nansum(np_data_unvac[idx_date, :, field.value])
for age_group in AgeGroup:
np_percent_age_unvac[idx_date, age_group.value] = np.round(
np_age_unvac_percent[idx_date, age_group.value] = np.round(
(np_data_unvac[idx_date, age_group.value, field.value] / sum_effectif)
* 100,
2,
)
return np_percent_age_vac, np_percent_age_unvac
return np_age_vac_percent, np_age_unvac_percent
def get_percent_age_by_date_field(np_data: np.ndarray, field: Field) -> np.ndarray:
def get_distribution_age_by_field(np_data: np.ndarray, field: Field) -> np.ndarray:
"""
get numpy percent age grouped by date and field
get age distribution (percent) over the whole period by field
"""
np_percent_age = np.empty((len(np_data), len(AgeGroup)))
for idx_date in range(len(np_data)):
@ -291,33 +308,38 @@ def save_and_close_fig(
def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]:
"""
analyse data
analyse DREES dataset
useful stats can be compute here if no plots needed
"""
logging.info("analysing data...")
lst_analyse_data: List[Union[VaccineMean, AgeMean]] = list()
np_percent_vac, _ = get_vaccine_percent(np_data)
np_vac_distri, _ = get_vaccine_status_distribution(np_data)
logging.info("--- field by age vaccine mean percent ---")
logging.info(
"--- field distribution by age and only vaccine status (averaged over the whole period) ---"
)
for age_group in AgeGroup:
for field in Field:
mean_vac_percent = np.round(
np.nanmean(np_percent_vac[:, age_group.value, field.value]) * 100, 2
vac_percent_mean = np.round(
np.nanmean(np_vac_distri[:, age_group.value, field.value]) * 100, 2
)
print(f"{field.name} - {age_group.label} - vac : {mean_vac_percent}%")
print(f"{field.name} - {age_group.label} - vac : {vac_percent_mean}%")
lst_analyse_data.append(
VaccineMean(age_group.label, field.label, mean_vac_percent)
VaccineMean(age_group.label, field.label, vac_percent_mean)
)
logging.info("--- age by field and vac status mean percent ---")
logging.info(
"--- age distribution by field and vac status (averaged over the whole period) ---"
)
for field in Field:
np_percent_age = get_percent_age_by_date_field(np_data, field)
np_age_percent = get_distribution_age_by_field(np_data, field)
(
np_percent_age_vac,
np_percent_age_unvac,
) = get_percent_age_by_date_field_vac_splited(np_data, field)
) = get_distribution_age_by_field_and_vac_status(np_data, field)
for age_group in AgeGroup:
percent_age_mean = np.round(
np.nanmean(np_percent_age[:, age_group.value]), 2
np.nanmean(np_age_percent[:, age_group.value]), 2
)
print(f"age: {age_group.label} - field: {field.name} = {percent_age_mean}%")
lst_analyse_data.append(
@ -340,18 +362,21 @@ def analyse(np_data: np.ndarray) -> List[Union[VaccineMean, AgeMean]]:
return lst_analyse_data
def plot_bar_age_percent_vac_status_by_field(
np_data_vac_status: np.ndarray,
def plot_bar_age_distribution_by_field_and_vac_status(
np_data: np.ndarray,
np_date: np.ndarray,
field: Field,
is_vac: Optional[bool] = True,
) -> None:
"""
plot age distribution distribution (percent) by field and vaccine status
"""
fig, ax = get_plot_fig(figsize=(22, 8), locator=md.WeekdayLocator())
bottom = np_data_vac_status[:, 0]
bottom = np_data[:, 0]
suffix = "vac" if is_vac else "unvac"
title = "Vaccinés" if is_vac else "Non vaccinés"
for age_group in AgeGroup:
percents_age = np_data_vac_status[:, age_group.value]
percents_age = np_data[:, age_group.value]
if age_group.value > 0:
ax.bar(
np_date,
@ -381,25 +406,30 @@ def plot_bar_age_percent_vac_status_by_field(
)
def plot_bar_age_percent_by_field(
def plot_bar_age_distribution_by_field(
np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> None:
"""
plot percent vaccinated field group by age bar diagram
plot age distribution (percent) by field
"""
(
np_percent_age_vac,
np_percent_age_unvac,
) = get_percent_age_by_date_field_vac_splited(np_data, field)
plot_bar_age_percent_vac_status_by_field(np_percent_age_vac, np_date, field)
plot_bar_age_percent_vac_status_by_field(
np_percent_age_unvac, np_date, field, is_vac=False
np_age_vac_percent,
np_age_unvac_percent,
) = get_distribution_age_by_field_and_vac_status(np_data, field)
plot_bar_age_distribution_by_field_and_vac_status(
np_age_vac_percent, np_date, field
)
plot_bar_age_distribution_by_field_and_vac_status(
np_age_unvac_percent, np_date, field, is_vac=False
)
def plot_cumulative_field(
np_data: np.ndarray, np_date: np.ndarray, field: Field
) -> None:
"""
plot cumulative field by age and vaccine status (cases per million)
"""
np_data_vac, np_data_unvac = split_by_vac_status(np_data)
for age_group in AgeGroup:
fig, _ = get_plot_fig()
@ -413,8 +443,8 @@ def plot_cumulative_field(
plt.plot(np_date, np_cumulate_unvac, label=f"Non vaccinés")
plt.title(f"{age_group.label} - {field.label}")
plt.xlabel("date")
plt.ylabel("nombre")
plt.xlabel("Date")
plt.ylabel("Nombre de cas")
save_and_close_fig(
fig,
os.path.join(
@ -428,18 +458,20 @@ def plot_fields_by_age_vac(
np_data: np.ndarray, np_date: np.ndarray, age_group: AgeGroup, vac_status: VacStatus
) -> None:
"""
plot data by vaccine status, age and field
plot field data by age and vaccine status (cases per million)
"""
fig, _ = get_plot_fig()
for field in Field:
plt.plot(
np_date,
np_data[:, age_group.value, vac_status.value, field.value],
10e6
* np_data[:, age_group.value, vac_status.value, field.value]
/ np_data[:, age_group.value, vac_status.value, Quota.EFFECTIF.value],
label=f"{field.label}",
)
plt.xlabel("date")
plt.ylabel("nombre")
plt.xlabel("Date")
plt.ylabel("Cas par million de personnes")
plt.title(f"{age_group.label} - {vac_status.label}")
save_and_close_fig(
@ -450,26 +482,25 @@ def plot_fields_by_age_vac(
)
def plot_bar_data_by_age_field(
def plot_bar_vaccine_status_distribution_by_age_field(
np_data: np.ndarray,
np_date: np.ndarray,
age_group: AgeGroup,
field: Field,
) -> None:
"""
display a bar graph by field and age over the data period
bars display vaccine status percent
display vaccine/unvaccine distribution (percent) over the whole period by age and field
"""
np_percent_vac, np_percent_unvac = get_vaccine_percent(np_data)
np_vac_distri, np_unvac_distri = get_vaccine_status_distribution(np_data)
# adjust the fig size to display correctly bars and labels
fig, ax = get_plot_fig(figsize=(22, 8))
for idx_date in range(len(np_date)):
vac_percent = np.round(
np_percent_vac[idx_date, age_group.value, field.value] * 100, 2
np_vac_distri[idx_date, age_group.value, field.value] * 100, 2
)
unvac_percent = np.round(
np_percent_unvac[idx_date, age_group.value, field.value] * 100, 2
np_unvac_distri[idx_date, age_group.value, field.value] * 100, 2
)
bar_vac = ax.bar(idx_date, vac_percent, color="b", label="Vaccinés")
ax.bar(
@ -505,7 +536,7 @@ def plot_bar_data_by_age_field(
def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
"""
get tuple arguments to plot fields data by age and vac status on multiprocess
build pool age and vac status arguments
"""
pool_args: List[Tuple[AgeGroup, VacStatus]] = list()
for age_group in AgeGroup:
@ -516,7 +547,7 @@ def get_age_vac_args() -> List[Tuple[AgeGroup, VacStatus]]:
def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
"""
get tuple arguments to plot fields data by age and field on multiprocess
build pool age and field arguments
"""
pool_args: List[Tuple[AgeGroup, Field]] = list()
for age_group in AgeGroup:
@ -525,6 +556,16 @@ def get_age_field_args() -> List[Tuple[AgeGroup, Field]]:
return pool_args
def get_field_args() -> List[Tuple[Field]]:
"""
build pool field arguments
"""
pool_args: List[Tuple[Field]] = list()
for field in Field:
pool_args.append((field,))
return pool_args
def move_tmp_plots() -> None:
"""
move .tmp.png plots into .png after generation
@ -567,12 +608,15 @@ def generate_html_page(
if __name__ == "__main__":
"""
This script aims to plot DRESS data
This script aims to analyse and plot DRESS data
Stats availables:
- Age distribution (percent) by field (vaccine and unvaccine)
- Vaccine/unvaccine distribution (percent) by field and age
Plots availables :
- cumulative deaths by age
- hc, sc, dc by vaccine status and age
- hc, sc, dc (vaccine/unvaccine percent) by age
- hc, sc, dc (age grouped percent) by field
- cumulative hc, sc, dc by age and vaccine status
- hc, sc, dc by vaccine status and age (cases per million)
- hc, sc, dc (vaccine/unvaccine percent distribution) by age
- hc, sc, dc (age percent distribution) by field
Main indicators are :
- hospitalisations (hc)
- criticals (sc)
@ -605,27 +649,29 @@ if __name__ == "__main__":
args = parser.parse_args()
dic_data: Dict[str, Any] = get_data(
dic_data_unstructured: Dict[str, Any] = get_data(
file_path=os.path.join(DATA_REPOSITORY, "dress.json"), refresh=args.refresh
)
dic_data_grouped: Dict[dt, Any] = group_by_age_date(dic_data)
np_data, np_date = get_np_data(dic_data_grouped)
dic_data: Dict[dt, Any] = structure_data(dic_data_unstructured)
np_data, np_date = get_np_data(dic_data)
lst_analyse_data = analyse(np_data)
if not args.no_plot:
os.makedirs(OUTPUT_REPOSITORY, exist_ok=True)
plot_fields_args = get_age_vac_args()
f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
plot_vac_percent_age_args = get_age_field_args()
f_bars = partial(plot_bar_data_by_age_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, plot_fields_args)
pool.starmap(f_bars, plot_vac_percent_age_args)
for field in Field:
plot_cumulative_field(np_data, np_date, field)
plot_bar_age_percent_by_field(np_data, np_date, field)
f_fields = partial(plot_fields_by_age_vac, np_data, np_date)
f_bars_vaccine = partial(
plot_bar_vaccine_status_distribution_by_age_field, np_data, np_date
)
f_bars_age = partial(plot_bar_age_distribution_by_field, np_data, np_date)
f_cumulate = partial(plot_cumulative_field, np_data, np_date)
with Pool(2) as pool:
pool.starmap(f_fields, get_age_vac_args())
pool.starmap(f_bars_vaccine, get_age_field_args())
pool.starmap(f_bars_age, get_field_args())
pool.starmap(f_cumulate, get_field_args())
move_tmp_plots()

View File

@ -16,7 +16,7 @@ logging.basicConfig(format=FORMAT, level=logging.INFO)
DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.{extension}"
DATA_REPOSITORY = "data"
OUTPUT_REPOSITORY = "output"
OUTPUT_REPOSITORY = "static/plots"
class DataProvider: