diff --git a/src/facebias/evaluation.py b/src/facebias/evaluation.py index 10ed570..6d7941b 100644 --- a/src/facebias/evaluation.py +++ b/src/facebias/evaluation.py @@ -31,12 +31,138 @@ def _to_age_bracket(row): return "{}-{}".format(d, d + 9) +def generate_description(dataset: pd.DataFrame, name: str | None = None) -> tuple[str]: + """Generates a textual description of `dataset`, its variables and values. + + For all supported variables in `dataset`, this function lists their names, + types, value ranges and descriptive statistics. It also groups each + variable, comparing it to the others, thus creating a subgroup description + as well. + + Note that we only support the variables defined by the `Capability` enum. + + Parameters + ---------- + dataset: pd.DataFrame + + name: str, optional + The dataset name. Only ever used in the first line of the returned + description, for pretty printing. + + Returns + ------- + desc: str + The textual description as a human-readable string. + + See Also + -------- + `Capability` + """ + outstr = "" + if name is not None: + outstr = f"The dataset {name}" + else: + outstr = f"The dataset" + + outstr += f" has a total of {len(dataset)} {dataset.index.name}s, with the following supported features/capabilities:" + + caps = [] + for c in Capability: + if c.value in dataset: + caps.append(c) + outstr += f"\n- {c.value}" + + outstr += "\n\nEach feature/capability has the following types and values:" + for c in caps: + if c == Capability.AGE: + outstr += f"\n{c.value}: numeric" + else: + outstr += f"\n{c.value}: categorical" + outstr += f"\n - {sorted(dataset[c].unique())}" + + outstr += "\n\nData distribution statistics." + for c in caps: + outstr += ( + f'\nThe feature/capability "{c}" has the following distribution of values:' + ) + if c == Capability.AGE: + m1 = dataset[c].min() + m2 = dataset[c].max() + mean = dataset[c].mean() + std = dataset[c].std() + p25 = dataset[c].quantile(0.25) + median = dataset[c].median() + p75 = dataset[c].quantile(0.75) + + outstr += f"\n - min = {m1}" + outstr += f"\n - max = {m2}" + outstr += f"\n - mean = {mean:.2f}" + outstr += f"\n - std = {std:.2f}" + outstr += f"\n - p25 = {p25}" + outstr += f"\n - p50 = {median}" + outstr += f"\n - p75 = {p75}" + outstr += "\n Interqualtile ranges:" + outstr += f"\n - p25-min = {p25 - m1}" + outstr += f"\n - p50-p25 = {median - p25}" + outstr += f"\n - p75-p50 = {p75 - median}" + outstr += f"\n - max-p75 = {m2 - p75}" + else: + series = dataset[c].value_counts().sort_index() + for s in series.index: + outstr += f"\n - {s}: {series[s]}" + + outstr += "\n\nPer capability/class data distribution statistics." + if Capability.AGE in caps and caps[-1] != Capability.AGE: + # Rotating AGE to be the last in the list, as it cannot be the grouping + # variable, since it is numeric. + idx = caps.index(Capability.AGE) + del caps[idx] + caps.append(Capability.AGE) + # caps = caps[1:] + caps[:1] + + for c1, c2 in combinations(caps, 2): + # Here we test for age, but this should really be a test for any + # numerical variable. + if c1 == Capability.AGE: + continue + + gb = dataset.groupby(c1)[[c2]] + tmpdf = None + tmpstr = '\nGrouping by "{}", the dataset has the following {} for each value of "{}"' + + if c2 != Capability.AGE: + outstr += tmpstr.format(c1, "number of elements", c2) + tmpdf = gb.value_counts().sort_index().unstack(level=c1) + tmpdf = tmpdf.fillna(0).astype(int) + # tmpdf = pd.concat([tmpdf, gb.apply(gini)], axis=1) + elif c1 != Capability.AGEGROUP: + outstr += tmpstr.format(c1, "statistics", c2) + tmpdf = pd.concat( + [ + gb.min(), + gb.quantile(0.25), + gb.median(), + gb.quantile(0.75), + gb.max(), + gb.apply(gini), + ], + axis=1, + ) + tmpdf.columns = ["min", "p25", "p50", "p75", "max", "gini_impurity"] + else: + continue + + outstr += f"\n{tmpdf}\n" + + return outstr + + if __name__ == "__main__": import os logger.info(os.getcwd()) - DATASET_PATH = Path("../../data/facing2-train/") + DATASET_PATH = Path("data/facing2-train/") METADATA_PATH = DATASET_PATH / "meta-w-age.csv" meta = pd.read_csv(METADATA_PATH, sep=",", index_col="image") @@ -49,9 +175,8 @@ if __name__ == "__main__": # GINI IMPURITY # Lower values means a concentration of values around a single class, i.e. bias. - age_gini = gini(meta["age"]) + age_gini = gini(meta[Capability.AGE]) - # gt_age_group_ord = meta["age_group"].apply(lambda x: _agegroup_int_map[x]) agegroup_gini = gini(meta[Capability.AGEGROUP + "_cat"]) # Should be close to 0.5, indicating a 50/50 split of males and females, @@ -78,64 +203,12 @@ if __name__ == "__main__": gini_per_sex = sex_gb.apply(gini) gini_per_agegroup = agegroup_gb.apply(gini) - # Prototype textual description of the dataset. To be incorporated into a - # "generate_report" function. - print( - f'The dataset "{DATASET_PATH.name}" has a total of {len(meta)} {meta.index.name}s,' - " with the following features/capabilities:" - ) - caps = [] - for c in Capability: - if c.value in meta: - caps.append(c) - print(f"- {c.value}") + # Prototype textual description of the dataset. + s = generate_description(meta, name=DATASET_PATH.name) + print(s) - print("\nEach feature/capability has the following types and values:") - for c in caps: - if c == Capability.AGE: - print(f"{c.value}: numeric") - else: - print(f"{c.value}: categorical") - print(f" - {sorted(meta[c].unique())}") - - print("\nData distribution statistics.") - for c in caps: - print(f'The feature/capability "{c}" has the following distribution of values:') - if c == Capability.AGE: - m1 = meta[c].min() - m2 = meta[c].max() - mean = meta[c].mean() - std = meta[c].std() - p25 = meta[c].quantile(0.25) - median = meta[c].median() - p75 = meta[c].quantile(0.75) - - print( - f" - min = {m1}, max = {m2}, mean = {mean:.2f}, std = {std:.2f}" - f" p25 = {p25}, p50 = {median}, p75 = {p75}" - ) - print(" Interqualtile ranges:") - print(f" - p25-min = {p25 - m1}") - print(f" - p50-p25 = {median - p25}") - print(f" - p75-p50 = {p75 - median}") - print(f" - max-p75 = {m2 - p75}") - else: - series = meta[c].value_counts().sort_index() - for s in series.index: - print(f" - {s}: {series[s]}") - - print("\nPer capability/class data distribution statistics.") - for c1, c2 in combinations(caps, 2): - if c1 == Capability.AGE: - continue - - if c2 != Capability.AGE: - gb = meta.groupby(c1)[[c2]] - print( - f'Grouping by "{c1}", the dataset has the following data distribution for "{c2}"' - ) - print(gb.value_counts().sort_index().unstack(level=c1).fillna(0)) - - # Diagnostics of biases in the dataset. To be incorporated into a - # "generate_diagnostics" function later on. +# Diagnostics of biases in the dataset. To be incorporated into a +# "generate_diagnostics" function later on. +def generate_bias_diagnostics(): + pass