Added prototype code for dataset distribution evaluation.

2026-05-06 16:29:13 +01:00
parent 536c29978d
commit 92db500bd0
1 changed files with 141 additions and 0 deletions
--- a/src/facebias/evaluation.py
+++ b/src/facebias/evaluation.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+"""Dataset/model evaluation functions."""
+
+import logging
+from itertools import permutations, combinations
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from scipy.stats import entropy
+
+from facebias.estimators import Capability
+from facebias.metrics import gini
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("facebias:evaluation.py")
+
+
+# TODO(gschardong): Move to the same file as `load_dataset`
+def _to_age_bracket(row):
+    iage = int(row["age"])
+    if iage < 3:
+        return "00-02"
+    elif iage < 10:
+        return "03-09"
+    elif iage > 69:
+        return "70+"
+
+    d = iage // 10 * 10
+    return "{}-{}".format(d, d + 9)
+
+
+if __name__ == "__main__":
+    import os
+
+    logger.info(os.getcwd())
+
+    DATASET_PATH = Path("../../data/facing2-train/")
+    METADATA_PATH = DATASET_PATH / "meta-w-age.csv"
+
+    meta = pd.read_csv(METADATA_PATH, sep=",", index_col="image")
+    meta[Capability.AGEGROUP.value] = meta.apply(_to_age_bracket, axis=1)
+    meta = meta.sort_index()
+    meta[Capability.AGEGROUP + "_cat"], _ = pd.factorize(
+        meta[Capability.AGEGROUP], sort=True
+    )
+    meta[Capability.SEX + "_cat"], _ = pd.factorize(meta[Capability.SEX], sort=True)
+
+    # GINI IMPURITY
+    # Lower values means a concentration of values around a single class, i.e. bias.
+    age_gini = gini(meta["age"])
+
+    # gt_age_group_ord =  meta["age_group"].apply(lambda x: _agegroup_int_map[x])
+    agegroup_gini = gini(meta[Capability.AGEGROUP + "_cat"])
+
+    # Should be close to 0.5, indicating a 50/50 split of males and females,
+    # representing maximum uncertainty.
+    sex_gini = gini(meta[Capability.SEX + "_cat"])
+
+    # SHANNON'S ENTROPY
+    count_per_agegroup = meta["age_group"].value_counts()
+    prob_per_agegroup = count_per_agegroup / count_per_agegroup.sum()
+    H_agegroup = entropy(prob_per_agegroup)
+
+    count_per_sex = meta["sex"].value_counts()
+    prob_per_sex = count_per_sex / count_per_sex.sum()
+    H_sex = entropy(prob_per_sex)
+
+    # Now, onto the subgroup metrics.
+    # The goal is to be able to answer the following types of questions:
+    # 1) How many women are in each age-bracket?
+    # 2) Given the population in age-bracket 20-49 years, how is their gender distribution?
+    # 3) Do we need to collect more images of new individuals? If so, what population should we focus on?
+    sex_gb = meta.groupby(Capability.SEX)[["age_group_cat"]]
+    agegroup_gb = meta.groupby(Capability.AGEGROUP)[["sex_cat"]]
+
+    gini_per_sex = sex_gb.apply(gini)
+    gini_per_agegroup = agegroup_gb.apply(gini)
+
+    # Prototype textual description of the dataset. To be incorporated into a
+    # "generate_report" function.
+    print(
+        f'The dataset "{DATASET_PATH.name}" has a total of {len(meta)} {meta.index.name}s,'
+        " with the following features/capabilities:"
+    )
+    caps = []
+    for c in Capability:
+        if c.value in meta:
+            caps.append(c)
+            print(f"- {c.value}")
+
+    print("\nEach feature/capability has the following types and values:")
+    for c in caps:
+        if c == Capability.AGE:
+            print(f"{c.value}: numeric")
+
+        else:
+            print(f"{c.value}: categorical")
+            print(f" - {sorted(meta[c].unique())}")
+
+    print("\nData distribution statistics.")
+    for c in caps:
+        print(f'The feature/capability "{c}" has the following distribution of values:')
+        if c == Capability.AGE:
+            m1 = meta[c].min()
+            m2 = meta[c].max()
+            mean = meta[c].mean()
+            std = meta[c].std()
+            p25 = meta[c].quantile(0.25)
+            median = meta[c].median()
+            p75 = meta[c].quantile(0.75)
+
+            print(
+                f"  - min = {m1}, max = {m2}, mean = {mean:.2f}, std = {std:.2f}"
+                f" p25 = {p25}, p50 = {median}, p75 = {p75}"
+            )
+            print(" Interqualtile ranges:")
+            print(f"  - p25-min = {p25 - m1}")
+            print(f"  - p50-p25 = {median - p25}")
+            print(f"  - p75-p50 = {p75 - median}")
+            print(f"  - max-p75 = {m2 - p75}")
+        else:
+            series = meta[c].value_counts().sort_index()
+            for s in series.index:
+                print(f" - {s}: {series[s]}")
+
+    print("\nPer capability/class data distribution statistics.")
+    for c1, c2 in combinations(caps, 2):
+        if c1 == Capability.AGE:
+            continue
+
+        if c2 != Capability.AGE:
+            gb = meta.groupby(c1)[[c2]]
+            print(
+                f'Grouping by "{c1}", the dataset has the following data distribution for "{c2}"'
+            )
+            print(gb.value_counts().sort_index().unstack(level=c1).fillna(0))
+
+    # Diagnostics of biases in the dataset. To be incorporated into a
+    # "generate_diagnostics" function later on.