Moving the description generation to its own function and expanding it.

2026-05-07 17:13:47 +01:00
parent 92db500bd0
commit 7c72462e53
1 changed files with 135 additions and 62 deletions
--- a/src/facebias/evaluation.py
+++ b/src/facebias/evaluation.py
@@ -31,12 +31,138 @@ def _to_age_bracket(row):
    return "{}-{}".format(d, d + 9)


+def generate_description(dataset: pd.DataFrame, name: str | None = None) -> tuple[str]:
+    """Generates a textual description of `dataset`, its variables and values.
+
+    For all supported variables in `dataset`, this function lists their names,
+    types, value ranges and descriptive statistics. It also groups each
+    variable, comparing it to the others, thus creating a subgroup description
+    as well.
+
+    Note that we only support the variables defined by the `Capability` enum.
+
+    Parameters
+    ----------
+    dataset: pd.DataFrame
+
+    name: str, optional
+        The dataset name. Only ever used in the first line of the returned
+        description, for pretty printing.
+
+    Returns
+    -------
+    desc: str
+        The textual description as a human-readable string.
+
+    See Also
+    --------
+    `Capability`
+    """
+    outstr = ""
+    if name is not None:
+        outstr = f"The dataset {name}"
+    else:
+        outstr = f"The dataset"
+
+    outstr += f" has a total of {len(dataset)} {dataset.index.name}s, with the following supported features/capabilities:"
+
+    caps = []
+    for c in Capability:
+        if c.value in dataset:
+            caps.append(c)
+            outstr += f"\n- {c.value}"
+
+    outstr += "\n\nEach feature/capability has the following types and values:"
+    for c in caps:
+        if c == Capability.AGE:
+            outstr += f"\n{c.value}: numeric"
+        else:
+            outstr += f"\n{c.value}: categorical"
+            outstr += f"\n - {sorted(dataset[c].unique())}"
+
+    outstr += "\n\nData distribution statistics."
+    for c in caps:
+        outstr += (
+            f'\nThe feature/capability "{c}" has the following distribution of values:'
+        )
+        if c == Capability.AGE:
+            m1 = dataset[c].min()
+            m2 = dataset[c].max()
+            mean = dataset[c].mean()
+            std = dataset[c].std()
+            p25 = dataset[c].quantile(0.25)
+            median = dataset[c].median()
+            p75 = dataset[c].quantile(0.75)
+
+            outstr += f"\n  - min = {m1}"
+            outstr += f"\n  - max = {m2}"
+            outstr += f"\n  - mean = {mean:.2f}"
+            outstr += f"\n  - std = {std:.2f}"
+            outstr += f"\n  - p25 = {p25}"
+            outstr += f"\n  - p50 = {median}"
+            outstr += f"\n  - p75 = {p75}"
+            outstr += "\n Interqualtile ranges:"
+            outstr += f"\n  - p25-min = {p25 - m1}"
+            outstr += f"\n  - p50-p25 = {median - p25}"
+            outstr += f"\n  - p75-p50 = {p75 - median}"
+            outstr += f"\n  - max-p75 = {m2 - p75}"
+        else:
+            series = dataset[c].value_counts().sort_index()
+            for s in series.index:
+                outstr += f"\n - {s}: {series[s]}"
+
+    outstr += "\n\nPer capability/class data distribution statistics."
+    if Capability.AGE in caps and caps[-1] != Capability.AGE:
+        # Rotating AGE to be the last in the list, as it cannot be the grouping
+        #  variable, since it is numeric.
+        idx = caps.index(Capability.AGE)
+        del caps[idx]
+        caps.append(Capability.AGE)
+        # caps = caps[1:] + caps[:1]
+
+    for c1, c2 in combinations(caps, 2):
+        # Here we test for age, but this should really be a test for any
+        # numerical variable.
+        if c1 == Capability.AGE:
+            continue
+
+        gb = dataset.groupby(c1)[[c2]]
+        tmpdf = None
+        tmpstr = '\nGrouping by "{}", the dataset has the following {} for each value of "{}"'
+
+        if c2 != Capability.AGE:
+            outstr += tmpstr.format(c1, "number of elements", c2)
+            tmpdf = gb.value_counts().sort_index().unstack(level=c1)
+            tmpdf = tmpdf.fillna(0).astype(int)
+            # tmpdf = pd.concat([tmpdf, gb.apply(gini)], axis=1)
+        elif c1 != Capability.AGEGROUP:
+            outstr += tmpstr.format(c1, "statistics", c2)
+            tmpdf = pd.concat(
+                [
+                    gb.min(),
+                    gb.quantile(0.25),
+                    gb.median(),
+                    gb.quantile(0.75),
+                    gb.max(),
+                    gb.apply(gini),
+                ],
+                axis=1,
+            )
+            tmpdf.columns = ["min", "p25", "p50", "p75", "max", "gini_impurity"]
+        else:
+            continue
+
+        outstr += f"\n{tmpdf}\n"
+
+    return outstr
+
+
 if __name__ == "__main__":
    import os

    logger.info(os.getcwd())

-    DATASET_PATH = Path("../../data/facing2-train/")
+    DATASET_PATH = Path("data/facing2-train/")
    METADATA_PATH = DATASET_PATH / "meta-w-age.csv"

    meta = pd.read_csv(METADATA_PATH, sep=",", index_col="image")
@@ -49,9 +175,8 @@ if __name__ == "__main__":

    # GINI IMPURITY
    # Lower values means a concentration of values around a single class, i.e. bias.
-    age_gini = gini(meta["age"])
+    age_gini = gini(meta[Capability.AGE])

-    # gt_age_group_ord =  meta["age_group"].apply(lambda x: _agegroup_int_map[x])
    agegroup_gini = gini(meta[Capability.AGEGROUP + "_cat"])

    # Should be close to 0.5, indicating a 50/50 split of males and females,
@@ -78,64 +203,12 @@ if __name__ == "__main__":
    gini_per_sex = sex_gb.apply(gini)
    gini_per_agegroup = agegroup_gb.apply(gini)

-    # Prototype textual description of the dataset. To be incorporated into a
-    # "generate_report" function.
-    print(
-        f'The dataset "{DATASET_PATH.name}" has a total of {len(meta)} {meta.index.name}s,'
-        " with the following features/capabilities:"
-    )
-    caps = []
-    for c in Capability:
-        if c.value in meta:
-            caps.append(c)
-            print(f"- {c.value}")
+    # Prototype textual description of the dataset.
+    s = generate_description(meta, name=DATASET_PATH.name)
+    print(s)

-    print("\nEach feature/capability has the following types and values:")
-    for c in caps:
-        if c == Capability.AGE:
-            print(f"{c.value}: numeric")

-        else:
-            print(f"{c.value}: categorical")
-            print(f" - {sorted(meta[c].unique())}")
-
-    print("\nData distribution statistics.")
-    for c in caps:
-        print(f'The feature/capability "{c}" has the following distribution of values:')
-        if c == Capability.AGE:
-            m1 = meta[c].min()
-            m2 = meta[c].max()
-            mean = meta[c].mean()
-            std = meta[c].std()
-            p25 = meta[c].quantile(0.25)
-            median = meta[c].median()
-            p75 = meta[c].quantile(0.75)
-
-            print(
-                f"  - min = {m1}, max = {m2}, mean = {mean:.2f}, std = {std:.2f}"
-                f" p25 = {p25}, p50 = {median}, p75 = {p75}"
-            )
-            print(" Interqualtile ranges:")
-            print(f"  - p25-min = {p25 - m1}")
-            print(f"  - p50-p25 = {median - p25}")
-            print(f"  - p75-p50 = {p75 - median}")
-            print(f"  - max-p75 = {m2 - p75}")
-        else:
-            series = meta[c].value_counts().sort_index()
-            for s in series.index:
-                print(f" - {s}: {series[s]}")
-
-    print("\nPer capability/class data distribution statistics.")
-    for c1, c2 in combinations(caps, 2):
-        if c1 == Capability.AGE:
-            continue
-
-        if c2 != Capability.AGE:
-            gb = meta.groupby(c1)[[c2]]
-            print(
-                f'Grouping by "{c1}", the dataset has the following data distribution for "{c2}"'
-            )
-            print(gb.value_counts().sort_index().unstack(level=c1).fillna(0))
-
-    # Diagnostics of biases in the dataset. To be incorporated into a
-    # "generate_diagnostics" function later on.
+# Diagnostics of biases in the dataset. To be incorporated into a
+# "generate_diagnostics" function later on.
+def generate_bias_diagnostics():
+    pass