From f9798b9cf885b30c1cf10a804cac988e930af984 Mon Sep 17 00:00:00 2001
From: Guilherme Schardong <guilherme.schardong@isr.uc.pt>
Date: Fri, 29 May 2026 15:51:30 +0100
Subject: [PATCH] Adding some missing files from the last commits.

---
 .gitignore                               |   5 +-
 README.md                                |   6 +
 src/facebias/__init__.py                 | 167 ++++++++++++++++++++++-
 src/facebias/estimators/mivolo/README.md |   3 +
 4 files changed, 175 insertions(+), 6 deletions(-)
 create mode 100644 README.md
 create mode 100644 src/facebias/estimators/mivolo/README.md

diff --git a/.gitignore b/.gitignore
index b95055f..5db814b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,7 @@
 .vscode
 *.egg-info
 landmark_models
-*__pycache__/
\ No newline at end of file
+*__pycache__/
+data/*
+models/*
+gpt*.py
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..840e35d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# ISR Face Dataset/Model Bias Check API
+
+We use [FairFace](https://github.com/dchen236/FairFace) and [MiVOLO](https://github.com/wildchlamydia/mivolo) version 1, face-only checkpoint.
+
+## Dataset
+Download the tar file `VISTEAM-NAS/Public_Data/facing2-skin-tone-train-images.tar.bz2` to the `data` directory, and extract it. This dataset has balanced sex and skin-tone, and unbalanced age.
diff --git a/src/facebias/__init__.py b/src/facebias/__init__.py
index 3b5982f..a2299e7 100644
--- a/src/facebias/__init__.py
+++ b/src/facebias/__init__.py
@@ -23,7 +23,10 @@ class FaceBox:
 
 # TODO(gschardong): Convert all CSV reading functions to pandas
 
-def load_metadata(p: Path, key_id="image", key_proc_fn=None) -> dict[str, dict[str, str]]:
+
+def load_metadata(
+    p: Path, key_id="image", key_proc_fn=None
+) -> dict[str, dict[str, str]]:
     lines = []
     with open(p, newline="") as csvfile:
         dialect = csv.Sniffer().sniff(csvfile.read(1024))
@@ -41,9 +44,7 @@ def load_metadata(p: Path, key_id="image", key_proc_fn=None) -> dict[str, dict[s
 
 
 def load_dataset(
-        root: Path,
-        meta_path: Path | None,
-        imname_proc_fn: Callable |None
+    root: Path, meta_path: Path | None, imname_proc_fn: Callable | None
 ) -> tuple[dict[str, np.ndarray], dict[str, dict[str, Any]] | None]:
     """
     if `meta_path` is `None`, we won't attempt to read it.
@@ -83,7 +84,9 @@ def load_dataset(
         except cv2.error:
             logger.info(f'File "{p}" is not an image. Skipping.')
         else:
-            proc_imname = imname_proc_fn(p.name) if imname_proc_fn is not None else str(p.name)
+            proc_imname = (
+                imname_proc_fn(p.name) if imname_proc_fn is not None else str(p.name)
+            )
             ims[proc_imname] = im
 
     if not metadata:
@@ -92,3 +95,157 @@ def load_dataset(
             logger.error(f'Metadata file not found at "{meta_path}".')
 
     return ims, metadata
+
+
+# def calc_model_performance(
+#     gt: pd.DataFrame,
+#     preds: pd.DataFrame,
+#     keys: list[str] | None = None,
+#     possible_caps: dict[Capability, Any] | None = None,
+# ) -> pd.DataFrame:
+#     """
+#     We assume that both `gt` and `preds` have the same structure. They should
+#     be indexed by individual ID, such as the image name, and each value is a
+#     dictionary with model prediction capabilities as keys (e.g., "age_group",
+#     "sex", "skin-color", etc.), and the values are the predictions, or ground-truth
+#     values for each ID/capability.
+
+#     if `keys` is empty, then we infer from common keys present in `preds` and `gt`.
+
+#     Parameters
+#     ----------
+#     gt: pd.DataFrame
+#     preds: pd.DataFrame
+#     keys: list[str] | None
+
+#     Returns
+#     -------
+#     metrics: pd.DataFrame
+#     """
+#     common_caps = keys
+#     if keys is None:
+#         common_caps = set(gt.columns) & set(preds.columns)
+#         if not common_caps:
+#             logger.error(
+#                 f'No common capabilities found. Predictions has "{preds.columns}",'
+#                 f' ground-truth has "{gt.columns}".'
+#             )
+#             return None
+
+#     # Finding common images between predictions and ground-truth.
+#     common_inds = set(preds.index) & set(gt.index)
+#     if not common_inds:
+#         logger.error("No common images found between predictions and ground-truth.")
+#         return None
+
+#     metric_vals = dict()
+#     for cap in common_caps:
+#         if isinstance(preds[cap].iloc[0], (float, int)):
+#             metric_vals[cap] = {
+#                 "mean_absolute_error": mean_absolute_error(gt[cap], preds[cap]),
+#                 "max_error": max_error(gt[cap], preds[cap]),
+#             }
+#         else:
+#             labels = possible_caps[cap]
+#             if possible_caps is None:
+#                 labels = sorted(list(set(preds[cap].unique()) | set(gt[cap].unique())))
+#             metric_vals[cap] = {
+#                 "accuracy": accuracy_score(gt[cap], preds[cap]),
+#                 "balanced_accuracy": balanced_accuracy_score(gt[cap], preds[cap]),
+#                 "cohen-kappa": cohen_kappa_score(gt[cap], preds[cap], labels=labels),
+#             }
+
+#     return pd.DataFrame.from_dict(metric_vals)
+
+
+# def calc_metrics_per_subgroup(
+#     gt: pd.DataFrame,
+#     preds: pd.DataFrame,
+#     model_cls: BaseEstimator,
+#     metrics: list[str] = [
+#         "accuracy",
+#     ],
+# ) -> pd.DataFrame:
+#     """Calculate performance metrics per sub-group for each capability.
+
+#     Parameters
+#     ----------
+#     gt: pd.DataFrame
+
+#     preds: pd.DataFrame
+
+#     model_cls: BaseEstimator-derived class
+
+#     Returns
+#     -------
+#     metrics: dict[Capability, dict[Any, dict]]
+#     """
+#     common_caps = set(gt.columns) & set(preds.columns)
+
+#     # metrics = {}
+#     index = sorted(
+#         [
+#             model_cls.possible_capability_values(c)
+#             for c in common_caps
+#             if c != Capability.AGE
+#         ],
+#         key=len,
+#     )
+#     index = product(*index)
+
+#     metrics = ["accuracy", ""]
+#     df = pd.DataFrame(index=index, columns=metrics)
+
+#     for cap in common_caps:
+#         # TODO(gschardong): Better to store the "type" of each capability
+#         # somewhere and test all numeric types here.
+#         if cap == Capability.AGE:
+#             continue
+
+#         other_caps = common_caps - set([cap])
+
+#         # TODO(gschardong): Do we only need the values that occur in the data,
+#         # or all possible values? If the first is true, then we need to fetch
+#         # from the model class itself, else, we keep it as is.
+#         unique_values_cap = set(gt[cap].unique()) | set(preds[cap].unique())
+
+#         metrics[cap] = {}
+#         for val in unique_values_cap:
+#             ids = gt.index[gt[cap] == val]
+#             metrics[cap][val] = {"number_of_elements": len(ids)}
+#             for ocap in other_caps:
+#                 metrics[cap][val][ocap] = {}
+#                 fpred_data = preds[ocap][ids]
+#                 fgt_data = gt[ocap][ids]
+
+#                 if isinstance(fpred_data[0], (float, int)):
+#                     # If data is numeric, we calculate regression-based metrics
+#                     metrics[cap][val][ocap] = {
+#                         "mean_absolute_error": mean_absolute_error(
+#                             fgt_data, fpred_data
+#                         ),
+#                         "max_error": max_error(fgt_data, fpred_data),
+#                     }
+#                 else:
+#                     unique_values_ocap = sorted(
+#                         list(set(gt[ocap].unique()) | set(preds[ocap].unique()))
+#                     )
+#                     unique_values_ocap = np.array(unique_values_ocap)
+
+#                     metrics_small = {}
+#                     # if len(fgt_data.unique()) == 2:
+#                     #     cm = confusion_matrix(fgt_data, fpred_data, labels=unique_values_ocap)
+#                     # else:
+#                     cm = multilabel_confusion_matrix(
+#                         fgt_data, fpred_data, labels=unique_values_ocap
+#                     )
+#                     for m, oval in zip(cm, unique_values_ocap):
+#                         # metrics[cap][val][ocap][oval] = m
+#                         metrics_small[oval] = m
+#                     return m
+
+#                     metrics[cap][val][ocap] = {
+#                         "accuracy": accuracy_score(fgt_data, fpred_data),
+#                     }
+
+#     return metrics
diff --git a/src/facebias/estimators/mivolo/README.md b/src/facebias/estimators/mivolo/README.md
new file mode 100644
index 0000000..8d6368b
--- /dev/null
+++ b/src/facebias/estimators/mivolo/README.md
@@ -0,0 +1,3 @@
+# Subset of MiVOLO code
+
+This is a subset of the [MiVOLO](https://github.com/wildchlamydia/mivolo) code necessary to instantiate the face-only attribute model.
\ No newline at end of file