Moving the description generation to its own function and expanding it.
This commit is contained in:
@@ -31,12 +31,138 @@ def _to_age_bracket(row):
|
||||
return "{}-{}".format(d, d + 9)
|
||||
|
||||
|
||||
def generate_description(dataset: pd.DataFrame, name: str | None = None) -> tuple[str]:
|
||||
"""Generates a textual description of `dataset`, its variables and values.
|
||||
|
||||
For all supported variables in `dataset`, this function lists their names,
|
||||
types, value ranges and descriptive statistics. It also groups each
|
||||
variable, comparing it to the others, thus creating a subgroup description
|
||||
as well.
|
||||
|
||||
Note that we only support the variables defined by the `Capability` enum.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset: pd.DataFrame
|
||||
|
||||
name: str, optional
|
||||
The dataset name. Only ever used in the first line of the returned
|
||||
description, for pretty printing.
|
||||
|
||||
Returns
|
||||
-------
|
||||
desc: str
|
||||
The textual description as a human-readable string.
|
||||
|
||||
See Also
|
||||
--------
|
||||
`Capability`
|
||||
"""
|
||||
outstr = ""
|
||||
if name is not None:
|
||||
outstr = f"The dataset {name}"
|
||||
else:
|
||||
outstr = f"The dataset"
|
||||
|
||||
outstr += f" has a total of {len(dataset)} {dataset.index.name}s, with the following supported features/capabilities:"
|
||||
|
||||
caps = []
|
||||
for c in Capability:
|
||||
if c.value in dataset:
|
||||
caps.append(c)
|
||||
outstr += f"\n- {c.value}"
|
||||
|
||||
outstr += "\n\nEach feature/capability has the following types and values:"
|
||||
for c in caps:
|
||||
if c == Capability.AGE:
|
||||
outstr += f"\n{c.value}: numeric"
|
||||
else:
|
||||
outstr += f"\n{c.value}: categorical"
|
||||
outstr += f"\n - {sorted(dataset[c].unique())}"
|
||||
|
||||
outstr += "\n\nData distribution statistics."
|
||||
for c in caps:
|
||||
outstr += (
|
||||
f'\nThe feature/capability "{c}" has the following distribution of values:'
|
||||
)
|
||||
if c == Capability.AGE:
|
||||
m1 = dataset[c].min()
|
||||
m2 = dataset[c].max()
|
||||
mean = dataset[c].mean()
|
||||
std = dataset[c].std()
|
||||
p25 = dataset[c].quantile(0.25)
|
||||
median = dataset[c].median()
|
||||
p75 = dataset[c].quantile(0.75)
|
||||
|
||||
outstr += f"\n - min = {m1}"
|
||||
outstr += f"\n - max = {m2}"
|
||||
outstr += f"\n - mean = {mean:.2f}"
|
||||
outstr += f"\n - std = {std:.2f}"
|
||||
outstr += f"\n - p25 = {p25}"
|
||||
outstr += f"\n - p50 = {median}"
|
||||
outstr += f"\n - p75 = {p75}"
|
||||
outstr += "\n Interqualtile ranges:"
|
||||
outstr += f"\n - p25-min = {p25 - m1}"
|
||||
outstr += f"\n - p50-p25 = {median - p25}"
|
||||
outstr += f"\n - p75-p50 = {p75 - median}"
|
||||
outstr += f"\n - max-p75 = {m2 - p75}"
|
||||
else:
|
||||
series = dataset[c].value_counts().sort_index()
|
||||
for s in series.index:
|
||||
outstr += f"\n - {s}: {series[s]}"
|
||||
|
||||
outstr += "\n\nPer capability/class data distribution statistics."
|
||||
if Capability.AGE in caps and caps[-1] != Capability.AGE:
|
||||
# Rotating AGE to be the last in the list, as it cannot be the grouping
|
||||
# variable, since it is numeric.
|
||||
idx = caps.index(Capability.AGE)
|
||||
del caps[idx]
|
||||
caps.append(Capability.AGE)
|
||||
# caps = caps[1:] + caps[:1]
|
||||
|
||||
for c1, c2 in combinations(caps, 2):
|
||||
# Here we test for age, but this should really be a test for any
|
||||
# numerical variable.
|
||||
if c1 == Capability.AGE:
|
||||
continue
|
||||
|
||||
gb = dataset.groupby(c1)[[c2]]
|
||||
tmpdf = None
|
||||
tmpstr = '\nGrouping by "{}", the dataset has the following {} for each value of "{}"'
|
||||
|
||||
if c2 != Capability.AGE:
|
||||
outstr += tmpstr.format(c1, "number of elements", c2)
|
||||
tmpdf = gb.value_counts().sort_index().unstack(level=c1)
|
||||
tmpdf = tmpdf.fillna(0).astype(int)
|
||||
# tmpdf = pd.concat([tmpdf, gb.apply(gini)], axis=1)
|
||||
elif c1 != Capability.AGEGROUP:
|
||||
outstr += tmpstr.format(c1, "statistics", c2)
|
||||
tmpdf = pd.concat(
|
||||
[
|
||||
gb.min(),
|
||||
gb.quantile(0.25),
|
||||
gb.median(),
|
||||
gb.quantile(0.75),
|
||||
gb.max(),
|
||||
gb.apply(gini),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
tmpdf.columns = ["min", "p25", "p50", "p75", "max", "gini_impurity"]
|
||||
else:
|
||||
continue
|
||||
|
||||
outstr += f"\n{tmpdf}\n"
|
||||
|
||||
return outstr
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
logger.info(os.getcwd())
|
||||
|
||||
DATASET_PATH = Path("../../data/facing2-train/")
|
||||
DATASET_PATH = Path("data/facing2-train/")
|
||||
METADATA_PATH = DATASET_PATH / "meta-w-age.csv"
|
||||
|
||||
meta = pd.read_csv(METADATA_PATH, sep=",", index_col="image")
|
||||
@@ -49,9 +175,8 @@ if __name__ == "__main__":
|
||||
|
||||
# GINI IMPURITY
|
||||
# Lower values means a concentration of values around a single class, i.e. bias.
|
||||
age_gini = gini(meta["age"])
|
||||
age_gini = gini(meta[Capability.AGE])
|
||||
|
||||
# gt_age_group_ord = meta["age_group"].apply(lambda x: _agegroup_int_map[x])
|
||||
agegroup_gini = gini(meta[Capability.AGEGROUP + "_cat"])
|
||||
|
||||
# Should be close to 0.5, indicating a 50/50 split of males and females,
|
||||
@@ -78,64 +203,12 @@ if __name__ == "__main__":
|
||||
gini_per_sex = sex_gb.apply(gini)
|
||||
gini_per_agegroup = agegroup_gb.apply(gini)
|
||||
|
||||
# Prototype textual description of the dataset. To be incorporated into a
|
||||
# "generate_report" function.
|
||||
print(
|
||||
f'The dataset "{DATASET_PATH.name}" has a total of {len(meta)} {meta.index.name}s,'
|
||||
" with the following features/capabilities:"
|
||||
)
|
||||
caps = []
|
||||
for c in Capability:
|
||||
if c.value in meta:
|
||||
caps.append(c)
|
||||
print(f"- {c.value}")
|
||||
# Prototype textual description of the dataset.
|
||||
s = generate_description(meta, name=DATASET_PATH.name)
|
||||
print(s)
|
||||
|
||||
print("\nEach feature/capability has the following types and values:")
|
||||
for c in caps:
|
||||
if c == Capability.AGE:
|
||||
print(f"{c.value}: numeric")
|
||||
|
||||
else:
|
||||
print(f"{c.value}: categorical")
|
||||
print(f" - {sorted(meta[c].unique())}")
|
||||
|
||||
print("\nData distribution statistics.")
|
||||
for c in caps:
|
||||
print(f'The feature/capability "{c}" has the following distribution of values:')
|
||||
if c == Capability.AGE:
|
||||
m1 = meta[c].min()
|
||||
m2 = meta[c].max()
|
||||
mean = meta[c].mean()
|
||||
std = meta[c].std()
|
||||
p25 = meta[c].quantile(0.25)
|
||||
median = meta[c].median()
|
||||
p75 = meta[c].quantile(0.75)
|
||||
|
||||
print(
|
||||
f" - min = {m1}, max = {m2}, mean = {mean:.2f}, std = {std:.2f}"
|
||||
f" p25 = {p25}, p50 = {median}, p75 = {p75}"
|
||||
)
|
||||
print(" Interqualtile ranges:")
|
||||
print(f" - p25-min = {p25 - m1}")
|
||||
print(f" - p50-p25 = {median - p25}")
|
||||
print(f" - p75-p50 = {p75 - median}")
|
||||
print(f" - max-p75 = {m2 - p75}")
|
||||
else:
|
||||
series = meta[c].value_counts().sort_index()
|
||||
for s in series.index:
|
||||
print(f" - {s}: {series[s]}")
|
||||
|
||||
print("\nPer capability/class data distribution statistics.")
|
||||
for c1, c2 in combinations(caps, 2):
|
||||
if c1 == Capability.AGE:
|
||||
continue
|
||||
|
||||
if c2 != Capability.AGE:
|
||||
gb = meta.groupby(c1)[[c2]]
|
||||
print(
|
||||
f'Grouping by "{c1}", the dataset has the following data distribution for "{c2}"'
|
||||
)
|
||||
print(gb.value_counts().sort_index().unstack(level=c1).fillna(0))
|
||||
|
||||
# Diagnostics of biases in the dataset. To be incorporated into a
|
||||
# "generate_diagnostics" function later on.
|
||||
# Diagnostics of biases in the dataset. To be incorporated into a
|
||||
# "generate_diagnostics" function later on.
|
||||
def generate_bias_diagnostics():
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user