Coverage for credoai/evaluators/ranking_fairness.py: 88%
112 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
1"""Ranking Fairness evaluator"""
3import numpy as np
4import pandas as pd
5from connect.evidence import MetricContainer, TableContainer
6from finsfairauditing import fins
8from credoai.artifacts import TabularData
9from credoai.evaluators.evaluator import Evaluator
10from credoai.evaluators.utils.validation import (
11 check_data_for_nulls,
12 check_data_instance,
13 check_existence,
14 check_feature_presence,
15)
16from credoai.modules.metrics_credoai import (
17 normalized_discounted_cumulative_kl_divergence,
18 skew_parity,
19)
20from credoai.utils.common import ValidationError
21from credoai.utils.dataset_utils import empirical_distribution_curve
23EPSILON = 1e-12
24METRIC_SUBSET = [
25 "skew_parity_difference-score",
26 "skew_parity_ratio-score",
27 "ndkl-score",
28 "demographic_parity_ratio-score",
29 "balance_ratio-score",
30 "qualified_demographic_parity_ratio-score",
31 "qualified_balance_ratio-score",
32 "calibrated_demographic_parity_ratio-score",
33 "calibrated_balance_ratio-score",
34 "relevance_parity_ratio-score",
35 "score_parity_ratio-score",
36 "score_balance_ratio-score",
37]
40class RankingFairness(Evaluator):
41 """
42 Ranking fairness evaluator for Credo AI (Experimental)
44 This module takes in ranking results and provides functionality to perform fairness assessment
45 The results should include rankings, sensitive features, and optionally, scores.
47 The scores that the evaluator can calculate are:
49 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected
50 items from a group over the desired proportion for that group.
51 It ranges from 0 to inf and the ideal value is 0.
53 * **skew_parity_ratio**: min_skew / max_skew, where skew is the proportion of the selected
54 items from a group over the desired proportion for that group.
55 It ranges from 0 to 1 and the ideal value is 1.
57 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values
58 indicating a greater divergence between the desired and actual distributions of
59 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0.
61 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate
62 is the proportion of the selected items from a group over the number of items for
63 that group in the pool. It ranges from 0 to 1 and ideal value is 1.
65 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items
66 from a group. It ranges from 0 to 1 and ideal value is 1.
68 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score
69 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.
71 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal
72 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.
74 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from
75 specified score bins. This is to audit if items with similar scores are are treated similarly
76 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.
78 * **calibrated_balance_ratio**: balance_ratio but with the selected set from
79 specified score bins. This is to audit if items with similar scores are are treated similarly
80 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.
82 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score
83 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1.
85 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score
86 is the average score of the selected items from a group.
87 It ranges from 0 to 1 and ideal value is 1.
89 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score
90 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1.
92 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables.
93 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1)
94 It is useful for a visual examination of the distribution of scores for the different groups.
96 Parameters
97 ----------
98 sensitive_features : pandas.Series
99 A series of the sensitive feature labels (e.g., "male", "female") which should
100 be used to create subgroups
101 rankings : pandas.Series of type int
102 The computed ranks
103 It should be passed to TabularData's y argument with the column name `rankings`
104 scores : pandas.Series of type int or float, Optional
105 A series of the scores
106 It should be passed to TabularData's y argument with the column name `scores`
107 k: int, Optional
108 The top k items are considered as the selected subset
109 If not provided, the top 50% of the items are considered as selected
110 q: float, Optional
111 The relevance score for which items in the pool that have score >= q are "relevant".
112 These two metrics require this to be provided: `qualified_demographic_parity_ratio`
113 and `qualified_balance_ratio`
114 lb_bin: numpy array of shape = (n_bins), Optional
115 The lower bound scores for each bin (bin is greater than or equal to lower bound).
116 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
117 and `calibrated_balance_ratio`
118 ub_bin: numpy array of shape = (n_bins), Optional
119 The upper bound scores for each bin (bin is less than upper bound).
120 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
121 and `calibrated_balance_ratio`
122 desired_proportions: dict, Optional
123 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6})
124 If not provided, equal proportions are used for calculation of `skew` score
125 down_sampling_step : int, optional
126 down-sampling step for scores empirical distribution curve
127 If not provided, down-sampling is done such that the curve length be nearly 100
128 """
130 def __init__(
131 self,
132 k: int = None,
133 q: float = None,
134 lb_bin: list = None,
135 ub_bin: list = None,
136 desired_proportions: dict = None,
137 down_sampling_step: int = None,
138 ):
139 self.desired_proportions = (desired_proportions,)
140 self.k = k
141 self.q = q
142 self.down_sampling_step = down_sampling_step
143 if lb_bin is not None and ub_bin is not None:
144 self.lb_bin = np.array(lb_bin)
145 self.ub_bin = np.array(ub_bin)
146 else:
147 self.lb_bin = lb_bin
148 self.ub_bin = ub_bin
150 required_artifacts = ["data", "sensitive_feature"]
152 def _validate_arguments(self):
153 check_data_instance(self.data, TabularData)
154 check_existence(self.data.sensitive_features, "sensitive_features")
155 check_feature_presence("rankings", self.data.y, "y")
156 check_data_for_nulls(self.data, "Data")
158 return self
160 def _setup(self):
161 self.pool_rankings = np.array(self.data.y.rankings)
162 self.pool_sensitive_features = np.array(self.data.sensitive_feature)
163 self.sf_name = self.data.sensitive_feature.name
164 if self.k is None:
165 self.k = int(len(self.pool_rankings) / 2)
167 if self.down_sampling_step is None:
168 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1)
170 # Sort ascending in parallel in case not already sorted
171 p = self.pool_rankings.argsort()
172 self.pool_rankings = self.pool_rankings[p]
173 self.pool_sensitive_features = self.pool_sensitive_features[p]
175 self.pool_groups = list(set(self.pool_sensitive_features))
176 self.num_items = len(self.pool_rankings)
178 self.subset_sensitive_features = self.pool_sensitive_features[: self.k]
179 self.subset_groups = list(set(self.subset_sensitive_features))
181 if "scores" in self.data.y:
182 self.pool_scores = np.array(self.data.y.scores)
183 if not np.issubdtype(self.pool_scores.dtype, np.number):
184 raise ValidationError(
185 "`scores` array provided contains non-numeric elements."
186 )
188 self.subset_scores = self.pool_scores[: self.k]
189 else:
190 self.pool_scores = None
191 self.subset_scores = None
193 # if desired proportions are not provided, set it to the pool proportions
194 if not all(self.desired_proportions):
195 uniques, counts = np.unique(
196 self.pool_sensitive_features, return_counts=True
197 )
198 self.desired_proportions = dict(zip(uniques, counts / self.num_items))
200 return self
202 def evaluate(self):
203 """
204 Runs the assessment process
206 Returns
207 -------
208 dict, nested
209 Key: assessment category
210 Values: detailed results associated with each category
211 """
212 # Skew parity metrics
213 skew_parity_diff = skew_parity(
214 self.subset_sensitive_features,
215 self.desired_proportions,
216 "difference",
217 )
218 skew_parity_ratio = skew_parity(
219 self.subset_sensitive_features,
220 self.desired_proportions,
221 "ratio",
222 )
223 skew_results = {
224 "skew_parity_difference-score": [{"value": skew_parity_diff}],
225 "skew_parity_ratio-score": [{"value": skew_parity_ratio}],
226 }
228 # NDKL metric
229 ndkl = normalized_discounted_cumulative_kl_divergence(
230 self.pool_sensitive_features, self.desired_proportions
231 )
232 ndkl_results = {"ndkl-score": [{"value": ndkl}]}
234 # FIN metrics
235 fins_results = calculate_fins_metrics(
236 self.pool_sensitive_features,
237 self.subset_sensitive_features,
238 self.pool_scores,
239 self.subset_scores,
240 self.lb_bin,
241 self.ub_bin,
242 self.q,
243 )
245 res = {**skew_results, **ndkl_results, **fins_results}
246 self.results = self._format_results(res)
248 # Score disaggregated empirical distributions
249 if self.pool_scores is not None:
250 for group in self.pool_groups:
251 ind = np.where(self.pool_sensitive_features == group)
252 group_scores = self.pool_scores[ind]
253 emp_dist_df = empirical_distribution_curve(
254 group_scores, self.down_sampling_step, variable_name="scores"
255 )
256 emp_dist_df.name = "score_empirical_distribution"
257 labels = {"sensitive_feature": self.sf_name, "group": group}
258 e = TableContainer(
259 emp_dist_df,
260 **self.get_info(labels=labels),
261 )
262 self.results.append(e)
264 return self
266 def _format_results(self, res):
267 """
268 Format results from the evaluations.
270 Parameters
271 ----------
272 res : dict
273 All results of the evaluations
274 """
275 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}
277 # Reformat results
278 labels = {"sensitive_feature": self.sf_name}
279 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]
280 res = pd.concat(res)
281 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)
282 res.drop("metric_type", axis=1, inplace=True)
283 return [MetricContainer(res, **self.get_info(labels=labels))]
286############################################
287## Evaluation helper functions
290## Helper functions create evidences
291## to be passed to .evaluate to be wrapped
292## by evidence containers
293############################################
294def calculate_fins_metrics(
295 pool_sensitive_features,
296 subset_sensitive_features,
297 pool_scores=None,
298 subset_scores=None,
299 lb_bin=None,
300 ub_bin=None,
301 q=None,
302):
303 """
304 Calculates group fairness metrics for subset selections from FINS paper and library
306 Parameters
307 ----------
308 pool_sensitive_features : numpy array
309 An array of items in the pool.
310 If ranking is applicable, the array should be sorted accordignly.
311 subset_sensitive_features : numpy array
312 An array of items in the subset.
313 If ranking is applicable, the array should be sorted accordignly.
314 pool_scores : numpy array, Optional
315 An array of the scores for items in the pools
316 subset_scores : numpy array, Optional
317 An array of the scores for items in the subset
318 lb_bin: numpy array of shape = (n_bins), Optional
319 The lower bound scores for each bin (bin is greater than or equal to lower bound).
320 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
321 and `calibrated_balance_ratio`
322 ub_bin: numpy array of shape = (n_bins), Optional
323 The upper bound scores for each bin (bin is less than upper bound).
324 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
325 and `calibrated_balance_ratio`
326 q: float, Optional
327 The relevance score for which items in the pool that have score >= q are "relevant".
328 These two metrics require this to be provided: `qualified_demographic_parity_ratio`
329 and `qualified_balance_ratio`
331 Returns
332 -------
333 fins_metrics : dict
334 All results of the FINS evaluations
336 References
337 ----------
338 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework:
339 Group Fairness for Subset Selections." Proceedings of the 2022
340 AAAI/ACM Conference on AI, Ethics, and Society. 2022.
341 """
342 fins_metrics = {}
344 pool_items = np.arange(0, len(pool_sensitive_features))
345 subset_items = np.arange(0, len(subset_sensitive_features))
347 # represent sensitive feature values via consecutive integers
348 lookupTable, pool_sf_int = np.unique(pool_sensitive_features, return_inverse=True)
349 lookupTable, subset_sf_int = np.unique(
350 subset_sensitive_features, return_inverse=True
351 )
353 selectRt, parity_score = fins.parity(
354 pool_items, pool_sf_int, subset_items, subset_sf_int
355 )
356 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}]
358 propOfS, balance_score = fins.balance(pool_sf_int, subset_items, subset_sf_int)
359 fins_metrics["balance_ratio-score"] = [{"value": balance_score}]
361 # Score-dependant metrics
362 if subset_scores is not None:
363 AvgScore, score_parity_score = fins.score_parity(
364 subset_items, subset_scores, subset_sf_int
365 )
366 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}]
368 TotalScore, score_balance_score = fins.score_balance(
369 subset_items, subset_scores, subset_sf_int
370 )
371 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}]
373 if pool_scores is not None:
374 RselectRt, relevance_parity_score = fins.relevance_parity(
375 pool_items,
376 pool_scores,
377 pool_sf_int,
378 subset_items,
379 subset_scores,
380 subset_sf_int,
381 )
382 fins_metrics["relevance_parity_ratio-score"] = [
383 {"value": relevance_parity_score}
384 ]
386 if q:
387 QselectRt, qualified_parity_score = fins.qualififed_parity(
388 pool_items,
389 pool_scores,
390 pool_sf_int,
391 subset_items,
392 subset_scores,
393 subset_sf_int,
394 q,
395 )
396 fins_metrics["qualified_demographic_parity_ratio-score"] = [
397 {"value": qualified_parity_score}
398 ]
400 QpropOfS, qualified_balance_score = fins.qualified_balance(
401 pool_items,
402 pool_scores,
403 pool_sf_int,
404 subset_items,
405 subset_scores,
406 subset_sf_int,
407 q,
408 )
409 fins_metrics["qualified_balance_ratio-score"] = [
410 {"value": qualified_balance_score}
411 ]
413 if lb_bin is not None and ub_bin is not None:
414 (
415 bin_group_selection_proportions,
416 calibrated_parity_score,
417 ) = fins.calibrated_parity(
418 pool_items,
419 pool_scores,
420 pool_sf_int,
421 subset_items,
422 subset_scores,
423 subset_sf_int,
424 lb_bin,
425 ub_bin,
426 )
427 fins_metrics["calibrated_demographic_parity_ratio-score"] = [
428 {"value": calibrated_parity_score}
429 ]
431 (
432 bin_group_proportions,
433 calibrated_balance_score,
434 ) = fins.calibrated_balance(
435 pool_items,
436 pool_scores,
437 pool_sf_int,
438 subset_items,
439 subset_scores,
440 subset_sf_int,
441 lb_bin,
442 ub_bin,
443 )
444 fins_metrics["calibrated_balance_ratio-score"] = [
445 {"value": calibrated_balance_score}
446 ]
448 return fins_metrics