Coverage for credoai/evaluators/ranking_fairness.py: 90%
134 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
1"""Ranking Fairness evaluator"""
2import math
4import numpy as np
5import pandas as pd
6from connect.evidence import MetricContainer, TableContainer
7from finsfairauditing import fins
9from credoai.artifacts import TabularData
10from credoai.evaluators import Evaluator
11from credoai.evaluators.utils.validation import (
12 check_artifact_for_nulls,
13 check_data_instance,
14 check_existence,
15 check_feature_presence,
16)
17from credoai.utils.common import ValidationError
18from credoai.utils.dataset_utils import empirical_distribution_curve
20EPSILON = 1e-12
21METRIC_SUBSET = [
22 "skew_parity_difference-score",
23 "ndkl-score",
24 "demographic_parity_ratio-score",
25 "balance_ratio-score",
26 "qualified_demographic_parity_ratio-score",
27 "qualified_balance_ratio-score",
28 "calibrated_demographic_parity_ratio-score",
29 "calibrated_balance_ratio-score",
30 "relevance_parity_ratio-score",
31 "score_parity_ratio-score",
32 "score_balance_ratio-score",
33]
36class RankingFairness(Evaluator):
37 """
38 Ranking fairness evaluator for Credo AI
40 This module takes in ranking results and provides functionality to perform fairness assessment
41 The results should include rankings, sensitive features, and optionally, scores.
43 The scores that the evaluator can calculate are:
45 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected
46 items from a group over the desired proportion for that group.
47 It ranges from 0 to inf and the ideal value is 0.
49 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values
50 indicating a greater divergence between the desired and actual distributions of
51 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0.
53 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate
54 is the proportion of the selected items from a group over the number of items for
55 that group in the pool. It ranges from 0 to 1 and ideal value is 1.
57 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items
58 from a group. It ranges from 0 to 1 and ideal value is 1.
60 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score
61 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.
63 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal
64 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.
66 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from
67 specified score bins. This is to audit if items with similar scores are are treated similarly
68 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.
70 * **calibrated_balance_ratio**: balance_ratio but with the selected set from
71 specified score bins. This is to audit if items with similar scores are are treated similarly
72 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.
74 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score
75 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1.
77 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score
78 is the average score of the selected items from a group.
79 It ranges from 0 to 1 and ideal value is 1.
81 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score
82 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1.
84 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables.
85 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1)
86 It is useful for a visual examination of the distribution of scores for the different groups.
88 Parameters
89 ----------
90 sensitive_features : pandas.Series
91 A series of the sensitive feature labels (e.g., "male", "female") which should
92 be used to create subgroups
93 rankings : pandas.Series of type int
94 The computed ranks
95 It should be passed to TabularData's y argument with the column name `rankings`
96 scores : pandas.Series of type int or float, Optional
97 A series of the scores
98 It should be passed to TabularData's y argument with the column name `scores`
99 k: int, Optional
100 The top k items are considered as the selected subset
101 If not provided, the top 50% of the items are considered as selected
102 q: float, Optional
103 The relevance score for which items in the pool that have score >= q are "relevant".
104 These two metrics require this to be provided: `qualified_demographic_parity_ratio`
105 and `qualified_balance_ratio`
106 lb_bin: numpy array of shape = (n_bins), Optional
107 The lower bound scores for each bin (bin is greater than or equal to lower bound).
108 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
109 and `calibrated_balance_ratio`
110 ub_bin: numpy array of shape = (n_bins), Optional
111 The upper bound scores for each bin (bin is less than upper bound).
112 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`
113 and `calibrated_balance_ratio`
114 desired_proportions: dict, Optional
115 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6})
116 If not provided, equal proportions are used for calculation of `skew` score
117 down_sampling_step : int, optional
118 down-sampling step for scores empirical distribution curve
119 If not provided, down-sampling is done such that the curve length be nearly 100
120 """
122 def __init__(
123 self,
124 k: int = None,
125 q: float = None,
126 lb_bin: list = None,
127 ub_bin: list = None,
128 desired_proportions: dict = None,
129 down_sampling_step: int = None,
130 ):
131 self.desired_proportions = (desired_proportions,)
132 self.k = k
133 self.q = q
134 self.down_sampling_step = down_sampling_step
135 if lb_bin is not None and ub_bin is not None:
136 self.lb_bin = np.array(lb_bin)
137 self.ub_bin = np.array(ub_bin)
138 else:
139 self.lb_bin = lb_bin
140 self.ub_bin = ub_bin
142 required_artifacts = ["data", "sensitive_feature"]
144 def _validate_arguments(self):
145 check_data_instance(self.data, TabularData)
146 check_existence(self.data.sensitive_features, "sensitive_features")
147 check_feature_presence("rankings", self.data.y, "y")
148 check_artifact_for_nulls(self.data, "Data")
150 return self
152 def _setup(self):
153 self.pool_rankings = np.array(self.data.y.rankings)
154 self.pool_sensitive_features = np.array(self.data.sensitive_feature)
155 self.sf_name = self.data.sensitive_feature.name
156 if self.k is None:
157 self.k = int(len(self.pool_rankings) / 2)
159 if self.down_sampling_step is None:
160 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1)
162 # Sort ascending in parallel in case not already sorted
163 p = self.pool_rankings.argsort()
164 self.pool_rankings = self.pool_rankings[p]
165 self.pool_sensitive_features = self.pool_sensitive_features[p]
167 self.pool_groups = list(set(self.pool_sensitive_features))
168 self.pool_items = np.arange(0, len(self.pool_rankings))
169 self.num_items = len(self.pool_rankings)
171 self.subset_sensitive_features = self.pool_sensitive_features[: self.k]
172 self.subset_items = self.pool_items[: self.k]
173 self.subset_groups = list(set(self.subset_sensitive_features))
175 if "scores" in self.data.y:
176 self.pool_scores = np.array(self.data.y.scores)
177 if not np.issubdtype(self.pool_scores.dtype, np.number):
178 raise ValidationError(
179 "`scores` array provided contains non-numeric elements."
180 )
182 self.subset_scores = self.pool_scores[: self.k]
183 else:
184 self.pool_scores = None
185 self.subset_scores = None
187 # if desired proportions are not provided, set it to the pool proportions
188 if not all(self.desired_proportions):
189 uniques, counts = np.unique(
190 self.pool_sensitive_features, return_counts=True
191 )
192 self.desired_proportions = dict(zip(uniques, counts / self.num_items))
194 return self
196 def evaluate(self):
197 """
198 Runs the assessment process
200 Returns
201 -------
202 dict, nested
203 Key: assessment category
204 Values: detailed results associated with each category
205 """
207 skew_results = self._skew()
208 ndkl_results = self._ndkl()
209 fins_results = self._fins()
211 res = {**skew_results, **ndkl_results, **fins_results}
213 self.results = self._format_results(res)
215 if self.pool_scores is not None:
216 self.results.append(self._score_distribution())
218 return self
220 def _format_results(self, res):
221 """
222 Format results from the evaluations.
224 Parameters
225 ----------
226 res : dict
227 All results of the evaluations
229 """
231 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}
233 # Reformat results
234 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]
235 res = pd.concat(res)
236 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)
237 res.drop("metric_type", axis=1, inplace=True)
238 return [MetricContainer(res, **self.get_container_info())]
240 def _skew(self):
241 """
242 Calculates skew parity
244 For every group, skew is the proportion of the selected candidates
245 from that group over the desired proportion for that group.
247 Returns
248 -------
249 dict
250 skew parity difference
251 """
252 uniques, counts = np.unique(self.subset_sensitive_features, return_counts=True)
253 subset_proportions = dict(zip(uniques, counts / self.k))
255 skew = {}
256 for g in self.pool_groups:
257 sk = (subset_proportions[g] + EPSILON) / (
258 self.desired_proportions[g] + EPSILON
259 )
260 skew[g] = sk
262 skew = {
263 "skew_parity_difference-score": [
264 {"value": max(skew.values()) - min(skew.values())}
265 ]
266 }
268 return skew
270 def _kld(self, dist_1, dist_2):
271 """
272 Calculates KL divergence
274 Parameters
275 ----------
276 dist_1 : list
277 first distribution
278 dist_2 : list
279 second distribution
281 Returns
282 -------
283 float
284 KL divergence
285 """
286 vals = []
287 for p1, p2 in zip(dist_1, dist_2):
288 vals.append(p1 * math.log((p1 + EPSILON) / (p2 + EPSILON)))
290 return sum(vals)
292 def _ndkl(self):
293 """
294 Calculates normalized discounted cumulative KL-divergence (ndkl)
296 It is based on the following paper:
297 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search &
298 recommendation systems with application to linkedin talent search."
299 Proceedings of the 25th acm sigkdd international conference on knowledge discovery & data mining. 2019.
301 Returns
302 -------
303 dict
304 normalized discounted cumulative KL-divergence (ndkl)
305 """
306 Z = np.sum(1 / (np.log2(np.arange(1, self.num_items + 1) + 1)))
308 total = 0.0
309 for k in range(1, self.num_items + 1):
310 item_attr_k = list(self.pool_sensitive_features[:k])
311 item_distr = [
312 item_attr_k.count(attr) / len(item_attr_k)
313 for attr in self.desired_proportions.keys()
314 ]
315 total += (1 / math.log2(k + 1)) * self._kld(
316 item_distr, list(self.desired_proportions.values())
317 )
319 ndkl = {"ndkl-score": [{"value": (1 / Z) * total}]}
321 return ndkl
323 def _fins(self):
324 """
325 Calculates group fairness metrics for subset selections from FINS paper and library
327 It is based on the following paper:
328 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework: Group Fairness for Subset Selections."
329 Proceedings of the 2022 AAAI/ACM Conference on AI, Ethics, and Society. 2022.
331 Returns
332 -------
333 dict
334 fairness metrics
335 """
336 fins_metrics = {}
338 # represent sensitive feature values via consecutive integers
339 lookupTable, pool_sf_int = np.unique(
340 self.pool_sensitive_features, return_inverse=True
341 )
342 lookupTable, subset_sf_int = np.unique(
343 self.subset_sensitive_features, return_inverse=True
344 )
346 selectRt, parity_score = fins.parity(
347 self.pool_items, pool_sf_int, self.subset_items, subset_sf_int
348 )
349 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}]
351 propOfS, balance_score = fins.balance(
352 pool_sf_int, self.subset_items, subset_sf_int
353 )
354 fins_metrics["balance_ratio-score"] = [{"value": balance_score}]
356 # Score-dependant metrics
357 if self.subset_scores is not None:
358 AvgScore, score_parity_score = fins.score_parity(
359 self.subset_items, self.subset_scores, subset_sf_int
360 )
361 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}]
363 TotalScore, score_balance_score = fins.score_balance(
364 self.subset_items, self.subset_scores, subset_sf_int
365 )
366 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}]
368 if self.pool_scores is not None:
369 RselectRt, relevance_parity_score = fins.relevance_parity(
370 self.pool_items,
371 self.pool_scores,
372 pool_sf_int,
373 self.subset_items,
374 self.subset_scores,
375 subset_sf_int,
376 )
377 fins_metrics["relevance_parity_ratio-score"] = [
378 {"value": relevance_parity_score}
379 ]
381 if self.q:
382 QselectRt, qualified_parity_score = fins.qualified_parity(
383 self.pool_items,
384 self.pool_scores,
385 pool_sf_int,
386 self.subset_items,
387 self.subset_scores,
388 subset_sf_int,
389 self.q,
390 )
391 fins_metrics["qualified_demographic_parity_ratio-score"] = [
392 {"value": qualified_parity_score}
393 ]
395 QpropOfS, qualified_balance_score = fins.qualified_balance(
396 self.pool_items,
397 self.pool_scores,
398 pool_sf_int,
399 self.subset_items,
400 self.subset_scores,
401 subset_sf_int,
402 self.q,
403 )
404 fins_metrics["qualified_balance_ratio-score"] = [
405 {"value": qualified_balance_score}
406 ]
408 if self.lb_bin is not None and self.ub_bin is not None:
409 (
410 bin_group_selection_proportions,
411 calibrated_parity_score,
412 ) = fins.calibrated_parity(
413 self.pool_items,
414 self.pool_scores,
415 pool_sf_int,
416 self.subset_items,
417 self.subset_scores,
418 subset_sf_int,
419 self.lb_bin,
420 self.ub_bin,
421 )
422 fins_metrics["calibrated_demographic_parity_ratio-score"] = [
423 {"value": calibrated_parity_score}
424 ]
426 (
427 bin_group_proportions,
428 calibrated_balance_score,
429 ) = fins.calibrated_balance(
430 self.pool_items,
431 self.pool_scores,
432 pool_sf_int,
433 self.subset_items,
434 self.subset_scores,
435 subset_sf_int,
436 self.lb_bin,
437 self.ub_bin,
438 )
439 fins_metrics["calibrated_balance_ratio-score"] = [
440 {"value": calibrated_balance_score}
441 ]
443 return fins_metrics
445 def _score_distribution(self):
446 """
447 Calculates scores empirical distribution curve for each demographic group
448 """
450 groups = np.unique(self.pool_sensitive_features)
451 for group in groups:
452 ind = np.where(self.pool_sensitive_features == group)
453 group_scores = self.pool_scores[ind]
454 emp_dist_df = empirical_distribution_curve(
455 group_scores, self.down_sampling_step, variable_name="scores"
456 )
457 emp_dist_df.name = "score_empirical_distribution"
459 labels = {"sensitive_feature": self.sf_name, "group": group}
461 e = TableContainer(
462 emp_dist_df,
463 **self.get_container_info(labels=labels),
464 )
465 return e