Coverage for credoai/evaluators/ranking_fairness.py: 88%

112 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1"""Ranking Fairness evaluator""" 

2 

3import numpy as np 

4import pandas as pd 

5from connect.evidence import MetricContainer, TableContainer 

6from finsfairauditing import fins 

7 

8from credoai.artifacts import TabularData 

9from credoai.evaluators.evaluator import Evaluator 

10from credoai.evaluators.utils.validation import ( 

11 check_data_for_nulls, 

12 check_data_instance, 

13 check_existence, 

14 check_feature_presence, 

15) 

16from credoai.modules.metrics_credoai import ( 

17 normalized_discounted_cumulative_kl_divergence, 

18 skew_parity, 

19) 

20from credoai.utils.common import ValidationError 

21from credoai.utils.dataset_utils import empirical_distribution_curve 

22 

23EPSILON = 1e-12 

24METRIC_SUBSET = [ 

25 "skew_parity_difference-score", 

26 "skew_parity_ratio-score", 

27 "ndkl-score", 

28 "demographic_parity_ratio-score", 

29 "balance_ratio-score", 

30 "qualified_demographic_parity_ratio-score", 

31 "qualified_balance_ratio-score", 

32 "calibrated_demographic_parity_ratio-score", 

33 "calibrated_balance_ratio-score", 

34 "relevance_parity_ratio-score", 

35 "score_parity_ratio-score", 

36 "score_balance_ratio-score", 

37] 

38 

39 

40class RankingFairness(Evaluator): 

41 """ 

42 Ranking fairness evaluator for Credo AI (Experimental) 

43 

44 This module takes in ranking results and provides functionality to perform fairness assessment 

45 The results should include rankings, sensitive features, and optionally, scores. 

46 

47 The scores that the evaluator can calculate are: 

48 

49 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected 

50 items from a group over the desired proportion for that group. 

51 It ranges from 0 to inf and the ideal value is 0. 

52 

53 * **skew_parity_ratio**: min_skew / max_skew, where skew is the proportion of the selected 

54 items from a group over the desired proportion for that group. 

55 It ranges from 0 to 1 and the ideal value is 1. 

56 

57 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values 

58 indicating a greater divergence between the desired and actual distributions of 

59 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0. 

60 

61 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate 

62 is the proportion of the selected items from a group over the number of items for 

63 that group in the pool. It ranges from 0 to 1 and ideal value is 1. 

64 

65 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items 

66 from a group. It ranges from 0 to 1 and ideal value is 1. 

67 

68 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score 

69 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1. 

70 

71 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal 

72 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1. 

73 

74 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from 

75 specified score bins. This is to audit if items with similar scores are are treated similarly 

76 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1. 

77 

78 * **calibrated_balance_ratio**: balance_ratio but with the selected set from 

79 specified score bins. This is to audit if items with similar scores are are treated similarly 

80 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1. 

81 

82 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score 

83 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1. 

84 

85 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score 

86 is the average score of the selected items from a group. 

87 It ranges from 0 to 1 and ideal value is 1. 

88 

89 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score 

90 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1. 

91 

92 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables. 

93 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1) 

94 It is useful for a visual examination of the distribution of scores for the different groups. 

95 

96 Parameters 

97 ---------- 

98 sensitive_features : pandas.Series 

99 A series of the sensitive feature labels (e.g., "male", "female") which should 

100 be used to create subgroups 

101 rankings : pandas.Series of type int 

102 The computed ranks 

103 It should be passed to TabularData's y argument with the column name `rankings` 

104 scores : pandas.Series of type int or float, Optional 

105 A series of the scores 

106 It should be passed to TabularData's y argument with the column name `scores` 

107 k: int, Optional 

108 The top k items are considered as the selected subset 

109 If not provided, the top 50% of the items are considered as selected 

110 q: float, Optional 

111 The relevance score for which items in the pool that have score >= q are "relevant". 

112 These two metrics require this to be provided: `qualified_demographic_parity_ratio` 

113 and `qualified_balance_ratio` 

114 lb_bin: numpy array of shape = (n_bins), Optional 

115 The lower bound scores for each bin (bin is greater than or equal to lower bound). 

116 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

117 and `calibrated_balance_ratio` 

118 ub_bin: numpy array of shape = (n_bins), Optional 

119 The upper bound scores for each bin (bin is less than upper bound). 

120 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

121 and `calibrated_balance_ratio` 

122 desired_proportions: dict, Optional 

123 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6}) 

124 If not provided, equal proportions are used for calculation of `skew` score 

125 down_sampling_step : int, optional 

126 down-sampling step for scores empirical distribution curve 

127 If not provided, down-sampling is done such that the curve length be nearly 100 

128 """ 

129 

130 def __init__( 

131 self, 

132 k: int = None, 

133 q: float = None, 

134 lb_bin: list = None, 

135 ub_bin: list = None, 

136 desired_proportions: dict = None, 

137 down_sampling_step: int = None, 

138 ): 

139 self.desired_proportions = (desired_proportions,) 

140 self.k = k 

141 self.q = q 

142 self.down_sampling_step = down_sampling_step 

143 if lb_bin is not None and ub_bin is not None: 

144 self.lb_bin = np.array(lb_bin) 

145 self.ub_bin = np.array(ub_bin) 

146 else: 

147 self.lb_bin = lb_bin 

148 self.ub_bin = ub_bin 

149 

150 required_artifacts = ["data", "sensitive_feature"] 

151 

152 def _validate_arguments(self): 

153 check_data_instance(self.data, TabularData) 

154 check_existence(self.data.sensitive_features, "sensitive_features") 

155 check_feature_presence("rankings", self.data.y, "y") 

156 check_data_for_nulls(self.data, "Data") 

157 

158 return self 

159 

160 def _setup(self): 

161 self.pool_rankings = np.array(self.data.y.rankings) 

162 self.pool_sensitive_features = np.array(self.data.sensitive_feature) 

163 self.sf_name = self.data.sensitive_feature.name 

164 if self.k is None: 

165 self.k = int(len(self.pool_rankings) / 2) 

166 

167 if self.down_sampling_step is None: 

168 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1) 

169 

170 # Sort ascending in parallel in case not already sorted 

171 p = self.pool_rankings.argsort() 

172 self.pool_rankings = self.pool_rankings[p] 

173 self.pool_sensitive_features = self.pool_sensitive_features[p] 

174 

175 self.pool_groups = list(set(self.pool_sensitive_features)) 

176 self.num_items = len(self.pool_rankings) 

177 

178 self.subset_sensitive_features = self.pool_sensitive_features[: self.k] 

179 self.subset_groups = list(set(self.subset_sensitive_features)) 

180 

181 if "scores" in self.data.y: 

182 self.pool_scores = np.array(self.data.y.scores) 

183 if not np.issubdtype(self.pool_scores.dtype, np.number): 

184 raise ValidationError( 

185 "`scores` array provided contains non-numeric elements." 

186 ) 

187 

188 self.subset_scores = self.pool_scores[: self.k] 

189 else: 

190 self.pool_scores = None 

191 self.subset_scores = None 

192 

193 # if desired proportions are not provided, set it to the pool proportions 

194 if not all(self.desired_proportions): 

195 uniques, counts = np.unique( 

196 self.pool_sensitive_features, return_counts=True 

197 ) 

198 self.desired_proportions = dict(zip(uniques, counts / self.num_items)) 

199 

200 return self 

201 

202 def evaluate(self): 

203 """ 

204 Runs the assessment process 

205 

206 Returns 

207 ------- 

208 dict, nested 

209 Key: assessment category 

210 Values: detailed results associated with each category 

211 """ 

212 # Skew parity metrics 

213 skew_parity_diff = skew_parity( 

214 self.subset_sensitive_features, 

215 self.desired_proportions, 

216 "difference", 

217 ) 

218 skew_parity_ratio = skew_parity( 

219 self.subset_sensitive_features, 

220 self.desired_proportions, 

221 "ratio", 

222 ) 

223 skew_results = { 

224 "skew_parity_difference-score": [{"value": skew_parity_diff}], 

225 "skew_parity_ratio-score": [{"value": skew_parity_ratio}], 

226 } 

227 

228 # NDKL metric 

229 ndkl = normalized_discounted_cumulative_kl_divergence( 

230 self.pool_sensitive_features, self.desired_proportions 

231 ) 

232 ndkl_results = {"ndkl-score": [{"value": ndkl}]} 

233 

234 # FIN metrics 

235 fins_results = calculate_fins_metrics( 

236 self.pool_sensitive_features, 

237 self.subset_sensitive_features, 

238 self.pool_scores, 

239 self.subset_scores, 

240 self.lb_bin, 

241 self.ub_bin, 

242 self.q, 

243 ) 

244 

245 res = {**skew_results, **ndkl_results, **fins_results} 

246 self.results = self._format_results(res) 

247 

248 # Score disaggregated empirical distributions 

249 if self.pool_scores is not None: 

250 for group in self.pool_groups: 

251 ind = np.where(self.pool_sensitive_features == group) 

252 group_scores = self.pool_scores[ind] 

253 emp_dist_df = empirical_distribution_curve( 

254 group_scores, self.down_sampling_step, variable_name="scores" 

255 ) 

256 emp_dist_df.name = "score_empirical_distribution" 

257 labels = {"sensitive_feature": self.sf_name, "group": group} 

258 e = TableContainer( 

259 emp_dist_df, 

260 **self.get_info(labels=labels), 

261 ) 

262 self.results.append(e) 

263 

264 return self 

265 

266 def _format_results(self, res): 

267 """ 

268 Format results from the evaluations. 

269 

270 Parameters 

271 ---------- 

272 res : dict 

273 All results of the evaluations 

274 """ 

275 res = {k: v for k, v in res.items() if k in METRIC_SUBSET} 

276 

277 # Reformat results 

278 labels = {"sensitive_feature": self.sf_name} 

279 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()] 

280 res = pd.concat(res) 

281 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True) 

282 res.drop("metric_type", axis=1, inplace=True) 

283 return [MetricContainer(res, **self.get_info(labels=labels))] 

284 

285 

286############################################ 

287## Evaluation helper functions 

288 

289 

290## Helper functions create evidences 

291## to be passed to .evaluate to be wrapped 

292## by evidence containers 

293############################################ 

294def calculate_fins_metrics( 

295 pool_sensitive_features, 

296 subset_sensitive_features, 

297 pool_scores=None, 

298 subset_scores=None, 

299 lb_bin=None, 

300 ub_bin=None, 

301 q=None, 

302): 

303 """ 

304 Calculates group fairness metrics for subset selections from FINS paper and library 

305 

306 Parameters 

307 ---------- 

308 pool_sensitive_features : numpy array 

309 An array of items in the pool. 

310 If ranking is applicable, the array should be sorted accordignly. 

311 subset_sensitive_features : numpy array 

312 An array of items in the subset. 

313 If ranking is applicable, the array should be sorted accordignly. 

314 pool_scores : numpy array, Optional 

315 An array of the scores for items in the pools 

316 subset_scores : numpy array, Optional 

317 An array of the scores for items in the subset 

318 lb_bin: numpy array of shape = (n_bins), Optional 

319 The lower bound scores for each bin (bin is greater than or equal to lower bound). 

320 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

321 and `calibrated_balance_ratio` 

322 ub_bin: numpy array of shape = (n_bins), Optional 

323 The upper bound scores for each bin (bin is less than upper bound). 

324 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

325 and `calibrated_balance_ratio` 

326 q: float, Optional 

327 The relevance score for which items in the pool that have score >= q are "relevant". 

328 These two metrics require this to be provided: `qualified_demographic_parity_ratio` 

329 and `qualified_balance_ratio` 

330 

331 Returns 

332 ------- 

333 fins_metrics : dict 

334 All results of the FINS evaluations 

335 

336 References 

337 ---------- 

338 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework: 

339 Group Fairness for Subset Selections." Proceedings of the 2022 

340 AAAI/ACM Conference on AI, Ethics, and Society. 2022. 

341 """ 

342 fins_metrics = {} 

343 

344 pool_items = np.arange(0, len(pool_sensitive_features)) 

345 subset_items = np.arange(0, len(subset_sensitive_features)) 

346 

347 # represent sensitive feature values via consecutive integers 

348 lookupTable, pool_sf_int = np.unique(pool_sensitive_features, return_inverse=True) 

349 lookupTable, subset_sf_int = np.unique( 

350 subset_sensitive_features, return_inverse=True 

351 ) 

352 

353 selectRt, parity_score = fins.parity( 

354 pool_items, pool_sf_int, subset_items, subset_sf_int 

355 ) 

356 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}] 

357 

358 propOfS, balance_score = fins.balance(pool_sf_int, subset_items, subset_sf_int) 

359 fins_metrics["balance_ratio-score"] = [{"value": balance_score}] 

360 

361 # Score-dependant metrics 

362 if subset_scores is not None: 

363 AvgScore, score_parity_score = fins.score_parity( 

364 subset_items, subset_scores, subset_sf_int 

365 ) 

366 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}] 

367 

368 TotalScore, score_balance_score = fins.score_balance( 

369 subset_items, subset_scores, subset_sf_int 

370 ) 

371 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}] 

372 

373 if pool_scores is not None: 

374 RselectRt, relevance_parity_score = fins.relevance_parity( 

375 pool_items, 

376 pool_scores, 

377 pool_sf_int, 

378 subset_items, 

379 subset_scores, 

380 subset_sf_int, 

381 ) 

382 fins_metrics["relevance_parity_ratio-score"] = [ 

383 {"value": relevance_parity_score} 

384 ] 

385 

386 if q: 

387 QselectRt, qualified_parity_score = fins.qualififed_parity( 

388 pool_items, 

389 pool_scores, 

390 pool_sf_int, 

391 subset_items, 

392 subset_scores, 

393 subset_sf_int, 

394 q, 

395 ) 

396 fins_metrics["qualified_demographic_parity_ratio-score"] = [ 

397 {"value": qualified_parity_score} 

398 ] 

399 

400 QpropOfS, qualified_balance_score = fins.qualified_balance( 

401 pool_items, 

402 pool_scores, 

403 pool_sf_int, 

404 subset_items, 

405 subset_scores, 

406 subset_sf_int, 

407 q, 

408 ) 

409 fins_metrics["qualified_balance_ratio-score"] = [ 

410 {"value": qualified_balance_score} 

411 ] 

412 

413 if lb_bin is not None and ub_bin is not None: 

414 ( 

415 bin_group_selection_proportions, 

416 calibrated_parity_score, 

417 ) = fins.calibrated_parity( 

418 pool_items, 

419 pool_scores, 

420 pool_sf_int, 

421 subset_items, 

422 subset_scores, 

423 subset_sf_int, 

424 lb_bin, 

425 ub_bin, 

426 ) 

427 fins_metrics["calibrated_demographic_parity_ratio-score"] = [ 

428 {"value": calibrated_parity_score} 

429 ] 

430 

431 ( 

432 bin_group_proportions, 

433 calibrated_balance_score, 

434 ) = fins.calibrated_balance( 

435 pool_items, 

436 pool_scores, 

437 pool_sf_int, 

438 subset_items, 

439 subset_scores, 

440 subset_sf_int, 

441 lb_bin, 

442 ub_bin, 

443 ) 

444 fins_metrics["calibrated_balance_ratio-score"] = [ 

445 {"value": calibrated_balance_score} 

446 ] 

447 

448 return fins_metrics