Coverage for credoai/evaluators/ranking_fairness.py: 90%

134 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1"""Ranking Fairness evaluator""" 

2import math 

3 

4import numpy as np 

5import pandas as pd 

6from connect.evidence import MetricContainer, TableContainer 

7from finsfairauditing import fins 

8 

9from credoai.artifacts import TabularData 

10from credoai.evaluators import Evaluator 

11from credoai.evaluators.utils.validation import ( 

12 check_artifact_for_nulls, 

13 check_data_instance, 

14 check_existence, 

15 check_feature_presence, 

16) 

17from credoai.utils.common import ValidationError 

18from credoai.utils.dataset_utils import empirical_distribution_curve 

19 

20EPSILON = 1e-12 

21METRIC_SUBSET = [ 

22 "skew_parity_difference-score", 

23 "ndkl-score", 

24 "demographic_parity_ratio-score", 

25 "balance_ratio-score", 

26 "qualified_demographic_parity_ratio-score", 

27 "qualified_balance_ratio-score", 

28 "calibrated_demographic_parity_ratio-score", 

29 "calibrated_balance_ratio-score", 

30 "relevance_parity_ratio-score", 

31 "score_parity_ratio-score", 

32 "score_balance_ratio-score", 

33] 

34 

35 

36class RankingFairness(Evaluator): 

37 """ 

38 Ranking fairness evaluator for Credo AI 

39 

40 This module takes in ranking results and provides functionality to perform fairness assessment 

41 The results should include rankings, sensitive features, and optionally, scores. 

42 

43 The scores that the evaluator can calculate are: 

44 

45 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected 

46 items from a group over the desired proportion for that group. 

47 It ranges from 0 to inf and the ideal value is 0. 

48 

49 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values 

50 indicating a greater divergence between the desired and actual distributions of 

51 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0. 

52 

53 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate 

54 is the proportion of the selected items from a group over the number of items for 

55 that group in the pool. It ranges from 0 to 1 and ideal value is 1. 

56 

57 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items 

58 from a group. It ranges from 0 to 1 and ideal value is 1. 

59 

60 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score 

61 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1. 

62 

63 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal 

64 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1. 

65 

66 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from 

67 specified score bins. This is to audit if items with similar scores are are treated similarly 

68 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1. 

69 

70 * **calibrated_balance_ratio**: balance_ratio but with the selected set from 

71 specified score bins. This is to audit if items with similar scores are are treated similarly 

72 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1. 

73 

74 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score 

75 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1. 

76 

77 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score 

78 is the average score of the selected items from a group. 

79 It ranges from 0 to 1 and ideal value is 1. 

80 

81 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score 

82 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1. 

83 

84 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables. 

85 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1) 

86 It is useful for a visual examination of the distribution of scores for the different groups. 

87 

88 Parameters 

89 ---------- 

90 sensitive_features : pandas.Series 

91 A series of the sensitive feature labels (e.g., "male", "female") which should 

92 be used to create subgroups 

93 rankings : pandas.Series of type int 

94 The computed ranks 

95 It should be passed to TabularData's y argument with the column name `rankings` 

96 scores : pandas.Series of type int or float, Optional 

97 A series of the scores 

98 It should be passed to TabularData's y argument with the column name `scores` 

99 k: int, Optional 

100 The top k items are considered as the selected subset 

101 If not provided, the top 50% of the items are considered as selected 

102 q: float, Optional 

103 The relevance score for which items in the pool that have score >= q are "relevant". 

104 These two metrics require this to be provided: `qualified_demographic_parity_ratio` 

105 and `qualified_balance_ratio` 

106 lb_bin: numpy array of shape = (n_bins), Optional 

107 The lower bound scores for each bin (bin is greater than or equal to lower bound). 

108 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

109 and `calibrated_balance_ratio` 

110 ub_bin: numpy array of shape = (n_bins), Optional 

111 The upper bound scores for each bin (bin is less than upper bound). 

112 These two metrics require this to be provided: `calibrated_demographic_parity_ratio` 

113 and `calibrated_balance_ratio` 

114 desired_proportions: dict, Optional 

115 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6}) 

116 If not provided, equal proportions are used for calculation of `skew` score 

117 down_sampling_step : int, optional 

118 down-sampling step for scores empirical distribution curve 

119 If not provided, down-sampling is done such that the curve length be nearly 100 

120 """ 

121 

122 def __init__( 

123 self, 

124 k: int = None, 

125 q: float = None, 

126 lb_bin: list = None, 

127 ub_bin: list = None, 

128 desired_proportions: dict = None, 

129 down_sampling_step: int = None, 

130 ): 

131 self.desired_proportions = (desired_proportions,) 

132 self.k = k 

133 self.q = q 

134 self.down_sampling_step = down_sampling_step 

135 if lb_bin is not None and ub_bin is not None: 

136 self.lb_bin = np.array(lb_bin) 

137 self.ub_bin = np.array(ub_bin) 

138 else: 

139 self.lb_bin = lb_bin 

140 self.ub_bin = ub_bin 

141 

142 required_artifacts = ["data", "sensitive_feature"] 

143 

144 def _validate_arguments(self): 

145 check_data_instance(self.data, TabularData) 

146 check_existence(self.data.sensitive_features, "sensitive_features") 

147 check_feature_presence("rankings", self.data.y, "y") 

148 check_artifact_for_nulls(self.data, "Data") 

149 

150 return self 

151 

152 def _setup(self): 

153 self.pool_rankings = np.array(self.data.y.rankings) 

154 self.pool_sensitive_features = np.array(self.data.sensitive_feature) 

155 self.sf_name = self.data.sensitive_feature.name 

156 if self.k is None: 

157 self.k = int(len(self.pool_rankings) / 2) 

158 

159 if self.down_sampling_step is None: 

160 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1) 

161 

162 # Sort ascending in parallel in case not already sorted 

163 p = self.pool_rankings.argsort() 

164 self.pool_rankings = self.pool_rankings[p] 

165 self.pool_sensitive_features = self.pool_sensitive_features[p] 

166 

167 self.pool_groups = list(set(self.pool_sensitive_features)) 

168 self.pool_items = np.arange(0, len(self.pool_rankings)) 

169 self.num_items = len(self.pool_rankings) 

170 

171 self.subset_sensitive_features = self.pool_sensitive_features[: self.k] 

172 self.subset_items = self.pool_items[: self.k] 

173 self.subset_groups = list(set(self.subset_sensitive_features)) 

174 

175 if "scores" in self.data.y: 

176 self.pool_scores = np.array(self.data.y.scores) 

177 if not np.issubdtype(self.pool_scores.dtype, np.number): 

178 raise ValidationError( 

179 "`scores` array provided contains non-numeric elements." 

180 ) 

181 

182 self.subset_scores = self.pool_scores[: self.k] 

183 else: 

184 self.pool_scores = None 

185 self.subset_scores = None 

186 

187 # if desired proportions are not provided, set it to the pool proportions 

188 if not all(self.desired_proportions): 

189 uniques, counts = np.unique( 

190 self.pool_sensitive_features, return_counts=True 

191 ) 

192 self.desired_proportions = dict(zip(uniques, counts / self.num_items)) 

193 

194 return self 

195 

196 def evaluate(self): 

197 """ 

198 Runs the assessment process 

199 

200 Returns 

201 ------- 

202 dict, nested 

203 Key: assessment category 

204 Values: detailed results associated with each category 

205 """ 

206 

207 skew_results = self._skew() 

208 ndkl_results = self._ndkl() 

209 fins_results = self._fins() 

210 

211 res = {**skew_results, **ndkl_results, **fins_results} 

212 

213 self.results = self._format_results(res) 

214 

215 if self.pool_scores is not None: 

216 self.results.append(self._score_distribution()) 

217 

218 return self 

219 

220 def _format_results(self, res): 

221 """ 

222 Format results from the evaluations. 

223 

224 Parameters 

225 ---------- 

226 res : dict 

227 All results of the evaluations 

228 

229 """ 

230 

231 res = {k: v for k, v in res.items() if k in METRIC_SUBSET} 

232 

233 # Reformat results 

234 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()] 

235 res = pd.concat(res) 

236 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True) 

237 res.drop("metric_type", axis=1, inplace=True) 

238 return [MetricContainer(res, **self.get_container_info())] 

239 

240 def _skew(self): 

241 """ 

242 Calculates skew parity 

243 

244 For every group, skew is the proportion of the selected candidates 

245 from that group over the desired proportion for that group. 

246 

247 Returns 

248 ------- 

249 dict 

250 skew parity difference 

251 """ 

252 uniques, counts = np.unique(self.subset_sensitive_features, return_counts=True) 

253 subset_proportions = dict(zip(uniques, counts / self.k)) 

254 

255 skew = {} 

256 for g in self.pool_groups: 

257 sk = (subset_proportions[g] + EPSILON) / ( 

258 self.desired_proportions[g] + EPSILON 

259 ) 

260 skew[g] = sk 

261 

262 skew = { 

263 "skew_parity_difference-score": [ 

264 {"value": max(skew.values()) - min(skew.values())} 

265 ] 

266 } 

267 

268 return skew 

269 

270 def _kld(self, dist_1, dist_2): 

271 """ 

272 Calculates KL divergence 

273 

274 Parameters 

275 ---------- 

276 dist_1 : list 

277 first distribution 

278 dist_2 : list 

279 second distribution 

280 

281 Returns 

282 ------- 

283 float 

284 KL divergence 

285 """ 

286 vals = [] 

287 for p1, p2 in zip(dist_1, dist_2): 

288 vals.append(p1 * math.log((p1 + EPSILON) / (p2 + EPSILON))) 

289 

290 return sum(vals) 

291 

292 def _ndkl(self): 

293 """ 

294 Calculates normalized discounted cumulative KL-divergence (ndkl) 

295 

296 It is based on the following paper: 

297 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search & 

298 recommendation systems with application to linkedin talent search." 

299 Proceedings of the 25th acm sigkdd international conference on knowledge discovery & data mining. 2019. 

300 

301 Returns 

302 ------- 

303 dict 

304 normalized discounted cumulative KL-divergence (ndkl) 

305 """ 

306 Z = np.sum(1 / (np.log2(np.arange(1, self.num_items + 1) + 1))) 

307 

308 total = 0.0 

309 for k in range(1, self.num_items + 1): 

310 item_attr_k = list(self.pool_sensitive_features[:k]) 

311 item_distr = [ 

312 item_attr_k.count(attr) / len(item_attr_k) 

313 for attr in self.desired_proportions.keys() 

314 ] 

315 total += (1 / math.log2(k + 1)) * self._kld( 

316 item_distr, list(self.desired_proportions.values()) 

317 ) 

318 

319 ndkl = {"ndkl-score": [{"value": (1 / Z) * total}]} 

320 

321 return ndkl 

322 

323 def _fins(self): 

324 """ 

325 Calculates group fairness metrics for subset selections from FINS paper and library 

326 

327 It is based on the following paper: 

328 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework: Group Fairness for Subset Selections." 

329 Proceedings of the 2022 AAAI/ACM Conference on AI, Ethics, and Society. 2022. 

330 

331 Returns 

332 ------- 

333 dict 

334 fairness metrics 

335 """ 

336 fins_metrics = {} 

337 

338 # represent sensitive feature values via consecutive integers 

339 lookupTable, pool_sf_int = np.unique( 

340 self.pool_sensitive_features, return_inverse=True 

341 ) 

342 lookupTable, subset_sf_int = np.unique( 

343 self.subset_sensitive_features, return_inverse=True 

344 ) 

345 

346 selectRt, parity_score = fins.parity( 

347 self.pool_items, pool_sf_int, self.subset_items, subset_sf_int 

348 ) 

349 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}] 

350 

351 propOfS, balance_score = fins.balance( 

352 pool_sf_int, self.subset_items, subset_sf_int 

353 ) 

354 fins_metrics["balance_ratio-score"] = [{"value": balance_score}] 

355 

356 # Score-dependant metrics 

357 if self.subset_scores is not None: 

358 AvgScore, score_parity_score = fins.score_parity( 

359 self.subset_items, self.subset_scores, subset_sf_int 

360 ) 

361 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}] 

362 

363 TotalScore, score_balance_score = fins.score_balance( 

364 self.subset_items, self.subset_scores, subset_sf_int 

365 ) 

366 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}] 

367 

368 if self.pool_scores is not None: 

369 RselectRt, relevance_parity_score = fins.relevance_parity( 

370 self.pool_items, 

371 self.pool_scores, 

372 pool_sf_int, 

373 self.subset_items, 

374 self.subset_scores, 

375 subset_sf_int, 

376 ) 

377 fins_metrics["relevance_parity_ratio-score"] = [ 

378 {"value": relevance_parity_score} 

379 ] 

380 

381 if self.q: 

382 QselectRt, qualified_parity_score = fins.qualified_parity( 

383 self.pool_items, 

384 self.pool_scores, 

385 pool_sf_int, 

386 self.subset_items, 

387 self.subset_scores, 

388 subset_sf_int, 

389 self.q, 

390 ) 

391 fins_metrics["qualified_demographic_parity_ratio-score"] = [ 

392 {"value": qualified_parity_score} 

393 ] 

394 

395 QpropOfS, qualified_balance_score = fins.qualified_balance( 

396 self.pool_items, 

397 self.pool_scores, 

398 pool_sf_int, 

399 self.subset_items, 

400 self.subset_scores, 

401 subset_sf_int, 

402 self.q, 

403 ) 

404 fins_metrics["qualified_balance_ratio-score"] = [ 

405 {"value": qualified_balance_score} 

406 ] 

407 

408 if self.lb_bin is not None and self.ub_bin is not None: 

409 ( 

410 bin_group_selection_proportions, 

411 calibrated_parity_score, 

412 ) = fins.calibrated_parity( 

413 self.pool_items, 

414 self.pool_scores, 

415 pool_sf_int, 

416 self.subset_items, 

417 self.subset_scores, 

418 subset_sf_int, 

419 self.lb_bin, 

420 self.ub_bin, 

421 ) 

422 fins_metrics["calibrated_demographic_parity_ratio-score"] = [ 

423 {"value": calibrated_parity_score} 

424 ] 

425 

426 ( 

427 bin_group_proportions, 

428 calibrated_balance_score, 

429 ) = fins.calibrated_balance( 

430 self.pool_items, 

431 self.pool_scores, 

432 pool_sf_int, 

433 self.subset_items, 

434 self.subset_scores, 

435 subset_sf_int, 

436 self.lb_bin, 

437 self.ub_bin, 

438 ) 

439 fins_metrics["calibrated_balance_ratio-score"] = [ 

440 {"value": calibrated_balance_score} 

441 ] 

442 

443 return fins_metrics 

444 

445 def _score_distribution(self): 

446 """ 

447 Calculates scores empirical distribution curve for each demographic group 

448 """ 

449 

450 groups = np.unique(self.pool_sensitive_features) 

451 for group in groups: 

452 ind = np.where(self.pool_sensitive_features == group) 

453 group_scores = self.pool_scores[ind] 

454 emp_dist_df = empirical_distribution_curve( 

455 group_scores, self.down_sampling_step, variable_name="scores" 

456 ) 

457 emp_dist_df.name = "score_empirical_distribution" 

458 

459 labels = {"sensitive_feature": self.sf_name, "group": group} 

460 

461 e = TableContainer( 

462 emp_dist_df, 

463 **self.get_container_info(labels=labels), 

464 ) 

465 return e