Coverage for credoai/evaluators/ranking

1"""Ranking Fairness evaluator"""

3import numpy as np

4import pandas as pd

5from connect.evidence import MetricContainer, TableContainer

6from finsfairauditing import fins

8from credoai.artifacts import TabularData

9from credoai.evaluators.evaluator import Evaluator

10from credoai.evaluators.utils.validation import (

11 check_data_for_nulls,

12 check_data_instance,

13 check_existence,

14 check_feature_presence,

15)

16from credoai.modules.metrics_credoai import (

17 normalized_discounted_cumulative_kl_divergence,

18 skew_parity,

19)

20from credoai.utils.common import ValidationError

21from credoai.utils.dataset_utils import empirical_distribution_curve

23EPSILON = 1e-12

24METRIC_SUBSET = [

25 "skew_parity_difference-score",

26 "skew_parity_ratio-score",

27 "ndkl-score",

28 "demographic_parity_ratio-score",

29 "balance_ratio-score",

30 "qualified_demographic_parity_ratio-score",

31 "qualified_balance_ratio-score",

32 "calibrated_demographic_parity_ratio-score",

33 "calibrated_balance_ratio-score",

34 "relevance_parity_ratio-score",

35 "score_parity_ratio-score",

36 "score_balance_ratio-score",

37]

40class RankingFairness(Evaluator):

41 """

42 Ranking fairness evaluator for Credo AI (Experimental)

44 This module takes in ranking results and provides functionality to perform fairness assessment

45 The results should include rankings, sensitive features, and optionally, scores.

47 The scores that the evaluator can calculate are:

49 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected

50 items from a group over the desired proportion for that group.

51 It ranges from 0 to inf and the ideal value is 0.

53 * **skew_parity_ratio**: min_skew / max_skew, where skew is the proportion of the selected

54 items from a group over the desired proportion for that group.

55 It ranges from 0 to 1 and the ideal value is 1.

57 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values

58 indicating a greater divergence between the desired and actual distributions of

59 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0.

61 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate

62 is the proportion of the selected items from a group over the number of items for

63 that group in the pool. It ranges from 0 to 1 and ideal value is 1.

65 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items

66 from a group. It ranges from 0 to 1 and ideal value is 1.

68 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score

69 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.

71 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal

72 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.

74 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from

75 specified score bins. This is to audit if items with similar scores are are treated similarly

76 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.

78 * **calibrated_balance_ratio**: balance_ratio but with the selected set from

79 specified score bins. This is to audit if items with similar scores are are treated similarly

80 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.

82 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score

83 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1.

85 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score

86 is the average score of the selected items from a group.

87 It ranges from 0 to 1 and ideal value is 1.

89 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score

90 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1.

92 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables.

93 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1)

94 It is useful for a visual examination of the distribution of scores for the different groups.

96 Parameters

97 ----------

98 sensitive_features : pandas.Series

99 A series of the sensitive feature labels (e.g., "male", "female") which should

100 be used to create subgroups

101 rankings : pandas.Series of type int

102 The computed ranks

103 It should be passed to TabularData's y argument with the column name `rankings`

104 scores : pandas.Series of type int or float, Optional

105 A series of the scores

106 It should be passed to TabularData's y argument with the column name `scores`

107 k: int, Optional

108 The top k items are considered as the selected subset

109 If not provided, the top 50% of the items are considered as selected

110 q: float, Optional

111 The relevance score for which items in the pool that have score >= q are "relevant".

112 These two metrics require this to be provided: `qualified_demographic_parity_ratio`

113 and `qualified_balance_ratio`

114 lb_bin: numpy array of shape = (n_bins), Optional

115 The lower bound scores for each bin (bin is greater than or equal to lower bound).

116 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

117 and `calibrated_balance_ratio`

118 ub_bin: numpy array of shape = (n_bins), Optional

119 The upper bound scores for each bin (bin is less than upper bound).

120 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

121 and `calibrated_balance_ratio`

122 desired_proportions: dict, Optional

123 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6})

124 If not provided, equal proportions are used for calculation of `skew` score

125 down_sampling_step : int, optional

126 down-sampling step for scores empirical distribution curve

127 If not provided, down-sampling is done such that the curve length be nearly 100

128 """

129

130 def __init__(

131 self,

132 k: int = None,

133 q: float = None,

134 lb_bin: list = None,

135 ub_bin: list = None,

136 desired_proportions: dict = None,

137 down_sampling_step: int = None,

138 ):

139 self.desired_proportions = (desired_proportions,)

140 self.k = k

141 self.q = q

142 self.down_sampling_step = down_sampling_step

143 if lb_bin is not None and ub_bin is not None:

144 self.lb_bin = np.array(lb_bin)

145 self.ub_bin = np.array(ub_bin)

146 else:

147 self.lb_bin = lb_bin

148 self.ub_bin = ub_bin

149

150 required_artifacts = ["data", "sensitive_feature"]

151

152 def _validate_arguments(self):

153 check_data_instance(self.data, TabularData)

154 check_existence(self.data.sensitive_features, "sensitive_features")

155 check_feature_presence("rankings", self.data.y, "y")

156 check_data_for_nulls(self.data, "Data")

157

158 return self

159

160 def _setup(self):

161 self.pool_rankings = np.array(self.data.y.rankings)

162 self.pool_sensitive_features = np.array(self.data.sensitive_feature)

163 self.sf_name = self.data.sensitive_feature.name

164 if self.k is None:

165 self.k = int(len(self.pool_rankings) / 2)

166

167 if self.down_sampling_step is None:

168 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1)

169

170 # Sort ascending in parallel in case not already sorted

171 p = self.pool_rankings.argsort()

172 self.pool_rankings = self.pool_rankings[p]

173 self.pool_sensitive_features = self.pool_sensitive_features[p]

174

175 self.pool_groups = list(set(self.pool_sensitive_features))

176 self.num_items = len(self.pool_rankings)

177

178 self.subset_sensitive_features = self.pool_sensitive_features[: self.k]

179 self.subset_groups = list(set(self.subset_sensitive_features))

180

181 if "scores" in self.data.y:

182 self.pool_scores = np.array(self.data.y.scores)

183 if not np.issubdtype(self.pool_scores.dtype, np.number):

184 raise ValidationError(

185 "`scores` array provided contains non-numeric elements."

186 )

187

188 self.subset_scores = self.pool_scores[: self.k]

189 else:

190 self.pool_scores = None

191 self.subset_scores = None

192

193 # if desired proportions are not provided, set it to the pool proportions

194 if not all(self.desired_proportions):

195 uniques, counts = np.unique(

196 self.pool_sensitive_features, return_counts=True

197 )

198 self.desired_proportions = dict(zip(uniques, counts / self.num_items))

199

200 return self

201

202 def evaluate(self):

203 """

204 Runs the assessment process

205

206 Returns

207 -------

208 dict, nested

209 Key: assessment category

210 Values: detailed results associated with each category

211 """

212 # Skew parity metrics

213 skew_parity_diff = skew_parity(

214 self.subset_sensitive_features,

215 self.desired_proportions,

216 "difference",

217 )

218 skew_parity_ratio = skew_parity(

219 self.subset_sensitive_features,

220 self.desired_proportions,

221 "ratio",

222 )

223 skew_results = {

224 "skew_parity_difference-score": [{"value": skew_parity_diff}],

225 "skew_parity_ratio-score": [{"value": skew_parity_ratio}],

226 }

227

228 # NDKL metric

229 ndkl = normalized_discounted_cumulative_kl_divergence(

230 self.pool_sensitive_features, self.desired_proportions

231 )

232 ndkl_results = {"ndkl-score": [{"value": ndkl}]}

233

234 # FIN metrics

235 fins_results = calculate_fins_metrics(

236 self.pool_sensitive_features,

237 self.subset_sensitive_features,

238 self.pool_scores,

239 self.subset_scores,

240 self.lb_bin,

241 self.ub_bin,

242 self.q,

243 )

244

245 res = {**skew_results, **ndkl_results, **fins_results}

246 self.results = self._format_results(res)

247

248 # Score disaggregated empirical distributions

249 if self.pool_scores is not None:

250 for group in self.pool_groups:

251 ind = np.where(self.pool_sensitive_features == group)

252 group_scores = self.pool_scores[ind]

253 emp_dist_df = empirical_distribution_curve(

254 group_scores, self.down_sampling_step, variable_name="scores"

255 )

256 emp_dist_df.name = "score_empirical_distribution"

257 labels = {"sensitive_feature": self.sf_name, "group": group}

258 e = TableContainer(

259 emp_dist_df,

260 **self.get_info(labels=labels),

261 )

262 self.results.append(e)

263

264 return self

265

266 def _format_results(self, res):

267 """

268 Format results from the evaluations.

269

270 Parameters

271 ----------

272 res : dict

273 All results of the evaluations

274 """

275 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}

276

277 # Reformat results

278 labels = {"sensitive_feature": self.sf_name}

279 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]

280 res = pd.concat(res)

281 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)

282 res.drop("metric_type", axis=1, inplace=True)

283 return [MetricContainer(res, **self.get_info(labels=labels))]

284

285

286############################################

287## Evaluation helper functions

288

289

290## Helper functions create evidences

291## to be passed to .evaluate to be wrapped

292## by evidence containers

293############################################

294def calculate_fins_metrics(

295 pool_sensitive_features,

296 subset_sensitive_features,

297 pool_scores=None,

298 subset_scores=None,

299 lb_bin=None,

300 ub_bin=None,

301 q=None,

302):

303 """

304 Calculates group fairness metrics for subset selections from FINS paper and library

305

306 Parameters

307 ----------

308 pool_sensitive_features : numpy array

309 An array of items in the pool.

310 If ranking is applicable, the array should be sorted accordignly.

311 subset_sensitive_features : numpy array

312 An array of items in the subset.

313 If ranking is applicable, the array should be sorted accordignly.

314 pool_scores : numpy array, Optional

315 An array of the scores for items in the pools

316 subset_scores : numpy array, Optional

317 An array of the scores for items in the subset

318 lb_bin: numpy array of shape = (n_bins), Optional

319 The lower bound scores for each bin (bin is greater than or equal to lower bound).

320 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

321 and `calibrated_balance_ratio`

322 ub_bin: numpy array of shape = (n_bins), Optional

323 The upper bound scores for each bin (bin is less than upper bound).

324 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

325 and `calibrated_balance_ratio`

326 q: float, Optional

327 The relevance score for which items in the pool that have score >= q are "relevant".

328 These two metrics require this to be provided: `qualified_demographic_parity_ratio`

329 and `qualified_balance_ratio`

330

331 Returns

332 -------

333 fins_metrics : dict

334 All results of the FINS evaluations

335

336 References

337 ----------

338 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework:

339 Group Fairness for Subset Selections." Proceedings of the 2022

340 AAAI/ACM Conference on AI, Ethics, and Society. 2022.

341 """

342 fins_metrics = {}

343

344 pool_items = np.arange(0, len(pool_sensitive_features))

345 subset_items = np.arange(0, len(subset_sensitive_features))

346

347 # represent sensitive feature values via consecutive integers

348 lookupTable, pool_sf_int = np.unique(pool_sensitive_features, return_inverse=True)

349 lookupTable, subset_sf_int = np.unique(

350 subset_sensitive_features, return_inverse=True

351 )

352

353 selectRt, parity_score = fins.parity(

354 pool_items, pool_sf_int, subset_items, subset_sf_int

355 )

356 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}]

357

358 propOfS, balance_score = fins.balance(pool_sf_int, subset_items, subset_sf_int)

359 fins_metrics["balance_ratio-score"] = [{"value": balance_score}]

360

361 # Score-dependant metrics

362 if subset_scores is not None:

363 AvgScore, score_parity_score = fins.score_parity(

364 subset_items, subset_scores, subset_sf_int

365 )

366 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}]

367

368 TotalScore, score_balance_score = fins.score_balance(

369 subset_items, subset_scores, subset_sf_int

370 )

371 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}]

372

373 if pool_scores is not None:

374 RselectRt, relevance_parity_score = fins.relevance_parity(

375 pool_items,

376 pool_scores,

377 pool_sf_int,

378 subset_items,

379 subset_scores,

380 subset_sf_int,

381 )

382 fins_metrics["relevance_parity_ratio-score"] = [

383 {"value": relevance_parity_score}

384 ]

385

386 if q:

387 QselectRt, qualified_parity_score = fins.qualififed_parity(

388 pool_items,

389 pool_scores,

390 pool_sf_int,

391 subset_items,

392 subset_scores,

393 subset_sf_int,

394 q,

395 )

396 fins_metrics["qualified_demographic_parity_ratio-score"] = [

397 {"value": qualified_parity_score}

398 ]

399

400 QpropOfS, qualified_balance_score = fins.qualified_balance(

401 pool_items,

402 pool_scores,

403 pool_sf_int,

404 subset_items,

405 subset_scores,

406 subset_sf_int,

407 q,

408 )

409 fins_metrics["qualified_balance_ratio-score"] = [

410 {"value": qualified_balance_score}

411 ]

412

413 if lb_bin is not None and ub_bin is not None:

414 (

415 bin_group_selection_proportions,

416 calibrated_parity_score,

417 ) = fins.calibrated_parity(

418 pool_items,

419 pool_scores,

420 pool_sf_int,

421 subset_items,

422 subset_scores,

423 subset_sf_int,

424 lb_bin,

425 ub_bin,

426 )

427 fins_metrics["calibrated_demographic_parity_ratio-score"] = [

428 {"value": calibrated_parity_score}

429 ]

430

431 (

432 bin_group_proportions,

433 calibrated_balance_score,

434 ) = fins.calibrated_balance(

435 pool_items,

436 pool_scores,

437 pool_sf_int,

438 subset_items,

439 subset_scores,

440 subset_sf_int,

441 lb_bin,

442 ub_bin,

443 )

444 fins_metrics["calibrated_balance_ratio-score"] = [

445 {"value": calibrated_balance_score}

446 ]

447

448 return fins_metrics

Coverage for credoai/evaluators/ranking_fairness.py: 88%

112 statements