Coverage for credoai/evaluators/ranking

1"""Ranking Fairness evaluator"""

2import math

4import numpy as np

5import pandas as pd

6from connect.evidence import MetricContainer, TableContainer

7from finsfairauditing import fins

9from credoai.artifacts import TabularData

10from credoai.evaluators import Evaluator

11from credoai.evaluators.utils.validation import (

12 check_artifact_for_nulls,

13 check_data_instance,

14 check_existence,

15 check_feature_presence,

16)

17from credoai.utils.common import ValidationError

18from credoai.utils.dataset_utils import empirical_distribution_curve

20EPSILON = 1e-12

21METRIC_SUBSET = [

22 "skew_parity_difference-score",

23 "ndkl-score",

24 "demographic_parity_ratio-score",

25 "balance_ratio-score",

26 "qualified_demographic_parity_ratio-score",

27 "qualified_balance_ratio-score",

28 "calibrated_demographic_parity_ratio-score",

29 "calibrated_balance_ratio-score",

30 "relevance_parity_ratio-score",

31 "score_parity_ratio-score",

32 "score_balance_ratio-score",

33]

36class RankingFairness(Evaluator):

37 """

38 Ranking fairness evaluator for Credo AI

40 This module takes in ranking results and provides functionality to perform fairness assessment

41 The results should include rankings, sensitive features, and optionally, scores.

43 The scores that the evaluator can calculate are:

45 * **skew_parity_difference**: max_skew - min_skew, where skew is the proportion of the selected

46 items from a group over the desired proportion for that group.

47 It ranges from 0 to inf and the ideal value is 0.

49 * **ndkl**: a metric that accounts for increasing ranks. It is non-negative, with larger values

50 indicating a greater divergence between the desired and actual distributions of

51 sensitive attribute labels. It ranges from 0 to inf and the ideal value is 0.

53 * **demographic_parity_ratio**: min_selection_rate / max_selection_rate, where selection rate

54 is the proportion of the selected items from a group over the number of items for

55 that group in the pool. It ranges from 0 to 1 and ideal value is 1.

57 * **balance_ratio**: min_presence / max_presence, where presence is the number of the selected items

58 from a group. It ranges from 0 to 1 and ideal value is 1.

60 * **qualified_demographic_parity_ratio**: demographic_parity_ratio but with a qualified (i.e., score

61 greater than or equal to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.

63 * **qualified_balance_ratio**: balance_ratio but with a qualified (i.e., score greater than or equal

64 to q) filter applied to the items. It ranges from 0 to 1 and ideal value is 1.

66 * **calibrated_demographic_parity_ratio**: demographic_parity_ratio but with the selected set from

67 specified score bins. This is to audit if items with similar scores are are treated similarly

68 (via proportional presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.

70 * **calibrated_balance_ratio**: balance_ratio but with the selected set from

71 specified score bins. This is to audit if items with similar scores are are treated similarly

72 (via equal presence) regardless of group membership. It ranges from 0 to 1 and ideal value is 1.

74 * **relevance_parity_ratio**: to audit if groups are represented proportional to their average score

75 (i.e., score-based relevance). It ranges from 0 to 1 and ideal value is 1.

77 * **score_parity_ratio**: min_average_Score / max_average_Score, where average score

78 is the average score of the selected items from a group.

79 It ranges from 0 to 1 and ideal value is 1.

81 * **score_balance_ratio**: min_total_Score / max_total_Score, where total score

82 is the total score of the selected items from a group. It ranges from 0 to 1 and ideal value is 1.

84 * **score_empirical_distribution**: score empirical distributions for each demographic group as tables.

85 The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1)

86 It is useful for a visual examination of the distribution of scores for the different groups.

88 Parameters

89 ----------

90 sensitive_features : pandas.Series

91 A series of the sensitive feature labels (e.g., "male", "female") which should

92 be used to create subgroups

93 rankings : pandas.Series of type int

94 The computed ranks

95 It should be passed to TabularData's y argument with the column name `rankings`

96 scores : pandas.Series of type int or float, Optional

97 A series of the scores

98 It should be passed to TabularData's y argument with the column name `scores`

99 k: int, Optional

100 The top k items are considered as the selected subset

101 If not provided, the top 50% of the items are considered as selected

102 q: float, Optional

103 The relevance score for which items in the pool that have score >= q are "relevant".

104 These two metrics require this to be provided: `qualified_demographic_parity_ratio`

105 and `qualified_balance_ratio`

106 lb_bin: numpy array of shape = (n_bins), Optional

107 The lower bound scores for each bin (bin is greater than or equal to lower bound).

108 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

109 and `calibrated_balance_ratio`

110 ub_bin: numpy array of shape = (n_bins), Optional

111 The upper bound scores for each bin (bin is less than upper bound).

112 These two metrics require this to be provided: `calibrated_demographic_parity_ratio`

113 and `calibrated_balance_ratio`

114 desired_proportions: dict, Optional

115 The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6})

116 If not provided, equal proportions are used for calculation of `skew` score

117 down_sampling_step : int, optional

118 down-sampling step for scores empirical distribution curve

119 If not provided, down-sampling is done such that the curve length be nearly 100

120 """

121

122 def __init__(

123 self,

124 k: int = None,

125 q: float = None,

126 lb_bin: list = None,

127 ub_bin: list = None,

128 desired_proportions: dict = None,

129 down_sampling_step: int = None,

130 ):

131 self.desired_proportions = (desired_proportions,)

132 self.k = k

133 self.q = q

134 self.down_sampling_step = down_sampling_step

135 if lb_bin is not None and ub_bin is not None:

136 self.lb_bin = np.array(lb_bin)

137 self.ub_bin = np.array(ub_bin)

138 else:

139 self.lb_bin = lb_bin

140 self.ub_bin = ub_bin

141

142 required_artifacts = ["data", "sensitive_feature"]

143

144 def _validate_arguments(self):

145 check_data_instance(self.data, TabularData)

146 check_existence(self.data.sensitive_features, "sensitive_features")

147 check_feature_presence("rankings", self.data.y, "y")

148 check_artifact_for_nulls(self.data, "Data")

149

150 return self

151

152 def _setup(self):

153 self.pool_rankings = np.array(self.data.y.rankings)

154 self.pool_sensitive_features = np.array(self.data.sensitive_feature)

155 self.sf_name = self.data.sensitive_feature.name

156 if self.k is None:

157 self.k = int(len(self.pool_rankings) / 2)

158

159 if self.down_sampling_step is None:

160 self.down_sampling_step = max(int(len(self.pool_rankings) / 100), 1)

161

162 # Sort ascending in parallel in case not already sorted

163 p = self.pool_rankings.argsort()

164 self.pool_rankings = self.pool_rankings[p]

165 self.pool_sensitive_features = self.pool_sensitive_features[p]

166

167 self.pool_groups = list(set(self.pool_sensitive_features))

168 self.pool_items = np.arange(0, len(self.pool_rankings))

169 self.num_items = len(self.pool_rankings)

170

171 self.subset_sensitive_features = self.pool_sensitive_features[: self.k]

172 self.subset_items = self.pool_items[: self.k]

173 self.subset_groups = list(set(self.subset_sensitive_features))

174

175 if "scores" in self.data.y:

176 self.pool_scores = np.array(self.data.y.scores)

177 if not np.issubdtype(self.pool_scores.dtype, np.number):

178 raise ValidationError(

179 "`scores` array provided contains non-numeric elements."

180 )

181

182 self.subset_scores = self.pool_scores[: self.k]

183 else:

184 self.pool_scores = None

185 self.subset_scores = None

186

187 # if desired proportions are not provided, set it to the pool proportions

188 if not all(self.desired_proportions):

189 uniques, counts = np.unique(

190 self.pool_sensitive_features, return_counts=True

191 )

192 self.desired_proportions = dict(zip(uniques, counts / self.num_items))

193

194 return self

195

196 def evaluate(self):

197 """

198 Runs the assessment process

199

200 Returns

201 -------

202 dict, nested

203 Key: assessment category

204 Values: detailed results associated with each category

205 """

206

207 skew_results = self._skew()

208 ndkl_results = self._ndkl()

209 fins_results = self._fins()

210

211 res = {**skew_results, **ndkl_results, **fins_results}

212

213 self.results = self._format_results(res)

214

215 if self.pool_scores is not None:

216 self.results.append(self._score_distribution())

217

218 return self

219

220 def _format_results(self, res):

221 """

222 Format results from the evaluations.

223

224 Parameters

225 ----------

226 res : dict

227 All results of the evaluations

228

229 """

230

231 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}

232

233 # Reformat results

234 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]

235 res = pd.concat(res)

236 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)

237 res.drop("metric_type", axis=1, inplace=True)

238 return [MetricContainer(res, **self.get_container_info())]

239

240 def _skew(self):

241 """

242 Calculates skew parity

243

244 For every group, skew is the proportion of the selected candidates

245 from that group over the desired proportion for that group.

246

247 Returns

248 -------

249 dict

250 skew parity difference

251 """

252 uniques, counts = np.unique(self.subset_sensitive_features, return_counts=True)

253 subset_proportions = dict(zip(uniques, counts / self.k))

254

255 skew = {}

256 for g in self.pool_groups:

257 sk = (subset_proportions[g] + EPSILON) / (

258 self.desired_proportions[g] + EPSILON

259 )

260 skew[g] = sk

261

262 skew = {

263 "skew_parity_difference-score": [

264 {"value": max(skew.values()) - min(skew.values())}

265 ]

266 }

267

268 return skew

269

270 def _kld(self, dist_1, dist_2):

271 """

272 Calculates KL divergence

273

274 Parameters

275 ----------

276 dist_1 : list

277 first distribution

278 dist_2 : list

279 second distribution

280

281 Returns

282 -------

283 float

284 KL divergence

285 """

286 vals = []

287 for p1, p2 in zip(dist_1, dist_2):

288 vals.append(p1 * math.log((p1 + EPSILON) / (p2 + EPSILON)))

289

290 return sum(vals)

291

292 def _ndkl(self):

293 """

294 Calculates normalized discounted cumulative KL-divergence (ndkl)

295

296 It is based on the following paper:

297 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search &

298 recommendation systems with application to linkedin talent search."

299 Proceedings of the 25th acm sigkdd international conference on knowledge discovery & data mining. 2019.

300

301 Returns

302 -------

303 dict

304 normalized discounted cumulative KL-divergence (ndkl)

305 """

306 Z = np.sum(1 / (np.log2(np.arange(1, self.num_items + 1) + 1)))

307

308 total = 0.0

309 for k in range(1, self.num_items + 1):

310 item_attr_k = list(self.pool_sensitive_features[:k])

311 item_distr = [

312 item_attr_k.count(attr) / len(item_attr_k)

313 for attr in self.desired_proportions.keys()

314 ]

315 total += (1 / math.log2(k + 1)) * self._kld(

316 item_distr, list(self.desired_proportions.values())

317 )

318

319 ndkl = {"ndkl-score": [{"value": (1 / Z) * total}]}

320

321 return ndkl

322

323 def _fins(self):

324 """

325 Calculates group fairness metrics for subset selections from FINS paper and library

326

327 It is based on the following paper:

328 Cachel, Kathleen, and Elke Rundensteiner. "FINS Auditing Framework: Group Fairness for Subset Selections."

329 Proceedings of the 2022 AAAI/ACM Conference on AI, Ethics, and Society. 2022.

330

331 Returns

332 -------

333 dict

334 fairness metrics

335 """

336 fins_metrics = {}

337

338 # represent sensitive feature values via consecutive integers

339 lookupTable, pool_sf_int = np.unique(

340 self.pool_sensitive_features, return_inverse=True

341 )

342 lookupTable, subset_sf_int = np.unique(

343 self.subset_sensitive_features, return_inverse=True

344 )

345

346 selectRt, parity_score = fins.parity(

347 self.pool_items, pool_sf_int, self.subset_items, subset_sf_int

348 )

349 fins_metrics["demographic_parity_ratio-score"] = [{"value": parity_score}]

350

351 propOfS, balance_score = fins.balance(

352 pool_sf_int, self.subset_items, subset_sf_int

353 )

354 fins_metrics["balance_ratio-score"] = [{"value": balance_score}]

355

356 # Score-dependant metrics

357 if self.subset_scores is not None:

358 AvgScore, score_parity_score = fins.score_parity(

359 self.subset_items, self.subset_scores, subset_sf_int

360 )

361 fins_metrics["score_parity_ratio-score"] = [{"value": score_parity_score}]

362

363 TotalScore, score_balance_score = fins.score_balance(

364 self.subset_items, self.subset_scores, subset_sf_int

365 )

366 fins_metrics["score_balance_ratio-score"] = [{"value": score_balance_score}]

367

368 if self.pool_scores is not None:

369 RselectRt, relevance_parity_score = fins.relevance_parity(

370 self.pool_items,

371 self.pool_scores,

372 pool_sf_int,

373 self.subset_items,

374 self.subset_scores,

375 subset_sf_int,

376 )

377 fins_metrics["relevance_parity_ratio-score"] = [

378 {"value": relevance_parity_score}

379 ]

380

381 if self.q:

382 QselectRt, qualified_parity_score = fins.qualified_parity(

383 self.pool_items,

384 self.pool_scores,

385 pool_sf_int,

386 self.subset_items,

387 self.subset_scores,

388 subset_sf_int,

389 self.q,

390 )

391 fins_metrics["qualified_demographic_parity_ratio-score"] = [

392 {"value": qualified_parity_score}

393 ]

394

395 QpropOfS, qualified_balance_score = fins.qualified_balance(

396 self.pool_items,

397 self.pool_scores,

398 pool_sf_int,

399 self.subset_items,

400 self.subset_scores,

401 subset_sf_int,

402 self.q,

403 )

404 fins_metrics["qualified_balance_ratio-score"] = [

405 {"value": qualified_balance_score}

406 ]

407

408 if self.lb_bin is not None and self.ub_bin is not None:

409 (

410 bin_group_selection_proportions,

411 calibrated_parity_score,

412 ) = fins.calibrated_parity(

413 self.pool_items,

414 self.pool_scores,

415 pool_sf_int,

416 self.subset_items,

417 self.subset_scores,

418 subset_sf_int,

419 self.lb_bin,

420 self.ub_bin,

421 )

422 fins_metrics["calibrated_demographic_parity_ratio-score"] = [

423 {"value": calibrated_parity_score}

424 ]

425

426 (

427 bin_group_proportions,

428 calibrated_balance_score,

429 ) = fins.calibrated_balance(

430 self.pool_items,

431 self.pool_scores,

432 pool_sf_int,

433 self.subset_items,

434 self.subset_scores,

435 subset_sf_int,

436 self.lb_bin,

437 self.ub_bin,

438 )

439 fins_metrics["calibrated_balance_ratio-score"] = [

440 {"value": calibrated_balance_score}

441 ]

442

443 return fins_metrics

444

445 def _score_distribution(self):

446 """

447 Calculates scores empirical distribution curve for each demographic group

448 """

449

450 groups = np.unique(self.pool_sensitive_features)

451 for group in groups:

452 ind = np.where(self.pool_sensitive_features == group)

453 group_scores = self.pool_scores[ind]

454 emp_dist_df = empirical_distribution_curve(

455 group_scores, self.down_sampling_step, variable_name="scores"

456 )

457 emp_dist_df.name = "score_empirical_distribution"

458

459 labels = {"sensitive_feature": self.sf_name, "group": group}

460

461 e = TableContainer(

462 emp_dist_df,

463 **self.get_container_info(labels=labels),

464 )

465 return e

Coverage for credoai/evaluators/ranking_fairness.py: 90%

134 statements