Coverage for credoai/modules/metrics_credoai.py: 73%

183 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1"""Custom metrics defined by Credo AI""" 

2 

3from functools import partial 

4from typing import Literal 

5 

6import numpy as np 

7import pandas as pd 

8import scipy.stats as st 

9from fairlearn.metrics import make_derived_metric, true_positive_rate 

10from sklearn import metrics as sk_metrics 

11from sklearn.metrics import accuracy_score, confusion_matrix 

12from sklearn.utils import check_consistent_length 

13 

14 

15def multiclass_confusion_metrics(y_true, y_pred, metric=None, average="weighted"): 

16 """Calculate 

17 

18 Parameters 

19 ---------- 

20 y_true : array-like of shape (n_samples,) 

21 Ground truth (correct) target values. 

22 

23 y_pred : array-like of shape (n_samples,) 

24 Estimated targets as returned by a classifier. 

25 metric : str, optional 

26 If provided, returns a specific metric. All metrics are returned if None is provided. 

27 Options are: 

28 "TPR": Sensitivity, hit rate, recall, or true positive rate 

29 "TNR": Specificity or true negative rate 

30 "PPV": Precision or positive predictive value 

31 "NPV": Negative predictive value 

32 "FPR": Fall out or false positive rate 

33 "FNR": False negative rate 

34 "FDR": False discovery rate 

35 "ACC": Overall accuracy 

36 average : str 

37 Options are "weighted", "macro" or None (which will return the values for each label) 

38 

39 Returns 

40 ------- 

41 dict or float 

42 dict if metric is not provided 

43 """ 

44 cnf_matrix = confusion_matrix(y_true, y_pred) 

45 FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) 

46 FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) 

47 TP = np.diag(cnf_matrix) 

48 TN = cnf_matrix.sum() - (FP + FN + TP) 

49 

50 FP = FP.astype(float) 

51 FN = FN.astype(float) 

52 TP = TP.astype(float) 

53 TN = TN.astype(float) 

54 

55 metrics = { 

56 "TPR": TP / (TP + FN), 

57 "TNR": TN / (TN + FP), 

58 "PPV": TP / (TP + FP), 

59 "NPV": TN / (TN + FN), 

60 "FPR": FP / (FP + TN), 

61 "FNR": FN / (TP + FN), 

62 "FDR": FP / (TP + FP), 

63 "ACC": (TP + TN) / (TP + FP + FN + TN), 

64 } 

65 if average == "weighted": 

66 weights = np.unique(y_true, return_counts=True)[1] / len(y_true) 

67 metrics = {k: np.average(v, weights=weights) for k, v in metrics.items()} 

68 elif average == "macro": 

69 metrics = {k: v.mean() for k, v in metrics.items()} 

70 if metric: 

71 return metrics[metric] 

72 else: 

73 return metrics 

74 

75 

76def general_wilson(p, n, z=1.96): 

77 """Return lower and upper bound using Wilson Interval. 

78 Parameters 

79 ---------- 

80 p : float 

81 Proportion of successes. 

82 n : int 

83 Total number of trials. 

84 digits : int 

85 Digits of precisions to which the returned bound will be rounded 

86 z : float 

87 Z-score, which indicates the number of standard deviations of confidence 

88 Returns 

89 ------- 

90 np.ndarray 

91 Array of length 2 of form: [lower_bound, upper_bound] 

92 """ 

93 denominator = 1 + z**2 / n 

94 centre_adjusted_probability = p + z * z / (2 * n) 

95 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n) 

96 lower_bound = ( 

97 centre_adjusted_probability - z * adjusted_standard_deviation 

98 ) / denominator 

99 upper_bound = ( 

100 centre_adjusted_probability + z * adjusted_standard_deviation 

101 ) / denominator 

102 return np.array([lower_bound, upper_bound]) 

103 

104 

105def wilson_ci(num_hits, num_total, confidence=0.95): 

106 """Convenience wrapper for general_wilson""" 

107 z = st.norm.ppf((1 + confidence) / 2) 

108 p = num_hits / num_total 

109 return general_wilson(p, num_total, z=z) 

110 

111 

112def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95): 

113 """Return Wilson Interval bounds for performance metrics 

114 

115 Metrics derived from confusion matrix 

116 

117 Parameters 

118 ---------- 

119 y_true : array-like of shape (n_samples,) 

120 Ground truth labels 

121 y_pred : array-like of shape (n_samples,) 

122 Predicted labels 

123 metric : string 

124 indicates kind of performance metric. Must be 

125 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate, 

126 "fnr" is false negative rate, etc. 

127 Returns 

128 ------- 

129 np.ndarray 

130 Array of length 2 of form: [lower_bound, upper_bound] 

131 """ 

132 check_consistent_length(y_true, y_pred) 

133 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 

134 negatives = tn + fp 

135 positives = tp + fn 

136 if metric == "true_positive_rate": 

137 numer = tp 

138 denom = positives 

139 elif metric == "true_negative_rate": 

140 numer = tn 

141 denom = negatives 

142 elif metric == "false_positive_rate": 

143 numer = fp 

144 denom = negatives 

145 elif metric == "false_negative_rate": 

146 numer = fn 

147 denom = positives 

148 else: 

149 raise ValueError( 

150 """ 

151 Metric must be one of the following: 

152 -true_positive_rate 

153 -true_negative_rate 

154 -false_positive_rate 

155 -false_negative_rate 

156 """ 

157 ) 

158 

159 bounds = wilson_ci(numer, denom, confidence) 

160 return bounds 

161 

162 

163def accuracy_wilson(y_true, y_pred, confidence=0.95): 

164 """Return Wilson Interval bounds for accuracy metric. 

165 Parameters 

166 ---------- 

167 y_true : array-like of shape (n_samples,) 

168 Ground truth labels 

169 y_pred : array-like of shape (n_samples,) 

170 Predicted labels 

171 Returns 

172 ------- 

173 np.ndarray 

174 Array of length 2 of form: [lower_bound, upper_bound] 

175 """ 

176 check_consistent_length(y_true, y_pred) 

177 score = accuracy_score(y_true, y_pred) 

178 bounds = general_wilson(score, len(y_true), confidence) 

179 return bounds 

180 

181 

182# metric definitions 

183 

184 

185def false_discovery_rate(y_true, y_pred, **kwargs): 

186 """Compute the false discovery rate. 

187 

188 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of 

189 true positives and ``fp`` the number of false positives. The false discovery rate is 

190 intuitively the rate at which the classifier will be wrong when 

191 labeling an example as positive. 

192 

193 The best value is 0 and the worst value is 1. 

194 

195 Parameters 

196 ---------- 

197 y_true : 1d array-like, or label indicator array / sparse matrix 

198 Ground truth (correct) target values. 

199 

200 y_pred : 1d array-like, or label indicator array / sparse matrix 

201 Estimated targets as returned by a classifier. 

202 kwargs : key, value mappings 

203 Other keyword arguments are passed through 

204 to scikit-learn.metrics.precision 

205 """ 

206 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs) 

207 

208 

209def false_omission_rate(y_true, y_pred, **kwargs): 

210 """Compute the false omission rate. 

211 

212 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of 

213 false negatives and ``tn`` the number of true negatives. The false omission rate is 

214 intuitively the rate at which the classifier will be wrong when 

215 labeling an example as negative. 

216 

217 The best value is 0 and the worst value is 1. 

218 

219 Parameters 

220 ---------- 

221 y_true : 1d array-like, or label indicator array / sparse matrix 

222 Ground truth (correct) target values. 

223 

224 y_pred : 1d array-like, or label indicator array / sparse matrix 

225 Estimated targets as returned by a classifier. 

226 kwargs : key, value mappings 

227 Other keyword arguments are passed through 

228 to scikit-learn.metrics.precision 

229 """ 

230 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 

231 return fn / (fn + tn) 

232 

233 

234def equal_opportunity_difference( 

235 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None 

236) -> float: 

237 """Calculate the equal opportunity difference. 

238 

239 Equivalent to the `true_positive_rate_difference` defined as the difference between the 

240 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a` 

241 of the sensitive feature(s). 

242 

243 Parameters 

244 ---------- 

245 y_true : array-like 

246 Ground truth (correct) labels. 

247 y_pred : array-like 

248 Predicted labels :math:`h(X)` returned by the classifier. 

249 sensitive_features : 

250 The sensitive features over which demographic parity should be assessed 

251 method : str 

252 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio` 

253 for details. 

254 sample_weight : array-like 

255 The sample weights 

256 Returns 

257 ------- 

258 float 

259 The average odds difference 

260 """ 

261 fun = make_derived_metric(metric=true_positive_rate, transform="difference") 

262 return fun( 

263 y_true, 

264 y_pred, 

265 sensitive_features=sensitive_features, 

266 method=method, 

267 sample_weight=sample_weight, 

268 ) 

269 

270 

271def ks_statistic(y_true, y_pred) -> float: 

272 """Performs the two-sample Kolmogorov-Smirnov test (two-sided) 

273 

274 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples. 

275 The null hypothesis is that the two distributions are identical, F(x)=G(x) 

276 If the KS statistic is small or the p-value is high, 

277 then we cannot reject the null hypothesis in favor of the alternative. 

278 

279 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different. 

280 

281 Parameters 

282 ---------- 

283 y_true : array-like 

284 Ground truth (correct) labels. 

285 y_pred : array-like 

286 Predicted labels :math:`h(X)` returned by the classifier. 

287 

288 Returns 

289 ------- 

290 float 

291 KS statistic value 

292 """ 

293 

294 ks_stat = st.ks_2samp(y_true, y_pred).statistic 

295 

296 return ks_stat 

297 

298 

299def interpolate_increasing_thresholds(lib_thresholds, *series): 

300 out = [list() for i in series] 

301 quantization = 1 / ( 

302 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds)) 

303 ) 

304 interpolated_thresholds = np.arange( 

305 min(lib_thresholds), max(lib_thresholds), quantization 

306 ) 

307 

308 for t in interpolated_thresholds: 

309 if t >= lib_thresholds[0]: 

310 lib_thresholds.pop(0) 

311 for s in series: 

312 s.pop(0) 

313 for i, s in enumerate(out): 

314 s.append(series[i][0]) 

315 

316 return out + [interpolated_thresholds] 

317 

318 

319def interpolate_decreasing_thresholds(lib_thresholds, *series): 

320 out = [list() for i in series] 

321 quantization = -1 / ( 

322 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds)) 

323 ) 

324 interpolated_thresholds = np.arange( 

325 max(lib_thresholds), min(lib_thresholds), quantization 

326 ) 

327 

328 for t in interpolated_thresholds: 

329 for i, s in enumerate(out): 

330 s.append(series[i][0]) 

331 if t <= lib_thresholds[0]: 

332 lib_thresholds.pop(0) 

333 for s in series: 

334 s.pop(0) 

335 

336 return out + [interpolated_thresholds] 

337 

338 

339def credo_pr_curve(y_true, y_prob): 

340 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob) 

341 ( 

342 precision, 

343 recall, 

344 thresholds, 

345 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist()) 

346 return pd.DataFrame( 

347 { 

348 "precision": precision, 

349 "recall": recall, 

350 "threshold": thresholds, 

351 } 

352 ) 

353 

354 

355def credo_roc_curve(y_true, y_prob): 

356 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob) 

357 ( 

358 false_positive_rate, 

359 true_positive_rate, 

360 thresholds, 

361 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist()) 

362 return pd.DataFrame( 

363 { 

364 "false_positive_rate": false_positive_rate, 

365 "true_positive_rate": true_positive_rate, 

366 "threshold": thresholds, 

367 } 

368 ) 

369 

370 

371def credo_det_curve(y_true, y_prob): 

372 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob) 

373 ( 

374 false_positive_rate, 

375 false_negative_rate, 

376 thresholds, 

377 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist()) 

378 return pd.DataFrame( 

379 { 

380 "false_positive_rate": false_positive_rate, 

381 "false_negative_rate": false_negative_rate, 

382 "thresholds": thresholds, 

383 } 

384 ) 

385 

386 

387def gini_coefficient_discriminatory(y_true, y_prob, **kwargs): 

388 """Returns the Gini Coefficient of a discriminatory model 

389 

390 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'. 

391 

392 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve. 

393 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal 

394 relationship between predictions made by a model and the ground truth values for each sample. 

395 

396 This metric has a linear relationship with the area under the receiver operating characteristic curve: 

397 :math:`G = 2*AUC - 1` 

398 

399 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420 

400 for more details. 

401 

402 Parameters 

403 ---------- 

404 y_true : array-like 

405 Ground truth (correct) labels. 

406 y_prob : array-like 

407 Predicted probabilities returned by a call to the model's `predict_proba()` function. 

408 

409 Returns 

410 ------- 

411 float 

412 Discriminatory Gini Coefficient 

413 """ 

414 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob, **kwargs)) - 1 

415 return G 

416 

417 

418def population_stability_index( 

419 expected_array, 

420 actual_array, 

421 percentage=False, 

422 buckets: int = 10, 

423 buckettype: Literal["bins", "quantiles"] = "bins", 

424): 

425 """Calculate the PSI for a single variable. 

426 

427 PSI is a measure of how much a distribution has changed over time or between 

428 two different samples of a population. 

429 It does this by bucketing the two distributions and comparing the percents of 

430 items in each of the buckets. The final result is a single number: 

431 

432 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )` 

433 

434 The common interpretations of the PSI result are: 

435 

436 PSI < 0.1: no significant population change 

437 PSI < 0.25: moderate population change 

438 PSI >= 0.25: significant population change 

439 

440 The number of buckets chosen and the bucketing logic affect the final result. 

441 

442 

443 References 

444 ---------- 

445 Based on the code in: github.com/mwburke by Matthew Burke. 

446 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html 

447 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations 

448 

449 

450 Parameters 

451 ---------- 

452 expected_array: array-like 

453 Array of expected/initial values 

454 actual_array: array-like 

455 Array of new values 

456 percentage: bool 

457 When True the arrays are interpreted as already binned/aggregated. This is 

458 so that the user can perform their own aggregation and pass it directly to 

459 the metric. Default = False 

460 buckets: int 

461 number of percentile ranges to bucket the values into 

462 buckettype: Literal["bins", "quantiles"] 

463 type of strategy for creating buckets, bins splits into even splits, 

464 quantiles splits into quantile buckets 

465 

466 Returns: 

467 psi_value: calculated PSI value 

468 """ 

469 epsilon: float = 0.001 

470 

471 def scale_range(input, min, max): 

472 input += -(np.min(input)) 

473 input /= np.max(input) / (max - min) 

474 input += min 

475 return input 

476 

477 if not percentage: 

478 # Define histogram breakpoints 

479 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100 

480 

481 if buckettype == "bins": 

482 breakpoints = scale_range( 

483 breakpoints, np.min(expected_array), np.max(expected_array) 

484 ) 

485 elif buckettype == "quantiles": 

486 breakpoints = np.stack( 

487 [np.percentile(expected_array, b) for b in breakpoints] 

488 ) 

489 

490 # Populate bins and calculate percentages 

491 expected_percents = np.histogram(expected_array, breakpoints)[0] / len( 

492 expected_array 

493 ) 

494 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array) 

495 else: 

496 expected_percents = expected_array 

497 actual_percents = actual_array 

498 

499 # Substitute 0 with an arbitrary epsilon << 1 

500 # This is to avoid inf in the following calculations 

501 expected_percents[expected_percents == 0] = epsilon 

502 actual_percents[actual_percents == 0] = epsilon 

503 

504 psi_values = (expected_percents - actual_percents) * np.log( 

505 expected_percents / actual_percents 

506 ) 

507 

508 return np.sum(psi_values) 

509 

510 

511def kl_divergence(p, q): 

512 """Calculates KL divergence of two discrete distributions 

513 

514 Parameters 

515 ---------- 

516 p : list 

517 first distribution 

518 q : list 

519 second distribution 

520 

521 Returns 

522 ------- 

523 float 

524 KL divergence 

525 

526 Examples 

527 -------- 

528 >>> p = [.05, .1, .2, .05, .15, .25, .08, .12] 

529 >>> q = [.30, .1, .2, .10, .10, .02, .08, .10] 

530 >>> round(kl_divergence(p, q),2) 

531 0.59 

532 """ 

533 if len(p) != len(q): 

534 raise ValueError("`p` and `q` must have the same length.") 

535 

536 EPSILON = 1e-12 

537 vals = [] 

538 for pi, qi in zip(p, q): 

539 vals.append(pi * np.log((pi + EPSILON) / (qi + EPSILON))) 

540 

541 return sum(vals) 

542 

543 

544def normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions): 

545 """Calculates normalized discounted cumulative KL divergence (NDKL) 

546 

547 It is non-negative, with larger values indicating a greater divergence between the 

548 desired and actual distributions of groups labels. It ranges from 0 to inf 

549 and the ideal value is 0 for fairness. 

550 

551 Parameters 

552 ---------- 

553 ranked_list : list 

554 Ranked list of items 

555 desired_proportions : dict 

556 The desired proportion for each group 

557 

558 Returns 

559 ------- 

560 float 

561 normalized discounted cumulative KL divergence 

562 

563 References 

564 ---------- 

565 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search & 

566 recommendation systems with application to linkedin talent search." 

567 Proceedings of the 25th acm sigkdd international conference on knowledge discovery & 

568 data mining. 2019. 

569 

570 Examples 

571 -------- 

572 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male'] 

573 >>> desired_proportions = {'female': 0.6, 'male': 0.4} 

574 >>> normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions) 

575 0.208096993149323 

576 """ 

577 num_items = len(ranked_list) 

578 Z = np.sum(1 / (np.log2(np.arange(1, num_items + 1) + 1))) 

579 

580 total = 0.0 

581 for k in range(1, num_items + 1): 

582 item_attr_k = list(ranked_list[:k]) 

583 item_distr = [ 

584 item_attr_k.count(attr) / len(item_attr_k) 

585 for attr in desired_proportions.keys() 

586 ] 

587 total += (1 / np.log2(k + 1)) * kl_divergence( 

588 item_distr, list(desired_proportions.values()) 

589 ) 

590 

591 ndkl = (1 / Z) * total 

592 

593 return ndkl 

594 

595 

596def skew_parity(items_list, desired_proportions, parity_type="difference"): 

597 """Calculates skew parity 

598 

599 max_skew vs min_skew, where skew is the proportion of the selected 

600 items from a group over the desired proportion for that group. 

601 

602 Parameters 

603 ---------- 

604 ranked_list : list 

605 Ranked list of items 

606 desired_proportions : dict 

607 The desired proportion for each group 

608 parity_type : type pf parity to use. Two options: 

609 'difference' : `max_skew - min_skew`. It ranges from 0 to inf and the ideal value is 0. 

610 'ratio' : `min_skew / max_skew`. It ranges from 0 to 1 and the ideal value is 1. 

611 

612 Returns 

613 ------- 

614 float 

615 skew parity difference 

616 

617 Examples 

618 -------- 

619 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male'] 

620 >>> desired_proportions = {'female': 0.6, 'male': 0.4} 

621 >>> skew_parity(ranked_list, desired_proportions) 

622 1.1111111111087035 

623 """ 

624 EPSILON = 1e-12 

625 groups, counts = np.unique(items_list, return_counts=True) 

626 subset_proportions = dict(zip(groups, counts / len(items_list))) 

627 

628 skew = {} 

629 for g in groups: 

630 sk = (subset_proportions[g] + EPSILON) / (desired_proportions[g] + EPSILON) 

631 skew[g] = sk 

632 

633 if parity_type == "difference": 

634 parity = max(skew.values()) - min(skew.values()) 

635 elif parity_type == "ratio": 

636 parity = min(skew.values()) / max(skew.values()) 

637 else: 

638 raise ValueError( 

639 f"Possible values of `parity_type` are 'difference' and 'ratio'. You provided {parity_type}" 

640 ) 

641 

642 return parity 

643 

644 

645def credo_gain_chart(y_true, y_prob, bins=10): 

646 sort_indices = np.argsort(y_prob)[::-1] # descending order 

647 total_events = np.sum(y_true) 

648 overall_rate = np.mean(y_true) 

649 

650 sorted_y_true = y_true[sort_indices] 

651 

652 chunked_y_true = np.array_split(sorted_y_true, bins) 

653 

654 cumulative_samples = np.zeros(bins) 

655 cumulative_events = np.zeros(bins) 

656 cumulative_ratio = np.zeros(bins) 

657 lift = np.zeros(bins) 

658 

659 for i in range(bins): 

660 cumulative_samples[i] = len(np.hstack(chunked_y_true[: i + 1])) 

661 cumulative_events[i] = np.sum(np.hstack(chunked_y_true[: i + 1])) 

662 cumulative_ratio[i] = cumulative_events[i] / total_events 

663 lift[i] = (cumulative_events[i] / cumulative_samples[i]) / overall_rate 

664 return pd.DataFrame( 

665 { 

666 "Decile": np.arange(bins) + 1, 

667 "Cumulative Samples": cumulative_samples, 

668 "Cumulative Events": cumulative_events, 

669 "Cumulative Gain": cumulative_ratio, 

670 "Lift": lift, 

671 } 

672 )