Coverage for credoai/modules/metrics

1"""Custom metrics defined by Credo AI"""

3from functools import partial

4from typing import Literal

6import numpy as np

7import pandas as pd

8import scipy.stats as st

9from fairlearn.metrics import make_derived_metric, true_positive_rate

10from sklearn import metrics as sk_metrics

11from sklearn.metrics import accuracy_score, confusion_matrix

12from sklearn.utils import check_consistent_length

15def multiclass_confusion_metrics(y_true, y_pred, metric=None, average="weighted"):

16 """Calculate

18 Parameters

19 ----------

20 y_true : array-like of shape (n_samples,)

21 Ground truth (correct) target values.

23 y_pred : array-like of shape (n_samples,)

24 Estimated targets as returned by a classifier.

25 metric : str, optional

26 If provided, returns a specific metric. All metrics are returned if None is provided.

27 Options are:

28 "TPR": Sensitivity, hit rate, recall, or true positive rate

29 "TNR": Specificity or true negative rate

30 "PPV": Precision or positive predictive value

31 "NPV": Negative predictive value

32 "FPR": Fall out or false positive rate

33 "FNR": False negative rate

34 "FDR": False discovery rate

35 "ACC": Overall accuracy

36 average : str

37 Options are "weighted", "macro" or None (which will return the values for each label)

39 Returns

40 -------

41 dict or float

42 dict if metric is not provided

43 """

44 cnf_matrix = confusion_matrix(y_true, y_pred)

45 FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)

46 FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)

47 TP = np.diag(cnf_matrix)

48 TN = cnf_matrix.sum() - (FP + FN + TP)

50 FP = FP.astype(float)

51 FN = FN.astype(float)

52 TP = TP.astype(float)

53 TN = TN.astype(float)

55 metrics = {

56 "TPR": TP / (TP + FN),

57 "TNR": TN / (TN + FP),

58 "PPV": TP / (TP + FP),

59 "NPV": TN / (TN + FN),

60 "FPR": FP / (FP + TN),

61 "FNR": FN / (TP + FN),

62 "FDR": FP / (TP + FP),

63 "ACC": (TP + TN) / (TP + FP + FN + TN),

64 }

65 if average == "weighted":

66 weights = np.unique(y_true, return_counts=True)[1] / len(y_true)

67 metrics = {k: np.average(v, weights=weights) for k, v in metrics.items()}

68 elif average == "macro":

69 metrics = {k: v.mean() for k, v in metrics.items()}

70 if metric:

71 return metrics[metric]

72 else:

73 return metrics

76def general_wilson(p, n, z=1.96):

77 """Return lower and upper bound using Wilson Interval.

78 Parameters

79 ----------

80 p : float

81 Proportion of successes.

82 n : int

83 Total number of trials.

84 digits : int

85 Digits of precisions to which the returned bound will be rounded

86 z : float

87 Z-score, which indicates the number of standard deviations of confidence

88 Returns

89 -------

90 np.ndarray

91 Array of length 2 of form: [lower_bound, upper_bound]

92 """

93 denominator = 1 + z**2 / n

94 centre_adjusted_probability = p + z * z / (2 * n)

95 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n)

96 lower_bound = (

97 centre_adjusted_probability - z * adjusted_standard_deviation

98 ) / denominator

99 upper_bound = (

100 centre_adjusted_probability + z * adjusted_standard_deviation

101 ) / denominator

102 return np.array([lower_bound, upper_bound])

103

104

105def wilson_ci(num_hits, num_total, confidence=0.95):

106 """Convenience wrapper for general_wilson"""

107 z = st.norm.ppf((1 + confidence) / 2)

108 p = num_hits / num_total

109 return general_wilson(p, num_total, z=z)

110

111

112def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95):

113 """Return Wilson Interval bounds for performance metrics

114

115 Metrics derived from confusion matrix

116

117 Parameters

118 ----------

119 y_true : array-like of shape (n_samples,)

120 Ground truth labels

121 y_pred : array-like of shape (n_samples,)

122 Predicted labels

123 metric : string

124 indicates kind of performance metric. Must be

125 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate,

126 "fnr" is false negative rate, etc.

127 Returns

128 -------

129 np.ndarray

130 Array of length 2 of form: [lower_bound, upper_bound]

131 """

132 check_consistent_length(y_true, y_pred)

133 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

134 negatives = tn + fp

135 positives = tp + fn

136 if metric == "true_positive_rate":

137 numer = tp

138 denom = positives

139 elif metric == "true_negative_rate":

140 numer = tn

141 denom = negatives

142 elif metric == "false_positive_rate":

143 numer = fp

144 denom = negatives

145 elif metric == "false_negative_rate":

146 numer = fn

147 denom = positives

148 else:

149 raise ValueError(

150 """

151 Metric must be one of the following:

152 -true_positive_rate

153 -true_negative_rate

154 -false_positive_rate

155 -false_negative_rate

156 """

157 )

158

159 bounds = wilson_ci(numer, denom, confidence)

160 return bounds

161

162

163def accuracy_wilson(y_true, y_pred, confidence=0.95):

164 """Return Wilson Interval bounds for accuracy metric.

165 Parameters

166 ----------

167 y_true : array-like of shape (n_samples,)

168 Ground truth labels

169 y_pred : array-like of shape (n_samples,)

170 Predicted labels

171 Returns

172 -------

173 np.ndarray

174 Array of length 2 of form: [lower_bound, upper_bound]

175 """

176 check_consistent_length(y_true, y_pred)

177 score = accuracy_score(y_true, y_pred)

178 bounds = general_wilson(score, len(y_true), confidence)

179 return bounds

180

181

182# metric definitions

183

184

185def false_discovery_rate(y_true, y_pred, **kwargs):

186 """Compute the false discovery rate.

187

188 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of

189 true positives and ``fp`` the number of false positives. The false discovery rate is

190 intuitively the rate at which the classifier will be wrong when

191 labeling an example as positive.

192

193 The best value is 0 and the worst value is 1.

194

195 Parameters

196 ----------

197 y_true : 1d array-like, or label indicator array / sparse matrix

198 Ground truth (correct) target values.

199

200 y_pred : 1d array-like, or label indicator array / sparse matrix

201 Estimated targets as returned by a classifier.

202 kwargs : key, value mappings

203 Other keyword arguments are passed through

204 to scikit-learn.metrics.precision

205 """

206 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs)

207

208

209def false_omission_rate(y_true, y_pred, **kwargs):

210 """Compute the false omission rate.

211

212 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of

213 false negatives and ``tn`` the number of true negatives. The false omission rate is

214 intuitively the rate at which the classifier will be wrong when

215 labeling an example as negative.

216

217 The best value is 0 and the worst value is 1.

218

219 Parameters

220 ----------

221 y_true : 1d array-like, or label indicator array / sparse matrix

222 Ground truth (correct) target values.

223

224 y_pred : 1d array-like, or label indicator array / sparse matrix

225 Estimated targets as returned by a classifier.

226 kwargs : key, value mappings

227 Other keyword arguments are passed through

228 to scikit-learn.metrics.precision

229 """

230 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

231 return fn / (fn + tn)

232

233

234def equal_opportunity_difference(

235 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None

236) -> float:

237 """Calculate the equal opportunity difference.

238

239 Equivalent to the `true_positive_rate_difference` defined as the difference between the

240 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a`

241 of the sensitive feature(s).

242

243 Parameters

244 ----------

245 y_true : array-like

246 Ground truth (correct) labels.

247 y_pred : array-like

248 Predicted labels :math:`h(X)` returned by the classifier.

249 sensitive_features :

250 The sensitive features over which demographic parity should be assessed

251 method : str

252 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio`

253 for details.

254 sample_weight : array-like

255 The sample weights

256 Returns

257 -------

258 float

259 The average odds difference

260 """

261 fun = make_derived_metric(metric=true_positive_rate, transform="difference")

262 return fun(

263 y_true,

264 y_pred,

265 sensitive_features=sensitive_features,

266 method=method,

267 sample_weight=sample_weight,

268 )

269

270

271def ks_statistic(y_true, y_pred) -> float:

272 """Performs the two-sample Kolmogorov-Smirnov test (two-sided)

273

274 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples.

275 The null hypothesis is that the two distributions are identical, F(x)=G(x)

276 If the KS statistic is small or the p-value is high,

277 then we cannot reject the null hypothesis in favor of the alternative.

278

279 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different.

280

281 Parameters

282 ----------

283 y_true : array-like

284 Ground truth (correct) labels.

285 y_pred : array-like

286 Predicted labels :math:`h(X)` returned by the classifier.

287

288 Returns

289 -------

290 float

291 KS statistic value

292 """

293

294 ks_stat = st.ks_2samp(y_true, y_pred).statistic

295

296 return ks_stat

297

298

299def interpolate_increasing_thresholds(lib_thresholds, *series):

300 out = [list() for i in series]

301 quantization = 1 / (

302 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))

303 )

304 interpolated_thresholds = np.arange(

305 min(lib_thresholds), max(lib_thresholds), quantization

306 )

307

308 for t in interpolated_thresholds:

309 if t >= lib_thresholds[0]:

310 lib_thresholds.pop(0)

311 for s in series:

312 s.pop(0)

313 for i, s in enumerate(out):

314 s.append(series[i][0])

315

316 return out + [interpolated_thresholds]

317

318

319def interpolate_decreasing_thresholds(lib_thresholds, *series):

320 out = [list() for i in series]

321 quantization = -1 / (

322 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))

323 )

324 interpolated_thresholds = np.arange(

325 max(lib_thresholds), min(lib_thresholds), quantization

326 )

327

328 for t in interpolated_thresholds:

329 for i, s in enumerate(out):

330 s.append(series[i][0])

331 if t <= lib_thresholds[0]:

332 lib_thresholds.pop(0)

333 for s in series:

334 s.pop(0)

335

336 return out + [interpolated_thresholds]

337

338

339def credo_pr_curve(y_true, y_prob):

340 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob)

341 (

342 precision,

343 recall,

344 thresholds,

345 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist())

346 return pd.DataFrame(

347 {

348 "precision": precision,

349 "recall": recall,

350 "threshold": thresholds,

351 }

352 )

353

354

355def credo_roc_curve(y_true, y_prob):

356 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob)

357 (

358 false_positive_rate,

359 true_positive_rate,

360 thresholds,

361 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist())

362 return pd.DataFrame(

363 {

364 "false_positive_rate": false_positive_rate,

365 "true_positive_rate": true_positive_rate,

366 "threshold": thresholds,

367 }

368 )

369

370

371def credo_det_curve(y_true, y_prob):

372 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob)

373 (

374 false_positive_rate,

375 false_negative_rate,

376 thresholds,

377 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist())

378 return pd.DataFrame(

379 {

380 "false_positive_rate": false_positive_rate,

381 "false_negative_rate": false_negative_rate,

382 "thresholds": thresholds,

383 }

384 )

385

386

387def gini_coefficient_discriminatory(y_true, y_prob, **kwargs):

388 """Returns the Gini Coefficient of a discriminatory model

389

390 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'.

391

392 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve.

393 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal

394 relationship between predictions made by a model and the ground truth values for each sample.

395

396 This metric has a linear relationship with the area under the receiver operating characteristic curve:

397 :math:`G = 2*AUC - 1`

398

399 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420

400 for more details.

401

402 Parameters

403 ----------

404 y_true : array-like

405 Ground truth (correct) labels.

406 y_prob : array-like

407 Predicted probabilities returned by a call to the model's `predict_proba()` function.

408

409 Returns

410 -------

411 float

412 Discriminatory Gini Coefficient

413 """

414 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob, **kwargs)) - 1

415 return G

416

417

418def population_stability_index(

419 expected_array,

420 actual_array,

421 percentage=False,

422 buckets: int = 10,

423 buckettype: Literal["bins", "quantiles"] = "bins",

424):

425 """Calculate the PSI for a single variable.

426

427 PSI is a measure of how much a distribution has changed over time or between

428 two different samples of a population.

429 It does this by bucketing the two distributions and comparing the percents of

430 items in each of the buckets. The final result is a single number:

431

432 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )`

433

434 The common interpretations of the PSI result are:

435

436 PSI < 0.1: no significant population change

437 PSI < 0.25: moderate population change

438 PSI >= 0.25: significant population change

439

440 The number of buckets chosen and the bucketing logic affect the final result.

441

442

443 References

444 ----------

445 Based on the code in: github.com/mwburke by Matthew Burke.

446 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html

447 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations

448

449

450 Parameters

451 ----------

452 expected_array: array-like

453 Array of expected/initial values

454 actual_array: array-like

455 Array of new values

456 percentage: bool

457 When True the arrays are interpreted as already binned/aggregated. This is

458 so that the user can perform their own aggregation and pass it directly to

459 the metric. Default = False

460 buckets: int

461 number of percentile ranges to bucket the values into

462 buckettype: Literal["bins", "quantiles"]

463 type of strategy for creating buckets, bins splits into even splits,

464 quantiles splits into quantile buckets

465

466 Returns:

467 psi_value: calculated PSI value

468 """

469 epsilon: float = 0.001

470

471 def scale_range(input, min, max):

472 input += -(np.min(input))

473 input /= np.max(input) / (max - min)

474 input += min

475 return input

476

477 if not percentage:

478 # Define histogram breakpoints

479 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

480

481 if buckettype == "bins":

482 breakpoints = scale_range(

483 breakpoints, np.min(expected_array), np.max(expected_array)

484 )

485 elif buckettype == "quantiles":

486 breakpoints = np.stack(

487 [np.percentile(expected_array, b) for b in breakpoints]

488 )

489

490 # Populate bins and calculate percentages

491 expected_percents = np.histogram(expected_array, breakpoints)[0] / len(

492 expected_array

493 )

494 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

495 else:

496 expected_percents = expected_array

497 actual_percents = actual_array

498

499 # Substitute 0 with an arbitrary epsilon << 1

500 # This is to avoid inf in the following calculations

501 expected_percents[expected_percents == 0] = epsilon

502 actual_percents[actual_percents == 0] = epsilon

503

504 psi_values = (expected_percents - actual_percents) * np.log(

505 expected_percents / actual_percents

506 )

507

508 return np.sum(psi_values)

509

510

511def kl_divergence(p, q):

512 """Calculates KL divergence of two discrete distributions

513

514 Parameters

515 ----------

516 p : list

517 first distribution

518 q : list

519 second distribution

520

521 Returns

522 -------

523 float

524 KL divergence

525

526 Examples

527 --------

528 >>> p = [.05, .1, .2, .05, .15, .25, .08, .12]

529 >>> q = [.30, .1, .2, .10, .10, .02, .08, .10]

530 >>> round(kl_divergence(p, q),2)

531 0.59

532 """

533 if len(p) != len(q):

534 raise ValueError("`p` and `q` must have the same length.")

535

536 EPSILON = 1e-12

537 vals = []

538 for pi, qi in zip(p, q):

539 vals.append(pi * np.log((pi + EPSILON) / (qi + EPSILON)))

540

541 return sum(vals)

542

543

544def normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions):

545 """Calculates normalized discounted cumulative KL divergence (NDKL)

546

547 It is non-negative, with larger values indicating a greater divergence between the

548 desired and actual distributions of groups labels. It ranges from 0 to inf

549 and the ideal value is 0 for fairness.

550

551 Parameters

552 ----------

553 ranked_list : list

554 Ranked list of items

555 desired_proportions : dict

556 The desired proportion for each group

557

558 Returns

559 -------

560 float

561 normalized discounted cumulative KL divergence

562

563 References

564 ----------

565 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search &

566 recommendation systems with application to linkedin talent search."

567 Proceedings of the 25th acm sigkdd international conference on knowledge discovery &

568 data mining. 2019.

569

570 Examples

571 --------

572 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male']

573 >>> desired_proportions = {'female': 0.6, 'male': 0.4}

574 >>> normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions)

575 0.208096993149323

576 """

577 num_items = len(ranked_list)

578 Z = np.sum(1 / (np.log2(np.arange(1, num_items + 1) + 1)))

579

580 total = 0.0

581 for k in range(1, num_items + 1):

582 item_attr_k = list(ranked_list[:k])

583 item_distr = [

584 item_attr_k.count(attr) / len(item_attr_k)

585 for attr in desired_proportions.keys()

586 ]

587 total += (1 / np.log2(k + 1)) * kl_divergence(

588 item_distr, list(desired_proportions.values())

589 )

590

591 ndkl = (1 / Z) * total

592

593 return ndkl

594

595

596def skew_parity(items_list, desired_proportions, parity_type="difference"):

597 """Calculates skew parity

598

599 max_skew vs min_skew, where skew is the proportion of the selected

600 items from a group over the desired proportion for that group.

601

602 Parameters

603 ----------

604 ranked_list : list

605 Ranked list of items

606 desired_proportions : dict

607 The desired proportion for each group

608 parity_type : type pf parity to use. Two options:

609 'difference' : `max_skew - min_skew`. It ranges from 0 to inf and the ideal value is 0.

610 'ratio' : `min_skew / max_skew`. It ranges from 0 to 1 and the ideal value is 1.

611

612 Returns

613 -------

614 float

615 skew parity difference

616

617 Examples

618 --------

619 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male']

620 >>> desired_proportions = {'female': 0.6, 'male': 0.4}

621 >>> skew_parity(ranked_list, desired_proportions)

622 1.1111111111087035

623 """

624 EPSILON = 1e-12

625 groups, counts = np.unique(items_list, return_counts=True)

626 subset_proportions = dict(zip(groups, counts / len(items_list)))

627

628 skew = {}

629 for g in groups:

630 sk = (subset_proportions[g] + EPSILON) / (desired_proportions[g] + EPSILON)

631 skew[g] = sk

632

633 if parity_type == "difference":

634 parity = max(skew.values()) - min(skew.values())

635 elif parity_type == "ratio":

636 parity = min(skew.values()) / max(skew.values())

637 else:

638 raise ValueError(

639 f"Possible values of `parity_type` are 'difference' and 'ratio'. You provided {parity_type}"

640 )

641

642 return parity

643

644

645def credo_gain_chart(y_true, y_prob, bins=10):

646 sort_indices = np.argsort(y_prob)[::-1] # descending order

647 total_events = np.sum(y_true)

648 overall_rate = np.mean(y_true)

649

650 sorted_y_true = y_true[sort_indices]

651

652 chunked_y_true = np.array_split(sorted_y_true, bins)

653

654 cumulative_samples = np.zeros(bins)

655 cumulative_events = np.zeros(bins)

656 cumulative_ratio = np.zeros(bins)

657 lift = np.zeros(bins)

658

659 for i in range(bins):

660 cumulative_samples[i] = len(np.hstack(chunked_y_true[: i + 1]))

661 cumulative_events[i] = np.sum(np.hstack(chunked_y_true[: i + 1]))

662 cumulative_ratio[i] = cumulative_events[i] / total_events

663 lift[i] = (cumulative_events[i] / cumulative_samples[i]) / overall_rate

664 return pd.DataFrame(

665 {

666 "Decile": np.arange(bins) + 1,

667 "Cumulative Samples": cumulative_samples,

668 "Cumulative Events": cumulative_events,

669 "Cumulative Gain": cumulative_ratio,

670 "Lift": lift,

671 }

672 )

Coverage for credoai/modules/metrics_credoai.py: 73%

183 statements