Coverage for credoai/modules/metrics_credoai.py: 73%
183 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
1"""Custom metrics defined by Credo AI"""
3from functools import partial
4from typing import Literal
6import numpy as np
7import pandas as pd
8import scipy.stats as st
9from fairlearn.metrics import make_derived_metric, true_positive_rate
10from sklearn import metrics as sk_metrics
11from sklearn.metrics import accuracy_score, confusion_matrix
12from sklearn.utils import check_consistent_length
15def multiclass_confusion_metrics(y_true, y_pred, metric=None, average="weighted"):
16 """Calculate
18 Parameters
19 ----------
20 y_true : array-like of shape (n_samples,)
21 Ground truth (correct) target values.
23 y_pred : array-like of shape (n_samples,)
24 Estimated targets as returned by a classifier.
25 metric : str, optional
26 If provided, returns a specific metric. All metrics are returned if None is provided.
27 Options are:
28 "TPR": Sensitivity, hit rate, recall, or true positive rate
29 "TNR": Specificity or true negative rate
30 "PPV": Precision or positive predictive value
31 "NPV": Negative predictive value
32 "FPR": Fall out or false positive rate
33 "FNR": False negative rate
34 "FDR": False discovery rate
35 "ACC": Overall accuracy
36 average : str
37 Options are "weighted", "macro" or None (which will return the values for each label)
39 Returns
40 -------
41 dict or float
42 dict if metric is not provided
43 """
44 cnf_matrix = confusion_matrix(y_true, y_pred)
45 FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
46 FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
47 TP = np.diag(cnf_matrix)
48 TN = cnf_matrix.sum() - (FP + FN + TP)
50 FP = FP.astype(float)
51 FN = FN.astype(float)
52 TP = TP.astype(float)
53 TN = TN.astype(float)
55 metrics = {
56 "TPR": TP / (TP + FN),
57 "TNR": TN / (TN + FP),
58 "PPV": TP / (TP + FP),
59 "NPV": TN / (TN + FN),
60 "FPR": FP / (FP + TN),
61 "FNR": FN / (TP + FN),
62 "FDR": FP / (TP + FP),
63 "ACC": (TP + TN) / (TP + FP + FN + TN),
64 }
65 if average == "weighted":
66 weights = np.unique(y_true, return_counts=True)[1] / len(y_true)
67 metrics = {k: np.average(v, weights=weights) for k, v in metrics.items()}
68 elif average == "macro":
69 metrics = {k: v.mean() for k, v in metrics.items()}
70 if metric:
71 return metrics[metric]
72 else:
73 return metrics
76def general_wilson(p, n, z=1.96):
77 """Return lower and upper bound using Wilson Interval.
78 Parameters
79 ----------
80 p : float
81 Proportion of successes.
82 n : int
83 Total number of trials.
84 digits : int
85 Digits of precisions to which the returned bound will be rounded
86 z : float
87 Z-score, which indicates the number of standard deviations of confidence
88 Returns
89 -------
90 np.ndarray
91 Array of length 2 of form: [lower_bound, upper_bound]
92 """
93 denominator = 1 + z**2 / n
94 centre_adjusted_probability = p + z * z / (2 * n)
95 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n)
96 lower_bound = (
97 centre_adjusted_probability - z * adjusted_standard_deviation
98 ) / denominator
99 upper_bound = (
100 centre_adjusted_probability + z * adjusted_standard_deviation
101 ) / denominator
102 return np.array([lower_bound, upper_bound])
105def wilson_ci(num_hits, num_total, confidence=0.95):
106 """Convenience wrapper for general_wilson"""
107 z = st.norm.ppf((1 + confidence) / 2)
108 p = num_hits / num_total
109 return general_wilson(p, num_total, z=z)
112def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95):
113 """Return Wilson Interval bounds for performance metrics
115 Metrics derived from confusion matrix
117 Parameters
118 ----------
119 y_true : array-like of shape (n_samples,)
120 Ground truth labels
121 y_pred : array-like of shape (n_samples,)
122 Predicted labels
123 metric : string
124 indicates kind of performance metric. Must be
125 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate,
126 "fnr" is false negative rate, etc.
127 Returns
128 -------
129 np.ndarray
130 Array of length 2 of form: [lower_bound, upper_bound]
131 """
132 check_consistent_length(y_true, y_pred)
133 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
134 negatives = tn + fp
135 positives = tp + fn
136 if metric == "true_positive_rate":
137 numer = tp
138 denom = positives
139 elif metric == "true_negative_rate":
140 numer = tn
141 denom = negatives
142 elif metric == "false_positive_rate":
143 numer = fp
144 denom = negatives
145 elif metric == "false_negative_rate":
146 numer = fn
147 denom = positives
148 else:
149 raise ValueError(
150 """
151 Metric must be one of the following:
152 -true_positive_rate
153 -true_negative_rate
154 -false_positive_rate
155 -false_negative_rate
156 """
157 )
159 bounds = wilson_ci(numer, denom, confidence)
160 return bounds
163def accuracy_wilson(y_true, y_pred, confidence=0.95):
164 """Return Wilson Interval bounds for accuracy metric.
165 Parameters
166 ----------
167 y_true : array-like of shape (n_samples,)
168 Ground truth labels
169 y_pred : array-like of shape (n_samples,)
170 Predicted labels
171 Returns
172 -------
173 np.ndarray
174 Array of length 2 of form: [lower_bound, upper_bound]
175 """
176 check_consistent_length(y_true, y_pred)
177 score = accuracy_score(y_true, y_pred)
178 bounds = general_wilson(score, len(y_true), confidence)
179 return bounds
182# metric definitions
185def false_discovery_rate(y_true, y_pred, **kwargs):
186 """Compute the false discovery rate.
188 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of
189 true positives and ``fp`` the number of false positives. The false discovery rate is
190 intuitively the rate at which the classifier will be wrong when
191 labeling an example as positive.
193 The best value is 0 and the worst value is 1.
195 Parameters
196 ----------
197 y_true : 1d array-like, or label indicator array / sparse matrix
198 Ground truth (correct) target values.
200 y_pred : 1d array-like, or label indicator array / sparse matrix
201 Estimated targets as returned by a classifier.
202 kwargs : key, value mappings
203 Other keyword arguments are passed through
204 to scikit-learn.metrics.precision
205 """
206 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs)
209def false_omission_rate(y_true, y_pred, **kwargs):
210 """Compute the false omission rate.
212 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of
213 false negatives and ``tn`` the number of true negatives. The false omission rate is
214 intuitively the rate at which the classifier will be wrong when
215 labeling an example as negative.
217 The best value is 0 and the worst value is 1.
219 Parameters
220 ----------
221 y_true : 1d array-like, or label indicator array / sparse matrix
222 Ground truth (correct) target values.
224 y_pred : 1d array-like, or label indicator array / sparse matrix
225 Estimated targets as returned by a classifier.
226 kwargs : key, value mappings
227 Other keyword arguments are passed through
228 to scikit-learn.metrics.precision
229 """
230 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
231 return fn / (fn + tn)
234def equal_opportunity_difference(
235 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None
236) -> float:
237 """Calculate the equal opportunity difference.
239 Equivalent to the `true_positive_rate_difference` defined as the difference between the
240 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a`
241 of the sensitive feature(s).
243 Parameters
244 ----------
245 y_true : array-like
246 Ground truth (correct) labels.
247 y_pred : array-like
248 Predicted labels :math:`h(X)` returned by the classifier.
249 sensitive_features :
250 The sensitive features over which demographic parity should be assessed
251 method : str
252 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio`
253 for details.
254 sample_weight : array-like
255 The sample weights
256 Returns
257 -------
258 float
259 The average odds difference
260 """
261 fun = make_derived_metric(metric=true_positive_rate, transform="difference")
262 return fun(
263 y_true,
264 y_pred,
265 sensitive_features=sensitive_features,
266 method=method,
267 sample_weight=sample_weight,
268 )
271def ks_statistic(y_true, y_pred) -> float:
272 """Performs the two-sample Kolmogorov-Smirnov test (two-sided)
274 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples.
275 The null hypothesis is that the two distributions are identical, F(x)=G(x)
276 If the KS statistic is small or the p-value is high,
277 then we cannot reject the null hypothesis in favor of the alternative.
279 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different.
281 Parameters
282 ----------
283 y_true : array-like
284 Ground truth (correct) labels.
285 y_pred : array-like
286 Predicted labels :math:`h(X)` returned by the classifier.
288 Returns
289 -------
290 float
291 KS statistic value
292 """
294 ks_stat = st.ks_2samp(y_true, y_pred).statistic
296 return ks_stat
299def interpolate_increasing_thresholds(lib_thresholds, *series):
300 out = [list() for i in series]
301 quantization = 1 / (
302 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))
303 )
304 interpolated_thresholds = np.arange(
305 min(lib_thresholds), max(lib_thresholds), quantization
306 )
308 for t in interpolated_thresholds:
309 if t >= lib_thresholds[0]:
310 lib_thresholds.pop(0)
311 for s in series:
312 s.pop(0)
313 for i, s in enumerate(out):
314 s.append(series[i][0])
316 return out + [interpolated_thresholds]
319def interpolate_decreasing_thresholds(lib_thresholds, *series):
320 out = [list() for i in series]
321 quantization = -1 / (
322 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))
323 )
324 interpolated_thresholds = np.arange(
325 max(lib_thresholds), min(lib_thresholds), quantization
326 )
328 for t in interpolated_thresholds:
329 for i, s in enumerate(out):
330 s.append(series[i][0])
331 if t <= lib_thresholds[0]:
332 lib_thresholds.pop(0)
333 for s in series:
334 s.pop(0)
336 return out + [interpolated_thresholds]
339def credo_pr_curve(y_true, y_prob):
340 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob)
341 (
342 precision,
343 recall,
344 thresholds,
345 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist())
346 return pd.DataFrame(
347 {
348 "precision": precision,
349 "recall": recall,
350 "threshold": thresholds,
351 }
352 )
355def credo_roc_curve(y_true, y_prob):
356 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob)
357 (
358 false_positive_rate,
359 true_positive_rate,
360 thresholds,
361 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist())
362 return pd.DataFrame(
363 {
364 "false_positive_rate": false_positive_rate,
365 "true_positive_rate": true_positive_rate,
366 "threshold": thresholds,
367 }
368 )
371def credo_det_curve(y_true, y_prob):
372 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob)
373 (
374 false_positive_rate,
375 false_negative_rate,
376 thresholds,
377 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist())
378 return pd.DataFrame(
379 {
380 "false_positive_rate": false_positive_rate,
381 "false_negative_rate": false_negative_rate,
382 "thresholds": thresholds,
383 }
384 )
387def gini_coefficient_discriminatory(y_true, y_prob, **kwargs):
388 """Returns the Gini Coefficient of a discriminatory model
390 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'.
392 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve.
393 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal
394 relationship between predictions made by a model and the ground truth values for each sample.
396 This metric has a linear relationship with the area under the receiver operating characteristic curve:
397 :math:`G = 2*AUC - 1`
399 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420
400 for more details.
402 Parameters
403 ----------
404 y_true : array-like
405 Ground truth (correct) labels.
406 y_prob : array-like
407 Predicted probabilities returned by a call to the model's `predict_proba()` function.
409 Returns
410 -------
411 float
412 Discriminatory Gini Coefficient
413 """
414 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob, **kwargs)) - 1
415 return G
418def population_stability_index(
419 expected_array,
420 actual_array,
421 percentage=False,
422 buckets: int = 10,
423 buckettype: Literal["bins", "quantiles"] = "bins",
424):
425 """Calculate the PSI for a single variable.
427 PSI is a measure of how much a distribution has changed over time or between
428 two different samples of a population.
429 It does this by bucketing the two distributions and comparing the percents of
430 items in each of the buckets. The final result is a single number:
432 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )`
434 The common interpretations of the PSI result are:
436 PSI < 0.1: no significant population change
437 PSI < 0.25: moderate population change
438 PSI >= 0.25: significant population change
440 The number of buckets chosen and the bucketing logic affect the final result.
443 References
444 ----------
445 Based on the code in: github.com/mwburke by Matthew Burke.
446 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html
447 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations
450 Parameters
451 ----------
452 expected_array: array-like
453 Array of expected/initial values
454 actual_array: array-like
455 Array of new values
456 percentage: bool
457 When True the arrays are interpreted as already binned/aggregated. This is
458 so that the user can perform their own aggregation and pass it directly to
459 the metric. Default = False
460 buckets: int
461 number of percentile ranges to bucket the values into
462 buckettype: Literal["bins", "quantiles"]
463 type of strategy for creating buckets, bins splits into even splits,
464 quantiles splits into quantile buckets
466 Returns:
467 psi_value: calculated PSI value
468 """
469 epsilon: float = 0.001
471 def scale_range(input, min, max):
472 input += -(np.min(input))
473 input /= np.max(input) / (max - min)
474 input += min
475 return input
477 if not percentage:
478 # Define histogram breakpoints
479 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100
481 if buckettype == "bins":
482 breakpoints = scale_range(
483 breakpoints, np.min(expected_array), np.max(expected_array)
484 )
485 elif buckettype == "quantiles":
486 breakpoints = np.stack(
487 [np.percentile(expected_array, b) for b in breakpoints]
488 )
490 # Populate bins and calculate percentages
491 expected_percents = np.histogram(expected_array, breakpoints)[0] / len(
492 expected_array
493 )
494 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)
495 else:
496 expected_percents = expected_array
497 actual_percents = actual_array
499 # Substitute 0 with an arbitrary epsilon << 1
500 # This is to avoid inf in the following calculations
501 expected_percents[expected_percents == 0] = epsilon
502 actual_percents[actual_percents == 0] = epsilon
504 psi_values = (expected_percents - actual_percents) * np.log(
505 expected_percents / actual_percents
506 )
508 return np.sum(psi_values)
511def kl_divergence(p, q):
512 """Calculates KL divergence of two discrete distributions
514 Parameters
515 ----------
516 p : list
517 first distribution
518 q : list
519 second distribution
521 Returns
522 -------
523 float
524 KL divergence
526 Examples
527 --------
528 >>> p = [.05, .1, .2, .05, .15, .25, .08, .12]
529 >>> q = [.30, .1, .2, .10, .10, .02, .08, .10]
530 >>> round(kl_divergence(p, q),2)
531 0.59
532 """
533 if len(p) != len(q):
534 raise ValueError("`p` and `q` must have the same length.")
536 EPSILON = 1e-12
537 vals = []
538 for pi, qi in zip(p, q):
539 vals.append(pi * np.log((pi + EPSILON) / (qi + EPSILON)))
541 return sum(vals)
544def normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions):
545 """Calculates normalized discounted cumulative KL divergence (NDKL)
547 It is non-negative, with larger values indicating a greater divergence between the
548 desired and actual distributions of groups labels. It ranges from 0 to inf
549 and the ideal value is 0 for fairness.
551 Parameters
552 ----------
553 ranked_list : list
554 Ranked list of items
555 desired_proportions : dict
556 The desired proportion for each group
558 Returns
559 -------
560 float
561 normalized discounted cumulative KL divergence
563 References
564 ----------
565 Geyik, Sahin Cem, Stuart Ambler, and Krishnaram Kenthapadi. "Fairness-aware ranking in search &
566 recommendation systems with application to linkedin talent search."
567 Proceedings of the 25th acm sigkdd international conference on knowledge discovery &
568 data mining. 2019.
570 Examples
571 --------
572 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male']
573 >>> desired_proportions = {'female': 0.6, 'male': 0.4}
574 >>> normalized_discounted_cumulative_kl_divergence(ranked_list, desired_proportions)
575 0.208096993149323
576 """
577 num_items = len(ranked_list)
578 Z = np.sum(1 / (np.log2(np.arange(1, num_items + 1) + 1)))
580 total = 0.0
581 for k in range(1, num_items + 1):
582 item_attr_k = list(ranked_list[:k])
583 item_distr = [
584 item_attr_k.count(attr) / len(item_attr_k)
585 for attr in desired_proportions.keys()
586 ]
587 total += (1 / np.log2(k + 1)) * kl_divergence(
588 item_distr, list(desired_proportions.values())
589 )
591 ndkl = (1 / Z) * total
593 return ndkl
596def skew_parity(items_list, desired_proportions, parity_type="difference"):
597 """Calculates skew parity
599 max_skew vs min_skew, where skew is the proportion of the selected
600 items from a group over the desired proportion for that group.
602 Parameters
603 ----------
604 ranked_list : list
605 Ranked list of items
606 desired_proportions : dict
607 The desired proportion for each group
608 parity_type : type pf parity to use. Two options:
609 'difference' : `max_skew - min_skew`. It ranges from 0 to inf and the ideal value is 0.
610 'ratio' : `min_skew / max_skew`. It ranges from 0 to 1 and the ideal value is 1.
612 Returns
613 -------
614 float
615 skew parity difference
617 Examples
618 --------
619 >>> ranked_list = ['female', 'male', 'male', 'female', 'male', 'male']
620 >>> desired_proportions = {'female': 0.6, 'male': 0.4}
621 >>> skew_parity(ranked_list, desired_proportions)
622 1.1111111111087035
623 """
624 EPSILON = 1e-12
625 groups, counts = np.unique(items_list, return_counts=True)
626 subset_proportions = dict(zip(groups, counts / len(items_list)))
628 skew = {}
629 for g in groups:
630 sk = (subset_proportions[g] + EPSILON) / (desired_proportions[g] + EPSILON)
631 skew[g] = sk
633 if parity_type == "difference":
634 parity = max(skew.values()) - min(skew.values())
635 elif parity_type == "ratio":
636 parity = min(skew.values()) / max(skew.values())
637 else:
638 raise ValueError(
639 f"Possible values of `parity_type` are 'difference' and 'ratio'. You provided {parity_type}"
640 )
642 return parity
645def credo_gain_chart(y_true, y_prob, bins=10):
646 sort_indices = np.argsort(y_prob)[::-1] # descending order
647 total_events = np.sum(y_true)
648 overall_rate = np.mean(y_true)
650 sorted_y_true = y_true[sort_indices]
652 chunked_y_true = np.array_split(sorted_y_true, bins)
654 cumulative_samples = np.zeros(bins)
655 cumulative_events = np.zeros(bins)
656 cumulative_ratio = np.zeros(bins)
657 lift = np.zeros(bins)
659 for i in range(bins):
660 cumulative_samples[i] = len(np.hstack(chunked_y_true[: i + 1]))
661 cumulative_events[i] = np.sum(np.hstack(chunked_y_true[: i + 1]))
662 cumulative_ratio[i] = cumulative_events[i] / total_events
663 lift[i] = (cumulative_events[i] / cumulative_samples[i]) / overall_rate
664 return pd.DataFrame(
665 {
666 "Decile": np.arange(bins) + 1,
667 "Cumulative Samples": cumulative_samples,
668 "Cumulative Events": cumulative_events,
669 "Cumulative Gain": cumulative_ratio,
670 "Lift": lift,
671 }
672 )