Coverage for credoai/modules/metrics_credoai.py: 60%
115 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
1"""Custom metrics defined by Credo AI"""
3from typing import Literal
5import numpy as np
6import pandas as pd
7import scipy.stats as st
8from fairlearn.metrics import make_derived_metric, true_positive_rate
9from sklearn import metrics as sk_metrics
10from sklearn.metrics import accuracy_score, confusion_matrix
11from sklearn.utils import check_consistent_length
14def general_wilson(p, n, z=1.96):
15 """Return lower and upper bound using Wilson Interval.
16 Parameters
17 ----------
18 p : float
19 Proportion of successes.
20 n : int
21 Total number of trials.
22 digits : int
23 Digits of precisions to which the returned bound will be rounded
24 z : float
25 Z-score, which indicates the number of standard deviations of confidence
26 Returns
27 -------
28 np.ndarray
29 Array of length 2 of form: [lower_bound, upper_bound]
30 """
31 denominator = 1 + z**2 / n
32 centre_adjusted_probability = p + z * z / (2 * n)
33 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n)
34 lower_bound = (
35 centre_adjusted_probability - z * adjusted_standard_deviation
36 ) / denominator
37 upper_bound = (
38 centre_adjusted_probability + z * adjusted_standard_deviation
39 ) / denominator
40 return np.array([lower_bound, upper_bound])
43def wilson_ci(num_hits, num_total, confidence=0.95):
44 """Convenience wrapper for general_wilson"""
45 z = st.norm.ppf((1 + confidence) / 2)
46 p = num_hits / num_total
47 return general_wilson(p, num_total, z=z)
50def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95):
51 """Return Wilson Interval bounds for performance metrics
53 Metrics derived from confusion matrix
55 Parameters
56 ----------
57 y_true : array-like of shape (n_samples,)
58 Ground truth labels
59 y_pred : array-like of shape (n_samples,)
60 Predicted labels
61 metric : string
62 indicates kind of performance metric. Must be
63 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate,
64 "fnr" is false negative rate, etc.
65 Returns
66 -------
67 np.ndarray
68 Array of length 2 of form: [lower_bound, upper_bound]
69 """
70 check_consistent_length(y_true, y_pred)
71 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
72 negatives = tn + fp
73 positives = tp + fn
74 if metric == "true_positive_rate":
75 numer = tp
76 denom = positives
77 elif metric == "true_negative_rate":
78 numer = tn
79 denom = negatives
80 elif metric == "false_positive_rate":
81 numer = fp
82 denom = negatives
83 elif metric == "false_negative_rate":
84 numer = fn
85 denom = positives
86 else:
87 raise ValueError(
88 """
89 Metric must be one of the following:
90 -true_positive_rate
91 -true_negative_rate
92 -false_positive_rate
93 -false_negative_rate
94 """
95 )
97 bounds = wilson_ci(numer, denom, confidence)
98 return bounds
101def accuracy_wilson(y_true, y_pred, confidence=0.95):
102 """Return Wilson Interval bounds for accuracy metric.
103 Parameters
104 ----------
105 y_true : array-like of shape (n_samples,)
106 Ground truth labels
107 y_pred : array-like of shape (n_samples,)
108 Predicted labels
109 Returns
110 -------
111 np.ndarray
112 Array of length 2 of form: [lower_bound, upper_bound]
113 """
114 check_consistent_length(y_true, y_pred)
115 score = accuracy_score(y_true, y_pred)
116 bounds = general_wilson(score, len(y_true), confidence)
117 return bounds
120# metric definitions
123def false_discovery_rate(y_true, y_pred, **kwargs):
124 """Compute the false discovery rate.
126 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of
127 true positives and ``fp`` the number of false positives. The false discovery rate is
128 intuitively the rate at which the classifier will be wrong when
129 labeling an example as positive.
131 The best value is 0 and the worst value is 1.
133 Parameters
134 ----------
135 y_true : 1d array-like, or label indicator array / sparse matrix
136 Ground truth (correct) target values.
138 y_pred : 1d array-like, or label indicator array / sparse matrix
139 Estimated targets as returned by a classifier.
140 kwargs : key, value mappings
141 Other keyword arguments are passed through
142 to scikit-learn.metrics.precision
143 """
144 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs)
147def false_omission_rate(y_true, y_pred, **kwargs):
148 """Compute the false omission rate.
150 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of
151 false negatives and ``tn`` the number of true negatives. The false omission rate is
152 intuitively the rate at which the classifier will be wrong when
153 labeling an example as negative.
155 The best value is 0 and the worst value is 1.
157 Parameters
158 ----------
159 y_true : 1d array-like, or label indicator array / sparse matrix
160 Ground truth (correct) target values.
162 y_pred : 1d array-like, or label indicator array / sparse matrix
163 Estimated targets as returned by a classifier.
164 kwargs : key, value mappings
165 Other keyword arguments are passed through
166 to scikit-learn.metrics.precision
167 """
168 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
169 return fn / (fn + tn)
172def equal_opportunity_difference(
173 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None
174) -> float:
175 """Calculate the equal opportunity difference.
177 Equivalent to the `true_positive_rate_difference` defined as the difference between the
178 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a`
179 of the sensitive feature(s).
181 Parameters
182 ----------
183 y_true : array-like
184 Ground truth (correct) labels.
185 y_pred : array-like
186 Predicted labels :math:`h(X)` returned by the classifier.
187 sensitive_features :
188 The sensitive features over which demographic parity should be assessed
189 method : str
190 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio`
191 for details.
192 sample_weight : array-like
193 The sample weights
194 Returns
195 -------
196 float
197 The average odds difference
198 """
199 fun = make_derived_metric(metric=true_positive_rate, transform="difference")
200 return fun(
201 y_true,
202 y_pred,
203 sensitive_features=sensitive_features,
204 method=method,
205 sample_weight=sample_weight,
206 )
209def ks_statistic(y_true, y_pred) -> float:
210 """Performs the two-sample Kolmogorov-Smirnov test (two-sided)
212 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples.
213 The null hypothesis is that the two distributions are identical, F(x)=G(x)
214 If the KS statistic is small or the p-value is high,
215 then we cannot reject the null hypothesis in favor of the alternative.
217 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different.
219 Parameters
220 ----------
221 y_true : array-like
222 Ground truth (correct) labels.
223 y_pred : array-like
224 Predicted labels :math:`h(X)` returned by the classifier.
226 Returns
227 -------
228 float
229 KS statistic value
230 """
232 ks_stat = st.ks_2samp(y_true, y_pred).statistic
234 return ks_stat
237def interpolate_increasing_thresholds(lib_thresholds, *series):
238 out = [list() for i in series]
239 quantization = 1 / (
240 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))
241 )
242 interpolated_thresholds = np.arange(
243 min(lib_thresholds), max(lib_thresholds), quantization
244 )
246 for t in interpolated_thresholds:
247 if t >= lib_thresholds[0]:
248 lib_thresholds.pop(0)
249 for s in series:
250 s.pop(0)
251 for i, s in enumerate(out):
252 s.append(series[i][0])
254 return out + [interpolated_thresholds]
257def interpolate_decreasing_thresholds(lib_thresholds, *series):
258 out = [list() for i in series]
259 quantization = -1 / (
260 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))
261 )
262 interpolated_thresholds = np.arange(
263 max(lib_thresholds), min(lib_thresholds), quantization
264 )
266 for t in interpolated_thresholds:
267 for i, s in enumerate(out):
268 s.append(series[i][0])
269 if t <= lib_thresholds[0]:
270 lib_thresholds.pop(0)
271 for s in series:
272 s.pop(0)
274 return out + [interpolated_thresholds]
277def credo_pr_curve(y_true, y_prob):
278 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob)
279 (
280 precision,
281 recall,
282 thresholds,
283 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist())
284 return pd.DataFrame(
285 {
286 "precision": precision,
287 "recall": recall,
288 "threshold": thresholds,
289 }
290 )
293def credo_roc_curve(y_true, y_prob):
294 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob)
295 (
296 false_positive_rate,
297 true_positive_rate,
298 thresholds,
299 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist())
300 return pd.DataFrame(
301 {
302 "false_positive_rate": false_positive_rate,
303 "true_positive_rate": true_positive_rate,
304 "threshold": thresholds,
305 }
306 )
309def credo_det_curve(y_true, y_prob):
310 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob)
311 (
312 false_positive_rate,
313 false_negative_rate,
314 thresholds,
315 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist())
316 return pd.DataFrame(
317 {
318 "false_positive_rate": false_positive_rate,
319 "false_negative_rate": false_negative_rate,
320 "thresholds": thresholds,
321 }
322 )
325def gini_coefficient_discriminatory(y_true, y_prob):
326 """Returns the Gini Coefficient of a discriminatory model
328 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'.
330 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve.
331 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal
332 relationship between predictions made by a model and the ground truth values for each sample.
334 This metric has a linear relationship with the area under the receiver operating characteristic curve:
335 :math:`G = 2*AUC - 1`
337 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420
338 for more details.
340 Parameters
341 ----------
342 y_true : array-like
343 Ground truth (correct) labels.
344 y_prob : array-like
345 Predicted probabilities returned by a call to the model's `predict_proba()` function.
347 Returns
348 -------
349 float
350 Discriminatory Gini Coefficient
351 """
352 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob)) - 1
353 return G
356def population_stability_index(
357 expected_array,
358 actual_array,
359 percentage=False,
360 buckets: int = 10,
361 buckettype: Literal["bins", "quantiles"] = "bins",
362):
363 """Calculate the PSI for a single variable.
365 PSI is a measure of how much a distribution has changed over time or between
366 two different samples of a population.
367 It does this by bucketing the two distributions and comparing the percents of
368 items in each of the buckets. The final result is a single number:
370 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )`
372 The common interpretations of the PSI result are:
374 PSI < 0.1: no significant population change
375 PSI < 0.25: moderate population change
376 PSI >= 0.25: significant population change
378 The number of buckets chosen and the bucketing logic affect the final result.
381 References
382 ----------
383 Based on the code in: github.com/mwburke by Matthew Burke.
384 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html
385 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations
388 Parameters
389 ----------
390 expected_array: array-like
391 Array of expected/initial values
392 actual_array: array-like
393 Array of new values
394 percentage: bool
395 When True the arrays are interpreted as already binned/aggregated. This is
396 so that the user can perform their own aggregation and pass it directly to
397 the metric. Default = False
398 buckets: int
399 number of percentile ranges to bucket the values into
400 buckettype: Literal["bins", "quantiles"]
401 type of strategy for creating buckets, bins splits into even splits,
402 quantiles splits into quantile buckets
404 Returns:
405 psi_value: calculated PSI value
406 """
407 epsilon: float = 0.001
409 def scale_range(input, min, max):
410 input += -(np.min(input))
411 input /= np.max(input) / (max - min)
412 input += min
413 return input
415 if not percentage:
416 # Define histogram breakpoints
417 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100
419 if buckettype == "bins":
420 breakpoints = scale_range(
421 breakpoints, np.min(expected_array), np.max(expected_array)
422 )
423 elif buckettype == "quantiles":
424 breakpoints = np.stack(
425 [np.percentile(expected_array, b) for b in breakpoints]
426 )
428 # Populate bins and calculate percentages
429 expected_percents = np.histogram(expected_array, breakpoints)[0] / len(
430 expected_array
431 )
432 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)
433 else:
434 expected_percents = expected_array
435 actual_percents = actual_array
437 # Substitute 0 with an arbitrary epsilon << 1
438 # This is to avoid inf in the following calculations
439 expected_percents[expected_percents == 0] = epsilon
440 actual_percents[actual_percents == 0] = epsilon
442 psi_values = (expected_percents - actual_percents) * np.log(
443 expected_percents / actual_percents
444 )
446 return np.sum(psi_values)