Coverage for credoai/modules/metrics

1"""Custom metrics defined by Credo AI"""

3from typing import Literal

5import numpy as np

6import pandas as pd

7import scipy.stats as st

8from fairlearn.metrics import make_derived_metric, true_positive_rate

9from sklearn import metrics as sk_metrics

10from sklearn.metrics import accuracy_score, confusion_matrix

11from sklearn.utils import check_consistent_length

14def general_wilson(p, n, z=1.96):

15 """Return lower and upper bound using Wilson Interval.

16 Parameters

17 ----------

18 p : float

19 Proportion of successes.

20 n : int

21 Total number of trials.

22 digits : int

23 Digits of precisions to which the returned bound will be rounded

24 z : float

25 Z-score, which indicates the number of standard deviations of confidence

26 Returns

27 -------

28 np.ndarray

29 Array of length 2 of form: [lower_bound, upper_bound]

30 """

31 denominator = 1 + z**2 / n

32 centre_adjusted_probability = p + z * z / (2 * n)

33 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n)

34 lower_bound = (

35 centre_adjusted_probability - z * adjusted_standard_deviation

36 ) / denominator

37 upper_bound = (

38 centre_adjusted_probability + z * adjusted_standard_deviation

39 ) / denominator

40 return np.array([lower_bound, upper_bound])

43def wilson_ci(num_hits, num_total, confidence=0.95):

44 """Convenience wrapper for general_wilson"""

45 z = st.norm.ppf((1 + confidence) / 2)

46 p = num_hits / num_total

47 return general_wilson(p, num_total, z=z)

50def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95):

51 """Return Wilson Interval bounds for performance metrics

53 Metrics derived from confusion matrix

55 Parameters

56 ----------

57 y_true : array-like of shape (n_samples,)

58 Ground truth labels

59 y_pred : array-like of shape (n_samples,)

60 Predicted labels

61 metric : string

62 indicates kind of performance metric. Must be

63 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate,

64 "fnr" is false negative rate, etc.

65 Returns

66 -------

67 np.ndarray

68 Array of length 2 of form: [lower_bound, upper_bound]

69 """

70 check_consistent_length(y_true, y_pred)

71 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

72 negatives = tn + fp

73 positives = tp + fn

74 if metric == "true_positive_rate":

75 numer = tp

76 denom = positives

77 elif metric == "true_negative_rate":

78 numer = tn

79 denom = negatives

80 elif metric == "false_positive_rate":

81 numer = fp

82 denom = negatives

83 elif metric == "false_negative_rate":

84 numer = fn

85 denom = positives

86 else:

87 raise ValueError(

88 """

89 Metric must be one of the following:

90 -true_positive_rate

91 -true_negative_rate

92 -false_positive_rate

93 -false_negative_rate

94 """

95 )

97 bounds = wilson_ci(numer, denom, confidence)

98 return bounds

100

101def accuracy_wilson(y_true, y_pred, confidence=0.95):

102 """Return Wilson Interval bounds for accuracy metric.

103 Parameters

104 ----------

105 y_true : array-like of shape (n_samples,)

106 Ground truth labels

107 y_pred : array-like of shape (n_samples,)

108 Predicted labels

109 Returns

110 -------

111 np.ndarray

112 Array of length 2 of form: [lower_bound, upper_bound]

113 """

114 check_consistent_length(y_true, y_pred)

115 score = accuracy_score(y_true, y_pred)

116 bounds = general_wilson(score, len(y_true), confidence)

117 return bounds

118

119

120# metric definitions

121

122

123def false_discovery_rate(y_true, y_pred, **kwargs):

124 """Compute the false discovery rate.

125

126 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of

127 true positives and ``fp`` the number of false positives. The false discovery rate is

128 intuitively the rate at which the classifier will be wrong when

129 labeling an example as positive.

130

131 The best value is 0 and the worst value is 1.

132

133 Parameters

134 ----------

135 y_true : 1d array-like, or label indicator array / sparse matrix

136 Ground truth (correct) target values.

137

138 y_pred : 1d array-like, or label indicator array / sparse matrix

139 Estimated targets as returned by a classifier.

140 kwargs : key, value mappings

141 Other keyword arguments are passed through

142 to scikit-learn.metrics.precision

143 """

144 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs)

145

146

147def false_omission_rate(y_true, y_pred, **kwargs):

148 """Compute the false omission rate.

149

150 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of

151 false negatives and ``tn`` the number of true negatives. The false omission rate is

152 intuitively the rate at which the classifier will be wrong when

153 labeling an example as negative.

154

155 The best value is 0 and the worst value is 1.

156

157 Parameters

158 ----------

159 y_true : 1d array-like, or label indicator array / sparse matrix

160 Ground truth (correct) target values.

161

162 y_pred : 1d array-like, or label indicator array / sparse matrix

163 Estimated targets as returned by a classifier.

164 kwargs : key, value mappings

165 Other keyword arguments are passed through

166 to scikit-learn.metrics.precision

167 """

168 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

169 return fn / (fn + tn)

170

171

172def equal_opportunity_difference(

173 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None

174) -> float:

175 """Calculate the equal opportunity difference.

176

177 Equivalent to the `true_positive_rate_difference` defined as the difference between the

178 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a`

179 of the sensitive feature(s).

180

181 Parameters

182 ----------

183 y_true : array-like

184 Ground truth (correct) labels.

185 y_pred : array-like

186 Predicted labels :math:`h(X)` returned by the classifier.

187 sensitive_features :

188 The sensitive features over which demographic parity should be assessed

189 method : str

190 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio`

191 for details.

192 sample_weight : array-like

193 The sample weights

194 Returns

195 -------

196 float

197 The average odds difference

198 """

199 fun = make_derived_metric(metric=true_positive_rate, transform="difference")

200 return fun(

201 y_true,

202 y_pred,

203 sensitive_features=sensitive_features,

204 method=method,

205 sample_weight=sample_weight,

206 )

207

208

209def ks_statistic(y_true, y_pred) -> float:

210 """Performs the two-sample Kolmogorov-Smirnov test (two-sided)

211

212 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples.

213 The null hypothesis is that the two distributions are identical, F(x)=G(x)

214 If the KS statistic is small or the p-value is high,

215 then we cannot reject the null hypothesis in favor of the alternative.

216

217 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different.

218

219 Parameters

220 ----------

221 y_true : array-like

222 Ground truth (correct) labels.

223 y_pred : array-like

224 Predicted labels :math:`h(X)` returned by the classifier.

225

226 Returns

227 -------

228 float

229 KS statistic value

230 """

231

232 ks_stat = st.ks_2samp(y_true, y_pred).statistic

233

234 return ks_stat

235

236

237def interpolate_increasing_thresholds(lib_thresholds, *series):

238 out = [list() for i in series]

239 quantization = 1 / (

240 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))

241 )

242 interpolated_thresholds = np.arange(

243 min(lib_thresholds), max(lib_thresholds), quantization

244 )

245

246 for t in interpolated_thresholds:

247 if t >= lib_thresholds[0]:

248 lib_thresholds.pop(0)

249 for s in series:

250 s.pop(0)

251 for i, s in enumerate(out):

252 s.append(series[i][0])

253

254 return out + [interpolated_thresholds]

255

256

257def interpolate_decreasing_thresholds(lib_thresholds, *series):

258 out = [list() for i in series]

259 quantization = -1 / (

260 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds))

261 )

262 interpolated_thresholds = np.arange(

263 max(lib_thresholds), min(lib_thresholds), quantization

264 )

265

266 for t in interpolated_thresholds:

267 for i, s in enumerate(out):

268 s.append(series[i][0])

269 if t <= lib_thresholds[0]:

270 lib_thresholds.pop(0)

271 for s in series:

272 s.pop(0)

273

274 return out + [interpolated_thresholds]

275

276

277def credo_pr_curve(y_true, y_prob):

278 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob)

279 (

280 precision,

281 recall,

282 thresholds,

283 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist())

284 return pd.DataFrame(

285 {

286 "precision": precision,

287 "recall": recall,

288 "threshold": thresholds,

289 }

290 )

291

292

293def credo_roc_curve(y_true, y_prob):

294 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob)

295 (

296 false_positive_rate,

297 true_positive_rate,

298 thresholds,

299 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist())

300 return pd.DataFrame(

301 {

302 "false_positive_rate": false_positive_rate,

303 "true_positive_rate": true_positive_rate,

304 "threshold": thresholds,

305 }

306 )

307

308

309def credo_det_curve(y_true, y_prob):

310 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob)

311 (

312 false_positive_rate,

313 false_negative_rate,

314 thresholds,

315 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist())

316 return pd.DataFrame(

317 {

318 "false_positive_rate": false_positive_rate,

319 "false_negative_rate": false_negative_rate,

320 "thresholds": thresholds,

321 }

322 )

323

324

325def gini_coefficient_discriminatory(y_true, y_prob):

326 """Returns the Gini Coefficient of a discriminatory model

327

328 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'.

329

330 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve.

331 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal

332 relationship between predictions made by a model and the ground truth values for each sample.

333

334 This metric has a linear relationship with the area under the receiver operating characteristic curve:

335 :math:`G = 2*AUC - 1`

336

337 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420

338 for more details.

339

340 Parameters

341 ----------

342 y_true : array-like

343 Ground truth (correct) labels.

344 y_prob : array-like

345 Predicted probabilities returned by a call to the model's `predict_proba()` function.

346

347 Returns

348 -------

349 float

350 Discriminatory Gini Coefficient

351 """

352 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob)) - 1

353 return G

354

355

356def population_stability_index(

357 expected_array,

358 actual_array,

359 percentage=False,

360 buckets: int = 10,

361 buckettype: Literal["bins", "quantiles"] = "bins",

362):

363 """Calculate the PSI for a single variable.

364

365 PSI is a measure of how much a distribution has changed over time or between

366 two different samples of a population.

367 It does this by bucketing the two distributions and comparing the percents of

368 items in each of the buckets. The final result is a single number:

369

370 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )`

371

372 The common interpretations of the PSI result are:

373

374 PSI < 0.1: no significant population change

375 PSI < 0.25: moderate population change

376 PSI >= 0.25: significant population change

377

378 The number of buckets chosen and the bucketing logic affect the final result.

379

380

381 References

382 ----------

383 Based on the code in: github.com/mwburke by Matthew Burke.

384 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html

385 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations

386

387

388 Parameters

389 ----------

390 expected_array: array-like

391 Array of expected/initial values

392 actual_array: array-like

393 Array of new values

394 percentage: bool

395 When True the arrays are interpreted as already binned/aggregated. This is

396 so that the user can perform their own aggregation and pass it directly to

397 the metric. Default = False

398 buckets: int

399 number of percentile ranges to bucket the values into

400 buckettype: Literal["bins", "quantiles"]

401 type of strategy for creating buckets, bins splits into even splits,

402 quantiles splits into quantile buckets

403

404 Returns:

405 psi_value: calculated PSI value

406 """

407 epsilon: float = 0.001

408

409 def scale_range(input, min, max):

410 input += -(np.min(input))

411 input /= np.max(input) / (max - min)

412 input += min

413 return input

414

415 if not percentage:

416 # Define histogram breakpoints

417 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

418

419 if buckettype == "bins":

420 breakpoints = scale_range(

421 breakpoints, np.min(expected_array), np.max(expected_array)

422 )

423 elif buckettype == "quantiles":

424 breakpoints = np.stack(

425 [np.percentile(expected_array, b) for b in breakpoints]

426 )

427

428 # Populate bins and calculate percentages

429 expected_percents = np.histogram(expected_array, breakpoints)[0] / len(

430 expected_array

431 )

432 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

433 else:

434 expected_percents = expected_array

435 actual_percents = actual_array

436

437 # Substitute 0 with an arbitrary epsilon << 1

438 # This is to avoid inf in the following calculations

439 expected_percents[expected_percents == 0] = epsilon

440 actual_percents[actual_percents == 0] = epsilon

441

442 psi_values = (expected_percents - actual_percents) * np.log(

443 expected_percents / actual_percents

444 )

445

446 return np.sum(psi_values)

Coverage for credoai/modules/metrics_credoai.py: 60%

115 statements