Coverage for credoai/modules/metrics_credoai.py: 60%

115 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1"""Custom metrics defined by Credo AI""" 

2 

3from typing import Literal 

4 

5import numpy as np 

6import pandas as pd 

7import scipy.stats as st 

8from fairlearn.metrics import make_derived_metric, true_positive_rate 

9from sklearn import metrics as sk_metrics 

10from sklearn.metrics import accuracy_score, confusion_matrix 

11from sklearn.utils import check_consistent_length 

12 

13 

14def general_wilson(p, n, z=1.96): 

15 """Return lower and upper bound using Wilson Interval. 

16 Parameters 

17 ---------- 

18 p : float 

19 Proportion of successes. 

20 n : int 

21 Total number of trials. 

22 digits : int 

23 Digits of precisions to which the returned bound will be rounded 

24 z : float 

25 Z-score, which indicates the number of standard deviations of confidence 

26 Returns 

27 ------- 

28 np.ndarray 

29 Array of length 2 of form: [lower_bound, upper_bound] 

30 """ 

31 denominator = 1 + z**2 / n 

32 centre_adjusted_probability = p + z * z / (2 * n) 

33 adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n))) / np.sqrt(n) 

34 lower_bound = ( 

35 centre_adjusted_probability - z * adjusted_standard_deviation 

36 ) / denominator 

37 upper_bound = ( 

38 centre_adjusted_probability + z * adjusted_standard_deviation 

39 ) / denominator 

40 return np.array([lower_bound, upper_bound]) 

41 

42 

43def wilson_ci(num_hits, num_total, confidence=0.95): 

44 """Convenience wrapper for general_wilson""" 

45 z = st.norm.ppf((1 + confidence) / 2) 

46 p = num_hits / num_total 

47 return general_wilson(p, num_total, z=z) 

48 

49 

50def confusion_wilson(y_true, y_pred, metric="tpr", confidence=0.95): 

51 """Return Wilson Interval bounds for performance metrics 

52 

53 Metrics derived from confusion matrix 

54 

55 Parameters 

56 ---------- 

57 y_true : array-like of shape (n_samples,) 

58 Ground truth labels 

59 y_pred : array-like of shape (n_samples,) 

60 Predicted labels 

61 metric : string 

62 indicates kind of performance metric. Must be 

63 tpr, tnr, fpr, or fnr. "tpr" is true-positive-rate, 

64 "fnr" is false negative rate, etc. 

65 Returns 

66 ------- 

67 np.ndarray 

68 Array of length 2 of form: [lower_bound, upper_bound] 

69 """ 

70 check_consistent_length(y_true, y_pred) 

71 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 

72 negatives = tn + fp 

73 positives = tp + fn 

74 if metric == "true_positive_rate": 

75 numer = tp 

76 denom = positives 

77 elif metric == "true_negative_rate": 

78 numer = tn 

79 denom = negatives 

80 elif metric == "false_positive_rate": 

81 numer = fp 

82 denom = negatives 

83 elif metric == "false_negative_rate": 

84 numer = fn 

85 denom = positives 

86 else: 

87 raise ValueError( 

88 """ 

89 Metric must be one of the following: 

90 -true_positive_rate 

91 -true_negative_rate 

92 -false_positive_rate 

93 -false_negative_rate 

94 """ 

95 ) 

96 

97 bounds = wilson_ci(numer, denom, confidence) 

98 return bounds 

99 

100 

101def accuracy_wilson(y_true, y_pred, confidence=0.95): 

102 """Return Wilson Interval bounds for accuracy metric. 

103 Parameters 

104 ---------- 

105 y_true : array-like of shape (n_samples,) 

106 Ground truth labels 

107 y_pred : array-like of shape (n_samples,) 

108 Predicted labels 

109 Returns 

110 ------- 

111 np.ndarray 

112 Array of length 2 of form: [lower_bound, upper_bound] 

113 """ 

114 check_consistent_length(y_true, y_pred) 

115 score = accuracy_score(y_true, y_pred) 

116 bounds = general_wilson(score, len(y_true), confidence) 

117 return bounds 

118 

119 

120# metric definitions 

121 

122 

123def false_discovery_rate(y_true, y_pred, **kwargs): 

124 """Compute the false discovery rate. 

125 

126 False discovery rate is 1-precision, or ``fp / (tp + fp)`` where ``tp`` is the number of 

127 true positives and ``fp`` the number of false positives. The false discovery rate is 

128 intuitively the rate at which the classifier will be wrong when 

129 labeling an example as positive. 

130 

131 The best value is 0 and the worst value is 1. 

132 

133 Parameters 

134 ---------- 

135 y_true : 1d array-like, or label indicator array / sparse matrix 

136 Ground truth (correct) target values. 

137 

138 y_pred : 1d array-like, or label indicator array / sparse matrix 

139 Estimated targets as returned by a classifier. 

140 kwargs : key, value mappings 

141 Other keyword arguments are passed through 

142 to scikit-learn.metrics.precision 

143 """ 

144 return 1.0 - sk_metrics.precision_score(y_true, y_pred, **kwargs) 

145 

146 

147def false_omission_rate(y_true, y_pred, **kwargs): 

148 """Compute the false omission rate. 

149 

150 False omission rate is ``fn / (tn + fn)`` where ``fn`` is the number of 

151 false negatives and ``tn`` the number of true negatives. The false omission rate is 

152 intuitively the rate at which the classifier will be wrong when 

153 labeling an example as negative. 

154 

155 The best value is 0 and the worst value is 1. 

156 

157 Parameters 

158 ---------- 

159 y_true : 1d array-like, or label indicator array / sparse matrix 

160 Ground truth (correct) target values. 

161 

162 y_pred : 1d array-like, or label indicator array / sparse matrix 

163 Estimated targets as returned by a classifier. 

164 kwargs : key, value mappings 

165 Other keyword arguments are passed through 

166 to scikit-learn.metrics.precision 

167 """ 

168 tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 

169 return fn / (fn + tn) 

170 

171 

172def equal_opportunity_difference( 

173 y_true, y_pred, *, sensitive_features, method="between_groups", sample_weight=None 

174) -> float: 

175 """Calculate the equal opportunity difference. 

176 

177 Equivalent to the `true_positive_rate_difference` defined as the difference between the 

178 largest and smallest of :math:`P[h(X)=1 | A=a, Y=1]`, across all values :math:`a` 

179 of the sensitive feature(s). 

180 

181 Parameters 

182 ---------- 

183 y_true : array-like 

184 Ground truth (correct) labels. 

185 y_pred : array-like 

186 Predicted labels :math:`h(X)` returned by the classifier. 

187 sensitive_features : 

188 The sensitive features over which demographic parity should be assessed 

189 method : str 

190 How to compute the differences. See :func:`fairlearn.metrics.MetricFrame.ratio` 

191 for details. 

192 sample_weight : array-like 

193 The sample weights 

194 Returns 

195 ------- 

196 float 

197 The average odds difference 

198 """ 

199 fun = make_derived_metric(metric=true_positive_rate, transform="difference") 

200 return fun( 

201 y_true, 

202 y_pred, 

203 sensitive_features=sensitive_features, 

204 method=method, 

205 sample_weight=sample_weight, 

206 ) 

207 

208 

209def ks_statistic(y_true, y_pred) -> float: 

210 """Performs the two-sample Kolmogorov-Smirnov test (two-sided) 

211 

212 The test compares the underlying continuous distributions F(x) and G(x) of two independent samples. 

213 The null hypothesis is that the two distributions are identical, F(x)=G(x) 

214 If the KS statistic is small or the p-value is high, 

215 then we cannot reject the null hypothesis in favor of the alternative. 

216 

217 For practical purposes, if the statistic value is higher than the critical value, the two distributions are different. 

218 

219 Parameters 

220 ---------- 

221 y_true : array-like 

222 Ground truth (correct) labels. 

223 y_pred : array-like 

224 Predicted labels :math:`h(X)` returned by the classifier. 

225 

226 Returns 

227 ------- 

228 float 

229 KS statistic value 

230 """ 

231 

232 ks_stat = st.ks_2samp(y_true, y_pred).statistic 

233 

234 return ks_stat 

235 

236 

237def interpolate_increasing_thresholds(lib_thresholds, *series): 

238 out = [list() for i in series] 

239 quantization = 1 / ( 

240 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds)) 

241 ) 

242 interpolated_thresholds = np.arange( 

243 min(lib_thresholds), max(lib_thresholds), quantization 

244 ) 

245 

246 for t in interpolated_thresholds: 

247 if t >= lib_thresholds[0]: 

248 lib_thresholds.pop(0) 

249 for s in series: 

250 s.pop(0) 

251 for i, s in enumerate(out): 

252 s.append(series[i][0]) 

253 

254 return out + [interpolated_thresholds] 

255 

256 

257def interpolate_decreasing_thresholds(lib_thresholds, *series): 

258 out = [list() for i in series] 

259 quantization = -1 / ( 

260 len(lib_thresholds) * (max(lib_thresholds) - min(lib_thresholds)) 

261 ) 

262 interpolated_thresholds = np.arange( 

263 max(lib_thresholds), min(lib_thresholds), quantization 

264 ) 

265 

266 for t in interpolated_thresholds: 

267 for i, s in enumerate(out): 

268 s.append(series[i][0]) 

269 if t <= lib_thresholds[0]: 

270 lib_thresholds.pop(0) 

271 for s in series: 

272 s.pop(0) 

273 

274 return out + [interpolated_thresholds] 

275 

276 

277def credo_pr_curve(y_true, y_prob): 

278 p, r, t = sk_metrics.precision_recall_curve(y_true, y_prob) 

279 ( 

280 precision, 

281 recall, 

282 thresholds, 

283 ) = interpolate_increasing_thresholds(t.tolist(), p.tolist(), r.tolist()) 

284 return pd.DataFrame( 

285 { 

286 "precision": precision, 

287 "recall": recall, 

288 "threshold": thresholds, 

289 } 

290 ) 

291 

292 

293def credo_roc_curve(y_true, y_prob): 

294 fpr, tpr, thresh = sk_metrics.roc_curve(y_true, y_prob) 

295 ( 

296 false_positive_rate, 

297 true_positive_rate, 

298 thresholds, 

299 ) = interpolate_decreasing_thresholds(thresh.tolist(), fpr.tolist(), tpr.tolist()) 

300 return pd.DataFrame( 

301 { 

302 "false_positive_rate": false_positive_rate, 

303 "true_positive_rate": true_positive_rate, 

304 "threshold": thresholds, 

305 } 

306 ) 

307 

308 

309def credo_det_curve(y_true, y_prob): 

310 fpr, fnr, t = sk_metrics.det_curve(y_true, y_prob) 

311 ( 

312 false_positive_rate, 

313 false_negative_rate, 

314 thresholds, 

315 ) = interpolate_increasing_thresholds(t.tolist(), fpr.tolist(), fnr.tolist()) 

316 return pd.DataFrame( 

317 { 

318 "false_positive_rate": false_positive_rate, 

319 "false_negative_rate": false_negative_rate, 

320 "thresholds": thresholds, 

321 } 

322 ) 

323 

324 

325def gini_coefficient_discriminatory(y_true, y_prob): 

326 """Returns the Gini Coefficient of a discriminatory model 

327 

328 NOTE: There are two popular, yet distinct metrics known as the 'gini coefficient'. 

329 

330 The value calculated by this function provides a summary statistic for the Cumulative Accuracy Profile (CAP) curve. 

331 This notion of Gini coefficient (or Gini index) is a _discriminatory_ metric. It helps characterize the ordinal 

332 relationship between predictions made by a model and the ground truth values for each sample. 

333 

334 This metric has a linear relationship with the area under the receiver operating characteristic curve: 

335 :math:`G = 2*AUC - 1` 

336 

337 See https://towardsdatascience.com/using-the-gini-coefficient-to-evaluate-the-performance-of-credit-score-models-59fe13ef420 

338 for more details. 

339 

340 Parameters 

341 ---------- 

342 y_true : array-like 

343 Ground truth (correct) labels. 

344 y_prob : array-like 

345 Predicted probabilities returned by a call to the model's `predict_proba()` function. 

346 

347 Returns 

348 ------- 

349 float 

350 Discriminatory Gini Coefficient 

351 """ 

352 G = (2 * sk_metrics.roc_auc_score(y_true, y_prob)) - 1 

353 return G 

354 

355 

356def population_stability_index( 

357 expected_array, 

358 actual_array, 

359 percentage=False, 

360 buckets: int = 10, 

361 buckettype: Literal["bins", "quantiles"] = "bins", 

362): 

363 """Calculate the PSI for a single variable. 

364 

365 PSI is a measure of how much a distribution has changed over time or between 

366 two different samples of a population. 

367 It does this by bucketing the two distributions and comparing the percents of 

368 items in each of the buckets. The final result is a single number: 

369 

370 :math:`PSI = \sum \left ( Actual_{%} - Expected_{%} \right ) \cdot ln\left ( \frac{Actual_{%}}{Expected_{%}} \right )` 

371 

372 The common interpretations of the PSI result are: 

373 

374 PSI < 0.1: no significant population change 

375 PSI < 0.25: moderate population change 

376 PSI >= 0.25: significant population change 

377 

378 The number of buckets chosen and the bucketing logic affect the final result. 

379 

380 

381 References 

382 ---------- 

383 Based on the code in: github.com/mwburke by Matthew Burke. 

384 For implementation walk through: https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html 

385 For a more theoretical reference: https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations 

386 

387 

388 Parameters 

389 ---------- 

390 expected_array: array-like 

391 Array of expected/initial values 

392 actual_array: array-like 

393 Array of new values 

394 percentage: bool 

395 When True the arrays are interpreted as already binned/aggregated. This is 

396 so that the user can perform their own aggregation and pass it directly to 

397 the metric. Default = False 

398 buckets: int 

399 number of percentile ranges to bucket the values into 

400 buckettype: Literal["bins", "quantiles"] 

401 type of strategy for creating buckets, bins splits into even splits, 

402 quantiles splits into quantile buckets 

403 

404 Returns: 

405 psi_value: calculated PSI value 

406 """ 

407 epsilon: float = 0.001 

408 

409 def scale_range(input, min, max): 

410 input += -(np.min(input)) 

411 input /= np.max(input) / (max - min) 

412 input += min 

413 return input 

414 

415 if not percentage: 

416 # Define histogram breakpoints 

417 breakpoints = np.arange(0, buckets + 1) / (buckets) * 100 

418 

419 if buckettype == "bins": 

420 breakpoints = scale_range( 

421 breakpoints, np.min(expected_array), np.max(expected_array) 

422 ) 

423 elif buckettype == "quantiles": 

424 breakpoints = np.stack( 

425 [np.percentile(expected_array, b) for b in breakpoints] 

426 ) 

427 

428 # Populate bins and calculate percentages 

429 expected_percents = np.histogram(expected_array, breakpoints)[0] / len( 

430 expected_array 

431 ) 

432 actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array) 

433 else: 

434 expected_percents = expected_array 

435 actual_percents = actual_array 

436 

437 # Substitute 0 with an arbitrary epsilon << 1 

438 # This is to avoid inf in the following calculations 

439 expected_percents[expected_percents == 0] = epsilon 

440 actual_percents[actual_percents == 0] = epsilon 

441 

442 psi_values = (expected_percents - actual_percents) * np.log( 

443 expected_percents / actual_percents 

444 ) 

445 

446 return np.sum(psi_values)