Coverage for credoai/evaluators/performance.py: 88%

117 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1import pandas as pd 

2from connect.evidence import MetricContainer, TableContainer 

3from sklearn.metrics import confusion_matrix 

4 

5from credoai.artifacts import ClassificationModel, TabularData 

6from credoai.evaluators import Evaluator 

7from credoai.evaluators.utils.fairlearn import setup_metric_frames 

8from credoai.evaluators.utils.validation import ( 

9 check_artifact_for_nulls, 

10 check_data_instance, 

11 check_existence, 

12) 

13from credoai.modules.constants_metrics import ( 

14 MODEL_METRIC_CATEGORIES, 

15 THRESHOLD_METRIC_CATEGORIES, 

16) 

17from credoai.modules.metrics import Metric, find_metrics 

18from credoai.utils.common import ValidationError 

19 

20 

21class Performance(Evaluator): 

22 """ 

23 Performance evaluator for Credo AI. 

24 

25 This evaluator calculates overall performance metrics. 

26 Handles any metric that can be calculated on a set of ground truth labels and predictions, 

27 e.g., binary classification, multi class classification, regression. 

28 

29 This module takes in a set of metrics and provides functionality to: 

30 

31 - calculate the metrics 

32 - create disaggregated metrics 

33 

34 Parameters 

35 ---------- 

36 metrics : List-like 

37 list of metric names as strings or list of Metric objects (credoai.modules.metrics.Metric). 

38 Metric strings should in list returned by credoai.modules.metric_utils.list_metrics(). 

39 Note for performance parity metrics like 

40 "false negative rate parity" just list "false negative rate". Parity metrics 

41 are calculated automatically if the performance metric is supplied 

42 y_true : (List, pandas.Series, numpy.ndarray) 

43 The ground-truth labels (for classification) or target values (for regression). 

44 y_pred : (List, pandas.Series, numpy.ndarray) 

45 The predicted labels for classification 

46 y_prob : (List, pandas.Series, numpy.ndarray), optional 

47 The unthresholded predictions, confidence values or probabilities. 

48 """ 

49 

50 required_artifacts = {"model", "assessment_data"} 

51 

52 def __init__(self, metrics=None): 

53 super().__init__() 

54 # assign variables 

55 self.metrics = metrics 

56 self.metric_frames = {} 

57 self.performance_metrics = None 

58 self.prob_metrics = None 

59 self.failed_metrics = None 

60 

61 def _validate_arguments(self): 

62 check_existence(self.metrics, "metrics") 

63 check_data_instance(self.assessment_data, TabularData) 

64 check_artifact_for_nulls(self.assessment_data, "Data") 

65 

66 def _setup(self): 

67 # data variables 

68 self.y_true = self.assessment_data.y 

69 self.y_pred = self.model.predict(self.assessment_data.X) 

70 try: 

71 self.y_prob = self.model.predict_proba(self.assessment_data.X) 

72 except: 

73 self.y_prob = None 

74 self.update_metrics(self.metrics) 

75 return self 

76 

77 def evaluate(self): 

78 """ 

79 Run performance base module 

80 """ 

81 results = [] 

82 overall_metrics = self.get_overall_metrics() 

83 threshold_metrics = self.get_overall_threshold_metrics() 

84 

85 if overall_metrics is not None: 

86 results.append(overall_metrics) 

87 if threshold_metrics is not None: 

88 results += threshold_metrics 

89 

90 if isinstance(self.model, ClassificationModel): 

91 results.append(self._create_confusion_container()) 

92 self.results = results 

93 return self 

94 

95 def update_metrics(self, metrics, replace=True): 

96 """replace metrics 

97 

98 Parameters 

99 ---------- 

100 metrics : List-like 

101 list of metric names as string or list of Metrics (credoai.metrics.Metric). 

102 Metric strings should in list returned by credoai.modules.list_metrics. 

103 Note for performance parity metrics like 

104 "false negative rate parity" just list "false negative rate". Parity metrics 

105 are calculated automatically if the performance metric is supplied 

106 """ 

107 if replace: 

108 self.metrics = metrics 

109 else: 

110 self.metrics += metrics 

111 ( 

112 self.performance_metrics, 

113 self.prob_metrics, 

114 self.threshold_metrics, 

115 self.failed_metrics, 

116 ) = self._process_metrics(self.metrics) 

117 

118 dummy_sensitive = pd.Series(["NA"] * len(self.y_true), name="NA") 

119 self.metric_frames = setup_metric_frames( 

120 self.performance_metrics, 

121 self.prob_metrics, 

122 self.threshold_metrics, 

123 self.y_pred, 

124 self.y_prob, 

125 self.y_true, 

126 dummy_sensitive, 

127 ) 

128 

129 def get_df(self): 

130 """Return dataframe of input arrays 

131 

132 Returns 

133 ------- 

134 pandas.DataFrame 

135 Dataframe containing the input arrays 

136 """ 

137 df = pd.DataFrame({"true": self.y_true, "pred": self.y_pred}) 

138 if self.y_prob is not None: 

139 y_prob_df = pd.DataFrame(self.y_prob) 

140 y_prob_df.columns = [f"y_prob_{i}" for i in range(y_prob_df.shape[1])] 

141 df = pd.concat([df, y_prob_df], axis=1) 

142 

143 return df 

144 

145 def get_overall_metrics(self): 

146 """Return scalar performance metrics for each group 

147 

148 Returns 

149 ------- 

150 pandas.Series 

151 The overall performance metrics 

152 """ 

153 # retrieve overall metrics for one of the sensitive features only as they are the same 

154 overall_metrics = [ 

155 metric_frame.overall 

156 for name, metric_frame in self.metric_frames.items() 

157 if name != "thresh" 

158 ] 

159 if not overall_metrics: 

160 return 

161 

162 output_series = ( 

163 pd.concat(overall_metrics, axis=0).rename(index="value").to_frame() 

164 ) 

165 output_series = output_series.reset_index().rename({"index": "type"}, axis=1) 

166 

167 return MetricContainer(output_series, **self.get_container_info()) 

168 

169 def get_overall_threshold_metrics(self): 

170 """Return performance metrics for each group 

171 

172 Returns 

173 ------- 

174 pandas.Series 

175 The overall performance metrics 

176 """ 

177 # retrieve overall metrics for one of the sensitive features only as they are the same 

178 if not (self.threshold_metrics and "thresh" in self.metric_frames): 

179 return 

180 

181 threshold_results = ( 

182 pd.concat([self.metric_frames["thresh"].overall], axis=0) 

183 .rename(index="value") 

184 .to_frame() 

185 ) 

186 threshold_results = threshold_results.reset_index().rename( 

187 {"index": "threshold_metric"}, axis=1 

188 ) 

189 threshold_results.name = "threshold_metric_performance" 

190 

191 results = [] 

192 for _, threshold_metric in threshold_results.iterrows(): 

193 metric = threshold_metric.threshold_metric 

194 threshold_metric.value.name = "threshold_dependent_performance" 

195 results.append( 

196 TableContainer( 

197 threshold_metric.value, 

198 **self.get_container_info({"metric_type": metric}), 

199 ) 

200 ) 

201 

202 return results 

203 

204 def _process_metrics(self, metrics): 

205 """Separates metrics 

206 

207 Parameters 

208 ---------- 

209 metrics : Union[List[Metric, str]] 

210 list of metrics to use. These can be Metric objects 

211 (see credoai.modules.metrics.py), or strings. 

212 If strings, they will be converted to Metric objects 

213 as appropriate, using find_metrics() 

214 

215 Returns 

216 ------- 

217 Separate dictionaries and lists of metrics 

218 """ 

219 # separate metrics 

220 failed_metrics = [] 

221 performance_metrics = {} 

222 prob_metrics = {} 

223 threshold_metrics = {} 

224 for metric in metrics: 

225 if isinstance(metric, str): 

226 metric_name = metric 

227 metric = find_metrics(metric, MODEL_METRIC_CATEGORIES) 

228 if len(metric) == 1: 

229 metric = metric[0] 

230 elif len(metric) == 0: 

231 raise Exception( 

232 f"Returned no metrics when searching using the provided metric name <{metric_name}>. Expected to find one matching metric." 

233 ) 

234 else: 

235 raise Exception( 

236 f"Returned multiple metrics when searching using the provided metric name <{metric_name}>. Expected to find only one matching metric." 

237 ) 

238 else: 

239 metric_name = metric.name 

240 if not isinstance(metric, Metric): 

241 raise ValidationError( 

242 "Specified metric is not of type credoai.metric.Metric" 

243 ) 

244 if metric.metric_category == "FAIRNESS": 

245 self.logger.info( 

246 f"fairness metric, {metric_name}, unused by PerformanceModule" 

247 ) 

248 pass 

249 elif metric.metric_category in MODEL_METRIC_CATEGORIES: 

250 if metric.takes_prob: 

251 if metric.metric_category in THRESHOLD_METRIC_CATEGORIES: 

252 threshold_metrics[metric_name] = metric 

253 else: 

254 prob_metrics[metric_name] = metric 

255 else: 

256 performance_metrics[metric_name] = metric 

257 else: 

258 self.logger.warning( 

259 f"{metric_name} failed to be used by FairnessModule" 

260 ) 

261 failed_metrics.append(metric_name) 

262 

263 return (performance_metrics, prob_metrics, threshold_metrics, failed_metrics) 

264 

265 def _create_confusion_container(self): 

266 confusion_container = TableContainer( 

267 create_confusion_matrix(self.y_true, self.y_pred), 

268 **self.get_container_info(), 

269 ) 

270 return confusion_container 

271 

272 

273############################################ 

274## Evaluation helper functions 

275 

276## Helper functions create evidences 

277## to be passed to .evaluate to be wrapped 

278## by evidence containers 

279############################################ 

280def create_confusion_matrix(y_true, y_pred): 

281 """Create a confusion matrix as a dataframe 

282 

283 Parameters 

284 ---------- 

285 y_true : pd.Series of shape (n_samples,) 

286 Ground truth (correct) target values. 

287 

288 y_pred : array-like of shape (n_samples,) 

289 Estimated targets as returned by a classifier. 

290 

291 """ 

292 labels = y_true.astype("category").cat.categories 

293 confusion = confusion_matrix(y_true, y_pred, normalize="true", labels=labels) 

294 confusion_df = pd.DataFrame(confusion, index=labels.copy(), columns=labels) 

295 confusion_df.index.name = "true_label" 

296 confusion_df = confusion_df.reset_index().melt( 

297 id_vars=["true_label"], var_name="predicted_label" 

298 ) 

299 confusion_df.name = "Confusion Matrix" 

300 return confusion_df