Coverage for credoai/evaluators/performance.py: 88%

1import pandas as pd

2from connect.evidence import MetricContainer, TableContainer

3from sklearn.metrics import confusion_matrix

5from credoai.artifacts import ClassificationModel, TabularData

6from credoai.evaluators import Evaluator

7from credoai.evaluators.utils.fairlearn import setup_metric_frames

8from credoai.evaluators.utils.validation import (

9 check_artifact_for_nulls,

10 check_data_instance,

11 check_existence,

12)

13from credoai.modules.constants_metrics import (

14 MODEL_METRIC_CATEGORIES,

15 THRESHOLD_METRIC_CATEGORIES,

16)

17from credoai.modules.metrics import Metric, find_metrics

18from credoai.utils.common import ValidationError

21class Performance(Evaluator):

22 """

23 Performance evaluator for Credo AI.

25 This evaluator calculates overall performance metrics.

26 Handles any metric that can be calculated on a set of ground truth labels and predictions,

27 e.g., binary classification, multi class classification, regression.

29 This module takes in a set of metrics and provides functionality to:

31 - calculate the metrics

32 - create disaggregated metrics

34 Parameters

35 ----------

36 metrics : List-like

37 list of metric names as strings or list of Metric objects (credoai.modules.metrics.Metric).

38 Metric strings should in list returned by credoai.modules.metric_utils.list_metrics().

39 Note for performance parity metrics like

40 "false negative rate parity" just list "false negative rate". Parity metrics

41 are calculated automatically if the performance metric is supplied

42 y_true : (List, pandas.Series, numpy.ndarray)

43 The ground-truth labels (for classification) or target values (for regression).

44 y_pred : (List, pandas.Series, numpy.ndarray)

45 The predicted labels for classification

46 y_prob : (List, pandas.Series, numpy.ndarray), optional

47 The unthresholded predictions, confidence values or probabilities.

48 """

50 required_artifacts = {"model", "assessment_data"}

52 def __init__(self, metrics=None):

53 super().__init__()

54 # assign variables

55 self.metrics = metrics

56 self.metric_frames = {}

57 self.performance_metrics = None

58 self.prob_metrics = None

59 self.failed_metrics = None

61 def _validate_arguments(self):

62 check_existence(self.metrics, "metrics")

63 check_data_instance(self.assessment_data, TabularData)

64 check_artifact_for_nulls(self.assessment_data, "Data")

66 def _setup(self):

67 # data variables

68 self.y_true = self.assessment_data.y

69 self.y_pred = self.model.predict(self.assessment_data.X)

70 try:

71 self.y_prob = self.model.predict_proba(self.assessment_data.X)

72 except:

73 self.y_prob = None

74 self.update_metrics(self.metrics)

75 return self

77 def evaluate(self):

78 """

79 Run performance base module

80 """

81 results = []

82 overall_metrics = self.get_overall_metrics()

83 threshold_metrics = self.get_overall_threshold_metrics()

85 if overall_metrics is not None:

86 results.append(overall_metrics)

87 if threshold_metrics is not None:

88 results += threshold_metrics

90 if isinstance(self.model, ClassificationModel):

91 results.append(self._create_confusion_container())

92 self.results = results

93 return self

95 def update_metrics(self, metrics, replace=True):

96 """replace metrics

98 Parameters

99 ----------

100 metrics : List-like

101 list of metric names as string or list of Metrics (credoai.metrics.Metric).

102 Metric strings should in list returned by credoai.modules.list_metrics.

103 Note for performance parity metrics like

104 "false negative rate parity" just list "false negative rate". Parity metrics

105 are calculated automatically if the performance metric is supplied

106 """

107 if replace:

108 self.metrics = metrics

109 else:

110 self.metrics += metrics

111 (

112 self.performance_metrics,

113 self.prob_metrics,

114 self.threshold_metrics,

115 self.failed_metrics,

116 ) = self._process_metrics(self.metrics)

117

118 dummy_sensitive = pd.Series(["NA"] * len(self.y_true), name="NA")

119 self.metric_frames = setup_metric_frames(

120 self.performance_metrics,

121 self.prob_metrics,

122 self.threshold_metrics,

123 self.y_pred,

124 self.y_prob,

125 self.y_true,

126 dummy_sensitive,

127 )

128

129 def get_df(self):

130 """Return dataframe of input arrays

131

132 Returns

133 -------

134 pandas.DataFrame

135 Dataframe containing the input arrays

136 """

137 df = pd.DataFrame({"true": self.y_true, "pred": self.y_pred})

138 if self.y_prob is not None:

139 y_prob_df = pd.DataFrame(self.y_prob)

140 y_prob_df.columns = [f"y_prob_{i}" for i in range(y_prob_df.shape[1])]

141 df = pd.concat([df, y_prob_df], axis=1)

142

143 return df

144

145 def get_overall_metrics(self):

146 """Return scalar performance metrics for each group

147

148 Returns

149 -------

150 pandas.Series

151 The overall performance metrics

152 """

153 # retrieve overall metrics for one of the sensitive features only as they are the same

154 overall_metrics = [

155 metric_frame.overall

156 for name, metric_frame in self.metric_frames.items()

157 if name != "thresh"

158 ]

159 if not overall_metrics:

160 return

161

162 output_series = (

163 pd.concat(overall_metrics, axis=0).rename(index="value").to_frame()

164 )

165 output_series = output_series.reset_index().rename({"index": "type"}, axis=1)

166

167 return MetricContainer(output_series, **self.get_container_info())

168

169 def get_overall_threshold_metrics(self):

170 """Return performance metrics for each group

171

172 Returns

173 -------

174 pandas.Series

175 The overall performance metrics

176 """

177 # retrieve overall metrics for one of the sensitive features only as they are the same

178 if not (self.threshold_metrics and "thresh" in self.metric_frames):

179 return

180

181 threshold_results = (

182 pd.concat([self.metric_frames["thresh"].overall], axis=0)

183 .rename(index="value")

184 .to_frame()

185 )

186 threshold_results = threshold_results.reset_index().rename(

187 {"index": "threshold_metric"}, axis=1

188 )

189 threshold_results.name = "threshold_metric_performance"

190

191 results = []

192 for _, threshold_metric in threshold_results.iterrows():

193 metric = threshold_metric.threshold_metric

194 threshold_metric.value.name = "threshold_dependent_performance"

195 results.append(

196 TableContainer(

197 threshold_metric.value,

198 **self.get_container_info({"metric_type": metric}),

199 )

200 )

201

202 return results

203

204 def _process_metrics(self, metrics):

205 """Separates metrics

206

207 Parameters

208 ----------

209 metrics : Union[List[Metric, str]]

210 list of metrics to use. These can be Metric objects

211 (see credoai.modules.metrics.py), or strings.

212 If strings, they will be converted to Metric objects

213 as appropriate, using find_metrics()

214

215 Returns

216 -------

217 Separate dictionaries and lists of metrics

218 """

219 # separate metrics

220 failed_metrics = []

221 performance_metrics = {}

222 prob_metrics = {}

223 threshold_metrics = {}

224 for metric in metrics:

225 if isinstance(metric, str):

226 metric_name = metric

227 metric = find_metrics(metric, MODEL_METRIC_CATEGORIES)

228 if len(metric) == 1:

229 metric = metric[0]

230 elif len(metric) == 0:

231 raise Exception(

232 f"Returned no metrics when searching using the provided metric name <{metric_name}>. Expected to find one matching metric."

233 )

234 else:

235 raise Exception(

236 f"Returned multiple metrics when searching using the provided metric name <{metric_name}>. Expected to find only one matching metric."

237 )

238 else:

239 metric_name = metric.name

240 if not isinstance(metric, Metric):

241 raise ValidationError(

242 "Specified metric is not of type credoai.metric.Metric"

243 )

244 if metric.metric_category == "FAIRNESS":

245 self.logger.info(

246 f"fairness metric, {metric_name}, unused by PerformanceModule"

247 )

248 pass

249 elif metric.metric_category in MODEL_METRIC_CATEGORIES:

250 if metric.takes_prob:

251 if metric.metric_category in THRESHOLD_METRIC_CATEGORIES:

252 threshold_metrics[metric_name] = metric

253 else:

254 prob_metrics[metric_name] = metric

255 else:

256 performance_metrics[metric_name] = metric

257 else:

258 self.logger.warning(

259 f"{metric_name} failed to be used by FairnessModule"

260 )

261 failed_metrics.append(metric_name)

262

263 return (performance_metrics, prob_metrics, threshold_metrics, failed_metrics)

264

265 def _create_confusion_container(self):

266 confusion_container = TableContainer(

267 create_confusion_matrix(self.y_true, self.y_pred),

268 **self.get_container_info(),

269 )

270 return confusion_container

271

272

273############################################

274## Evaluation helper functions

275

276## Helper functions create evidences

277## to be passed to .evaluate to be wrapped

278## by evidence containers

279############################################

280def create_confusion_matrix(y_true, y_pred):

281 """Create a confusion matrix as a dataframe

282

283 Parameters

284 ----------

285 y_true : pd.Series of shape (n_samples,)

286 Ground truth (correct) target values.

287

288 y_pred : array-like of shape (n_samples,)

289 Estimated targets as returned by a classifier.

290

291 """

292 labels = y_true.astype("category").cat.categories

293 confusion = confusion_matrix(y_true, y_pred, normalize="true", labels=labels)

294 confusion_df = pd.DataFrame(confusion, index=labels.copy(), columns=labels)

295 confusion_df.index.name = "true_label"

296 confusion_df = confusion_df.reset_index().melt(

297 id_vars=["true_label"], var_name="predicted_label"

298 )

299 confusion_df.name = "Confusion Matrix"

300 return confusion_df