Coverage for credoai/evaluators/performance.py: 88%
117 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
1import pandas as pd
2from connect.evidence import MetricContainer, TableContainer
3from sklearn.metrics import confusion_matrix
5from credoai.artifacts import ClassificationModel, TabularData
6from credoai.evaluators import Evaluator
7from credoai.evaluators.utils.fairlearn import setup_metric_frames
8from credoai.evaluators.utils.validation import (
9 check_artifact_for_nulls,
10 check_data_instance,
11 check_existence,
12)
13from credoai.modules.constants_metrics import (
14 MODEL_METRIC_CATEGORIES,
15 THRESHOLD_METRIC_CATEGORIES,
16)
17from credoai.modules.metrics import Metric, find_metrics
18from credoai.utils.common import ValidationError
21class Performance(Evaluator):
22 """
23 Performance evaluator for Credo AI.
25 This evaluator calculates overall performance metrics.
26 Handles any metric that can be calculated on a set of ground truth labels and predictions,
27 e.g., binary classification, multi class classification, regression.
29 This module takes in a set of metrics and provides functionality to:
31 - calculate the metrics
32 - create disaggregated metrics
34 Parameters
35 ----------
36 metrics : List-like
37 list of metric names as strings or list of Metric objects (credoai.modules.metrics.Metric).
38 Metric strings should in list returned by credoai.modules.metric_utils.list_metrics().
39 Note for performance parity metrics like
40 "false negative rate parity" just list "false negative rate". Parity metrics
41 are calculated automatically if the performance metric is supplied
42 y_true : (List, pandas.Series, numpy.ndarray)
43 The ground-truth labels (for classification) or target values (for regression).
44 y_pred : (List, pandas.Series, numpy.ndarray)
45 The predicted labels for classification
46 y_prob : (List, pandas.Series, numpy.ndarray), optional
47 The unthresholded predictions, confidence values or probabilities.
48 """
50 required_artifacts = {"model", "assessment_data"}
52 def __init__(self, metrics=None):
53 super().__init__()
54 # assign variables
55 self.metrics = metrics
56 self.metric_frames = {}
57 self.performance_metrics = None
58 self.prob_metrics = None
59 self.failed_metrics = None
61 def _validate_arguments(self):
62 check_existence(self.metrics, "metrics")
63 check_data_instance(self.assessment_data, TabularData)
64 check_artifact_for_nulls(self.assessment_data, "Data")
66 def _setup(self):
67 # data variables
68 self.y_true = self.assessment_data.y
69 self.y_pred = self.model.predict(self.assessment_data.X)
70 try:
71 self.y_prob = self.model.predict_proba(self.assessment_data.X)
72 except:
73 self.y_prob = None
74 self.update_metrics(self.metrics)
75 return self
77 def evaluate(self):
78 """
79 Run performance base module
80 """
81 results = []
82 overall_metrics = self.get_overall_metrics()
83 threshold_metrics = self.get_overall_threshold_metrics()
85 if overall_metrics is not None:
86 results.append(overall_metrics)
87 if threshold_metrics is not None:
88 results += threshold_metrics
90 if isinstance(self.model, ClassificationModel):
91 results.append(self._create_confusion_container())
92 self.results = results
93 return self
95 def update_metrics(self, metrics, replace=True):
96 """replace metrics
98 Parameters
99 ----------
100 metrics : List-like
101 list of metric names as string or list of Metrics (credoai.metrics.Metric).
102 Metric strings should in list returned by credoai.modules.list_metrics.
103 Note for performance parity metrics like
104 "false negative rate parity" just list "false negative rate". Parity metrics
105 are calculated automatically if the performance metric is supplied
106 """
107 if replace:
108 self.metrics = metrics
109 else:
110 self.metrics += metrics
111 (
112 self.performance_metrics,
113 self.prob_metrics,
114 self.threshold_metrics,
115 self.failed_metrics,
116 ) = self._process_metrics(self.metrics)
118 dummy_sensitive = pd.Series(["NA"] * len(self.y_true), name="NA")
119 self.metric_frames = setup_metric_frames(
120 self.performance_metrics,
121 self.prob_metrics,
122 self.threshold_metrics,
123 self.y_pred,
124 self.y_prob,
125 self.y_true,
126 dummy_sensitive,
127 )
129 def get_df(self):
130 """Return dataframe of input arrays
132 Returns
133 -------
134 pandas.DataFrame
135 Dataframe containing the input arrays
136 """
137 df = pd.DataFrame({"true": self.y_true, "pred": self.y_pred})
138 if self.y_prob is not None:
139 y_prob_df = pd.DataFrame(self.y_prob)
140 y_prob_df.columns = [f"y_prob_{i}" for i in range(y_prob_df.shape[1])]
141 df = pd.concat([df, y_prob_df], axis=1)
143 return df
145 def get_overall_metrics(self):
146 """Return scalar performance metrics for each group
148 Returns
149 -------
150 pandas.Series
151 The overall performance metrics
152 """
153 # retrieve overall metrics for one of the sensitive features only as they are the same
154 overall_metrics = [
155 metric_frame.overall
156 for name, metric_frame in self.metric_frames.items()
157 if name != "thresh"
158 ]
159 if not overall_metrics:
160 return
162 output_series = (
163 pd.concat(overall_metrics, axis=0).rename(index="value").to_frame()
164 )
165 output_series = output_series.reset_index().rename({"index": "type"}, axis=1)
167 return MetricContainer(output_series, **self.get_container_info())
169 def get_overall_threshold_metrics(self):
170 """Return performance metrics for each group
172 Returns
173 -------
174 pandas.Series
175 The overall performance metrics
176 """
177 # retrieve overall metrics for one of the sensitive features only as they are the same
178 if not (self.threshold_metrics and "thresh" in self.metric_frames):
179 return
181 threshold_results = (
182 pd.concat([self.metric_frames["thresh"].overall], axis=0)
183 .rename(index="value")
184 .to_frame()
185 )
186 threshold_results = threshold_results.reset_index().rename(
187 {"index": "threshold_metric"}, axis=1
188 )
189 threshold_results.name = "threshold_metric_performance"
191 results = []
192 for _, threshold_metric in threshold_results.iterrows():
193 metric = threshold_metric.threshold_metric
194 threshold_metric.value.name = "threshold_dependent_performance"
195 results.append(
196 TableContainer(
197 threshold_metric.value,
198 **self.get_container_info({"metric_type": metric}),
199 )
200 )
202 return results
204 def _process_metrics(self, metrics):
205 """Separates metrics
207 Parameters
208 ----------
209 metrics : Union[List[Metric, str]]
210 list of metrics to use. These can be Metric objects
211 (see credoai.modules.metrics.py), or strings.
212 If strings, they will be converted to Metric objects
213 as appropriate, using find_metrics()
215 Returns
216 -------
217 Separate dictionaries and lists of metrics
218 """
219 # separate metrics
220 failed_metrics = []
221 performance_metrics = {}
222 prob_metrics = {}
223 threshold_metrics = {}
224 for metric in metrics:
225 if isinstance(metric, str):
226 metric_name = metric
227 metric = find_metrics(metric, MODEL_METRIC_CATEGORIES)
228 if len(metric) == 1:
229 metric = metric[0]
230 elif len(metric) == 0:
231 raise Exception(
232 f"Returned no metrics when searching using the provided metric name <{metric_name}>. Expected to find one matching metric."
233 )
234 else:
235 raise Exception(
236 f"Returned multiple metrics when searching using the provided metric name <{metric_name}>. Expected to find only one matching metric."
237 )
238 else:
239 metric_name = metric.name
240 if not isinstance(metric, Metric):
241 raise ValidationError(
242 "Specified metric is not of type credoai.metric.Metric"
243 )
244 if metric.metric_category == "FAIRNESS":
245 self.logger.info(
246 f"fairness metric, {metric_name}, unused by PerformanceModule"
247 )
248 pass
249 elif metric.metric_category in MODEL_METRIC_CATEGORIES:
250 if metric.takes_prob:
251 if metric.metric_category in THRESHOLD_METRIC_CATEGORIES:
252 threshold_metrics[metric_name] = metric
253 else:
254 prob_metrics[metric_name] = metric
255 else:
256 performance_metrics[metric_name] = metric
257 else:
258 self.logger.warning(
259 f"{metric_name} failed to be used by FairnessModule"
260 )
261 failed_metrics.append(metric_name)
263 return (performance_metrics, prob_metrics, threshold_metrics, failed_metrics)
265 def _create_confusion_container(self):
266 confusion_container = TableContainer(
267 create_confusion_matrix(self.y_true, self.y_pred),
268 **self.get_container_info(),
269 )
270 return confusion_container
273############################################
274## Evaluation helper functions
276## Helper functions create evidences
277## to be passed to .evaluate to be wrapped
278## by evidence containers
279############################################
280def create_confusion_matrix(y_true, y_pred):
281 """Create a confusion matrix as a dataframe
283 Parameters
284 ----------
285 y_true : pd.Series of shape (n_samples,)
286 Ground truth (correct) target values.
288 y_pred : array-like of shape (n_samples,)
289 Estimated targets as returned by a classifier.
291 """
292 labels = y_true.astype("category").cat.categories
293 confusion = confusion_matrix(y_true, y_pred, normalize="true", labels=labels)
294 confusion_df = pd.DataFrame(confusion, index=labels.copy(), columns=labels)
295 confusion_df.index.name = "true_label"
296 confusion_df = confusion_df.reset_index().melt(
297 id_vars=["true_label"], var_name="predicted_label"
298 )
299 confusion_df.name = "Confusion Matrix"
300 return confusion_df