Coverage for credoai/evaluators/performance.py: 92%

86 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1import numpy as np 

2import pandas as pd 

3from connect.evidence import MetricContainer, TableContainer 

4from sklearn.metrics import confusion_matrix 

5 

6from credoai.artifacts import ClassificationModel 

7from credoai.evaluators.evaluator import Evaluator 

8from credoai.evaluators.utils.fairlearn import setup_metric_frames 

9from credoai.evaluators.utils.validation import check_data_for_nulls, check_existence 

10from credoai.modules.metrics import process_metrics 

11from credoai.utils.common import ValidationError 

12 

13 

14class Performance(Evaluator): 

15 """ 

16 Performance evaluator for Credo AI. 

17 

18 This evaluator calculates overall performance metrics. 

19 Handles any metric that can be calculated on a set of ground truth labels and predictions, 

20 e.g., binary classification, multi class classification, regression. 

21 

22 This module takes in a set of metrics and provides functionality to: 

23 

24 - calculate the metrics 

25 - create disaggregated metrics 

26 

27 Parameters 

28 ---------- 

29 metrics : List-like 

30 list of metric names as strings or list of Metric objects (credoai.modules.metrics.Metric). 

31 Metric strings should in list returned by credoai.modules.metric_utils.list_metrics(). 

32 Note for performance parity metrics like 

33 "false negative rate parity" just list "false negative rate". Parity metrics 

34 are calculated automatically if the performance metric is supplied 

35 y_true : (List, pandas.Series, numpy.ndarray) 

36 The ground-truth labels (for classification) or target values (for regression). 

37 y_pred : (List, pandas.Series, numpy.ndarray) 

38 The predicted labels for classification 

39 y_prob : (List, pandas.Series, numpy.ndarray), optional 

40 The unthresholded predictions, confidence values or probabilities. 

41 """ 

42 

43 required_artifacts = {"model", "assessment_data"} 

44 

45 def __init__(self, metrics=None): 

46 super().__init__() 

47 # assign variables 

48 self.metrics = metrics 

49 self.metric_frames = {} 

50 self.processed_metrics = None 

51 

52 def _validate_arguments(self): 

53 check_existence(self.metrics, "metrics") 

54 check_existence(self.assessment_data.y, "y") 

55 check_data_for_nulls( 

56 self.assessment_data, "Data", check_X=True, check_y=True, check_sens=False 

57 ) 

58 

59 def _setup(self): 

60 # data variables 

61 self.y_true = self.assessment_data.y 

62 self.y_pred = self.model.predict(self.assessment_data.X) 

63 try: 

64 self.y_prob = self.model.predict_proba(self.assessment_data.X) 

65 except: 

66 self.y_prob = None 

67 self.update_metrics(self.metrics) 

68 return self 

69 

70 def evaluate(self): 

71 """ 

72 Run performance base module 

73 """ 

74 results = [] 

75 overall_metrics = self.get_overall_metrics() 

76 threshold_metrics = self.get_overall_threshold_metrics() 

77 

78 if overall_metrics is not None: 

79 results.append(overall_metrics) 

80 if threshold_metrics is not None: 

81 results += threshold_metrics 

82 

83 if isinstance(self.model, ClassificationModel): 

84 results.append(self._create_confusion_container()) 

85 self.results = results 

86 return self 

87 

88 def update_metrics(self, metrics, replace=True): 

89 """replace metrics 

90 

91 Parameters 

92 ---------- 

93 metrics : List-like 

94 list of metric names as string or list of Metrics (credoai.metrics.Metric). 

95 Metric strings should in list returned by credoai.modules.list_metrics. 

96 Note for performance parity metrics like 

97 "false negative rate parity" just list "false negative rate". Parity metrics 

98 are calculated automatically if the performance metric is supplied 

99 """ 

100 if replace: 

101 self.metrics = metrics 

102 else: 

103 self.metrics += metrics 

104 

105 self.processed_metrics, _ = process_metrics(self.metrics, self.model.type) 

106 

107 dummy_sensitive = pd.Series(["NA"] * len(self.y_true), name="NA") 

108 self.metric_frames = setup_metric_frames( 

109 self.processed_metrics, 

110 self.y_pred, 

111 self.y_prob, 

112 self.y_true, 

113 dummy_sensitive, 

114 ) 

115 

116 def get_df(self): 

117 """Return dataframe of input arrays 

118 

119 Returns 

120 ------- 

121 pandas.DataFrame 

122 Dataframe containing the input arrays 

123 """ 

124 df = pd.DataFrame({"true": self.y_true, "pred": self.y_pred}) 

125 if self.y_prob is not None: 

126 y_prob_df = pd.DataFrame(self.y_prob) 

127 y_prob_df.columns = [f"y_prob_{i}" for i in range(y_prob_df.shape[1])] 

128 df = pd.concat([df, y_prob_df], axis=1) 

129 

130 return df 

131 

132 def get_overall_metrics(self): 

133 """Return scalar performance metrics for each group 

134 

135 Returns 

136 ------- 

137 pandas.Series 

138 The overall performance metrics 

139 """ 

140 # retrieve overall metrics for one of the sensitive features only as they are the same 

141 overall_metrics = [ 

142 metric_frame.overall 

143 for name, metric_frame in self.metric_frames.items() 

144 if name != "thresh" 

145 ] 

146 if not overall_metrics: 

147 return 

148 

149 output_series = ( 

150 pd.concat(overall_metrics, axis=0).rename(index="value").to_frame() 

151 ) 

152 output_series = output_series.reset_index().rename({"index": "type"}, axis=1) 

153 

154 return MetricContainer(output_series, **self.get_info()) 

155 

156 def get_overall_threshold_metrics(self): 

157 """Return performance metrics for each group 

158 

159 Returns 

160 ------- 

161 pandas.Series 

162 The overall performance metrics 

163 """ 

164 # retrieve overall metrics for one of the sensitive features only as they are the same 

165 if not "thresh" in self.metric_frames: 

166 return 

167 

168 threshold_results = ( 

169 pd.concat([self.metric_frames["thresh"].overall], axis=0) 

170 .rename(index="value") 

171 .to_frame() 

172 ) 

173 threshold_results = threshold_results.reset_index().rename( 

174 {"index": "threshold_metric"}, axis=1 

175 ) 

176 threshold_results.name = "threshold_metric_performance" 

177 

178 results = [] 

179 for _, threshold_metric in threshold_results.iterrows(): 

180 metric = threshold_metric.threshold_metric 

181 threshold_metric.value.name = "threshold_dependent_performance" 

182 results.append( 

183 TableContainer( 

184 threshold_metric.value, 

185 **self.get_info({"metric_type": metric}), 

186 ) 

187 ) 

188 

189 return results 

190 

191 def _create_confusion_container(self): 

192 confusion_container = TableContainer( 

193 create_confusion_matrix(self.y_true, self.y_pred), 

194 **self.get_info(), 

195 ) 

196 return confusion_container 

197 

198 

199############################################ 

200## Evaluation helper functions 

201 

202## Helper functions create evidences 

203## to be passed to .evaluate to be wrapped 

204## by evidence containers 

205############################################ 

206def create_confusion_matrix(y_true, y_pred): 

207 """Create a confusion matrix as a dataframe 

208 

209 Parameters 

210 ---------- 

211 y_true : pd.Series of shape (n_samples,) 

212 Ground truth (correct) target values. 

213 

214 y_pred : array-like of shape (n_samples,) 

215 Estimated targets as returned by a classifier. 

216 

217 """ 

218 labels = np.unique(y_true) 

219 confusion = confusion_matrix(y_true, y_pred, normalize="true", labels=labels) 

220 confusion_df = pd.DataFrame(confusion, index=labels.copy(), columns=labels) 

221 confusion_df.index.name = "true_label" 

222 confusion_df = confusion_df.reset_index().melt( 

223 id_vars=["true_label"], var_name="predicted_label" 

224 ) 

225 confusion_df.name = "Confusion Matrix" 

226 return confusion_df