Coverage for credoai/evaluators/equity.py: 95%

113 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1import numpy as np 

2import pandas as pd 

3from connect.evidence import MetricContainer, StatisticTestContainer, TableContainer 

4 

5from credoai.artifacts import TabularData 

6from credoai.evaluators.evaluator import Evaluator 

7from credoai.evaluators.utils.validation import ( 

8 check_data_for_nulls, 

9 check_data_instance, 

10 check_existence, 

11) 

12from credoai.modules.stats import ChiSquare, OneWayAnova 

13from credoai.utils.model_utils import type_of_target 

14 

15 

16class DataEquity(Evaluator): 

17 """ 

18 Data Equity evaluator for Credo AI (Experimental) 

19 

20 This evaluator assesses whether outcomes are distributed equally across a sensitive 

21 feature. Depending on the kind of outcome, different tests will be performed. 

22 

23 - Discrete: chi-squared contingency tests, 

24 followed by bonferronni corrected posthoc chi-sq tests 

25 - Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests 

26 - Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then 

27 proceed as normal for continuous 

28 

29 Parameters 

30 ---------- 

31 p_value : float 

32 The significance value to evaluate statistical tests 

33 """ 

34 

35 required_artifacts = {"data", "sensitive_feature"} 

36 

37 def __init__(self, p_value=0.01): 

38 self.pvalue = p_value 

39 super().__init__() 

40 

41 def _validate_arguments(self): 

42 check_data_instance(self.data, TabularData) 

43 check_existence(self.data.sensitive_features, "sensitive_features") 

44 check_data_for_nulls(self.data, "Data") 

45 

46 def _setup(self): 

47 self.sensitive_features = self.data.sensitive_feature 

48 self.y = self.data.y 

49 self.type_of_target = self.data.y_type 

50 

51 self.df = pd.concat([self.sensitive_features, self.y], axis=1) 

52 self.labels = { 

53 "sensitive_feature": self.sensitive_features.name, 

54 "outcome": self.y.name, 

55 } 

56 return self 

57 

58 def evaluate(self): 

59 summary, parity_results = self._describe() 

60 outcome_distribution = self._outcome_distributions() 

61 overall_equity, posthoc_tests = self._get_formatted_stats() 

62 

63 # Combine 

64 equity_containers = [ 

65 summary, 

66 outcome_distribution, 

67 parity_results, 

68 overall_equity, 

69 ] 

70 

71 # Add posthoc if available 

72 if posthoc_tests is not None: 

73 equity_containers.append(posthoc_tests) 

74 

75 self.results = equity_containers 

76 return self 

77 

78 def _describe(self): 

79 """Create descriptive output""" 

80 means = self.df.groupby(self.sensitive_features.name).mean() 

81 results = {"summary": means} 

82 

83 summary = results["summary"] 

84 results["sensitive_feature"] = self.sensitive_features.name 

85 results["highest_group"] = summary[self.y.name].idxmax() 

86 results["lowest_group"] = summary[self.y.name].idxmin() 

87 results["demographic_parity_difference"] = ( 

88 summary[self.y.name].max() - summary[self.y.name].min() 

89 ) 

90 results["demographic_parity_ratio"] = ( 

91 summary[self.y.name].min() / summary[self.y.name].max() 

92 ) 

93 

94 summary.name = f"Average Outcome Per Group" 

95 

96 # Format summary results 

97 summary = TableContainer( 

98 results["summary"], 

99 **self.get_info(labels=self.labels), 

100 ) 

101 

102 # Format parity results 

103 parity_results = pd.DataFrame( 

104 [ 

105 {"type": k, "value": v} 

106 for k, v in results.items() 

107 if "demographic_parity" in k 

108 ] 

109 ) 

110 parity_results = MetricContainer( 

111 parity_results, 

112 **self.get_info(labels=self.labels), 

113 ) 

114 

115 return summary, parity_results 

116 

117 def _outcome_distributions(self): 

118 out = TableContainer( 

119 outcome_distribution( 

120 self.df, self.sensitive_features.name, self.y.name, self.type_of_target 

121 ), 

122 **self.get_info(labels=self.labels), 

123 ) 

124 return out 

125 

126 def _get_formatted_stats(self) -> tuple: 

127 """ 

128 Select statistics based on classification type, add formatting. 

129 

130 Returns 

131 ------- 

132 tuple 

133 Overall equity, posthoc tests 

134 """ 

135 if self.type_of_target in ("binary", "multiclass"): 

136 statistics = self.discrete_stats() 

137 else: 

138 statistics = self.continuous_stats() 

139 

140 overall_equity = { 

141 "statistic_type": statistics["test_type"], 

142 "test_statistic": statistics["statistic"], 

143 "p_value": statistics["pvalue"], 

144 "significance_threshold": self.pvalue, 

145 "significant": statistics["pvalue"] <= self.pvalue, 

146 } 

147 

148 overall_equity = StatisticTestContainer( 

149 pd.DataFrame(overall_equity, index=[0]), **self.get_info() 

150 ) 

151 

152 posthoc_tests = None 

153 if "significant_posthoc_tests" in statistics: 

154 posthoc_tests = pd.DataFrame(statistics["significant_posthoc_tests"]) 

155 posthoc_tests.name = f"{statistics['test_type']}_posthoc" 

156 posthoc_tests = TableContainer(posthoc_tests, **self.get_info()) 

157 

158 return overall_equity, posthoc_tests 

159 

160 def discrete_stats(self): 

161 """Run statistics on discrete outcomes""" 

162 test = ChiSquare(self.pvalue) 

163 return test.run(self.df, self.sensitive_features.name, self.y.name) 

164 

165 def continuous_stats(self): 

166 """Run statistics on continuous outcomes""" 

167 # check for proportional bounding and transform 

168 if self._check_range(self.y, 0, 1): 

169 self._proportion_transformation() 

170 return OneWayAnova(self.pvalue).run( 

171 self.df, self.sensitive_features.name, self.y.name 

172 ) 

173 

174 # helper functions 

175 def _check_range(self, lst, lower_bound, upper_bound): 

176 return min(lst) >= lower_bound and max(lst) <= upper_bound 

177 

178 def _proportion_transformation(self): 

179 """Transforms bounded values between 0-1 into a continuous space""" 

180 

181 def logit(x): 

182 eps = 1e-6 

183 return np.log(x / (1 - x + eps) + eps) 

184 

185 self.df[self.y.name] = self.df[self.y.name].apply(logit) 

186 

187 

188class ModelEquity(DataEquity): 

189 """ 

190 Evaluates the equity of a model's predictions. 

191 

192 This evaluator assesses whether model predictions are distributed equally across a sensitive 

193 feature. Depending on the kind of outcome, different tests will be performed. 

194 

195 * Discrete: chi-squared contingency tests, 

196 followed by bonferronni corrected posthoc chi-sq tests 

197 * Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests 

198 * Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then 

199 proceed as normal for continuous 

200 

201 Parameters 

202 ---------- 

203 use_predict_proba : bool, optional 

204 Defines which predict method will be used, if True predict_proba will be used. 

205 This methods outputs probabilities rather then class predictions. The availability 

206 of predict_proba is dependent on the model under assessment. By default False 

207 p_value : float, optional 

208 The significance value to evaluate statistical tests, by default 0.01 

209 """ 

210 

211 required_artifacts = {"model", "assessment_data", "sensitive_feature"} 

212 

213 def __init__(self, use_predict_proba=False, p_value=0.01): 

214 self.use_predict_proba = use_predict_proba 

215 super().__init__(p_value) 

216 

217 def _setup(self): 

218 self.sensitive_features = self.assessment_data.sensitive_feature 

219 fun = self.model.predict_proba if self.use_predict_proba else self.model.predict 

220 self.y = pd.Series( 

221 fun(self.assessment_data.X), 

222 index=self.sensitive_features.index, 

223 ) 

224 prefix = "predicted probability" if self.use_predict_proba else "predicted" 

225 try: 

226 self.y.name = f"{prefix} {self.assessment_data.y.name}" 

227 except: 

228 self.y.name = f"{prefix} outcome" 

229 

230 self.type_of_target = type_of_target(self.y) 

231 

232 self.df = pd.concat([self.sensitive_features, self.y], axis=1) 

233 self.labels = { 

234 "sensitive_feature": self.sensitive_features.name, 

235 "outcome": self.y.name, 

236 } 

237 return self 

238 

239 def _validate_arguments(self): 

240 check_data_instance(self.assessment_data, TabularData) 

241 check_existence(self.assessment_data.sensitive_features, "sensitive_features") 

242 check_data_for_nulls( 

243 self.assessment_data, "Data", check_X=True, check_y=True, check_sens=True 

244 ) 

245 

246 

247############################################ 

248## Evaluation helper functions 

249 

250## Helper functions create evidences 

251## to be passed to .evaluate to be wrapped 

252## by evidence containers 

253############################################ 

254 

255 

256def outcome_distribution(df, grouping_col, outcome_col, type_of_target, bins=10): 

257 """Returns outcome distribution over a grouping factor 

258 

259 For binary/multiclass outcomes, returns the counts for each set of outcomes/grouping. 

260 For a continuous outcome, bins the outcome and reports the number of records in each bin 

261 for each group. 

262 

263 Parameters 

264 ---------- 

265 df : pd.DataFrame 

266 Dataframe with at least two columns for grouping and outcome 

267 grouping_col : str 

268 Name of the grouping column, must refer to a categorical column 

269 outcome_col : str 

270 Name of the outcome column 

271 type_of_target : str 

272 The type of outcome column. Anything besides "binary" and "multiclass" will be treated 

273 as continuous. 

274 bins : int 

275 Number of bins to use in the case of a continuous outcome 

276 

277 Returns 

278 ------- 

279 pd.DataFrame 

280 _description_ 

281 """ 

282 

283 df = df.loc[:, [grouping_col, outcome_col]] 

284 if type_of_target in ("binary", "multiclass"): 

285 distribution = df.value_counts().sort_index().reset_index(name="count") 

286 distribution["proportion"] = distribution["count"] / distribution["count"].sum() 

287 # histogram binning for continuous 

288 else: 

289 distribution = [] 

290 for i, group in df.groupby(grouping_col): 

291 counts, edges = np.histogram(group[outcome_col], bins=bins) 

292 bins = edges # ensure all groups have same bins 

293 bin_centers = 0.5 * (edges[:-1] + edges[1:]) 

294 tmp = pd.DataFrame( 

295 { 

296 grouping_col: i, 

297 outcome_col: bin_centers, 

298 "count": counts, 

299 } 

300 ) 

301 distribution.append(tmp) 

302 distribution = pd.concat(distribution, axis=0) 

303 distribution.name = "Outcome Distributions" 

304 

305 return distribution