Coverage for credoai/evaluators/equity.py: 95%

1import numpy as np

2import pandas as pd

3from connect.evidence import MetricContainer, StatisticTestContainer, TableContainer

5from credoai.artifacts import TabularData

6from credoai.evaluators.evaluator import Evaluator

7from credoai.evaluators.utils.validation import (

8 check_data_for_nulls,

9 check_data_instance,

10 check_existence,

11)

12from credoai.modules.stats import ChiSquare, OneWayAnova

13from credoai.utils.model_utils import type_of_target

16class DataEquity(Evaluator):

17 """

18 Data Equity evaluator for Credo AI (Experimental)

20 This evaluator assesses whether outcomes are distributed equally across a sensitive

21 feature. Depending on the kind of outcome, different tests will be performed.

23 - Discrete: chi-squared contingency tests,

24 followed by bonferronni corrected posthoc chi-sq tests

25 - Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests

26 - Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then

27 proceed as normal for continuous

29 Parameters

30 ----------

31 p_value : float

32 The significance value to evaluate statistical tests

33 """

35 required_artifacts = {"data", "sensitive_feature"}

37 def __init__(self, p_value=0.01):

38 self.pvalue = p_value

39 super().__init__()

41 def _validate_arguments(self):

42 check_data_instance(self.data, TabularData)

43 check_existence(self.data.sensitive_features, "sensitive_features")

44 check_data_for_nulls(self.data, "Data")

46 def _setup(self):

47 self.sensitive_features = self.data.sensitive_feature

48 self.y = self.data.y

49 self.type_of_target = self.data.y_type

51 self.df = pd.concat([self.sensitive_features, self.y], axis=1)

52 self.labels = {

53 "sensitive_feature": self.sensitive_features.name,

54 "outcome": self.y.name,

55 }

56 return self

58 def evaluate(self):

59 summary, parity_results = self._describe()

60 outcome_distribution = self._outcome_distributions()

61 overall_equity, posthoc_tests = self._get_formatted_stats()

63 # Combine

64 equity_containers = [

65 summary,

66 outcome_distribution,

67 parity_results,

68 overall_equity,

69 ]

71 # Add posthoc if available

72 if posthoc_tests is not None:

73 equity_containers.append(posthoc_tests)

75 self.results = equity_containers

76 return self

78 def _describe(self):

79 """Create descriptive output"""

80 means = self.df.groupby(self.sensitive_features.name).mean()

81 results = {"summary": means}

83 summary = results["summary"]

84 results["sensitive_feature"] = self.sensitive_features.name

85 results["highest_group"] = summary[self.y.name].idxmax()

86 results["lowest_group"] = summary[self.y.name].idxmin()

87 results["demographic_parity_difference"] = (

88 summary[self.y.name].max() - summary[self.y.name].min()

89 )

90 results["demographic_parity_ratio"] = (

91 summary[self.y.name].min() / summary[self.y.name].max()

92 )

94 summary.name = f"Average Outcome Per Group"

96 # Format summary results

97 summary = TableContainer(

98 results["summary"],

99 **self.get_info(labels=self.labels),

100 )

101

102 # Format parity results

103 parity_results = pd.DataFrame(

104 [

105 {"type": k, "value": v}

106 for k, v in results.items()

107 if "demographic_parity" in k

108 ]

109 )

110 parity_results = MetricContainer(

111 parity_results,

112 **self.get_info(labels=self.labels),

113 )

114

115 return summary, parity_results

116

117 def _outcome_distributions(self):

118 out = TableContainer(

119 outcome_distribution(

120 self.df, self.sensitive_features.name, self.y.name, self.type_of_target

121 ),

122 **self.get_info(labels=self.labels),

123 )

124 return out

125

126 def _get_formatted_stats(self) -> tuple:

127 """

128 Select statistics based on classification type, add formatting.

129

130 Returns

131 -------

132 tuple

133 Overall equity, posthoc tests

134 """

135 if self.type_of_target in ("binary", "multiclass"):

136 statistics = self.discrete_stats()

137 else:

138 statistics = self.continuous_stats()

139

140 overall_equity = {

141 "statistic_type": statistics["test_type"],

142 "test_statistic": statistics["statistic"],

143 "p_value": statistics["pvalue"],

144 "significance_threshold": self.pvalue,

145 "significant": statistics["pvalue"] <= self.pvalue,

146 }

147

148 overall_equity = StatisticTestContainer(

149 pd.DataFrame(overall_equity, index=[0]), **self.get_info()

150 )

151

152 posthoc_tests = None

153 if "significant_posthoc_tests" in statistics:

154 posthoc_tests = pd.DataFrame(statistics["significant_posthoc_tests"])

155 posthoc_tests.name = f"{statistics['test_type']}_posthoc"

156 posthoc_tests = TableContainer(posthoc_tests, **self.get_info())

157

158 return overall_equity, posthoc_tests

159

160 def discrete_stats(self):

161 """Run statistics on discrete outcomes"""

162 test = ChiSquare(self.pvalue)

163 return test.run(self.df, self.sensitive_features.name, self.y.name)

164

165 def continuous_stats(self):

166 """Run statistics on continuous outcomes"""

167 # check for proportional bounding and transform

168 if self._check_range(self.y, 0, 1):

169 self._proportion_transformation()

170 return OneWayAnova(self.pvalue).run(

171 self.df, self.sensitive_features.name, self.y.name

172 )

173

174 # helper functions

175 def _check_range(self, lst, lower_bound, upper_bound):

176 return min(lst) >= lower_bound and max(lst) <= upper_bound

177

178 def _proportion_transformation(self):

179 """Transforms bounded values between 0-1 into a continuous space"""

180

181 def logit(x):

182 eps = 1e-6

183 return np.log(x / (1 - x + eps) + eps)

184

185 self.df[self.y.name] = self.df[self.y.name].apply(logit)

186

187

188class ModelEquity(DataEquity):

189 """

190 Evaluates the equity of a model's predictions.

191

192 This evaluator assesses whether model predictions are distributed equally across a sensitive

193 feature. Depending on the kind of outcome, different tests will be performed.

194

195 * Discrete: chi-squared contingency tests,

196 followed by bonferronni corrected posthoc chi-sq tests

197 * Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests

198 * Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then

199 proceed as normal for continuous

200

201 Parameters

202 ----------

203 use_predict_proba : bool, optional

204 Defines which predict method will be used, if True predict_proba will be used.

205 This methods outputs probabilities rather then class predictions. The availability

206 of predict_proba is dependent on the model under assessment. By default False

207 p_value : float, optional

208 The significance value to evaluate statistical tests, by default 0.01

209 """

210

211 required_artifacts = {"model", "assessment_data", "sensitive_feature"}

212

213 def __init__(self, use_predict_proba=False, p_value=0.01):

214 self.use_predict_proba = use_predict_proba

215 super().__init__(p_value)

216

217 def _setup(self):

218 self.sensitive_features = self.assessment_data.sensitive_feature

219 fun = self.model.predict_proba if self.use_predict_proba else self.model.predict

220 self.y = pd.Series(

221 fun(self.assessment_data.X),

222 index=self.sensitive_features.index,

223 )

224 prefix = "predicted probability" if self.use_predict_proba else "predicted"

225 try:

226 self.y.name = f"{prefix} {self.assessment_data.y.name}"

227 except:

228 self.y.name = f"{prefix} outcome"

229

230 self.type_of_target = type_of_target(self.y)

231

232 self.df = pd.concat([self.sensitive_features, self.y], axis=1)

233 self.labels = {

234 "sensitive_feature": self.sensitive_features.name,

235 "outcome": self.y.name,

236 }

237 return self

238

239 def _validate_arguments(self):

240 check_data_instance(self.assessment_data, TabularData)

241 check_existence(self.assessment_data.sensitive_features, "sensitive_features")

242 check_data_for_nulls(

243 self.assessment_data, "Data", check_X=True, check_y=True, check_sens=True

244 )

245

246

247############################################

248## Evaluation helper functions

249

250## Helper functions create evidences

251## to be passed to .evaluate to be wrapped

252## by evidence containers

253############################################

254

255

256def outcome_distribution(df, grouping_col, outcome_col, type_of_target, bins=10):

257 """Returns outcome distribution over a grouping factor

258

259 For binary/multiclass outcomes, returns the counts for each set of outcomes/grouping.

260 For a continuous outcome, bins the outcome and reports the number of records in each bin

261 for each group.

262

263 Parameters

264 ----------

265 df : pd.DataFrame

266 Dataframe with at least two columns for grouping and outcome

267 grouping_col : str

268 Name of the grouping column, must refer to a categorical column

269 outcome_col : str

270 Name of the outcome column

271 type_of_target : str

272 The type of outcome column. Anything besides "binary" and "multiclass" will be treated

273 as continuous.

274 bins : int

275 Number of bins to use in the case of a continuous outcome

276

277 Returns

278 -------

279 pd.DataFrame

280 _description_

281 """

282

283 df = df.loc[:, [grouping_col, outcome_col]]

284 if type_of_target in ("binary", "multiclass"):

285 distribution = df.value_counts().sort_index().reset_index(name="count")

286 distribution["proportion"] = distribution["count"] / distribution["count"].sum()

287 # histogram binning for continuous

288 else:

289 distribution = []

290 for i, group in df.groupby(grouping_col):

291 counts, edges = np.histogram(group[outcome_col], bins=bins)

292 bins = edges # ensure all groups have same bins

293 bin_centers = 0.5 * (edges[:-1] + edges[1:])

294 tmp = pd.DataFrame(

295 {

296 grouping_col: i,

297 outcome_col: bin_centers,

298 "count": counts,

299 }

300 )

301 distribution.append(tmp)

302 distribution = pd.concat(distribution, axis=0)

303 distribution.name = "Outcome Distributions"

304

305 return distribution