Coverage for credoai/evaluators/identity_verification.py: 98%

112 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1"""Identity Verification evaluator""" 

2import pandas as pd 

3from connect.evidence import MetricContainer, TableContainer 

4 

5from credoai.artifacts import ComparisonData, ComparisonModel 

6from credoai.artifacts.model.comparison_model import DummyComparisonModel 

7from credoai.evaluators import Evaluator 

8from credoai.evaluators.utils.fairlearn import setup_metric_frames 

9from credoai.evaluators.utils.validation import ( 

10 check_data_instance, 

11 check_existence, 

12 check_model_instance, 

13) 

14from credoai.modules.constants_metrics import BINARY_CLASSIFICATION_FUNCTIONS as bcf 

15from credoai.modules.metrics import Metric 

16 

17METRIC_SUBSET = [ 

18 "false_match_rate-score", 

19 "false_non_match_rate-score", 

20 "false_match_rate_parity_difference-score", 

21 "false_non_match_rate_parity_difference-score", 

22 "false_match_rate_parity_ratio-score", 

23 "false_non_match_rate_parity_ratio-score", 

24] 

25 

26 

27class IdentityVerification(Evaluator): 

28 """ 

29 Pair-wise-comparison-based identity verification evaluator for Credo AI 

30 

31 This evaluator takes in identity verification data and 

32 provides functionality to perform performance and fairness assessment 

33 

34 Parameters 

35 ---------- 

36 pairs : pd.DataFrame of shape (n_pairs, 4) 

37 Dataframe where each row represents a data sample pair and associated subjects 

38 Type of data sample is decided by the ComparisonModel's `compare` function, which takes 

39 data sample pairs and returns their similarity scores. Examples are selfies, fingerprint scans, 

40 or voices of a person. 

41 

42 Required columns: 

43 

44 * source-subject-id: unique identifier of the source subject 

45 * source-subject-data-sample: data sample from the source subject 

46 * target-subject-id: unique identifier of the target subject 

47 * target-subject-data-sample: data sample from the target subject 

48 

49 subjects_sensitive_features : pd.DataFrame of shape (n_subjects, n_sensitive_feature_names), optional 

50 Sensitive features of all subjects present in pairs dataframe 

51 If provided, disaggregated performance assessment is also performed. 

52 This can be the columns you want to perform segmentation analysis on, or 

53 a feature related to fairness like 'race' or 'gender'. 

54 

55 Required columns: 

56 

57 * subject-id: id of subjects. Must cover all the subjects included in `pairs` dataframe 

58 other columns with arbitrary names for sensitive features 

59 

60 similarity_thresholds : list 

61 list of similarity score thresholds 

62 Similarity equal or greater than a similarity score threshold means match 

63 comparison_levels : list 

64 list of comparison levels. Options: 

65 

66 * sample: it means a match is observed for every sample pair. Sample-level comparison represent 

67 a use case where only two samples (such as a real time selfie and stored ID image) are 

68 used to confirm an identity. 

69 * subject: it means if any pairs of samples for the same subject are a match, the subject pair 

70 is marked as a match. Some identity verification use cases improve overall accuracy by storing 

71 multiple samples per identity. Subject-level comparison mirrors this behavior. 

72 

73 Example 

74 -------- 

75 

76 >>> import pandas as pd 

77 >>> from credoai.lens import Lens 

78 >>> from credoai.artifacts import ComparisonData, ComparisonModel 

79 >>> from credoai.evaluators import IdentityVerification 

80 >>> evaluator = IdentityVerification(similarity_thresholds=[60, 99]) 

81 >>> import doctest 

82 >>> doctest.ELLIPSIS_MARKER = '-etc-' 

83 >>> pairs = pd.DataFrame({ 

84 ... 'source-subject-id': ['s0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's2'], 

85 ... 'source-subject-data-sample': ['s00', 's00', 's00', 's00', 's10', 's10', 's10', 's11', 's11', 's20'], 

86 ... 'target-subject-id': ['s1', 's1', 's2', 's3', 's1', 's2', 's3', 's2', 's3', 's3'], 

87 ... 'target-subject-data-sample': ['s10', 's11', 's20', 's30', 's11', 's20', 's30', 's20', 's30', 's30'] 

88 ... }) 

89 >>> subjects_sensitive_features = pd.DataFrame({ 

90 ... 'subject-id': ['s0', 's1', 's2', 's3'], 

91 ... 'gender': ['female', 'male', 'female', 'female'] 

92 ... }) 

93 >>> class FaceCompare: 

94 ... # a dummy selfie comparison model 

95 ... def compare(self, pairs): 

96 ... similarity_scores = [31.5, 16.7, 20.8, 84.4, 12.0, 15.2, 45.8, 23.5, 28.5, 44.5] 

97 ... return similarity_scores 

98 >>> face_compare = FaceCompare() 

99 >>> credo_data = ComparisonData( 

100 ... name="face-data", 

101 ... pairs=pairs, 

102 ... subjects_sensitive_features=subjects_sensitive_features 

103 ... ) 

104 >>> credo_model = ComparisonModel( 

105 ... name="face-compare", 

106 ... model_like=face_compare 

107 ... ) 

108 >>> pipeline = Lens(model=credo_model, assessment_data=credo_data) 

109 >>> pipeline.add(evaluator) # doctest: +ELLIPSIS 

110 -etc- 

111 >>> pipeline.run() # doctest: +ELLIPSIS 

112 -etc- 

113 >>> pipeline.get_results() # doctest: +ELLIPSIS 

114 -etc- 

115 

116 """ 

117 

118 required_artifacts = {"model", "assessment_data"} 

119 

120 def __init__( 

121 self, 

122 similarity_thresholds: list = [90, 95, 99], 

123 comparison_levels: list = ["sample", "subject"], 

124 ): 

125 self.similarity_thresholds = similarity_thresholds 

126 self.comparison_levels = comparison_levels 

127 super().__init__() 

128 

129 def _validate_arguments(self): 

130 check_data_instance(self.assessment_data, ComparisonData) 

131 check_model_instance(self.model, (ComparisonModel, DummyComparisonModel)) 

132 check_existence(self.assessment_data.pairs, "pairs") 

133 return self 

134 

135 def _setup(self): 

136 self.pairs = self.assessment_data.pairs 

137 try: 

138 self.subjects_sensitive_features = ( 

139 self.assessment_data.subjects_sensitive_features 

140 ) 

141 sensitive_features_names = list(self.subjects_sensitive_features.columns) 

142 sensitive_features_names.remove("subject-id") 

143 self.sensitive_features_names = sensitive_features_names 

144 except: 

145 self.subjects_sensitive_features = None 

146 

147 self.pairs["similarity_score"] = self.model.compare( 

148 [ 

149 list(pair) 

150 for pair in zip( 

151 self.pairs["source-subject-data-sample"].tolist(), 

152 self.pairs["target-subject-data-sample"].tolist(), 

153 ) 

154 ] 

155 ) 

156 

157 self.pairs["match"] = self.pairs.apply( 

158 lambda x: 1 if x["source-subject-id"] == x["target-subject-id"] else 0, 

159 axis=1, 

160 ) 

161 

162 return self 

163 

164 def evaluate(self): 

165 """ 

166 Runs the assessment process 

167 

168 Returns 

169 ------- 

170 dict, nested 

171 Key: assessment category 

172 Values: detailed results associated with each category 

173 """ 

174 

175 self.results = self._assess_overall_performance() 

176 

177 if self.subjects_sensitive_features is not None: 

178 self._assess_disaggregated_performance() 

179 

180 return self 

181 

182 def _process_data( 

183 self, pairs_processed, threshold=90, comparison_level="sample", sf=None 

184 ): 

185 """ 

186 Process the pairs and sensitive features dataframes 

187 

188 Parameters 

189 ---------- 

190 pairs_processed : pd.DataFrame 

191 pairs dataframe to be processed in place 

192 threshold : float, optional 

193 similarity threshold equal or greater than which mean match, by default 90 

194 comparison_level : str, optional 

195 comparison levels, by default "sample" 

196 Options: 

197 sample: it means a match is observed for every sample pair. Sample-level comparison represent 

198 a use case where only two samples (such as a real time selfie and stored ID image) are 

199 used to confirm an identity. 

200 subject: it means if any pairs of samples for the same subject are a match, the subject pair 

201 is marked as a match. Some identity verification use cases improve overall accuracy by storing 

202 multiple samples per identity. Subject-level comparison mirrors this behavior. 

203 sf : pd.DataFrame, optional 

204 sensitive feature dataframe with 'subject-id' and sensitive feature name columns, by default None 

205 

206 Returns 

207 ------- 

208 pd.DataFrame, pd.DataFrame 

209 Processed pairs and sensitive features dataframes 

210 """ 

211 pairs_processed["match_prediction"] = pairs_processed.apply( 

212 lambda x: 1 if x["similarity_score"] >= threshold else 0, axis=1 

213 ) 

214 if comparison_level == "subject": 

215 pairs_processed = pairs_processed.sort_values("match").drop_duplicates( 

216 subset=["source-subject-id", "target-subject-id"], keep="last" 

217 ) 

218 

219 sf_processed = None 

220 if sf is not None: 

221 # Process the data for disaggregated assessment 

222 # Filter out the pairs with non-matching sensitive feature groups 

223 # and create the sensitive feature vector 

224 sf_name = list(sf.columns) 

225 sf_name.remove("subject-id") 

226 sf_name = sf_name[0] 

227 pairs_processed = pairs_processed.merge( 

228 sf, left_on="source-subject-id", right_on="subject-id", how="left" 

229 ) 

230 pairs_processed.drop("subject-id", inplace=True, axis=1) 

231 pairs_processed.rename( 

232 {sf_name: sf_name + "-source-subject"}, inplace=True, axis=1 

233 ) 

234 pairs_processed = pairs_processed.merge( 

235 sf, left_on="target-subject-id", right_on="subject-id", how="left" 

236 ) 

237 pairs_processed.drop("subject-id", inplace=True, axis=1) 

238 pairs_processed = pairs_processed.loc[ 

239 pairs_processed[sf_name + "-source-subject"] == pairs_processed[sf_name] 

240 ] 

241 sf_processed = pairs_processed[sf_name] 

242 pairs_processed.drop( 

243 [sf_name, sf_name + "-source-subject"], inplace=True, axis=1 

244 ) 

245 

246 return pairs_processed, sf_processed 

247 

248 def _assess_overall_performance(self): 

249 """ 

250 Perform overall performance assessment 

251 """ 

252 overall_performance_res = [] 

253 for threshold in self.similarity_thresholds: 

254 for level in self.comparison_levels: 

255 cols = ["subject-id", "gender"] 

256 sf = self.subjects_sensitive_features[cols] 

257 pairs_processed, sf_processed = self._process_data( 

258 self.pairs.copy(), 

259 threshold=threshold, 

260 comparison_level=level, 

261 sf=sf, 

262 ) 

263 

264 fmr = bcf["false_positive_rate"]( 

265 pairs_processed["match"], pairs_processed["match_prediction"] 

266 ) 

267 fmr_results = {"false_match_rate-score": [{"value": fmr}]} 

268 

269 fnmr = bcf["false_negative_rate"]( 

270 pairs_processed["match"], pairs_processed["match_prediction"] 

271 ) 

272 fnmr_results = {"false_non_match_rate-score": [{"value": fnmr}]} 

273 

274 res = {**fmr_results, **fnmr_results} 

275 res = {k: v for k, v in res.items() if k in METRIC_SUBSET} 

276 

277 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()] 

278 res = pd.concat(res) 

279 

280 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True) 

281 res.drop("metric_type", axis=1, inplace=True) 

282 parameters_label = { 

283 "similarity_threshold": threshold, 

284 "comparison_level": level, 

285 } 

286 overall_performance_res.append( 

287 MetricContainer( 

288 res, **self.get_container_info(labels={**parameters_label}) 

289 ) 

290 ) 

291 

292 return overall_performance_res 

293 

294 def _assess_disaggregated_performance(self): 

295 """ 

296 Perform disaggregated performance assessment 

297 """ 

298 performance_metrics = { 

299 "false_match_rate": Metric( 

300 "false_match_rate", "BINARY_CLASSIFICATION", bcf["false_positive_rate"] 

301 ), 

302 "false_non_match_rate": Metric( 

303 "false_non_match_rate", 

304 "BINARY_CLASSIFICATION", 

305 bcf["false_negative_rate"], 

306 ), 

307 } 

308 for sf_name in self.sensitive_features_names: 

309 for threshold in self.similarity_thresholds: 

310 for level in self.comparison_levels: 

311 self._assess_disaggregated_performance_one( 

312 sf_name, threshold, level, performance_metrics 

313 ) 

314 

315 def _assess_disaggregated_performance_one( 

316 self, sf_name, threshold, level, performance_metrics 

317 ): 

318 """ 

319 Perform disaggregated performance assessment for one combination 

320 

321 One combination of similarity threshold, comparison level, and sensitive feature 

322 

323 Parameters 

324 ---------- 

325 sf_name : str 

326 sensitive feature name 

327 threshold : float 

328 similarity threshold 

329 level : str 

330 comparison level 

331 performance_metrics : dict 

332 performance metrics 

333 """ 

334 cols = ["subject-id", sf_name] 

335 sf = self.subjects_sensitive_features[cols] 

336 pairs_processed, sf_processed = self._process_data( 

337 self.pairs.copy(), 

338 threshold=threshold, 

339 comparison_level=level, 

340 sf=sf, 

341 ) 

342 

343 self.metric_frames = setup_metric_frames( 

344 performance_metrics, 

345 prob_metrics=None, 

346 thresh_metrics=None, 

347 y_pred=pairs_processed["match_prediction"], 

348 y_prob=None, 

349 y_true=pairs_processed["match"], 

350 sensitive_features=sf_processed, 

351 ) 

352 

353 disaggregated_df = pd.DataFrame() 

354 for name, metric_frame in self.metric_frames.items(): 

355 df = metric_frame.by_group.copy().convert_dtypes() 

356 disaggregated_df = pd.concat([disaggregated_df, df], axis=1) 

357 disaggregated_results = disaggregated_df.reset_index().melt( 

358 id_vars=[disaggregated_df.index.name], 

359 var_name="type", 

360 ) 

361 disaggregated_results.name = "disaggregated_performance" 

362 

363 sens_feat_label = {"sensitive_feature": sf_name} 

364 metric_type_label = { 

365 "metric_types": disaggregated_results.type.unique().tolist() 

366 } 

367 parameters_label = { 

368 "similarity_threshold": threshold, 

369 "comparison_level": level, 

370 } 

371 if disaggregated_results is not None: 

372 e = TableContainer( 

373 disaggregated_results, 

374 **self.get_container_info( 

375 labels={ 

376 **sens_feat_label, 

377 **metric_type_label, 

378 **parameters_label, 

379 } 

380 ), 

381 ) 

382 self._results.append(e) 

383 

384 parity_diff = ( 

385 disaggregated_results.groupby("type") 

386 .apply(lambda x: max(x.value) - min(x.value)) 

387 .to_dict() 

388 ) 

389 

390 fmr_parity_difference = { 

391 "false_match_rate_parity_difference-score": [ 

392 {"value": parity_diff["false_match_rate"]} 

393 ] 

394 } 

395 fnmr_parity_difference = { 

396 "false_non_match_rate_parity_difference-score": [ 

397 {"value": parity_diff["false_non_match_rate"]} 

398 ] 

399 } 

400 

401 parity_ratio = ( 

402 disaggregated_results.groupby("type") 

403 .apply(lambda x: max(x.value) - min(x.value) if max(x.value) > 0 else 0) 

404 .to_dict() 

405 ) 

406 

407 fmr_parity_ratio = { 

408 "false_match_rate_parity_ratio-score": [ 

409 {"value": parity_ratio["false_match_rate"]} 

410 ] 

411 } 

412 fnmr_parity_ratio = { 

413 "false_non_match_rate_parity_ratio-score": [ 

414 {"value": parity_ratio["false_non_match_rate"]} 

415 ] 

416 } 

417 

418 res = { 

419 **fmr_parity_difference, 

420 **fnmr_parity_difference, 

421 **fmr_parity_ratio, 

422 **fnmr_parity_ratio, 

423 } 

424 res = {k: v for k, v in res.items() if k in METRIC_SUBSET} 

425 

426 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()] 

427 res = pd.concat(res) 

428 

429 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True) 

430 res.drop("metric_type", axis=1, inplace=True) 

431 self._results.append( 

432 MetricContainer( 

433 res, 

434 **self.get_container_info( 

435 labels={**parameters_label, **sens_feat_label} 

436 ), 

437 ) 

438 )