Coverage for credoai/evaluators/identity

1"""Identity Verification evaluator"""

2import pandas as pd

3from connect.evidence import MetricContainer, TableContainer

5from credoai.artifacts import ComparisonData, ComparisonModel

6from credoai.artifacts.model.comparison_model import DummyComparisonModel

7from credoai.evaluators.evaluator import Evaluator

8from credoai.evaluators.utils.fairlearn import setup_metric_frames

9from credoai.evaluators.utils.validation import (

10 check_data_instance,

11 check_existence,

12 check_model_instance,

13)

14from credoai.modules.constants_metrics import BINARY_CLASSIFICATION_FUNCTIONS as bcf

15from credoai.modules.metrics import Metric

17METRIC_SUBSET = [

18 "false_match_rate-score",

19 "false_non_match_rate-score",

20 "false_match_rate_parity_difference-score",

21 "false_non_match_rate_parity_difference-score",

22 "false_match_rate_parity_ratio-score",

23 "false_non_match_rate_parity_ratio-score",

24]

27class IdentityVerification(Evaluator):

28 """

29 Pair-wise-comparison-based identity verification evaluator for Credo AI (Experimental)

31 This evaluator takes in identity verification data and

32 provides functionality to perform performance and fairness assessment

34 Parameters

35 ----------

36 pairs : pd.DataFrame of shape (n_pairs, 4)

37 Dataframe where each row represents a data sample pair and associated subjects

38 Type of data sample is decided by the ComparisonModel's `compare` function, which takes

39 data sample pairs and returns their similarity scores. Examples are selfies, fingerprint scans,

40 or voices of a person.

42 Required columns:

44 * source-subject-id: unique identifier of the source subject

45 * source-subject-data-sample: data sample from the source subject

46 * target-subject-id: unique identifier of the target subject

47 * target-subject-data-sample: data sample from the target subject

49 subjects_sensitive_features : pd.DataFrame of shape (n_subjects, n_sensitive_feature_names), optional

50 Sensitive features of all subjects present in pairs dataframe

51 If provided, disaggregated performance assessment is also performed.

52 This can be the columns you want to perform segmentation analysis on, or

53 a feature related to fairness like 'race' or 'gender'.

55 Required columns:

57 * subject-id: id of subjects. Must cover all the subjects included in `pairs` dataframe

58 other columns with arbitrary names for sensitive features

60 similarity_thresholds : list

61 list of similarity score thresholds

62 Similarity equal or greater than a similarity score threshold means match

63 comparison_levels : list

64 list of comparison levels. Options:

66 * sample: it means a match is observed for every sample pair. Sample-level comparison represent

67 a use case where only two samples (such as a real time selfie and stored ID image) are

68 used to confirm an identity.

69 * subject: it means if any pairs of samples for the same subject are a match, the subject pair

70 is marked as a match. Some identity verification use cases improve overall accuracy by storing

71 multiple samples per identity. Subject-level comparison mirrors this behavior.

73 Example

74 --------

76 >>> import pandas as pd

77 >>> from credoai.lens import Lens

78 >>> from credoai.artifacts import ComparisonData, ComparisonModel

79 >>> from credoai.evaluators import IdentityVerification

80 >>> evaluator = IdentityVerification(similarity_thresholds=[60, 99])

81 >>> import doctest

82 >>> doctest.ELLIPSIS_MARKER = '-etc-'

83 >>> pairs = pd.DataFrame({

84 ... 'source-subject-id': ['s0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's2'],

85 ... 'source-subject-data-sample': ['s00', 's00', 's00', 's00', 's10', 's10', 's10', 's11', 's11', 's20'],

86 ... 'target-subject-id': ['s1', 's1', 's2', 's3', 's1', 's2', 's3', 's2', 's3', 's3'],

87 ... 'target-subject-data-sample': ['s10', 's11', 's20', 's30', 's11', 's20', 's30', 's20', 's30', 's30']

88 ... })

89 >>> subjects_sensitive_features = pd.DataFrame({

90 ... 'subject-id': ['s0', 's1', 's2', 's3'],

91 ... 'gender': ['female', 'male', 'female', 'female']

92 ... })

93 >>> class FaceCompare:

94 ... # a dummy selfie comparison model

95 ... def compare(self, pairs):

96 ... similarity_scores = [31.5, 16.7, 20.8, 84.4, 12.0, 15.2, 45.8, 23.5, 28.5, 44.5]

97 ... return similarity_scores

98 >>> face_compare = FaceCompare()

99 >>> credo_data = ComparisonData(

100 ... name="face-data",

101 ... pairs=pairs,

102 ... subjects_sensitive_features=subjects_sensitive_features

103 ... )

104 >>> credo_model = ComparisonModel(

105 ... name="face-compare",

106 ... model_like=face_compare

107 ... )

108 >>> pipeline = Lens(model=credo_model, assessment_data=credo_data)

109 >>> pipeline.add(evaluator) # doctest: +ELLIPSIS

110 -etc-

111 >>> pipeline.run() # doctest: +ELLIPSIS

112 -etc-

113 >>> pipeline.get_results() # doctest: +ELLIPSIS

114 -etc-

115

116 """

117

118 required_artifacts = {"model", "assessment_data"}

119

120 def __init__(

121 self,

122 similarity_thresholds: list = [90, 95, 99],

123 comparison_levels: list = ["sample", "subject"],

124 ):

125 self.similarity_thresholds = similarity_thresholds

126 self.comparison_levels = comparison_levels

127 super().__init__()

128

129 def _validate_arguments(self):

130 check_data_instance(self.assessment_data, ComparisonData)

131 check_model_instance(self.model, (ComparisonModel, DummyComparisonModel))

132 check_existence(self.assessment_data.pairs, "pairs")

133 return self

134

135 def _setup(self):

136 self.pairs = self.assessment_data.pairs

137 try:

138 self.subjects_sensitive_features = (

139 self.assessment_data.subjects_sensitive_features

140 )

141 sensitive_features_names = list(self.subjects_sensitive_features.columns)

142 sensitive_features_names.remove("subject-id")

143 self.sensitive_features_names = sensitive_features_names

144 except:

145 self.subjects_sensitive_features = None

146

147 self.pairs["similarity_score"] = self.model.compare(

148 [

149 list(pair)

150 for pair in zip(

151 self.pairs["source-subject-data-sample"].tolist(),

152 self.pairs["target-subject-data-sample"].tolist(),

153 )

154 ]

155 )

156

157 self.pairs["match"] = self.pairs.apply(

158 lambda x: 1 if x["source-subject-id"] == x["target-subject-id"] else 0,

159 axis=1,

160 )

161

162 return self

163

164 def evaluate(self):

165 """

166 Runs the assessment process

167

168 Returns

169 -------

170 dict, nested

171 Key: assessment category

172 Values: detailed results associated with each category

173 """

174

175 self.results = self._assess_overall_performance()

176

177 if self.subjects_sensitive_features is not None:

178 self._assess_disaggregated_performance()

179

180 return self

181

182 def _process_data(

183 self, pairs_processed, threshold=90, comparison_level="sample", sf=None

184 ):

185 """

186 Process the pairs and sensitive features dataframes

187

188 Parameters

189 ----------

190 pairs_processed : pd.DataFrame

191 pairs dataframe to be processed in place

192 threshold : float, optional

193 similarity threshold equal or greater than which mean match, by default 90

194 comparison_level : str, optional

195 comparison levels, by default "sample"

196 Options:

197 sample: it means a match is observed for every sample pair. Sample-level comparison represent

198 a use case where only two samples (such as a real time selfie and stored ID image) are

199 used to confirm an identity.

200 subject: it means if any pairs of samples for the same subject are a match, the subject pair

201 is marked as a match. Some identity verification use cases improve overall accuracy by storing

202 multiple samples per identity. Subject-level comparison mirrors this behavior.

203 sf : pd.DataFrame, optional

204 sensitive feature dataframe with 'subject-id' and sensitive feature name columns, by default None

205

206 Returns

207 -------

208 pd.DataFrame, pd.DataFrame

209 Processed pairs and sensitive features dataframes

210 """

211 pairs_processed["match_prediction"] = pairs_processed.apply(

212 lambda x: 1 if x["similarity_score"] >= threshold else 0, axis=1

213 )

214 if comparison_level == "subject":

215 pairs_processed = pairs_processed.sort_values("match").drop_duplicates(

216 subset=["source-subject-id", "target-subject-id"], keep="last"

217 )

218

219 sf_processed = None

220 if sf is not None:

221 # Process the data for disaggregated assessment

222 # Filter out the pairs with non-matching sensitive feature groups

223 # and create the sensitive feature vector

224 sf_name = list(sf.columns)

225 sf_name.remove("subject-id")

226 sf_name = sf_name[0]

227 pairs_processed = pairs_processed.merge(

228 sf, left_on="source-subject-id", right_on="subject-id", how="left"

229 )

230 pairs_processed.drop("subject-id", inplace=True, axis=1)

231 pairs_processed.rename(

232 {sf_name: sf_name + "-source-subject"}, inplace=True, axis=1

233 )

234 pairs_processed = pairs_processed.merge(

235 sf, left_on="target-subject-id", right_on="subject-id", how="left"

236 )

237 pairs_processed.drop("subject-id", inplace=True, axis=1)

238 pairs_processed = pairs_processed.loc[

239 pairs_processed[sf_name + "-source-subject"] == pairs_processed[sf_name]

240 ]

241 sf_processed = pairs_processed[sf_name]

242 pairs_processed.drop(

243 [sf_name, sf_name + "-source-subject"], inplace=True, axis=1

244 )

245

246 return pairs_processed, sf_processed

247

248 def _assess_overall_performance(self):

249 """

250 Perform overall performance assessment

251 """

252 overall_performance_res = []

253 for threshold in self.similarity_thresholds:

254 for level in self.comparison_levels:

255 cols = ["subject-id", "gender"]

256 sf = self.subjects_sensitive_features[cols]

257 pairs_processed, sf_processed = self._process_data(

258 self.pairs.copy(),

259 threshold=threshold,

260 comparison_level=level,

261 sf=sf,

262 )

263

264 fmr = bcf["false_positive_rate"](

265 pairs_processed["match"], pairs_processed["match_prediction"]

266 )

267 fmr_results = {"false_match_rate-score": [{"value": fmr}]}

268

269 fnmr = bcf["false_negative_rate"](

270 pairs_processed["match"], pairs_processed["match_prediction"]

271 )

272 fnmr_results = {"false_non_match_rate-score": [{"value": fnmr}]}

273

274 res = {**fmr_results, **fnmr_results}

275 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}

276

277 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]

278 res = pd.concat(res)

279

280 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)

281 res.drop("metric_type", axis=1, inplace=True)

282 parameters_label = {

283 "similarity_threshold": threshold,

284 "comparison_level": level,

285 }

286 overall_performance_res.append(

287 MetricContainer(res, **self.get_info(labels={**parameters_label}))

288 )

289

290 return overall_performance_res

291

292 def _assess_disaggregated_performance(self):

293 """

294 Perform disaggregated performance assessment

295 """

296 performance_metrics = {

297 "false_match_rate": Metric(

298 "false_match_rate", "BINARY_CLASSIFICATION", bcf["false_positive_rate"]

299 ),

300 "false_non_match_rate": Metric(

301 "false_non_match_rate",

302 "BINARY_CLASSIFICATION",

303 bcf["false_negative_rate"],

304 ),

305 }

306 for sf_name in self.sensitive_features_names:

307 for threshold in self.similarity_thresholds:

308 for level in self.comparison_levels:

309 self._assess_disaggregated_performance_one(

310 sf_name, threshold, level, performance_metrics

311 )

312

313 def _assess_disaggregated_performance_one(

314 self, sf_name, threshold, level, performance_metrics

315 ):

316 """

317 Perform disaggregated performance assessment for one combination

318

319 One combination of similarity threshold, comparison level, and sensitive feature

320

321 Parameters

322 ----------

323 sf_name : str

324 sensitive feature name

325 threshold : float

326 similarity threshold

327 level : str

328 comparison level

329 performance_metrics : dict

330 performance metrics

331 """

332 cols = ["subject-id", sf_name]

333 sf = self.subjects_sensitive_features[cols]

334 pairs_processed, sf_processed = self._process_data(

335 self.pairs.copy(),

336 threshold=threshold,

337 comparison_level=level,

338 sf=sf,

339 )

340

341 self.metric_frames = setup_metric_frames(

342 performance_metrics,

343 y_pred=pairs_processed["match_prediction"],

344 y_prob=None,

345 y_true=pairs_processed["match"],

346 sensitive_features=sf_processed,

347 )

348

349 disaggregated_df = pd.DataFrame()

350 for name, metric_frame in self.metric_frames.items():

351 df = metric_frame.by_group.copy().convert_dtypes()

352 disaggregated_df = pd.concat([disaggregated_df, df], axis=1)

353 disaggregated_results = disaggregated_df.reset_index().melt(

354 id_vars=[disaggregated_df.index.name],

355 var_name="type",

356 )

357 disaggregated_results.name = "disaggregated_performance"

358

359 sens_feat_label = {"sensitive_feature": sf_name}

360 metric_type_label = {

361 "metric_types": disaggregated_results.type.unique().tolist()

362 }

363 parameters_label = {

364 "similarity_threshold": threshold,

365 "comparison_level": level,

366 }

367 if disaggregated_results is not None:

368 e = TableContainer(

369 disaggregated_results,

370 **self.get_info(

371 labels={

372 **sens_feat_label,

373 **metric_type_label,

374 **parameters_label,

375 }

376 ),

377 )

378 self._results.append(e)

379

380 parity_diff = (

381 disaggregated_results.groupby("type")

382 .apply(lambda x: max(x.value) - min(x.value))

383 .to_dict()

384 )

385

386 fmr_parity_difference = {

387 "false_match_rate_parity_difference-score": [

388 {"value": parity_diff["false_match_rate"]}

389 ]

390 }

391 fnmr_parity_difference = {

392 "false_non_match_rate_parity_difference-score": [

393 {"value": parity_diff["false_non_match_rate"]}

394 ]

395 }

396

397 parity_ratio = (

398 disaggregated_results.groupby("type")

399 .apply(lambda x: max(x.value) - min(x.value) if max(x.value) > 0 else 0)

400 .to_dict()

401 )

402

403 fmr_parity_ratio = {

404 "false_match_rate_parity_ratio-score": [

405 {"value": parity_ratio["false_match_rate"]}

406 ]

407 }

408 fnmr_parity_ratio = {

409 "false_non_match_rate_parity_ratio-score": [

410 {"value": parity_ratio["false_non_match_rate"]}

411 ]

412 }

413

414 res = {

415 **fmr_parity_difference,

416 **fnmr_parity_difference,

417 **fmr_parity_ratio,

418 **fnmr_parity_ratio,

419 }

420 res = {k: v for k, v in res.items() if k in METRIC_SUBSET}

421

422 res = [pd.DataFrame(v).assign(metric_type=k) for k, v in res.items()]

423 res = pd.concat(res)

424

425 res[["type", "subtype"]] = res.metric_type.str.split("-", expand=True)

426 res.drop("metric_type", axis=1, inplace=True)

427 self._results.append(

428 MetricContainer(

429 res,

430 **self.get_info(labels={**parameters_label, **sens_feat_label}),

431 )

432 )

Coverage for credoai/evaluators/identity_verification.py: 98%

112 statements