Coverage for credoai/evaluators/data_profiler.py: 93%

61 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1# reset style after pandas profiler 

2 

3import matplotlib 

4import pandas as pd 

5from connect.evidence import TableContainer 

6from connect.evidence.lens_evidence import DataProfilerContainer 

7 

8from credoai.artifacts.data.base_data import Data 

9from credoai.evaluators.evaluator import Evaluator 

10from credoai.evaluators.utils import check_data_instance 

11from credoai.utils.common import ValidationError, check_pandas 

12 

13backend = matplotlib.get_backend() 

14# load pands profiler, which sets backend to Agg 

15from pandas_profiling import ProfileReport 

16 

17matplotlib.use(backend) 

18 

19 

20class DataProfiler(Evaluator): 

21 """ 

22 Data profiling evaluator for Credo AI 

23 

24 This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number 

25 of descriptive statistics about the data. The DataProfiler can only be run on parts of the 

26 data that are pandas objects (dataframes or series). E.g., if X is a multi-dimensional 

27 array, X will NOT be profiled. 

28 

29 Parameters 

30 ---------- 

31 profile_kwargs 

32 Potential arguments to be passed to pandas_profiling.ProfileReport 

33 """ 

34 

35 required_artifacts = {"data"} 

36 

37 def __init__(self, **profile_kwargs): 

38 self.profile_kwargs = profile_kwargs 

39 super().__init__() 

40 

41 def _validate_arguments(self): 

42 check_data_instance(self.data, Data) 

43 return self 

44 

45 def _setup(self): 

46 data_subsets = [self.data.X, self.data.y, self.data.sensitive_features] 

47 self.data_to_profile = list(filter(check_pandas, data_subsets)) 

48 if not self.data_to_profile: 

49 raise ValidationError( 

50 "At least one of X, y or sensitive features must exist and be a pandas object" 

51 ) 

52 self.data_to_profile = pd.concat(self.data_to_profile, axis=1) 

53 return self 

54 

55 def evaluate(self): 

56 """Generates data profile reports""" 

57 profile = create_report(self.data_to_profile, **self.profile_kwargs) 

58 metadata = self._get_column_meta() 

59 results = DataProfilerContainer(profile, **self.get_info(metadata=metadata)) 

60 self.results = [results] + self._wrap_sensitive_counts() 

61 return self 

62 

63 def _get_column_meta(self): 

64 metadata = {} 

65 if check_pandas(self.data.X): 

66 metadata["model_features"] = self.data.X.columns.tolist() 

67 if check_pandas(self.data.sensitive_features): 

68 metadata[ 

69 "sensitive_features" 

70 ] = self.data.sensitive_features.columns.tolist() 

71 if isinstance(self.data.y, pd.Series): 

72 metadata["target"] = self.data.y.name 

73 elif isinstance(self.data.y, pd.DataFrame): 

74 metadata["targets"] = self.data.y.columns.tolist() 

75 return metadata 

76 

77 def _wrap_sensitive_counts(self): 

78 counts = sensitive_feature_counts(self.data) 

79 if counts: 

80 return [TableContainer(count) for count in counts] 

81 

82 

83def create_report(df, **profile_kwargs): 

84 """Creates a pandas profiler report""" 

85 default_kwargs = {"title": "Dataset", "minimal": True} 

86 default_kwargs.update(profile_kwargs) 

87 return ProfileReport(df, **default_kwargs) 

88 

89 

90def sensitive_feature_counts(data): 

91 """Returns the sensitive feature distributions of a Data object""" 

92 if data.sensitive_features is None: 

93 return 

94 sensitive_feature_distributions = [] 

95 for name, col in data.sensitive_features.items(): 

96 df = pd.concat([col.value_counts(), col.value_counts(normalize=True)], axis=1) 

97 df.columns = ["Count", "Proportion"] 

98 df.name = f"{name} Distribution" 

99 sensitive_feature_distributions.append(df) 

100 return sensitive_feature_distributions