Coverage for credoai/evaluators/data_profiler.py: 93%
61 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
1# reset style after pandas profiler
3import matplotlib
4import pandas as pd
5from connect.evidence import TableContainer
6from connect.evidence.lens_evidence import DataProfilerContainer
8from credoai.artifacts.data.base_data import Data
9from credoai.evaluators.evaluator import Evaluator
10from credoai.evaluators.utils import check_data_instance
11from credoai.utils.common import ValidationError, check_pandas
13backend = matplotlib.get_backend()
14# load pands profiler, which sets backend to Agg
15from pandas_profiling import ProfileReport
17matplotlib.use(backend)
20class DataProfiler(Evaluator):
21 """
22 Data profiling evaluator for Credo AI
24 This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number
25 of descriptive statistics about the data. The DataProfiler can only be run on parts of the
26 data that are pandas objects (dataframes or series). E.g., if X is a multi-dimensional
27 array, X will NOT be profiled.
29 Parameters
30 ----------
31 profile_kwargs
32 Potential arguments to be passed to pandas_profiling.ProfileReport
33 """
35 required_artifacts = {"data"}
37 def __init__(self, **profile_kwargs):
38 self.profile_kwargs = profile_kwargs
39 super().__init__()
41 def _validate_arguments(self):
42 check_data_instance(self.data, Data)
43 return self
45 def _setup(self):
46 data_subsets = [self.data.X, self.data.y, self.data.sensitive_features]
47 self.data_to_profile = list(filter(check_pandas, data_subsets))
48 if not self.data_to_profile:
49 raise ValidationError(
50 "At least one of X, y or sensitive features must exist and be a pandas object"
51 )
52 self.data_to_profile = pd.concat(self.data_to_profile, axis=1)
53 return self
55 def evaluate(self):
56 """Generates data profile reports"""
57 profile = create_report(self.data_to_profile, **self.profile_kwargs)
58 metadata = self._get_column_meta()
59 results = DataProfilerContainer(profile, **self.get_info(metadata=metadata))
60 self.results = [results] + self._wrap_sensitive_counts()
61 return self
63 def _get_column_meta(self):
64 metadata = {}
65 if check_pandas(self.data.X):
66 metadata["model_features"] = self.data.X.columns.tolist()
67 if check_pandas(self.data.sensitive_features):
68 metadata[
69 "sensitive_features"
70 ] = self.data.sensitive_features.columns.tolist()
71 if isinstance(self.data.y, pd.Series):
72 metadata["target"] = self.data.y.name
73 elif isinstance(self.data.y, pd.DataFrame):
74 metadata["targets"] = self.data.y.columns.tolist()
75 return metadata
77 def _wrap_sensitive_counts(self):
78 counts = sensitive_feature_counts(self.data)
79 if counts:
80 return [TableContainer(count) for count in counts]
83def create_report(df, **profile_kwargs):
84 """Creates a pandas profiler report"""
85 default_kwargs = {"title": "Dataset", "minimal": True}
86 default_kwargs.update(profile_kwargs)
87 return ProfileReport(df, **default_kwargs)
90def sensitive_feature_counts(data):
91 """Returns the sensitive feature distributions of a Data object"""
92 if data.sensitive_features is None:
93 return
94 sensitive_feature_distributions = []
95 for name, col in data.sensitive_features.items():
96 df = pd.concat([col.value_counts(), col.value_counts(normalize=True)], axis=1)
97 df.columns = ["Count", "Proportion"]
98 df.name = f"{name} Distribution"
99 sensitive_feature_distributions.append(df)
100 return sensitive_feature_distributions