Coverage for credoai/evaluators/data_profiler.py: 94%
34 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
1# reset style after pandas profiler
3import matplotlib
4import pandas as pd
5from connect.evidence.lens_evidence import DataProfilerContainer
7from credoai.artifacts.data.tabular_data import TabularData
8from credoai.evaluators import Evaluator
9from credoai.evaluators.utils.validation import check_data_instance
11backend = matplotlib.get_backend()
12# load pands profiler, which sets backend to Agg
13from pandas_profiling import ProfileReport
15matplotlib.use(backend)
18class DataProfiler(Evaluator):
19 """
20 Data profiling evaluator for Credo AI.
22 This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number
23 of descriptive statistics about the data.
25 Parameters
26 ----------
27 dataset_name: str
28 Name of the dataset
29 profile_kwargs
30 Potential arguments to be passed to pandas_profiling.ProfileReport
31 """
33 required_artifacts = {"data"}
35 def __init__(self, dataset_name=None, **profile_kwargs):
36 self.profile_kwargs = profile_kwargs
37 # TODO: check utility of this
38 self.dataset_name = dataset_name
39 super().__init__()
41 def _validate_arguments(self):
42 check_data_instance(self.data, TabularData)
43 return self
45 def _setup(self):
46 self.data_to_profile = pd.concat([self.data.X, self.data.y], axis=1)
47 return self
49 def evaluate(self):
50 """Generates data profile reports"""
51 profile = self._create_reporter()
52 results = DataProfilerContainer(profile, **self.get_container_info())
53 self.results = [results]
54 return self
56 def get_html_report(self):
57 return self._create_reporter().to_html()
59 def profile_data(self):
60 return self._create_reporter().to_notebook_iframe()
62 def _create_reporter(self):
63 default_kwargs = {"title": "Dataset", "minimal": True}
64 default_kwargs.update(self.profile_kwargs)
65 return ProfileReport(self.data_to_profile, **default_kwargs)