Coverage for credoai/evaluators/data_profiler.py: 94%

34 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1# reset style after pandas profiler 

2 

3import matplotlib 

4import pandas as pd 

5from connect.evidence.lens_evidence import DataProfilerContainer 

6 

7from credoai.artifacts.data.tabular_data import TabularData 

8from credoai.evaluators import Evaluator 

9from credoai.evaluators.utils.validation import check_data_instance 

10 

11backend = matplotlib.get_backend() 

12# load pands profiler, which sets backend to Agg 

13from pandas_profiling import ProfileReport 

14 

15matplotlib.use(backend) 

16 

17 

18class DataProfiler(Evaluator): 

19 """ 

20 Data profiling evaluator for Credo AI. 

21 

22 This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number 

23 of descriptive statistics about the data. 

24 

25 Parameters 

26 ---------- 

27 dataset_name: str 

28 Name of the dataset 

29 profile_kwargs 

30 Potential arguments to be passed to pandas_profiling.ProfileReport 

31 """ 

32 

33 required_artifacts = {"data"} 

34 

35 def __init__(self, dataset_name=None, **profile_kwargs): 

36 self.profile_kwargs = profile_kwargs 

37 # TODO: check utility of this 

38 self.dataset_name = dataset_name 

39 super().__init__() 

40 

41 def _validate_arguments(self): 

42 check_data_instance(self.data, TabularData) 

43 return self 

44 

45 def _setup(self): 

46 self.data_to_profile = pd.concat([self.data.X, self.data.y], axis=1) 

47 return self 

48 

49 def evaluate(self): 

50 """Generates data profile reports""" 

51 profile = self._create_reporter() 

52 results = DataProfilerContainer(profile, **self.get_container_info()) 

53 self.results = [results] 

54 return self 

55 

56 def get_html_report(self): 

57 return self._create_reporter().to_html() 

58 

59 def profile_data(self): 

60 return self._create_reporter().to_notebook_iframe() 

61 

62 def _create_reporter(self): 

63 default_kwargs = {"title": "Dataset", "minimal": True} 

64 default_kwargs.update(self.profile_kwargs) 

65 return ProfileReport(self.data_to_profile, **default_kwargs)