Coverage for credoai/evaluators/utils/validation.py: 70%

91 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1############# Validation related functionality ################## 

2 

3import inspect 

4 

5import numpy as np 

6import pandas as pd 

7 

8try: 

9 tf_exists = True 

10 import tensorflow as tf 

11except ImportError: 

12 tf_exists = False 

13 

14from credoai.artifacts.data.tabular_data import TabularData 

15from credoai.artifacts.model.base_model import Model 

16from credoai.utils import global_logger 

17from credoai.utils.common import ValidationError 

18 

19############################## 

20# Checking individual artifacts 

21############################## 

22 

23 

24def check_instance(obj, inst_type, message=None): 

25 if not message: 

26 message = f"Object {obj} should be an instance of {inst_type.__name__}" 

27 if not isinstance(obj, inst_type): 

28 raise ValidationError(message) 

29 

30 

31def check_data_instance(obj, inst_type, name="Data"): 

32 message = f"{name} should be an instance of {inst_type.__name__}" 

33 check_instance(obj, inst_type, message) 

34 

35 

36def check_model_instance(obj, inst_type, name="Model"): 

37 if isinstance(inst_type, tuple): 

38 comp_label = " or ".join([x.__name__ for x in inst_type]) 

39 else: 

40 comp_label = inst_type.__name__ 

41 message = f"{name} should be an instance of {comp_label}" 

42 check_instance(obj, inst_type, message) 

43 

44 

45def check_feature_presence(feature_name, df, name): 

46 if isinstance(df, pd.DataFrame): 

47 if not feature_name in df.columns: 

48 message = f"Feature {feature_name} not found in dataframe {name}" 

49 raise ValidationError(message) 

50 if isinstance(df, pd.Series): 

51 if not df.name == feature_name: 

52 message = f"Feature {feature_name} not found in series {name}" 

53 raise ValidationError(message) 

54 

55 

56def check_existence(obj, name=None): 

57 message = f"Missing object {name}" 

58 if isinstance(obj, (pd.DataFrame, pd.Series)): 

59 if obj.empty: 

60 raise ValidationError(message) 

61 elif obj is None or not obj: 

62 raise ValidationError(message) 

63 

64 

65def check_nulls_by_data_type(data): 

66 nulls = False 

67 if isinstance(data, (pd.DataFrame, pd.Series)): 

68 nulls = data.isnull().to_numpy().any() 

69 if isinstance(data, np.ndarray): 

70 nulls = np.isnan(data).any() 

71 if tf_exists and isinstance(data, tf.Tensor): 

72 nulls = tf.reduce_any(tf.math.is_nan(data)) 

73 if ( 

74 tf_exists and isinstance(data, (tf.data.Dataset, tf.keras.utils.Sequence)) 

75 ) or inspect.isgeneratorfunction(data): 

76 message = """ 

77 Evaluator Validation: Checking for nulls in generator-based or mapped data is not currently 

78 supported. Please be sure to sanitize your data. Downstream errors may arise due to nulls in  

79 image or other tensor data. 

80 """ 

81 global_logger.warning(message) 

82 return nulls 

83 

84 

85################################# 

86# Checking evaluator requirements 

87################################# 

88 

89 

90def check_data_for_nulls(obj, name, check_X=True, check_y=True, check_sens=True): 

91 errors = [] 

92 if check_X and obj.X is not None: 

93 if check_nulls_by_data_type(obj.X): 

94 errors.append("X") 

95 if check_y and obj.y is not None: 

96 if check_nulls_by_data_type(obj.y): 

97 errors.append("y") 

98 if check_sens and obj.sensitive_features is not None: 

99 if check_nulls_by_data_type(obj.sensitive_features): 

100 errors.append("sensitive_features") 

101 

102 if len(errors) > 0: 

103 message = f"Detected null values in {name}, in attributes: {','.join(errors)}" 

104 raise ValidationError(message) 

105 

106 

107def check_requirements_existence(self): 

108 for required_name in self.required_artifacts: 

109 check_existence(vars(self)[required_name], required_name) 

110 

111 

112def check_requirements_deepchecks(self): 

113 # For case when we require at least one dataset 

114 # All supplied datasets must be of correct form 

115 at_least_one_artifact = False 

116 for required_name in self.required_artifacts: 

117 if "data" in required_name: 

118 try: 

119 check_data_instance(vars(self)[required_name], TabularData) 

120 at_least_one_artifact = True 

121 except ValidationError as e: 

122 if vars(self)[required_name]: 

123 # Check if the artifact actually contains anything 

124 # If so, raise the same error 

125 raise ValidationError(e) 

126 else: 

127 # Do nothing. We're simply not going to have this optional artifact 

128 pass 

129 else: 

130 # Check model 

131 try: 

132 check_model_instance(vars(self)[required_name], Model) 

133 at_least_one_artifact = True 

134 except ValidationError as e: 

135 if vars(self)[required_name]: 

136 # Check if model is NoneType 

137 raise ValidationError(e) 

138 else: 

139 # Model is NoneType but model is optional for deepchecks 

140 pass 

141 

142 if not at_least_one_artifact: 

143 raise ValidationError( 

144 "Expected at least one valid artifact. None provided or all objects passed are otherwise invalid" 

145 )