Coverage for credoai/utils/common.py: 60%

102 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 07:32 +0000

1import collections 

2import hashlib 

3import json 

4from pathlib import Path 

5from typing import Any, Dict 

6 

7import numpy as np 

8import pandas as pd 

9from sklearn.utils import check_array 

10 

11 

12class NotRunError(Exception): 

13 pass 

14 

15 

16class ValidationError(Exception): 

17 pass 

18 

19 

20class InstallationError(Exception): 

21 pass 

22 

23 

24class IntegrationError(Exception): 

25 pass 

26 

27 

28class SupressSettingWithCopyWarning: 

29 def __enter__(self): 

30 pd.options.mode.chained_assignment = None 

31 

32 def __exit__(self, *args): 

33 pd.options.mode.chained_assignment = "warn" 

34 

35 

36def check_subset(subset, superset): 

37 """Check whether one dictionary, list or set is a subset of another 

38 

39 Handles nested dictionaries 

40 """ 

41 if type(subset) != type(superset): 

42 return False 

43 if isinstance(subset, dict): 

44 for k, v in subset.items(): 

45 superset_value = superset.get(k) 

46 if superset_value == v: 

47 continue 

48 elif type(v) != type(superset_value): 

49 return False 

50 elif isinstance(v, (dict, list, set)): 

51 out = check_subset(v, superset_value) 

52 if not out: 

53 return False 

54 else: 

55 return False 

56 if isinstance(subset, (list, set)): 

57 return set(subset) <= set(superset) 

58 

59 return True 

60 

61 

62def check_array_like(array): 

63 if isinstance(array, (pd.DataFrame, pd.Series)): 

64 pass 

65 else: 

66 try: 

67 check_array(array, ensure_2d=False) 

68 except ValueError: 

69 raise ValidationError( 

70 "Expected array-like (e.g., list, numpy array, pandas series/dataframe" 

71 ) 

72 

73 

74def get_project_root() -> Path: 

75 return Path(__file__).parent.parent 

76 

77 

78def flatten_list(lst): 

79 return [item for sublist in lst for item in sublist] 

80 

81 

82def update_dictionary(d, u): 

83 """Recursively updates a dictionary""" 

84 for k, v in u.items(): 

85 if isinstance(v, collections.abc.Mapping): 

86 d[k] = update_dictionary(d.get(k, {}), v) 

87 elif isinstance(v, list): 

88 d[k] = v + d.get(k, []) 

89 else: 

90 d[k] = v 

91 return d 

92 

93 

94def wrap_list(obj): 

95 """Ensures object is an iterable""" 

96 if type(obj) == str: 

97 obj = [obj] 

98 elif obj is None: 

99 return None 

100 try: 

101 iter(obj) 

102 except TypeError: 

103 obj = [obj] 

104 return obj 

105 

106 

107def remove_suffix(text, suffix): 

108 return text[: -len(suffix)] if text.endswith(suffix) and len(suffix) != 0 else text 

109 

110 

111def humanize_label(s): 

112 return " ".join(s.split("_")).title() 

113 

114 

115class CredoEncoder(json.JSONEncoder): 

116 """Special json encoder for numpy types""" 

117 

118 def default(self, obj): 

119 # numpy encoders 

120 if isinstance(obj, np.integer): 

121 return int(obj) 

122 elif isinstance(obj, np.floating): 

123 return float(obj) 

124 elif isinstance(obj, np.ndarray): 

125 return obj.tolist() 

126 return json.JSONEncoder.default(self, obj) 

127 

128 

129def json_dumps(obj): 

130 """Custom json dumps with encoder""" 

131 return json.dumps(obj, cls=CredoEncoder, indent=2) 

132 

133 

134def dict_hash(dictionary: Dict[str, Any]) -> str: 

135 """MD5 hash of a dictionary.""" 

136 dhash = hashlib.md5() 

137 # We need to sort arguments so {'a': 1, 'b': 2} is 

138 # the same as {'b': 2, 'a': 1} 

139 encoded = json.dumps(dictionary, sort_keys=True).encode() 

140 dhash.update(encoded) 

141 return dhash.hexdigest() 

142 

143 

144def to_array(lst): 

145 """ 

146 Converts list-like object to array 

147 Parameters 

148 ---------- 

149 lst : (List, pandas.Series, numpy.ndarray) 

150 The list-like to be converted 

151 """ 

152 if type(lst) == pd.Series: 

153 return lst.values 

154 elif type(lst) == list: 

155 return np.array(lst) 

156 elif type(lst) == np.ndarray: 

157 return lst 

158 else: 

159 raise TypeError 

160 

161 

162def is_categorical(series, threshold=0.05): 

163 """Identifies whether a series is categorical or not 

164 

165 Logic: If there are relatively few unique values for a feature, the feature is likely categorical. 

166 The results are estimates and are not guaranteed to be correct. 

167 

168 Parameters 

169 ---------- 

170 series : pd.Series 

171 Series to evaluate 

172 threshold : float 

173 The threshold (number of the unique values over the total number of values) 

174 

175 

176 Returns 

177 ------- 

178 bool 

179 Whether the series is categorical or not 

180 """ 

181 

182 if series.dtype.name in ["category", "object"]: 

183 return True 

184 # float columns are assumed not-categorical 

185 elif len(series.unique()) / len(series) < threshold: 

186 return True 

187 else: 

188 return False