Coverage for credoai/utils/common.py: 68%

104 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1import collections 

2import hashlib 

3import json 

4from pathlib import Path 

5from typing import Any, Dict 

6 

7import numpy as np 

8import pandas as pd 

9from sklearn.utils import check_array 

10 

11 

12class NotRunError(Exception): 

13 pass 

14 

15 

16class ValidationError(Exception): 

17 pass 

18 

19 

20class InstallationError(Exception): 

21 pass 

22 

23 

24class IntegrationError(Exception): 

25 pass 

26 

27 

28class SupressSettingWithCopyWarning: 

29 def __enter__(self): 

30 pd.options.mode.chained_assignment = None 

31 

32 def __exit__(self, *args): 

33 pd.options.mode.chained_assignment = "warn" 

34 

35 

36def check_subset(subset, superset): 

37 """Check whether one dictionary, list or set is a subset of another 

38 

39 Handles nested dictionaries 

40 """ 

41 if type(subset) != type(superset): 

42 return False 

43 if isinstance(subset, dict): 

44 for k, v in subset.items(): 

45 superset_value = superset.get(k) 

46 if superset_value == v: 

47 continue 

48 elif type(v) != type(superset_value): 

49 return False 

50 elif isinstance(v, (dict, list, set)): 

51 out = check_subset(v, superset_value) 

52 if not out: 

53 return False 

54 else: 

55 return False 

56 if isinstance(subset, (list, set)): 

57 return set(subset) <= set(superset) 

58 

59 return True 

60 

61 

62def check_pandas(array): 

63 return isinstance(array, (pd.DataFrame, pd.Series)) 

64 

65 

66def check_array_like(array): 

67 if check_pandas(array): 

68 pass 

69 else: 

70 try: 

71 check_array(array, ensure_2d=False) 

72 except ValueError: 

73 raise ValidationError( 

74 "Expected array-like (e.g., list, numpy array, pandas series/dataframe" 

75 ) 

76 

77 

78def get_project_root() -> Path: 

79 return Path(__file__).parent.parent 

80 

81 

82def flatten_list(lst): 

83 return [item for sublist in lst for item in sublist] 

84 

85 

86def update_dictionary(d, u): 

87 """Recursively updates a dictionary""" 

88 for k, v in u.items(): 

89 if isinstance(v, collections.abc.Mapping): 

90 d[k] = update_dictionary(d.get(k, {}), v) 

91 elif isinstance(v, list): 

92 d[k] = v + d.get(k, []) 

93 else: 

94 d[k] = v 

95 return d 

96 

97 

98def wrap_list(obj): 

99 """Ensures object is an iterable""" 

100 if type(obj) == str: 

101 obj = [obj] 

102 elif obj is None: 

103 return None 

104 try: 

105 iter(obj) 

106 except TypeError: 

107 obj = [obj] 

108 return obj 

109 

110 

111def remove_suffix(text, suffix): 

112 return text[: -len(suffix)] if text.endswith(suffix) and len(suffix) != 0 else text 

113 

114 

115def humanize_label(s): 

116 return " ".join(s.split("_")).title() 

117 

118 

119class CredoEncoder(json.JSONEncoder): 

120 """Special json encoder for numpy types""" 

121 

122 def default(self, obj): 

123 # numpy encoders 

124 if isinstance(obj, np.integer): 

125 return int(obj) 

126 elif isinstance(obj, np.floating): 

127 return float(obj) 

128 elif isinstance(obj, np.ndarray): 

129 return obj.tolist() 

130 return json.JSONEncoder.default(self, obj) 

131 

132 

133def json_dumps(obj): 

134 """Custom json dumps with encoder""" 

135 return json.dumps(obj, cls=CredoEncoder, indent=2) 

136 

137 

138def dict_hash(dictionary: Dict[str, Any]) -> str: 

139 """MD5 hash of a dictionary.""" 

140 dhash = hashlib.md5() 

141 # We need to sort arguments so {'a': 1, 'b': 2} is 

142 # the same as {'b': 2, 'a': 1} 

143 encoded = json.dumps(dictionary, sort_keys=True).encode() 

144 dhash.update(encoded) 

145 return dhash.hexdigest() 

146 

147 

148def to_array(lst): 

149 """ 

150 Converts list-like object to array 

151 Parameters 

152 ---------- 

153 lst : (List, pandas.Series, numpy.ndarray) 

154 The list-like to be converted 

155 """ 

156 if type(lst) == pd.Series: 

157 return lst.values 

158 elif type(lst) == list: 

159 return np.array(lst) 

160 elif type(lst) == np.ndarray: 

161 return lst 

162 else: 

163 raise TypeError 

164 

165 

166def is_categorical(series, threshold=0.05): 

167 """Identifies whether a series is categorical or not 

168 

169 Logic: If there are relatively few unique values for a feature, the feature is likely categorical. 

170 The results are estimates and are not guaranteed to be correct. 

171 

172 Parameters 

173 ---------- 

174 series : pd.Series 

175 Series to evaluate 

176 threshold : float 

177 The threshold (number of the unique values over the total number of values) 

178 

179 

180 Returns 

181 ------- 

182 bool 

183 Whether the series is categorical or not 

184 """ 

185 

186 if series.dtype.name in ["category", "object"]: 

187 return True 

188 # float columns are assumed not-categorical 

189 elif len(series.unique()) / len(series) < threshold: 

190 return True 

191 else: 

192 return False