Coverage for credoai/utils/common.py: 68%

1import collections

2import hashlib

3import json

4from pathlib import Path

5from typing import Any, Dict

7import numpy as np

8import pandas as pd

9from sklearn.utils import check_array

12class NotRunError(Exception):

13 pass

16class ValidationError(Exception):

17 pass

20class InstallationError(Exception):

21 pass

24class IntegrationError(Exception):

25 pass

28class SupressSettingWithCopyWarning:

29 def __enter__(self):

30 pd.options.mode.chained_assignment = None

32 def __exit__(self, *args):

33 pd.options.mode.chained_assignment = "warn"

36def check_subset(subset, superset):

37 """Check whether one dictionary, list or set is a subset of another

39 Handles nested dictionaries

40 """

41 if type(subset) != type(superset):

42 return False

43 if isinstance(subset, dict):

44 for k, v in subset.items():

45 superset_value = superset.get(k)

46 if superset_value == v:

47 continue

48 elif type(v) != type(superset_value):

49 return False

50 elif isinstance(v, (dict, list, set)):

51 out = check_subset(v, superset_value)

52 if not out:

53 return False

54 else:

55 return False

56 if isinstance(subset, (list, set)):

57 return set(subset) <= set(superset)

59 return True

62def check_pandas(array):

63 return isinstance(array, (pd.DataFrame, pd.Series))

66def check_array_like(array):

67 if check_pandas(array):

68 pass

69 else:

70 try:

71 check_array(array, ensure_2d=False)

72 except ValueError:

73 raise ValidationError(

74 "Expected array-like (e.g., list, numpy array, pandas series/dataframe"

75 )

78def get_project_root() -> Path:

79 return Path(__file__).parent.parent

82def flatten_list(lst):

83 return [item for sublist in lst for item in sublist]

86def update_dictionary(d, u):

87 """Recursively updates a dictionary"""

88 for k, v in u.items():

89 if isinstance(v, collections.abc.Mapping):

90 d[k] = update_dictionary(d.get(k, {}), v)

91 elif isinstance(v, list):

92 d[k] = v + d.get(k, [])

93 else:

94 d[k] = v

95 return d

98def wrap_list(obj):

99 """Ensures object is an iterable"""

100 if type(obj) == str:

101 obj = [obj]

102 elif obj is None:

103 return None

104 try:

105 iter(obj)

106 except TypeError:

107 obj = [obj]

108 return obj

109

110

111def remove_suffix(text, suffix):

112 return text[: -len(suffix)] if text.endswith(suffix) and len(suffix) != 0 else text

113

114

115def humanize_label(s):

116 return " ".join(s.split("_")).title()

117

118

119class CredoEncoder(json.JSONEncoder):

120 """Special json encoder for numpy types"""

121

122 def default(self, obj):

123 # numpy encoders

124 if isinstance(obj, np.integer):

125 return int(obj)

126 elif isinstance(obj, np.floating):

127 return float(obj)

128 elif isinstance(obj, np.ndarray):

129 return obj.tolist()

130 return json.JSONEncoder.default(self, obj)

131

132

133def json_dumps(obj):

134 """Custom json dumps with encoder"""

135 return json.dumps(obj, cls=CredoEncoder, indent=2)

136

137

138def dict_hash(dictionary: Dict[str, Any]) -> str:

139 """MD5 hash of a dictionary."""

140 dhash = hashlib.md5()

141 # We need to sort arguments so {'a': 1, 'b': 2} is

142 # the same as {'b': 2, 'a': 1}

143 encoded = json.dumps(dictionary, sort_keys=True).encode()

144 dhash.update(encoded)

145 return dhash.hexdigest()

146

147

148def to_array(lst):

149 """

150 Converts list-like object to array

151 Parameters

152 ----------

153 lst : (List, pandas.Series, numpy.ndarray)

154 The list-like to be converted

155 """

156 if type(lst) == pd.Series:

157 return lst.values

158 elif type(lst) == list:

159 return np.array(lst)

160 elif type(lst) == np.ndarray:

161 return lst

162 else:

163 raise TypeError

164

165

166def is_categorical(series, threshold=0.05):

167 """Identifies whether a series is categorical or not

168

169 Logic: If there are relatively few unique values for a feature, the feature is likely categorical.

170 The results are estimates and are not guaranteed to be correct.

171

172 Parameters

173 ----------

174 series : pd.Series

175 Series to evaluate

176 threshold : float

177 The threshold (number of the unique values over the total number of values)

178

179

180 Returns

181 -------

182 bool

183 Whether the series is categorical or not

184 """

185

186 if series.dtype.name in ["category", "object"]:

187 return True

188 # float columns are assumed not-categorical

189 elif len(series.unique()) / len(series) < threshold:

190 return True

191 else:

192 return False