Coverage for credoai/utils/common.py: 68%
104 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
1import collections
2import hashlib
3import json
4from pathlib import Path
5from typing import Any, Dict
7import numpy as np
8import pandas as pd
9from sklearn.utils import check_array
12class NotRunError(Exception):
13 pass
16class ValidationError(Exception):
17 pass
20class InstallationError(Exception):
21 pass
24class IntegrationError(Exception):
25 pass
28class SupressSettingWithCopyWarning:
29 def __enter__(self):
30 pd.options.mode.chained_assignment = None
32 def __exit__(self, *args):
33 pd.options.mode.chained_assignment = "warn"
36def check_subset(subset, superset):
37 """Check whether one dictionary, list or set is a subset of another
39 Handles nested dictionaries
40 """
41 if type(subset) != type(superset):
42 return False
43 if isinstance(subset, dict):
44 for k, v in subset.items():
45 superset_value = superset.get(k)
46 if superset_value == v:
47 continue
48 elif type(v) != type(superset_value):
49 return False
50 elif isinstance(v, (dict, list, set)):
51 out = check_subset(v, superset_value)
52 if not out:
53 return False
54 else:
55 return False
56 if isinstance(subset, (list, set)):
57 return set(subset) <= set(superset)
59 return True
62def check_pandas(array):
63 return isinstance(array, (pd.DataFrame, pd.Series))
66def check_array_like(array):
67 if check_pandas(array):
68 pass
69 else:
70 try:
71 check_array(array, ensure_2d=False)
72 except ValueError:
73 raise ValidationError(
74 "Expected array-like (e.g., list, numpy array, pandas series/dataframe"
75 )
78def get_project_root() -> Path:
79 return Path(__file__).parent.parent
82def flatten_list(lst):
83 return [item for sublist in lst for item in sublist]
86def update_dictionary(d, u):
87 """Recursively updates a dictionary"""
88 for k, v in u.items():
89 if isinstance(v, collections.abc.Mapping):
90 d[k] = update_dictionary(d.get(k, {}), v)
91 elif isinstance(v, list):
92 d[k] = v + d.get(k, [])
93 else:
94 d[k] = v
95 return d
98def wrap_list(obj):
99 """Ensures object is an iterable"""
100 if type(obj) == str:
101 obj = [obj]
102 elif obj is None:
103 return None
104 try:
105 iter(obj)
106 except TypeError:
107 obj = [obj]
108 return obj
111def remove_suffix(text, suffix):
112 return text[: -len(suffix)] if text.endswith(suffix) and len(suffix) != 0 else text
115def humanize_label(s):
116 return " ".join(s.split("_")).title()
119class CredoEncoder(json.JSONEncoder):
120 """Special json encoder for numpy types"""
122 def default(self, obj):
123 # numpy encoders
124 if isinstance(obj, np.integer):
125 return int(obj)
126 elif isinstance(obj, np.floating):
127 return float(obj)
128 elif isinstance(obj, np.ndarray):
129 return obj.tolist()
130 return json.JSONEncoder.default(self, obj)
133def json_dumps(obj):
134 """Custom json dumps with encoder"""
135 return json.dumps(obj, cls=CredoEncoder, indent=2)
138def dict_hash(dictionary: Dict[str, Any]) -> str:
139 """MD5 hash of a dictionary."""
140 dhash = hashlib.md5()
141 # We need to sort arguments so {'a': 1, 'b': 2} is
142 # the same as {'b': 2, 'a': 1}
143 encoded = json.dumps(dictionary, sort_keys=True).encode()
144 dhash.update(encoded)
145 return dhash.hexdigest()
148def to_array(lst):
149 """
150 Converts list-like object to array
151 Parameters
152 ----------
153 lst : (List, pandas.Series, numpy.ndarray)
154 The list-like to be converted
155 """
156 if type(lst) == pd.Series:
157 return lst.values
158 elif type(lst) == list:
159 return np.array(lst)
160 elif type(lst) == np.ndarray:
161 return lst
162 else:
163 raise TypeError
166def is_categorical(series, threshold=0.05):
167 """Identifies whether a series is categorical or not
169 Logic: If there are relatively few unique values for a feature, the feature is likely categorical.
170 The results are estimates and are not guaranteed to be correct.
172 Parameters
173 ----------
174 series : pd.Series
175 Series to evaluate
176 threshold : float
177 The threshold (number of the unique values over the total number of values)
180 Returns
181 -------
182 bool
183 Whether the series is categorical or not
184 """
186 if series.dtype.name in ["category", "object"]:
187 return True
188 # float columns are assumed not-categorical
189 elif len(series.unique()) / len(series) < threshold:
190 return True
191 else:
192 return False