Coverage for credoai/utils/common.py: 60%
102 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 07:32 +0000
1import collections
2import hashlib
3import json
4from pathlib import Path
5from typing import Any, Dict
7import numpy as np
8import pandas as pd
9from sklearn.utils import check_array
12class NotRunError(Exception):
13 pass
16class ValidationError(Exception):
17 pass
20class InstallationError(Exception):
21 pass
24class IntegrationError(Exception):
25 pass
28class SupressSettingWithCopyWarning:
29 def __enter__(self):
30 pd.options.mode.chained_assignment = None
32 def __exit__(self, *args):
33 pd.options.mode.chained_assignment = "warn"
36def check_subset(subset, superset):
37 """Check whether one dictionary, list or set is a subset of another
39 Handles nested dictionaries
40 """
41 if type(subset) != type(superset):
42 return False
43 if isinstance(subset, dict):
44 for k, v in subset.items():
45 superset_value = superset.get(k)
46 if superset_value == v:
47 continue
48 elif type(v) != type(superset_value):
49 return False
50 elif isinstance(v, (dict, list, set)):
51 out = check_subset(v, superset_value)
52 if not out:
53 return False
54 else:
55 return False
56 if isinstance(subset, (list, set)):
57 return set(subset) <= set(superset)
59 return True
62def check_array_like(array):
63 if isinstance(array, (pd.DataFrame, pd.Series)):
64 pass
65 else:
66 try:
67 check_array(array, ensure_2d=False)
68 except ValueError:
69 raise ValidationError(
70 "Expected array-like (e.g., list, numpy array, pandas series/dataframe"
71 )
74def get_project_root() -> Path:
75 return Path(__file__).parent.parent
78def flatten_list(lst):
79 return [item for sublist in lst for item in sublist]
82def update_dictionary(d, u):
83 """Recursively updates a dictionary"""
84 for k, v in u.items():
85 if isinstance(v, collections.abc.Mapping):
86 d[k] = update_dictionary(d.get(k, {}), v)
87 elif isinstance(v, list):
88 d[k] = v + d.get(k, [])
89 else:
90 d[k] = v
91 return d
94def wrap_list(obj):
95 """Ensures object is an iterable"""
96 if type(obj) == str:
97 obj = [obj]
98 elif obj is None:
99 return None
100 try:
101 iter(obj)
102 except TypeError:
103 obj = [obj]
104 return obj
107def remove_suffix(text, suffix):
108 return text[: -len(suffix)] if text.endswith(suffix) and len(suffix) != 0 else text
111def humanize_label(s):
112 return " ".join(s.split("_")).title()
115class CredoEncoder(json.JSONEncoder):
116 """Special json encoder for numpy types"""
118 def default(self, obj):
119 # numpy encoders
120 if isinstance(obj, np.integer):
121 return int(obj)
122 elif isinstance(obj, np.floating):
123 return float(obj)
124 elif isinstance(obj, np.ndarray):
125 return obj.tolist()
126 return json.JSONEncoder.default(self, obj)
129def json_dumps(obj):
130 """Custom json dumps with encoder"""
131 return json.dumps(obj, cls=CredoEncoder, indent=2)
134def dict_hash(dictionary: Dict[str, Any]) -> str:
135 """MD5 hash of a dictionary."""
136 dhash = hashlib.md5()
137 # We need to sort arguments so {'a': 1, 'b': 2} is
138 # the same as {'b': 2, 'a': 1}
139 encoded = json.dumps(dictionary, sort_keys=True).encode()
140 dhash.update(encoded)
141 return dhash.hexdigest()
144def to_array(lst):
145 """
146 Converts list-like object to array
147 Parameters
148 ----------
149 lst : (List, pandas.Series, numpy.ndarray)
150 The list-like to be converted
151 """
152 if type(lst) == pd.Series:
153 return lst.values
154 elif type(lst) == list:
155 return np.array(lst)
156 elif type(lst) == np.ndarray:
157 return lst
158 else:
159 raise TypeError
162def is_categorical(series, threshold=0.05):
163 """Identifies whether a series is categorical or not
165 Logic: If there are relatively few unique values for a feature, the feature is likely categorical.
166 The results are estimates and are not guaranteed to be correct.
168 Parameters
169 ----------
170 series : pd.Series
171 Series to evaluate
172 threshold : float
173 The threshold (number of the unique values over the total number of values)
176 Returns
177 -------
178 bool
179 Whether the series is categorical or not
180 """
182 if series.dtype.name in ["category", "object"]:
183 return True
184 # float columns are assumed not-categorical
185 elif len(series.unique()) / len(series) < threshold:
186 return True
187 else:
188 return False