Coverage for credoai/utils/dataset_utils.py: 43%
61 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-13 21:56 +0000
1import numpy as np
2import pandas as pd
3from sklearn import feature_extraction, feature_selection, impute, pipeline
6class ColumnTransformerUtil:
7 """Utility functions for ColumnTransformer
9 ColumnTransformer is a helpful preprocessing utility from sklearn.
10 However, it makes getting the original feature names difficult, which
11 makes interpreting feature importance hard. This utility class
12 defined a `get_ct_feature_names` function which takes in a
13 ColumnTransformer instance and outputs a list of feature names
15 Ref: https://stackoverflow.com/a/57534118
16 """
18 @staticmethod
19 def get_feature_out(estimator, feature_in):
20 if hasattr(estimator, "get_feature_names_out"):
21 if isinstance(estimator, feature_extraction.text._VectorizerMixin):
22 # handling all vectorizers
23 return [f"vec_{f}" for f in estimator.get_feature_names_out()]
24 else:
25 return estimator.get_feature_names_out(feature_in)
26 elif hasattr(estimator, "get_feature_names"):
27 return estimator.get_feature_names(feature_in)
28 elif isinstance(estimator, feature_selection._base.SelectorMixin):
29 return np.array(feature_in)[estimator.get_support()]
30 else:
31 return feature_in
33 @staticmethod
34 def get_ct_feature_names(ct):
35 # handles all estimators, pipelines inside ColumnTransfomer
36 # doesn't work when remainder =='passthrough'
37 # which requires the input column names.
38 output_features = []
40 for name, estimator, features in ct.transformers_:
41 if name != "remainder":
42 if isinstance(estimator, pipeline.Pipeline):
43 current_features = features
44 for step in estimator:
45 current_features = ColumnTransformerUtil.get_feature_out(
46 step, current_features
47 )
48 features_out = current_features
49 else:
50 features_out = ColumnTransformerUtil.get_feature_out(
51 estimator, features
52 )
53 output_features.extend(features_out)
54 elif estimator == "passthrough":
55 output_features.extend(ct._feature_names_in[features])
56 return output_features
59def scrub_data(credo_data, nan_strategy="ignore"):
60 """Return scrubbed data
62 Implements NaN strategy indicated by nan_strategy before returning
63 X, y and sensitive_features dataframes/series.
65 Parameters
66 ----------
67 credo_data : CredoData
68 Data object
69 nan_strategy : str or callable, optional
70 The strategy for dealing with NaNs.
72 -- If "ignore" do nothing,
73 -- If "drop" drop any rows with any NaNs. X must be a pd.DataFrame
74 -- If any other string, pass to the "strategy" argument of `Simple Imputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_.
76 You can also supply your own imputer with
77 the same API as `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_.
79 Returns
80 -------
81 X
83 Raises
84 ------
85 ValueError
86 ValueError raised for nan_strategy cannot be used by SimpleImputer
87 """
88 if credo_data.X_type not in (pd.DataFrame, np.ndarray):
89 return credo_data
90 X, y, sensitive_features = credo_data.get_data().values()
91 imputed = None
92 if nan_strategy == "drop":
93 if credo_data.X_type == pd.DataFrame:
94 # determine index of no nan rows
95 tmp = pd.concat([X, y, sensitive_features], axis=1).dropna()
96 # apply dropped index
97 X = X.loc[tmp.index]
98 if y is not None:
99 y = y.loc[tmp.index]
100 if sensitive_features is not None:
101 sensitive_features = sensitive_features.loc[tmp.index]
102 else:
103 raise TypeError("X must be a pd.DataFrame when using the drop option")
104 elif nan_strategy == "ignore":
105 pass
106 elif isinstance(nan_strategy, str):
107 try:
108 imputer = impute.SimpleImputer(strategy=nan_strategy)
109 imputed = imputer.fit_transform(X)
110 except ValueError:
111 raise ValueError(
112 "Nan_strategy could not be successfully passed to SimpleImputer as a 'strategy' argument"
113 )
114 else:
115 imputed = nan_strategy.fit_transform(X)
116 if imputed:
117 X = X.copy()
118 X.iloc[:, :] = imputed
119 return X, y, sensitive_features
122def empirical_distribution_curve(values, down_sampling_step=1, variable_name="x"):
123 """Creates empirical distribution of a list of values
125 Parameters
126 ----------
127 values : array
128 list of values
129 down_sampling_step : int, optional
130 down-sampling step, by default 1 (i.e., no down-sampling)
131 variable_name : str, optional
132 name of the variable, by default 'x'
134 Returns
135 -------
136 _type_
137 _description_
138 """
139 x = np.sort(values)[::down_sampling_step]
140 y = np.arange(len(x)) / float(len(x))
141 return pd.DataFrame({variable_name: x, "cumulative_probability": y})