Coverage for credoai/utils/dataset

1import numpy as np

2import pandas as pd

3from sklearn import feature_extraction, feature_selection, impute, pipeline

6class ColumnTransformerUtil:

7 """Utility functions for ColumnTransformer

9 ColumnTransformer is a helpful preprocessing utility from sklearn.

10 However, it makes getting the original feature names difficult, which

11 makes interpreting feature importance hard. This utility class

12 defined a `get_ct_feature_names` function which takes in a

13 ColumnTransformer instance and outputs a list of feature names

15 Ref: https://stackoverflow.com/a/57534118

16 """

18 @staticmethod

19 def get_feature_out(estimator, feature_in):

20 if hasattr(estimator, "get_feature_names_out"):

21 if isinstance(estimator, feature_extraction.text._VectorizerMixin):

22 # handling all vectorizers

23 return [f"vec_{f}" for f in estimator.get_feature_names_out()]

24 else:

25 return estimator.get_feature_names_out(feature_in)

26 elif hasattr(estimator, "get_feature_names"):

27 return estimator.get_feature_names(feature_in)

28 elif isinstance(estimator, feature_selection._base.SelectorMixin):

29 return np.array(feature_in)[estimator.get_support()]

30 else:

31 return feature_in

33 @staticmethod

34 def get_ct_feature_names(ct):

35 # handles all estimators, pipelines inside ColumnTransfomer

36 # doesn't work when remainder =='passthrough'

37 # which requires the input column names.

38 output_features = []

40 for name, estimator, features in ct.transformers_:

41 if name != "remainder":

42 if isinstance(estimator, pipeline.Pipeline):

43 current_features = features

44 for step in estimator:

45 current_features = ColumnTransformerUtil.get_feature_out(

46 step, current_features

47 )

48 features_out = current_features

49 else:

50 features_out = ColumnTransformerUtil.get_feature_out(

51 estimator, features

52 )

53 output_features.extend(features_out)

54 elif estimator == "passthrough":

55 output_features.extend(ct._feature_names_in[features])

56 return output_features

59def scrub_data(credo_data, nan_strategy="ignore"):

60 """Return scrubbed data

62 Implements NaN strategy indicated by nan_strategy before returning

63 X, y and sensitive_features dataframes/series.

65 Parameters

66 ----------

67 credo_data : CredoData

68 Data object

69 nan_strategy : str or callable, optional

70 The strategy for dealing with NaNs.

72 -- If "ignore" do nothing,

73 -- If "drop" drop any rows with any NaNs. X must be a pd.DataFrame

74 -- If any other string, pass to the "strategy" argument of `Simple Imputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_.

76 You can also supply your own imputer with

77 the same API as `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_.

79 Returns

80 -------

81 X

83 Raises

84 ------

85 ValueError

86 ValueError raised for nan_strategy cannot be used by SimpleImputer

87 """

88 if credo_data.X_type not in (pd.DataFrame, np.ndarray):

89 return credo_data

90 X, y, sensitive_features = credo_data.get_data().values()

91 imputed = None

92 if nan_strategy == "drop":

93 if credo_data.X_type == pd.DataFrame:

94 # determine index of no nan rows

95 tmp = pd.concat([X, y, sensitive_features], axis=1).dropna()

96 # apply dropped index

97 X = X.loc[tmp.index]

98 if y is not None:

99 y = y.loc[tmp.index]

100 if sensitive_features is not None:

101 sensitive_features = sensitive_features.loc[tmp.index]

102 else:

103 raise TypeError("X must be a pd.DataFrame when using the drop option")

104 elif nan_strategy == "ignore":

105 pass

106 elif isinstance(nan_strategy, str):

107 try:

108 imputer = impute.SimpleImputer(strategy=nan_strategy)

109 imputed = imputer.fit_transform(X)

110 except ValueError:

111 raise ValueError(

112 "Nan_strategy could not be successfully passed to SimpleImputer as a 'strategy' argument"

113 )

114 else:

115 imputed = nan_strategy.fit_transform(X)

116 if imputed:

117 X = X.copy()

118 X.iloc[:, :] = imputed

119 return X, y, sensitive_features

120

121

122def empirical_distribution_curve(values, down_sampling_step=1, variable_name="x"):

123 """Creates empirical distribution of a list of values

124

125 Parameters

126 ----------

127 values : array

128 list of values

129 down_sampling_step : int, optional

130 down-sampling step, by default 1 (i.e., no down-sampling)

131 variable_name : str, optional

132 name of the variable, by default 'x'

133

134 Returns

135 -------

136 _type_

137 _description_

138 """

139 x = np.sort(values)[::down_sampling_step]

140 y = np.arange(len(x)) / float(len(x))

141 return pd.DataFrame({variable_name: x, "cumulative_probability": y})

Coverage for credoai/utils/dataset_utils.py: 43%

61 statements