Coverage for credoai/utils/dataset_utils.py: 43%

61 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-13 21:56 +0000

1import numpy as np 

2import pandas as pd 

3from sklearn import feature_extraction, feature_selection, impute, pipeline 

4 

5 

6class ColumnTransformerUtil: 

7 """Utility functions for ColumnTransformer 

8 

9 ColumnTransformer is a helpful preprocessing utility from sklearn. 

10 However, it makes getting the original feature names difficult, which 

11 makes interpreting feature importance hard. This utility class 

12 defined a `get_ct_feature_names` function which takes in a 

13 ColumnTransformer instance and outputs a list of feature names 

14 

15 Ref: https://stackoverflow.com/a/57534118 

16 """ 

17 

18 @staticmethod 

19 def get_feature_out(estimator, feature_in): 

20 if hasattr(estimator, "get_feature_names_out"): 

21 if isinstance(estimator, feature_extraction.text._VectorizerMixin): 

22 # handling all vectorizers 

23 return [f"vec_{f}" for f in estimator.get_feature_names_out()] 

24 else: 

25 return estimator.get_feature_names_out(feature_in) 

26 elif hasattr(estimator, "get_feature_names"): 

27 return estimator.get_feature_names(feature_in) 

28 elif isinstance(estimator, feature_selection._base.SelectorMixin): 

29 return np.array(feature_in)[estimator.get_support()] 

30 else: 

31 return feature_in 

32 

33 @staticmethod 

34 def get_ct_feature_names(ct): 

35 # handles all estimators, pipelines inside ColumnTransfomer 

36 # doesn't work when remainder =='passthrough' 

37 # which requires the input column names. 

38 output_features = [] 

39 

40 for name, estimator, features in ct.transformers_: 

41 if name != "remainder": 

42 if isinstance(estimator, pipeline.Pipeline): 

43 current_features = features 

44 for step in estimator: 

45 current_features = ColumnTransformerUtil.get_feature_out( 

46 step, current_features 

47 ) 

48 features_out = current_features 

49 else: 

50 features_out = ColumnTransformerUtil.get_feature_out( 

51 estimator, features 

52 ) 

53 output_features.extend(features_out) 

54 elif estimator == "passthrough": 

55 output_features.extend(ct._feature_names_in[features]) 

56 return output_features 

57 

58 

59def scrub_data(credo_data, nan_strategy="ignore"): 

60 """Return scrubbed data 

61 

62 Implements NaN strategy indicated by nan_strategy before returning 

63 X, y and sensitive_features dataframes/series. 

64 

65 Parameters 

66 ---------- 

67 credo_data : CredoData 

68 Data object 

69 nan_strategy : str or callable, optional 

70 The strategy for dealing with NaNs. 

71 

72 -- If "ignore" do nothing, 

73 -- If "drop" drop any rows with any NaNs. X must be a pd.DataFrame 

74 -- If any other string, pass to the "strategy" argument of `Simple Imputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_. 

75 

76 You can also supply your own imputer with 

77 the same API as `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_. 

78 

79 Returns 

80 ------- 

81 X 

82 

83 Raises 

84 ------ 

85 ValueError 

86 ValueError raised for nan_strategy cannot be used by SimpleImputer 

87 """ 

88 if credo_data.X_type not in (pd.DataFrame, np.ndarray): 

89 return credo_data 

90 X, y, sensitive_features = credo_data.get_data().values() 

91 imputed = None 

92 if nan_strategy == "drop": 

93 if credo_data.X_type == pd.DataFrame: 

94 # determine index of no nan rows 

95 tmp = pd.concat([X, y, sensitive_features], axis=1).dropna() 

96 # apply dropped index 

97 X = X.loc[tmp.index] 

98 if y is not None: 

99 y = y.loc[tmp.index] 

100 if sensitive_features is not None: 

101 sensitive_features = sensitive_features.loc[tmp.index] 

102 else: 

103 raise TypeError("X must be a pd.DataFrame when using the drop option") 

104 elif nan_strategy == "ignore": 

105 pass 

106 elif isinstance(nan_strategy, str): 

107 try: 

108 imputer = impute.SimpleImputer(strategy=nan_strategy) 

109 imputed = imputer.fit_transform(X) 

110 except ValueError: 

111 raise ValueError( 

112 "Nan_strategy could not be successfully passed to SimpleImputer as a 'strategy' argument" 

113 ) 

114 else: 

115 imputed = nan_strategy.fit_transform(X) 

116 if imputed: 

117 X = X.copy() 

118 X.iloc[:, :] = imputed 

119 return X, y, sensitive_features 

120 

121 

122def empirical_distribution_curve(values, down_sampling_step=1, variable_name="x"): 

123 """Creates empirical distribution of a list of values 

124 

125 Parameters 

126 ---------- 

127 values : array 

128 list of values 

129 down_sampling_step : int, optional 

130 down-sampling step, by default 1 (i.e., no down-sampling) 

131 variable_name : str, optional 

132 name of the variable, by default 'x' 

133 

134 Returns 

135 ------- 

136 _type_ 

137 _description_ 

138 """ 

139 x = np.sort(values)[::down_sampling_step] 

140 y = np.arange(len(x)) / float(len(x)) 

141 return pd.DataFrame({variable_name: x, "cumulative_probability": y})