diff --git a/anai/__init__.py b/anai/__init__.py index a9fe722..02ecfd3 100644 --- a/anai/__init__.py +++ b/anai/__init__.py @@ -30,6 +30,7 @@ def run( df=None, target: str = None, filepath: str = None, + df_kwargs: dict = {}, config: bool = False, except_columns: list = [], predictor: list = [], @@ -64,6 +65,10 @@ def run( DataFrame to be used for modelling. target : str Target Column Name + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. except_columns : list, optional List of Columns to be excluded from the dataset predictor : list @@ -137,6 +142,7 @@ def run( ai = anai.run( filepath='examples/Folds5x2_pp.xlsx', + df_kwargs={'sheet_name': 'Sheet1'}, target='PE', predictor=['lin'], ) @@ -166,7 +172,7 @@ def run( raise FileNotFoundError("ANAI Config File Not Found") if df is None: if filepath is not None: - df = df_loader(filepath, suppress=True) + df = df_loader(filepath, suppress=True, **df_kwargs) else: raise ValueError("Please provide a dataframe or a filepath") if __task(df, target) and not suppress_task_detection or task == "regression": @@ -182,6 +188,7 @@ def run( df=df, target=target, filepath=filepath, + df_kwargs=df_kwargs, config=config, except_columns=except_columns, predictor=predictor, @@ -206,6 +213,7 @@ def run( metric=metric, ensemble=ensemble, ) + regressor.fit() return regressor elif ( not __task(df, target) @@ -224,6 +232,7 @@ def run( df=df, target=target, filepath=filepath, + df_kwargs=df_kwargs, config=config, except_columns=except_columns, predictor=predictor, @@ -243,6 +252,7 @@ def run( ensemble=ensemble, exclude_models=exclude_models, ) + classifier.fit() return classifier except KeyboardInterrupt: if os.path.exists(os.getcwd() + "/dask-worker-space"): @@ -283,11 +293,12 @@ def __task(df, target): return False -def load(df_filepath): +def load(df_filepath, **df_kwargs): """Loads a dataframe from a filepath. Args: df_filepath (str): Filepath of the dataframe to be loaded. + df_kwargs (dict): Keyword arguments to be passed to df_loader function. Returns: pd.DataFrame : Loaded dataframe. @@ -296,12 +307,12 @@ def load(df_filepath): suppress = False if type(df_filepath) is str: - df = __df_loader_single(df_filepath, suppress=False) + df = __df_loader_single(df_filepath, suppress=False, **df_kwargs) elif type(df_filepath) is list: print(Fore.YELLOW + "Loading Data [*]\n") df = pd.concat( [ - __df_loader_single(df_filepath[i], suppress=True) + __df_loader_single(df_filepath[i], suppress=True, **df_kwargs) for i in range(len(df_filepath)) ] ) diff --git a/anai/supervised/__init__.py b/anai/supervised/__init__.py index 669e151..f56e44f 100644 --- a/anai/supervised/__init__.py +++ b/anai/supervised/__init__.py @@ -39,6 +39,7 @@ def __init__( df=None, target: str = None, filepath: str = None, + df_kwargs: dict = {}, config: bool = False, except_columns: list = [], predictor: list =["lr"], @@ -68,11 +69,16 @@ def __init__( Parameters: - features : array - features array - lables : array - labels array - except_columns (list): [List of Columns to be excluded from the dataset] + df : Pandas DataFrame + DataFrame to be used for modelling. + target : str + Target Column Name + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. + except_columns : list, optional + List of Columns to be excluded from the dataset predictor : list Predicting model to be used Default ['lr'] - Logistic Regression\n @@ -116,7 +122,7 @@ def __init__( params : dict contains parameters for model tune : boolean - when True Applies GridSearch CrossValidation + when True Applies Optuna to find best parameters for model Default is False test_size: float or int, default=.2 If float, should be between 0.0 and 1.0 and represent @@ -125,21 +131,18 @@ def __init__( If int, represents the absolute number of test samples. cv_folds : int No. of cross validation folds. Default = 10 - pca : str - if 'y' will apply PCA on Train and Validation set. Default = 'n' + pca : bool + if True will apply PCA on Train and Validation set. Default = False lda : str - if 'y' will apply LDA on Train and Validation set. Default = 'n' + if True will apply LDA on Train and Validation set. Default = False pca_kernel : str Kernel to be use in PCA. Default = 'linear' n_components_lda : int No. of components for LDA. Default = 1 n_components_pca : int No. of components for PCA. Default = 2 - loss : str - loss method for ann. Default = 'binary_crossentropy' - rate for dropout layer. Default = 0 - smote : str, - Whether to apply SMOTE. Default = 'y' + smote : Bool, + Whether to apply SMOTE. Default = True k_neighbors : int No. of neighbors for SMOTE. Default = 1 verbose : boolean @@ -160,12 +163,8 @@ def __init__( minimize : Minimize optuna_n_trials : int No. of trials for optuna. Default = 100 - optuna_metric: str - Metric to be used in optuna. Default = 'R^2' - lgbm_objective : str - Objective for lgbm classifier. Default = 'binary' ensemble : boolean - Whether to use ensemble. Default = True + Whether to use ensemble methods. Default = True Returns: @@ -178,6 +177,7 @@ def __init__( import anai ai = anai.run( filepath='examples/test_data.csv', + df_kwargs={'index_col':'id'}, target='PE', predictor=['lr'], ) @@ -196,7 +196,7 @@ def __init__( raise FileNotFoundError("ANAI Config File Not Found") if df is None: if filepath is not None: - df = df_loader(filepath) + df = df_loader(filepath, **df) else: raise ValueError("Please provide a dataframe or a filepath") if type(predictor) == list: @@ -295,9 +295,8 @@ def __init__( self.dimension_handler = DimensionHandler() self.encoder = None self.le_encoder = None - self.__fit() - def __fit(self): + def fit(self): """[Takes Features and Labels and Encodes Categorical Data then Applies SMOTE , Splits the features and labels in training and validation sets with test_size = .2 scales X_train, self.X_val using StandardScaler. Fits every model on training set and predicts results, @@ -760,9 +759,21 @@ def __load(self, path=None): else: raise ValueError("No path specified.Please provide actual path\n") - def explain(self, method): + def explain(self, method, show_graph=True): """ - Returns the importance features of the dataset + Explains the model using the specified method. + + args: + method (str): [Method to use for explaining the model] + Available methods: + - shap + - perm + + show_graph (bool): [Whether to show the graph or not] + + returns: + [DataFrame] : [Explained DataFrame] + """ columns = self.features.columns self.explainer.set_params( @@ -775,6 +786,7 @@ def explain(self, method): self.fit_params, False, columns, + show_graph, ) if self.pred_mode == "all": classifier = copy.deepcopy(self.best_classifier.model) @@ -804,6 +816,7 @@ def __init__( df=None, target: str = None, filepath: str = None, + df_kwargs: dict = {}, config: bool = False, except_columns: list = [], predictor: list = ["lin"], @@ -833,6 +846,10 @@ def __init__( Parameters: df (dataframe): [Dataset containing features and target] target (str): [Target Column Name] + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. except_columns (list): [List of Columns to be excluded from the dataset] predictor : list Predicting models to be used @@ -931,6 +948,7 @@ def __init__( ai = anai.run( filepath='examples/Folds5x2_pp.xlsx', + df_kwargs={'sheet_name': 'Sheet1'}, target='PE', predictor=['lin'], ) @@ -949,7 +967,7 @@ def __init__( raise FileNotFoundError("ANAI Config File Not Found") if df is None: if filepath is not None: - df = df_loader(filepath) + df = df_loader(filepath, **df_kwargs) # elif config_filepath is not None: # df, target = load_data_from_config(config_filepath) else: @@ -1048,9 +1066,8 @@ def __init__( self.encoder = None self.features = None self.labels = None - self.__fit() - def __fit(self): + def fit(self): """[Takes Features and Labels and Encodes Categorical Data then Applies SMOTE , Splits the features and labels in training and validation sets with test_size = .2 scales X_train, X_val using StandardScaler. Fits model on training set and predicts results, Finds R^2 Scoreand mean square error @@ -1573,9 +1590,21 @@ def __load(self, path=None): else: raise ValueError("No path specified.Please provide actual path\n") - def explain(self, method): + def explain(self, method, show_graph=True): """ - Returns the importance features of the dataset + Explains the model using the specified method. + + args: + method (str): [Method to use for explaining the model] + Available methods: + - shap + - perm + + show_graph (bool): [Whether to show the graph or not] + + returns: + [DataFrame] : [Explained DataFrame] + """ self.explainer.set_params( self.features, @@ -1585,6 +1614,7 @@ def explain(self, method): self.y_val, self.cv_folds, self.fit_params, + self.show_graph, ) if self.pred_mode == "all": regressor = copy.deepcopy(self.best_regressor.model) @@ -1595,14 +1625,12 @@ def explain(self, method): ) print(Fore.YELLOW + "Explaining ANAI [*]\n") - if self.original_predictor == "all": - raise TypeError( - "[Error] This method is only applicable on single predictor" - ) - elif method == "perm": - self.explainer.permutation(model=regressor) + if method == "perm": + res = self.explainer.permutation(model=regressor) + return res elif method == "shap": - self.explainer.shap(model=regressor) + res = self.explainer.shap(model=regressor) + return res else: raise NotImplementedError( "Technique not implemented. Please choose from perm, shap" diff --git a/anai/utils/connectors/data_handler.py b/anai/utils/connectors/data_handler.py index ae366b8..1578dbe 100644 --- a/anai/utils/connectors/data_handler.py +++ b/anai/utils/connectors/data_handler.py @@ -8,7 +8,9 @@ def __df_loader_single( obj=None, objfilepath=None, suppress=False, + df_kwargs = {} ): + kwargs = df_kwargs df = None flag = 0 if obj is None: @@ -20,32 +22,32 @@ def __df_loader_single( # print(path) flag = 1 if path.endswith(".csv") or path.startswith("http") or path.startswith("https"): - df = pd.read_csv(df_filepath) + df = pd.read_csv(df_filepath, **df_kwargs) elif path.endswith(".xlsx"): if flag == 1: - df = pd.read_excel(io.BytesIO(df_filepath.read())) + df = pd.read_excel(io.BytesIO(df_filepath.read()), **df_kwargs) else: - df = pd.read_excel(df_filepath) + df = pd.read_excel(df_filepath, **df_kwargs) elif path.endswith(".pkl"): - df = pd.read_pickle(df_filepath) + df = pd.read_pickle(df_filepath, **df_kwargs) elif path.endswith(".h5"): - df = pd.read_hdf(df_filepath) + df = pd.read_hdf(df_filepath, **df_kwargs) elif path.endswith(".feather"): - df = pd.read_feather(df_filepath) + df = pd.read_feather(df_filepath, **df_kwargs) elif path.endswith(".parquet"): - df = pd.read_parquet(df_filepath) + df = pd.read_parquet(df_filepath, **df_kwargs) elif path.endswith(".json"): - df = pd.read_json(df_filepath) + df = pd.read_json(df_filepath, **df_kwargs) elif path.endswith(".html"): - df = pd.read_html(df_filepath) + df = pd.read_html(df_filepath, **df_kwargs) elif path.endswith(".stata"): - df = pd.read_stata(df_filepath) + df = pd.read_stata(df_filepath, **df_kwargs) elif path.endswith(".sas7bdat"): - df = pd.read_sas(df_filepath) + df = pd.read_sas(df_filepath, **df_kwargs) elif path.endswith(".msgpack"): - df = pd.read_msgpack(df_filepath) + df = pd.read_msgpack(df_filepath, **df_kwargs) elif path.endswith(".jsonl"): - df = pd.read_json(df_filepath, lines=True) + df = pd.read_json(df_filepath, lines=True, **df_kwargs) else: raise Exception( "File extension not supported. Use .csv, .xlsx, .pkl, .h5, .feather, .parquet, .json, .html, .stata, .sas7bdat, .msgpack , .jsonl OR use df argument" @@ -63,14 +65,14 @@ def __df_loader_single( return df -def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False): +def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}): if type(df_filepath) is str: - df = __df_loader_single(df_filepath, obj, objfilepath, suppress) + df = __df_loader_single(df_filepath, obj, objfilepath, suppress, **df_kwargs) elif type(df_filepath) is list: print(Fore.YELLOW + "Loading Data [*]\n") df = pd.concat( [ - __df_loader_single(df_filepath[i], obj, objfilepath, True) + __df_loader_single(df_filepath[i], obj, objfilepath, True, **df_kwargs) for i in range(len(df_filepath)) ] ) diff --git a/anai/utils/explainable_anai/explain_core.py b/anai/utils/explainable_anai/explain_core.py index 542076c..8a601ae 100644 --- a/anai/utils/explainable_anai/explain_core.py +++ b/anai/utils/explainable_anai/explain_core.py @@ -18,6 +18,7 @@ def __init__(self): self.isReg = None self.columns = None self.y_test = None + self.show_graph = None def set_params( self, @@ -30,6 +31,7 @@ def set_params( fit_params={}, isReg=True, columns=None, + show_graph=True, ): self.features = features self.X_train = X_train @@ -40,16 +42,18 @@ def set_params( self.fit_params = fit_params self.isReg = isReg self.columns = columns - + self.show_graph = show_graph + def permutation(self, model): try: - permutational_feature_importance( - self.features.columns, self.X_train, self.y_train, model, self.isReg + res = permutational_feature_importance( + self.features.columns, self.X_train, self.y_train, model, self.isReg, self.show_graph ) + return res except Exception as e: print(Fore.YELLOW + "Automatically switching to Surrogate mode\n") try: - permutational_feature_importance( + res = permutational_feature_importance( self.features.columns, self.X_train, self.y_train, @@ -57,7 +61,9 @@ def permutation(self, model): model, self.X_train, self.y_train, isReg=self.isReg ), self.isReg, + self.show_graph, ) + return res except Exception as e: print(e) print(traceback.format_exc()) @@ -66,15 +72,17 @@ def permutation(self, model): def shap(self, model): try: - shap_feature_importance(self.features.columns, self.X_train, model) + res = shap_feature_importance(self.features.columns, self.X_train, model, self.isReg, self.show_graph) + return res except Exception as e: print(Fore.YELLOW + "Automatically switching to Surrogate mode\n") try: - shap_feature_importance( + res = shap_feature_importance( self.features.columns, self.X_train, - surrogate_decision_tree(model, self.X_train, isReg=self.isReg), + surrogate_decision_tree(model, self.X_train, isReg=self.isReg, show_graph=self.show_graph), ) + return res except Exception as e: print(e) print(traceback.format_exc()) diff --git a/anai/utils/explainable_anai/permutation.py b/anai/utils/explainable_anai/permutation.py index ee470d8..08bba83 100644 --- a/anai/utils/explainable_anai/permutation.py +++ b/anai/utils/explainable_anai/permutation.py @@ -4,7 +4,7 @@ import modin.pandas as pd -def permutational_feature_importance(columns, X_test, y_test, model, isReg=False): +def permutational_feature_importance(columns, X_test, y_test, model, isReg=False, show_graph=True): perm_importance = permutation_importance( model, X_test, @@ -20,20 +20,22 @@ def permutational_feature_importance(columns, X_test, y_test, model, isReg=False a = dict(sorted(perm_dict.items(), key=lambda item: item[1], reverse=False)) df1 = pd.DataFrame(a.items(), columns=["Column Name", "Permutation Value"]) df1["Color"] = np.where(df1["Permutation Value"] < 0, "red", "green") - fig = go.Figure() - fig.add_trace( - go.Bar( - name="Net", - x=df1["Permutation Value"], - y=df1["Column Name"], - marker_color=df1["Color"], - orientation="h", + if show_graph: + fig = go.Figure() + fig.add_trace( + go.Bar( + name="Net", + x=df1["Permutation Value"], + y=df1["Column Name"], + marker_color=df1["Color"], + orientation="h", + ) ) - ) - fig.update_layout( - template="plotly_dark", - title_text="Permutation Feature Importance", - xaxis_title="Permutation Value", - yaxis_title="Feature Name", - ) - fig.show() + fig.update_layout( + template="plotly_dark", + title_text="Permutation Feature Importance", + xaxis_title="Permutation Value", + yaxis_title="Feature Name", + ) + fig.show() + return df1 diff --git a/anai/utils/explainable_anai/shap.py b/anai/utils/explainable_anai/shap.py index 5562e39..350314c 100644 --- a/anai/utils/explainable_anai/shap.py +++ b/anai/utils/explainable_anai/shap.py @@ -2,26 +2,27 @@ import numpy as np -def shap_feature_importance(columns, X_train, model, *args, **kwargs): +def shap_feature_importance(columns, X_train, model, show_graph, *args, **kwargs): explainer = shap.TreeExplainer(model) shap_values = np.array(explainer.shap_values(X_train)) if shap_values.ndim == 3: shap_values = np.array(shap_values[1] + shap_values[1][1], ) columns = columns - shap.summary_plot( - shap_values, - X_train, - feature_names=columns, - plot_type="bar", - *args, - **kwargs - ) - shap.summary_plot( - shap_values, X_train, feature_names=columns, *args, **kwargs - ) - for i in range(0, len(columns)): - shap.dependence_plot( - i, shap_values, X_train, feature_names=columns, + if show_graph: + shap.summary_plot( + shap_values, + X_train, + feature_names=columns, + plot_type="bar", + *args, + **kwargs ) + shap.summary_plot( + shap_values, X_train, feature_names=columns, *args, **kwargs + ) + for i in range(0, len(columns)): + shap.dependence_plot( + i, shap_values, X_train, feature_names=columns, + ) return shap_values diff --git a/anai/utils/tuner/optuna/objectives/regression_objectives.py b/anai/utils/tuner/optuna/objectives/regression_objectives.py index dfe9eda..a04c9f2 100644 --- a/anai/utils/tuner/optuna/objectives/regression_objectives.py +++ b/anai/utils/tuner/optuna/objectives/regression_objectives.py @@ -36,7 +36,6 @@ SGDRegressor, TweedieRegressor, ) -from sklearn.linear_model._glm import GeneralizedLinearRegressor from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor from sklearn.neural_network import MLPRegressor diff --git a/docs/Features.md b/docs/Features.md index 77d82e8..22aedf3 100644 --- a/docs/Features.md +++ b/docs/Features.md @@ -10,7 +10,7 @@ ### Initialization import anai from anai.preprocessing import Preprocessor - df = anai.load("data/bodyPerformance.csv") + df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None}) prep = Preprocessor(dataset=df, target="class", except_columns=['weight_kg']) ### Available Preprocessing Methods @@ -89,6 +89,10 @@ ### Arguments df : Pandas DataFrame DataFrame to be used for modelling. + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. target : str Target Column Name except_columns : list, optional diff --git a/docs/index.md b/docs/index.md index 16a3c9c..8a1a352 100644 --- a/docs/index.md +++ b/docs/index.md @@ -22,6 +22,10 @@ ANAI is an Automated Machine Learning Python Library that works with tabular dat DataFrame to be used for modelling. target : str Target Column Name + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. except_columns : list, optional List of Columns to be excluded from the dataset predictor : list diff --git a/docs/old_doc_files/AutoML.md b/docs/old_doc_files/AutoML.md index ee41c7d..07ee6a9 100644 --- a/docs/old_doc_files/AutoML.md +++ b/docs/old_doc_files/AutoML.md @@ -5,12 +5,16 @@ ### Initialization import anai - ai = anai.run(filepath="data/iris.csv", target="class", predictor="lr") + ai = anai.run(filepath="data/iris.csv", df_kwargs={"header": None}, target="class", predictor="lr") ### Arguments df : Pandas DataFrame DataFrame to be used for modelling. + filepath : str + Filepath of the dataframe to be loaded. + df_kwargs : dict + Keyword arguments for the dataframe loading function. Only used if filepath is not None. target : str Target Column Name except_columns : list, optional diff --git a/requirements.txt b/requirements.txt index 91f9299..e93378b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ google-cloud-bigquery google-cloud-bigquery-storage h11 imbalanced-learn -lightgbm +lightgbm; platform_system=="Linux" or platform_system=="Windows" matplotlib matplotlib-inline networkx @@ -39,4 +39,4 @@ snowflake-connector-python snowflake-sqlalchemy python-Levenshtein-wheels; platform_system=="Windows" python-Levenshtein; platform_system=="Linux" or platform_system=="Darwin" -mlxtend==0.20.0 +mlxtend==0.20.0 \ No newline at end of file diff --git a/setup.py b/setup.py index a9e012c..a8db8ea 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ], include=["anai.*", "anai"], ), - version="0.1.5", + version="0.1.6-alpha-1", license="Apache License 2.0", description="Automated ML", url="https://github.com/Revca-ANAI/ANAI",