Update 0.1.6-alpha-1

- ANAI Open Source Alpha Build 6 - Updated Documentation - df_loader can now take kwargs related to pandas it shall be given while we are creating ANAI objects in form of df_kwargs argument - Added opion to show graphs while explaining ANAI Models - Fit method will run Automaticaly if ANAI is ran through anai.run() if Regerssion or Classification are used separately fit shall be called - Explain Method now returns result in for dataframe - Removed Unnecessary Import from Predictor Signed-off-by: Arsh <lucifer78908@gmail.com>
Revca-ANAI · Aug 27, 2022 · 5de8127 · 5de8127
1 parent 125824d
commit 5de8127
Show file tree

Hide file tree

Showing 12 changed files with 164 additions and 101 deletions.
diff --git a/anai/__init__.py b/anai/__init__.py
@@ -30,6 +30,7 @@ def run(
     df=None,
     target: str = None,
     filepath: str = None,
+    df_kwargs: dict = {},
     config: bool = False,
     except_columns: list = [],
     predictor: list = [],
@@ -64,6 +65,10 @@ def run(
                 DataFrame to be used for modelling.
             target : str
                 Target Column Name 
+            filepath : str
+                Filepath of the dataframe to be loaded.
+            df_kwargs : dict
+                Keyword arguments for the dataframe loading function. Only used if filepath is not None.
             except_columns : list, optional
                 List of Columns to be excluded from the dataset
             predictor : list
@@ -137,6 +142,7 @@ def run(
             
             ai = anai.run(
                         filepath='examples/Folds5x2_pp.xlsx', 
+                        df_kwargs={'sheet_name': 'Sheet1'},
                         target='PE',
                         predictor=['lin'],
             )
@@ -166,7 +172,7 @@ def run(
                 raise FileNotFoundError("ANAI Config File Not Found")
         if df is None:
             if filepath is not None:
-                df = df_loader(filepath, suppress=True)
+                df = df_loader(filepath, suppress=True, **df_kwargs)
             else:
                 raise ValueError("Please provide a dataframe or a filepath")
         if __task(df, target) and not suppress_task_detection or task == "regression":
@@ -182,6 +188,7 @@ def run(
                 df=df,
                 target=target,
                 filepath=filepath,
+                df_kwargs=df_kwargs,
                 config=config,
                 except_columns=except_columns,
                 predictor=predictor,
@@ -206,6 +213,7 @@ def run(
                 metric=metric,
                 ensemble=ensemble,
             )
+            regressor.fit()
             return regressor
         elif (
             not __task(df, target)
@@ -224,6 +232,7 @@ def run(
                 df=df,
                 target=target,
                 filepath=filepath,
+                df_kwargs=df_kwargs,
                 config=config,
                 except_columns=except_columns,
                 predictor=predictor,
@@ -243,6 +252,7 @@ def run(
                 ensemble=ensemble,
                 exclude_models=exclude_models,
             )
+            classifier.fit()
             return classifier
     except KeyboardInterrupt:
         if os.path.exists(os.getcwd() + "/dask-worker-space"):
@@ -283,11 +293,12 @@ def __task(df, target):
         return False
 
 
-def load(df_filepath):
+def load(df_filepath, **df_kwargs):
     """Loads a dataframe from a filepath.
 
     Args:
         df_filepath (str): Filepath of the dataframe to be loaded.
+        df_kwargs (dict): Keyword arguments to be passed to df_loader function.
 
     Returns:
         pd.DataFrame : Loaded dataframe.
@@ -296,12 +307,12 @@ def load(df_filepath):
 
     suppress = False
     if type(df_filepath) is str:
-        df = __df_loader_single(df_filepath, suppress=False)
+        df = __df_loader_single(df_filepath, suppress=False, **df_kwargs)
     elif type(df_filepath) is list:
         print(Fore.YELLOW + "Loading Data [*]\n")
         df = pd.concat(
             [
-                __df_loader_single(df_filepath[i], suppress=True)
+                __df_loader_single(df_filepath[i], suppress=True, **df_kwargs)
                 for i in range(len(df_filepath))
             ]
         )

diff --git a/anai/supervised/__init__.py b/anai/supervised/__init__.py
@@ -39,6 +39,7 @@ def __init__(
         df=None,
         target: str = None,
         filepath: str = None,
+        df_kwargs: dict = {},
         config: bool = False,
         except_columns: list = [],
         predictor: list =["lr"],
@@ -68,11 +69,16 @@ def __init__(
 
         Parameters:
 
-            features : array
-                        features array
-            lables : array
-                        labels array
-            except_columns (list): [List of Columns to be excluded from the dataset]
+            df : Pandas DataFrame
+                DataFrame to be used for modelling.
+            target : str
+                Target Column Name 
+            filepath : str
+                Filepath of the dataframe to be loaded.
+            df_kwargs : dict
+                Keyword arguments for the dataframe loading function. Only used if filepath is not None.
+            except_columns : list, optional
+                List of Columns to be excluded from the dataset
             predictor : list
                         Predicting model to be used
                         Default ['lr']  - Logistic Regression\n
@@ -116,7 +122,7 @@ def __init__(
             params : dict
                         contains parameters for model
             tune : boolean
-                    when True Applies GridSearch CrossValidation
+                    when True Applies Optuna to find best parameters for model
                     Default is False
             test_size: float or int, default=.2
                         If float, should be between 0.0 and 1.0 and represent
@@ -125,21 +131,18 @@ def __init__(
                         If int, represents the absolute number of test samples.
             cv_folds : int
                     No. of cross validation folds. Default = 10
-            pca : str
-                if 'y' will apply PCA on Train and Validation set. Default = 'n'
+            pca : bool
+                if True will apply PCA on Train and Validation set. Default = False
             lda : str
-                if 'y' will apply LDA on Train and Validation set. Default = 'n'
+                if True will apply LDA on Train and Validation set. Default = False
             pca_kernel : str
                     Kernel to be use in PCA. Default = 'linear'
             n_components_lda : int
                     No. of components for LDA. Default = 1
             n_components_pca : int
                     No. of components for PCA. Default = 2
-            loss : str
-                    loss method for ann. Default = 'binary_crossentropy'
-                    rate for dropout layer. Default = 0
-            smote : str,
-                Whether to apply SMOTE. Default = 'y'
+            smote : Bool,
+                Whether to apply SMOTE. Default = True
             k_neighbors : int
                 No. of neighbors for SMOTE. Default = 1
             verbose : boolean
@@ -160,12 +163,8 @@ def __init__(
                     minimize : Minimize
             optuna_n_trials : int
                 No. of trials for optuna. Default = 100
-            optuna_metric: str
-                Metric to be used in optuna. Default = 'R^2'
-            lgbm_objective : str
-                Objective for lgbm classifier. Default = 'binary'
             ensemble : boolean
-                Whether to use ensemble. Default = True
+                Whether to use ensemble methods. Default = True
 
         Returns:
 
@@ -178,6 +177,7 @@ def __init__(
             import anai
             ai = anai.run(
                         filepath='examples/test_data.csv', 
+                        df_kwargs={'index_col':'id'},
                         target='PE',
                         predictor=['lr'],
             )
@@ -196,7 +196,7 @@ def __init__(
                 raise FileNotFoundError("ANAI Config File Not Found")
         if df is None:
             if filepath is not None:
-                df = df_loader(filepath)
+                df = df_loader(filepath, **df)
             else:
                 raise ValueError("Please provide a dataframe or a filepath")
         if type(predictor) == list:
@@ -295,9 +295,8 @@ def __init__(
         self.dimension_handler = DimensionHandler()
         self.encoder = None
         self.le_encoder = None
-        self.__fit()
 
-    def __fit(self):
+    def fit(self):
         """[Takes Features and Labels and Encodes Categorical Data then Applies SMOTE , Splits the features and labels in training and validation sets with test_size = .2
         scales X_train, self.X_val using StandardScaler.
         Fits every model on training set and predicts results,
@@ -760,9 +759,21 @@ def __load(self, path=None):
         else:
             raise ValueError("No path specified.Please provide actual path\n")
 
-    def explain(self, method):
+    def explain(self, method, show_graph=True):
         """
-        Returns the importance features of the dataset
+        Explains the model using the specified method.
+        
+        args:
+            method (str): [Method to use for explaining the model]
+                Available methods:
+                    - shap
+                    - perm
+
+            show_graph (bool): [Whether to show the graph or not]
+            
+        returns:
+            [DataFrame] : [Explained DataFrame]
+        
         """
         columns = self.features.columns
         self.explainer.set_params(
@@ -775,6 +786,7 @@ def explain(self, method):
             self.fit_params,
             False,
             columns,
+            show_graph,
         )
         if self.pred_mode == "all":
             classifier = copy.deepcopy(self.best_classifier.model)
@@ -804,6 +816,7 @@ def __init__(
         df=None,
         target: str = None,
         filepath: str = None,
+        df_kwargs: dict = {},
         config: bool = False,
         except_columns: list = [],
         predictor: list = ["lin"],
@@ -833,6 +846,10 @@ def __init__(
         Parameters:
             df (dataframe): [Dataset containing features and target]
             target (str): [Target Column Name]
+            filepath : str
+                Filepath of the dataframe to be loaded.
+            df_kwargs : dict
+                Keyword arguments for the dataframe loading function. Only used if filepath is not None.
             except_columns (list): [List of Columns to be excluded from the dataset]
             predictor : list
                         Predicting models to be used
@@ -931,6 +948,7 @@ def __init__(
             
             ai = anai.run(
                         filepath='examples/Folds5x2_pp.xlsx', 
+                        df_kwargs={'sheet_name': 'Sheet1'},
                         target='PE',
                         predictor=['lin'],
             )
@@ -949,7 +967,7 @@ def __init__(
                 raise FileNotFoundError("ANAI Config File Not Found")
         if df is None:
             if filepath is not None:
-                df = df_loader(filepath)
+                df = df_loader(filepath, **df_kwargs)
             # elif config_filepath is not None:
             #     df, target = load_data_from_config(config_filepath)
             else:
@@ -1048,9 +1066,8 @@ def __init__(
         self.encoder = None
         self.features = None
         self.labels = None
-        self.__fit()
 
-    def __fit(self):
+    def fit(self):
         """[Takes Features and Labels and Encodes Categorical Data then Applies SMOTE , Splits the features and labels in training and validation sets with test_size = .2
         scales X_train, X_val using StandardScaler.
         Fits model on training set and predicts results, Finds R^2 Scoreand mean square error
@@ -1573,9 +1590,21 @@ def __load(self, path=None):
         else:
             raise ValueError("No path specified.Please provide actual path\n")
 
-    def explain(self, method):
+    def explain(self, method, show_graph=True):
         """
-        Returns the importance features of the dataset
+        Explains the model using the specified method.
+        
+        args:
+            method (str): [Method to use for explaining the model]
+                Available methods:
+                    - shap
+                    - perm
+
+            show_graph (bool): [Whether to show the graph or not]
+            
+        returns:
+            [DataFrame] : [Explained DataFrame]
+        
         """
         self.explainer.set_params(
             self.features,
@@ -1585,6 +1614,7 @@ def explain(self, method):
             self.y_val,
             self.cv_folds,
             self.fit_params,
+            self.show_graph,
         )
         if self.pred_mode == "all":
             regressor = copy.deepcopy(self.best_regressor.model)
@@ -1595,14 +1625,12 @@ def explain(self, method):
             )
             print(Fore.YELLOW + "Explaining ANAI [*]\n")
 
-        if self.original_predictor == "all":
-            raise TypeError(
-                "[Error] This method is only applicable on single predictor"
-            )
-        elif method == "perm":
-            self.explainer.permutation(model=regressor)
+        if method == "perm":
+            res = self.explainer.permutation(model=regressor)
+            return res
         elif method == "shap":
-            self.explainer.shap(model=regressor)
+            res = self.explainer.shap(model=regressor)
+            return res
         else:
             raise NotImplementedError(
                 "Technique not implemented. Please choose from perm, shap"