From 468a8e1f38b080f798e62d9f725f4ca463221233 Mon Sep 17 00:00:00 2001
From: "Carl F. Corneil" <cfc@ssb.no>
Date: Tue, 12 Sep 2023 11:12:45 +0200
Subject: [PATCH 1/3] restructure check_env

---
 statbank/auth.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/statbank/auth.py b/statbank/auth.py
index 418d4d7..2e0d11c 100644
--- a/statbank/auth.py
+++ b/statbank/auth.py
@@ -62,6 +62,19 @@ def check_env() -> str:
         else:
             raise OSError("Ikke i prodsonen, eller på Dapla? Må funksjonen skrives om?")
 
+    @staticmethod    
+    def check_database() -> str:
+        if "test" in os.environ["STATBANK_BASE_URL"]:
+            print("Warning: Descriptions and data in the TEST-database may be outdated!")
+            return "TEST"
+        elif "i.ssb" in os.environ["STATBANK_BASE_URL"]:
+            return "PROD"
+        else:
+            raise SystemError(
+                "Can't determine if Im sending to the test-database or the prod-database"
+            )
+
+
     def _build_user_agent(self):
         if self.check_env() == "DAPLA":
             user_agent = "Dapla"
@@ -70,9 +83,9 @@ def _build_user_agent(self):
         else:
             raise SystemError("Can't determine if Im in dapla or in prodsone")
 
-        if "test" in os.environ["STATBANK_BASE_URL"]:
+        if self.check_database() == "TEST":
             user_agent += "Test-"
-        elif "i.ssb" in os.environ["STATBANK_BASE_URL"]:
+        elif self.check_database() == "PROD":
             user_agent += "Prod-"
         else:
             raise SystemError(
@@ -93,12 +106,8 @@ def _build_auth(self):
             del response
         return "Basic " + base64.b64encode(username_encryptedpassword).decode("utf8")
 
-    @staticmethod
-    def _encrypt_request():
-        if "test" in os.environ["STATBANK_BASE_URL"].lower():
-            db = "TEST"
-        else:
-            db = "PROD"
+    def _encrypt_request(self):
+        db = self.check_database()
         if AuthClient.is_ready():
             headers = {
                 "Authorization": f"Bearer {AuthClient.fetch_personal_token()}",

From 10de3cab685b79ca47bfaa63a0fd41e20c712279 Mon Sep 17 00:00:00 2001
From: "Carl F. Corneil" <cfc@ssb.no>
Date: Tue, 12 Sep 2023 11:13:20 +0200
Subject: [PATCH 2/3] template may not recieve a parameter, that should be ok

---
 statbank/uttrekk.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/statbank/uttrekk.py b/statbank/uttrekk.py
index 6c72b12..9f97216 100644
--- a/statbank/uttrekk.py
+++ b/statbank/uttrekk.py
@@ -152,9 +152,9 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict
         """
 
         # If sending in a list, unwrap one layer
-        if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1:
-            dfs = dfs[0]
         if dfs:
+            if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1:
+                dfs = dfs[0]
             if not all([isinstance(df, pd.DataFrame) for df in dfs]):
                 raise TypeError(
                     "All elements sent in to transferdata_template must be pandas dataframes."
@@ -163,8 +163,6 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict
                 raise KeyError(
                     "Number of dataframes in must match the number of subtables."
                 )
-
-        if dfs:
             template = {k: dfs[i] for i, k in enumerate(self.subtables.keys())}
         else:
             template = {k: f"df{i}" for i, k in enumerate(self.subtables.keys())}

From 8ec174e368da16b03d16685be6866ad66b1aedef Mon Sep 17 00:00:00 2001
From: "Carl F. Corneil" <cfc@ssb.no>
Date: Tue, 12 Sep 2023 11:38:24 +0200
Subject: [PATCH 3/3] extra check that nans arnt literal strings in data

---
 statbank/uttrekk.py             |  1 +
 statbank/uttrekk_validations.py | 35 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/statbank/uttrekk.py b/statbank/uttrekk.py
index 9f97216..8687470 100644
--- a/statbank/uttrekk.py
+++ b/statbank/uttrekk.py
@@ -232,6 +232,7 @@ def validate(self, data, raise_errors: bool = False, printing: bool = True) -> d
             validation_errors,
         ) = self._category_code_usage(data, validation_errors, printing)
         validation_errors = self._check_for_floats(data, validation_errors, printing)
+        validation_errors = self._check_for_literal_nans_in_strings(data, validation_errors, printing)
         validation_errors = self._check_rounding(data, validation_errors, printing)
         validation_errors = self._check_time_formats(data, validation_errors, printing)
         validation_errors = self._check_suppression(data, validation_errors, printing)
diff --git a/statbank/uttrekk_validations.py b/statbank/uttrekk_validations.py
index c43ce28..d16bfa3 100644
--- a/statbank/uttrekk_validations.py
+++ b/statbank/uttrekk_validations.py
@@ -42,7 +42,42 @@ def _validate_number_columns(self, data, validation_errors: dict, printing) -> d
             if printing:
                 print("Correct number of columns...")
         return validation_errors
+    
+    def _check_for_literal_nans_in_strings(self, data: dict, validation_errors: dict, printing) -> dict:
+        for name, df in data.items():
+            string_df = df.select_dtypes(include=["object", "string", "string[pyarrow]"])
+            cat_df = df.select_dtypes(include=["category"])
+            
+            nans = ["nan", "na", "none", "."]
 
+            if len(string_df.columns):
+                for col in string_df.columns:
+                    error_text = f"""{col} in {name} has strings, that look like NAs / empty cells,
+                    (In this list: {nans})
+                    Which have been converted to literal strings. 
+                    Consider handeling your NAs before converting them to strings.
+                    Maybe with a .fillna("") before an .astype(str) """
+                    nan_len = len(string_df[string_df[col].str.lower().isin(nans)])
+                    if nan_len:
+                        validation_errors[f"contains_string_nans_{name}_{col}"] = error_text
+                        if printing:
+                            print(error_text)
+            if len(cat_df.columns):
+                for col in cat_df.columns:
+                    error_text = f"""{col} in {name} is a categorical but has strings,
+                    that look like NAs / empty cells,
+                    (In this list: {nans})
+                    Which have been converted to literal strings?
+                    Consider handeling your NAs before converting them to strings.
+                    Maybe with a .fillna("") before an .astype(str) """
+                    nan_cats = [cat for cat in cat_df[col].cat.categories if cat.lower() in nans]
+                    if nan_cats:
+                        validation_errors[f"contains_string_nans_in_category_{name}_{col}"] = error_text
+                        if printing:
+                            print(error_text)
+        return validation_errors
+    
+    
     def _check_for_floats(self, data: dict, validation_errors: dict, printing) -> dict:
         for name, df in data.items():
             for col in df.columns: