From 468a8e1f38b080f798e62d9f725f4ca463221233 Mon Sep 17 00:00:00 2001 From: "Carl F. Corneil" Date: Tue, 12 Sep 2023 11:12:45 +0200 Subject: [PATCH 1/3] restructure check_env --- statbank/auth.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/statbank/auth.py b/statbank/auth.py index 418d4d7..2e0d11c 100644 --- a/statbank/auth.py +++ b/statbank/auth.py @@ -62,6 +62,19 @@ def check_env() -> str: else: raise OSError("Ikke i prodsonen, eller på Dapla? Må funksjonen skrives om?") + @staticmethod + def check_database() -> str: + if "test" in os.environ["STATBANK_BASE_URL"]: + print("Warning: Descriptions and data in the TEST-database may be outdated!") + return "TEST" + elif "i.ssb" in os.environ["STATBANK_BASE_URL"]: + return "PROD" + else: + raise SystemError( + "Can't determine if Im sending to the test-database or the prod-database" + ) + + def _build_user_agent(self): if self.check_env() == "DAPLA": user_agent = "Dapla" @@ -70,9 +83,9 @@ def _build_user_agent(self): else: raise SystemError("Can't determine if Im in dapla or in prodsone") - if "test" in os.environ["STATBANK_BASE_URL"]: + if self.check_database() == "TEST": user_agent += "Test-" - elif "i.ssb" in os.environ["STATBANK_BASE_URL"]: + elif self.check_database() == "PROD": user_agent += "Prod-" else: raise SystemError( @@ -93,12 +106,8 @@ def _build_auth(self): del response return "Basic " + base64.b64encode(username_encryptedpassword).decode("utf8") - @staticmethod - def _encrypt_request(): - if "test" in os.environ["STATBANK_BASE_URL"].lower(): - db = "TEST" - else: - db = "PROD" + def _encrypt_request(self): + db = self.check_database() if AuthClient.is_ready(): headers = { "Authorization": f"Bearer {AuthClient.fetch_personal_token()}", From 10de3cab685b79ca47bfaa63a0fd41e20c712279 Mon Sep 17 00:00:00 2001 From: "Carl F. Corneil" Date: Tue, 12 Sep 2023 11:13:20 +0200 Subject: [PATCH 2/3] template may not recieve a parameter, that should be ok --- statbank/uttrekk.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/statbank/uttrekk.py b/statbank/uttrekk.py index 6c72b12..9f97216 100644 --- a/statbank/uttrekk.py +++ b/statbank/uttrekk.py @@ -152,9 +152,9 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict """ # If sending in a list, unwrap one layer - if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1: - dfs = dfs[0] if dfs: + if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1: + dfs = dfs[0] if not all([isinstance(df, pd.DataFrame) for df in dfs]): raise TypeError( "All elements sent in to transferdata_template must be pandas dataframes." @@ -163,8 +163,6 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict raise KeyError( "Number of dataframes in must match the number of subtables." ) - - if dfs: template = {k: dfs[i] for i, k in enumerate(self.subtables.keys())} else: template = {k: f"df{i}" for i, k in enumerate(self.subtables.keys())} From 8ec174e368da16b03d16685be6866ad66b1aedef Mon Sep 17 00:00:00 2001 From: "Carl F. Corneil" Date: Tue, 12 Sep 2023 11:38:24 +0200 Subject: [PATCH 3/3] extra check that nans arnt literal strings in data --- statbank/uttrekk.py | 1 + statbank/uttrekk_validations.py | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/statbank/uttrekk.py b/statbank/uttrekk.py index 9f97216..8687470 100644 --- a/statbank/uttrekk.py +++ b/statbank/uttrekk.py @@ -232,6 +232,7 @@ def validate(self, data, raise_errors: bool = False, printing: bool = True) -> d validation_errors, ) = self._category_code_usage(data, validation_errors, printing) validation_errors = self._check_for_floats(data, validation_errors, printing) + validation_errors = self._check_for_literal_nans_in_strings(data, validation_errors, printing) validation_errors = self._check_rounding(data, validation_errors, printing) validation_errors = self._check_time_formats(data, validation_errors, printing) validation_errors = self._check_suppression(data, validation_errors, printing) diff --git a/statbank/uttrekk_validations.py b/statbank/uttrekk_validations.py index c43ce28..d16bfa3 100644 --- a/statbank/uttrekk_validations.py +++ b/statbank/uttrekk_validations.py @@ -42,7 +42,42 @@ def _validate_number_columns(self, data, validation_errors: dict, printing) -> d if printing: print("Correct number of columns...") return validation_errors + + def _check_for_literal_nans_in_strings(self, data: dict, validation_errors: dict, printing) -> dict: + for name, df in data.items(): + string_df = df.select_dtypes(include=["object", "string", "string[pyarrow]"]) + cat_df = df.select_dtypes(include=["category"]) + + nans = ["nan", "na", "none", "."] + if len(string_df.columns): + for col in string_df.columns: + error_text = f"""{col} in {name} has strings, that look like NAs / empty cells, + (In this list: {nans}) + Which have been converted to literal strings. + Consider handeling your NAs before converting them to strings. + Maybe with a .fillna("") before an .astype(str) """ + nan_len = len(string_df[string_df[col].str.lower().isin(nans)]) + if nan_len: + validation_errors[f"contains_string_nans_{name}_{col}"] = error_text + if printing: + print(error_text) + if len(cat_df.columns): + for col in cat_df.columns: + error_text = f"""{col} in {name} is a categorical but has strings, + that look like NAs / empty cells, + (In this list: {nans}) + Which have been converted to literal strings? + Consider handeling your NAs before converting them to strings. + Maybe with a .fillna("") before an .astype(str) """ + nan_cats = [cat for cat in cat_df[col].cat.categories if cat.lower() in nans] + if nan_cats: + validation_errors[f"contains_string_nans_in_category_{name}_{col}"] = error_text + if printing: + print(error_text) + return validation_errors + + def _check_for_floats(self, data: dict, validation_errors: dict, printing) -> dict: for name, df in data.items(): for col in df.columns: