Skip to content

Commit

Permalink
Merge pull request #77 from statisticsnorway/76-validate-should-look-…
Browse files Browse the repository at this point in the history
…for-literal-strings-encoded-as-nan-na-and-none

76 validate should look for literal strings encoded as nan na and none
  • Loading branch information
aecorn authored Sep 12, 2023
2 parents f1f6162 + 8ec174e commit c127ceb
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 12 deletions.
25 changes: 17 additions & 8 deletions statbank/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@ def check_env() -> str:
else:
raise OSError("Ikke i prodsonen, eller på Dapla? Må funksjonen skrives om?")

@staticmethod
def check_database() -> str:
if "test" in os.environ["STATBANK_BASE_URL"]:
print("Warning: Descriptions and data in the TEST-database may be outdated!")
return "TEST"
elif "i.ssb" in os.environ["STATBANK_BASE_URL"]:
return "PROD"
else:
raise SystemError(
"Can't determine if Im sending to the test-database or the prod-database"
)


def _build_user_agent(self):
if self.check_env() == "DAPLA":
user_agent = "Dapla"
Expand All @@ -70,9 +83,9 @@ def _build_user_agent(self):
else:
raise SystemError("Can't determine if Im in dapla or in prodsone")

if "test" in os.environ["STATBANK_BASE_URL"]:
if self.check_database() == "TEST":
user_agent += "Test-"
elif "i.ssb" in os.environ["STATBANK_BASE_URL"]:
elif self.check_database() == "PROD":
user_agent += "Prod-"
else:
raise SystemError(
Expand All @@ -93,12 +106,8 @@ def _build_auth(self):
del response
return "Basic " + base64.b64encode(username_encryptedpassword).decode("utf8")

@staticmethod
def _encrypt_request():
if "test" in os.environ["STATBANK_BASE_URL"].lower():
db = "TEST"
else:
db = "PROD"
def _encrypt_request(self):
db = self.check_database()
if AuthClient.is_ready():
headers = {
"Authorization": f"Bearer {AuthClient.fetch_personal_token()}",
Expand Down
7 changes: 3 additions & 4 deletions statbank/uttrekk.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict
"""

# If sending in a list, unwrap one layer
if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1:
dfs = dfs[0]
if dfs:
if not isinstance(dfs[0], pd.DataFrame) and len(dfs) == 1:
dfs = dfs[0]
if not all([isinstance(df, pd.DataFrame) for df in dfs]):
raise TypeError(
"All elements sent in to transferdata_template must be pandas dataframes."
Expand All @@ -163,8 +163,6 @@ def transferdata_template(self, *dfs: list[pd.DataFrame] | pd.DataFrame) -> dict
raise KeyError(
"Number of dataframes in must match the number of subtables."
)

if dfs:
template = {k: dfs[i] for i, k in enumerate(self.subtables.keys())}
else:
template = {k: f"df{i}" for i, k in enumerate(self.subtables.keys())}
Expand Down Expand Up @@ -234,6 +232,7 @@ def validate(self, data, raise_errors: bool = False, printing: bool = True) -> d
validation_errors,
) = self._category_code_usage(data, validation_errors, printing)
validation_errors = self._check_for_floats(data, validation_errors, printing)
validation_errors = self._check_for_literal_nans_in_strings(data, validation_errors, printing)
validation_errors = self._check_rounding(data, validation_errors, printing)
validation_errors = self._check_time_formats(data, validation_errors, printing)
validation_errors = self._check_suppression(data, validation_errors, printing)
Expand Down
35 changes: 35 additions & 0 deletions statbank/uttrekk_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,42 @@ def _validate_number_columns(self, data, validation_errors: dict, printing) -> d
if printing:
print("Correct number of columns...")
return validation_errors

def _check_for_literal_nans_in_strings(self, data: dict, validation_errors: dict, printing) -> dict:
for name, df in data.items():
string_df = df.select_dtypes(include=["object", "string", "string[pyarrow]"])
cat_df = df.select_dtypes(include=["category"])

nans = ["nan", "na", "none", "."]

if len(string_df.columns):
for col in string_df.columns:
error_text = f"""{col} in {name} has strings, that look like NAs / empty cells,
(In this list: {nans})
Which have been converted to literal strings.
Consider handeling your NAs before converting them to strings.
Maybe with a .fillna("") before an .astype(str) """
nan_len = len(string_df[string_df[col].str.lower().isin(nans)])
if nan_len:
validation_errors[f"contains_string_nans_{name}_{col}"] = error_text
if printing:
print(error_text)
if len(cat_df.columns):
for col in cat_df.columns:
error_text = f"""{col} in {name} is a categorical but has strings,
that look like NAs / empty cells,
(In this list: {nans})
Which have been converted to literal strings?
Consider handeling your NAs before converting them to strings.
Maybe with a .fillna("") before an .astype(str) """
nan_cats = [cat for cat in cat_df[col].cat.categories if cat.lower() in nans]
if nan_cats:
validation_errors[f"contains_string_nans_in_category_{name}_{col}"] = error_text
if printing:
print(error_text)
return validation_errors


def _check_for_floats(self, data: dict, validation_errors: dict, printing) -> dict:
for name, df in data.items():
for col in df.columns:
Expand Down

0 comments on commit c127ceb

Please sign in to comment.