Skip to content

Commit

Permalink
NiFi: fixed issue with de-id script not working on texts of certain l…
Browse files Browse the repository at this point in the history
…ength.
  • Loading branch information
vladd-bit committed Dec 6, 2023
1 parent cbba074 commit 99d281a
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion nifi/user-scripts/anonymise_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@ def special_deid(cat, text, record):
input_text = sys.stdin.read()

model_pack_path = os.environ.get("MODEL_PACK_PATH", "/opt/models/de_id_base.zip")

text_field_name = "document"
nproc = 100

# if there are issues with DE-ID model not working on certain long documents please play around with the character limit
# dependent on the tokenizer used
char_limit = 512

for arg in sys.argv:
_arg = arg.split("=", 1)
if _arg[0] == "model_pack_path":
Expand All @@ -22,6 +27,8 @@ def special_deid(cat, text, record):
text_field_name = _arg[1]
if _arg[0] == "nproc":
nproc = _arg[1]
if _arg[0] == "char_limit":
char_limit = _arg[1]


records = json.loads(str(input_text))
Expand All @@ -31,7 +38,16 @@ def special_deid(cat, text, record):

for record in records:
if text_field_name in record.keys():
_anon_text = deid_text(cat, record[text_field_name])
text_field = record[text_field_name]
_anon_text = ""
if len(text_field) > char_limit:
sections = int(len(text_field_name) / char_limit)

for i in range(sections):
_tmp_text = text_field[i * char_limit: (i + 1) * char_limit]
_anon_text += deid_text(cat, _tmp_text)
else:
_anon_text = deid_text(cat, text_field)
record[text_field_name] = _anon_text
final_records.append(record)
else:
Expand Down

0 comments on commit 99d281a

Please sign in to comment.