leondz · jmartin-tech · Sep 24, 2024 · Sep 16, 2024 · Sep 17, 2024 · Sep 16, 2024
diff --git a/garak/analyze/calibration.py b/garak/analyze/calibration.py
@@ -10,7 +10,7 @@
 from typing import Union
 
 
-from garak import _config
+from garak.data import path as data_path
 
 MINIMUM_STD_DEV = (
     0.01732  # stddev=0 gives unusable z-scores; give it an arbitrary floor of 3^.5 %
@@ -132,7 +132,7 @@ def defcon_and_comment(
         return zscore_defcon, zscore_comment
 
     def _build_path(self, filename):
-        return _config.transient.package_dir / "resources" / "calibration" / filename
+        return data_path / "calibration" / filename
 
     def __init__(self, calibration_path: Union[None, str, pathlib.Path] = None) -> None:
 

diff --git a/garak/analyze/misp.py b/garak/analyze/misp.py
@@ -9,12 +9,9 @@
 import os
 
 from garak import _plugins
-import garak._config
+from garak.data import path as data_path
 
-# does this utility really have access to _config?
-misp_resource_file = (
-    garak._config.transient.package_dir / "resources" / "misp_descriptions.tsv"
-)
+misp_resource_file = data_path / "misp_descriptions.tsv"
 misp_descriptions = {}
 if os.path.isfile(misp_resource_file):
     with open(misp_resource_file, "r", encoding="utf-8") as f:

diff --git a/garak/analyze/report_digest.py b/garak/analyze/report_digest.py
@@ -14,8 +14,10 @@
 import sqlite3
 
 from garak import _config
+from garak.data import path as data_path
 import garak.analyze.calibration
 
+
 if not _config.loaded:
     _config.load_config()
 
@@ -33,9 +35,7 @@
 about_z_template = templateEnv.get_template("digest_about_z.jinja")
 
 
-misp_resource_file = (
-    _config.transient.package_dir / "resources" / "misp_descriptions.tsv"
-)
+misp_resource_file = data_path / "misp_descriptions.tsv"
 misp_descriptions = {}
 if os.path.isfile(misp_resource_file):
     with open(misp_resource_file, "r", encoding="utf-8") as f:

diff --git a/garak/data/__init__.py b/garak/data/__init__.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Local read only resources found by precedence matching supported paths
+
+Ideal usage:
+
+```
+file_path = resources / "filename"
+with open(file_path) as f:
+    f.read()
+```
+
+Resources that do not have a `shipped` version should wrap path access in a try block:
+```
+try:
+    file_path = resources / "filename"
+except GarakException as e:
+    logging.warn("No resource file found.", exc_info=e)
+```
+"""
+
+import pathlib
+
+from garak import _config
+from garak.exception import GarakException
+
+
+class LocalDataPath(type(pathlib.Path())):
+    """restricted Path object usable only for existing resource files"""
+
+    ORDERED_SEARCH_PATHS = [
+        _config.transient.data_dir / "data",
+        _config.transient.package_dir / "data",
+    ]
+
+    def _determine_suffix(self):
+        for path in self.ORDERED_SEARCH_PATHS:
+            if path == self or path in self.parents:
+                return self.relative_to(path)
+
+    def _eval_paths(self, segment, next_call, relative):
+        if self in self.ORDERED_SEARCH_PATHS and segment == relative:
+            raise GarakException(
+                f"The requested resource does not refer to a valid path"
+            )
+
+        prefix_removed = self._determine_suffix()
+        if prefix_removed is None:
+            # if LocalDataPath is instantiated using a path not in ORDERED_SEARCH_PATHS
+            raise GarakException(
+                f"The requested resource does not refer to a valid path: {self}"
+            )
+        for path in self.ORDERED_SEARCH_PATHS:
+            if segment == relative:
+                projected = (path / prefix_removed).parent
+            else:
+                current_path = path / prefix_removed
+                projected = getattr(current_path, next_call)(segment)
+            if projected.exists():
+                return LocalDataPath(projected)
+
+        raise GarakException(f"The resource requested does not exist {segment}")
+
+    def _glob(self, pattern, recursive=False):
+        glob_method = "rglob" if recursive else "glob"
+
+        prefix_removed = self._determine_suffix()
+        candidate_files = []
+        for path in self.ORDERED_SEARCH_PATHS:
+            candidate_path = path / prefix_removed
+            dir_files = getattr(candidate_path, glob_method)(pattern)
+            candidate_files.append(dir_files)
+        relative_paths = []
+        selected_files = []
+        for files in candidate_files:
+            for file in files:
+                suffix = LocalDataPath(file)._determine_suffix()
+                if suffix not in relative_paths:
+                    selected_files.append(file)
+                    relative_paths.append(suffix)
+
+        return selected_files
+
+    def glob(self, pattern):
+        return self._glob(pattern, recursive=False)
+
+    def rglob(self, pattern):
+        return self._glob(pattern, recursive=True)
+
+    def _make_child(self, segment):
+        return self._eval_paths(segment, "_make_child", ("..",))
+
+    def joinpath(self, *pathsegments):
+        for segment in pathsegments:
+            projected = self._eval_paths(segment, "joinpath", "..")
+        return projected
+
+
+path = LocalDataPath(_config.transient.data_dir / "data")
diff --git a/...k/resources/autodan/data/autodan_init.txt → garak/data/autodan/autodan_init.txt b/...k/resources/autodan/data/autodan_init.txt → garak/data/autodan/autodan_init.txt
diff --git a/...esources/autodan/data/autodan_prompts.txt → garak/data/autodan/autodan_prompts.txt b/...esources/autodan/data/autodan_prompts.txt → garak/data/autodan/autodan_prompts.txt
diff --git a/...k/resources/autodan/data/prompt_group.pth → garak/data/autodan/prompt_group.pth b/...k/resources/autodan/data/prompt_group.pth → garak/data/autodan/prompt_group.pth
diff --git a/garak/resources/banners → garak/data/banners b/garak/resources/banners → garak/data/banners
diff --git a/garak/resources/beast/data/suffixes.txt → garak/data/beast/suffixes.txt b/garak/resources/beast/data/suffixes.txt → garak/data/beast/suffixes.txt
diff --git a/garak/resources/book_cloze.tsv → garak/data/book_cloze.tsv b/garak/resources/book_cloze.tsv → garak/data/book_cloze.tsv
diff --git a/garak/resources/book_cloze_80.tsv → garak/data/book_cloze_80.tsv b/garak/resources/book_cloze_80.tsv → garak/data/book_cloze_80.tsv
diff --git a/garak/resources/calibration/bag.md → garak/data/calibration/bag.md b/garak/resources/calibration/bag.md → garak/data/calibration/bag.md
diff --git a/.../calibration/calibration-2024-summer.json → .../calibration/calibration-2024-summer.json b/.../calibration/calibration-2024-summer.json → .../calibration/calibration-2024-summer.json
diff --git a/...calibration/calibration-alpha-202407.json → ...calibration/calibration-alpha-202407.json b/...calibration/calibration-alpha-202407.json → ...calibration/calibration-alpha-202407.json
diff --git a/garak/resources/calibration/calibration.json → garak/data/calibration/calibration.json b/garak/resources/calibration/calibration.json → garak/data/calibration/calibration.json
diff --git a/..._exclusion_toxicity_hateful_offensive.txt → ..._exclusion_toxicity_hateful_offensive.txt b/..._exclusion_toxicity_hateful_offensive.txt → ..._exclusion_toxicity_hateful_offensive.txt
diff --git a/...k/resources/donotanswer/human_chatbox.txt → garak/data/donotanswer/human_chatbox.txt b/...k/resources/donotanswer/human_chatbox.txt → garak/data/donotanswer/human_chatbox.txt
diff --git a/...ources/donotanswer/information_hazard.txt → ...k/data/donotanswer/information_hazard.txt b/...ources/donotanswer/information_hazard.txt → ...k/data/donotanswer/information_hazard.txt
diff --git a/.../resources/donotanswer/malicious_uses.txt → garak/data/donotanswer/malicious_uses.txt b/.../resources/donotanswer/malicious_uses.txt → garak/data/donotanswer/malicious_uses.txt
diff --git a/...rces/donotanswer/misinformation_harms.txt → ...data/donotanswer/misinformation_harms.txt b/...rces/donotanswer/misinformation_harms.txt → ...data/donotanswer/misinformation_harms.txt
diff --git a/garak/resources/gcg/data/gcg.txt → garak/data/gcg/gcg.txt b/garak/resources/gcg/data/gcg.txt → garak/data/gcg/gcg.txt
diff --git a/garak/resources/graph_connectivity.json → garak/data/graph_connectivity.json b/garak/resources/graph_connectivity.json → garak/data/graph_connectivity.json
diff --git a/garak/resources/guardian_cloze.tsv → garak/data/guardian_cloze.tsv b/garak/resources/guardian_cloze.tsv → garak/data/guardian_cloze.tsv
diff --git a/garak/resources/inthewild_jailbreak_llms.txt → garak/data/inthewild_jailbreak_llms.txt b/garak/resources/inthewild_jailbreak_llms.txt → garak/data/inthewild_jailbreak_llms.txt
diff --git a/garak/resources/ldnoobw-en.txt → garak/data/ldnoobw-en.txt b/garak/resources/ldnoobw-en.txt → garak/data/ldnoobw-en.txt
diff --git a/garak/resources/misp_descriptions.tsv → garak/data/misp_descriptions.tsv b/garak/resources/misp_descriptions.tsv → garak/data/misp_descriptions.tsv
diff --git a/garak/resources/nyt_cloze.tsv → garak/data/nyt_cloze.tsv b/garak/resources/nyt_cloze.tsv → garak/data/nyt_cloze.tsv
diff --git a/...resources/ofcom-potentially-offensive.txt → garak/data/ofcom-potentially-offensive.txt b/...resources/ofcom-potentially-offensive.txt → garak/data/ofcom-potentially-offensive.txt
diff --git a/...k/resources/payloads/keyedprod_win10.json → garak/data/payloads/keyedprod_win10.json b/...k/resources/payloads/keyedprod_win10.json → garak/data/payloads/keyedprod_win10.json
diff --git a/garak/resources/potter_cloze.tsv → garak/data/potter_cloze.tsv b/garak/resources/potter_cloze.tsv → garak/data/potter_cloze.tsv
diff --git a/garak/resources/primality_testing.json → garak/data/primality_testing.json b/garak/resources/primality_testing.json → garak/data/primality_testing.json
diff --git a/garak/resources/profanity_en.csv → garak/data/profanity_en.csv b/garak/resources/profanity_en.csv → garak/data/profanity_en.csv
diff --git a/...es/realtoxicityprompts/rtp_flirtation.txt → ...ta/realtoxicityprompts/rtp_flirtation.txt b/...es/realtoxicityprompts/rtp_flirtation.txt → ...ta/realtoxicityprompts/rtp_flirtation.txt
diff --git a/...altoxicityprompts/rtp_identity_attack.txt → ...altoxicityprompts/rtp_identity_attack.txt b/...altoxicityprompts/rtp_identity_attack.txt → ...altoxicityprompts/rtp_identity_attack.txt
diff --git a/...ources/realtoxicityprompts/rtp_insult.txt → ...k/data/realtoxicityprompts/rtp_insult.txt b/...ources/realtoxicityprompts/rtp_insult.txt → ...k/data/realtoxicityprompts/rtp_insult.txt
diff --git a/...ces/realtoxicityprompts/rtp_profanity.txt → ...ata/realtoxicityprompts/rtp_profanity.txt b/...ces/realtoxicityprompts/rtp_profanity.txt → ...ata/realtoxicityprompts/rtp_profanity.txt
diff --git a/...altoxicityprompts/rtp_severe_toxicity.txt → ...altoxicityprompts/rtp_severe_toxicity.txt b/...altoxicityprompts/rtp_severe_toxicity.txt → ...altoxicityprompts/rtp_severe_toxicity.txt
diff --git a/...toxicityprompts/rtp_sexually_explicit.txt → ...toxicityprompts/rtp_sexually_explicit.txt b/...toxicityprompts/rtp_sexually_explicit.txt → ...toxicityprompts/rtp_sexually_explicit.txt
diff --git a/...ources/realtoxicityprompts/rtp_threat.txt → ...k/data/realtoxicityprompts/rtp_threat.txt b/...ources/realtoxicityprompts/rtp_threat.txt → ...k/data/realtoxicityprompts/rtp_threat.txt
diff --git a/garak/resources/safebench_filenames.txt → garak/data/safebench_filenames.txt b/garak/resources/safebench_filenames.txt → garak/data/safebench_filenames.txt
diff --git a/garak/resources/safebenchtiny_filenames.txt → garak/data/safebenchtiny_filenames.txt b/garak/resources/safebenchtiny_filenames.txt → garak/data/safebenchtiny_filenames.txt
diff --git a/garak/resources/senator_search.json → garak/data/senator_search.json b/garak/resources/senator_search.json → garak/data/senator_search.json
diff --git a/garak/resources/slurprompts.jsonl → garak/data/slurprompts.jsonl b/garak/resources/slurprompts.jsonl → garak/data/slurprompts.jsonl
diff --git a/garak/resources/slurprompts_mini.jsonl → garak/data/slurprompts_mini.jsonl b/garak/resources/slurprompts_mini.jsonl → garak/data/slurprompts_mini.jsonl
diff --git a/garak/resources/slursreclaimedslurs.txt → garak/data/slursreclaimedslurs.txt b/garak/resources/slursreclaimedslurs.txt → garak/data/slursreclaimedslurs.txt
diff --git a/garak/resources/tap/data/tap_jailbreaks.txt → garak/data/tap/tap_jailbreaks.txt b/garak/resources/tap/data/tap_jailbreaks.txt → garak/data/tap/tap_jailbreaks.txt
diff --git a/garak/resources/truefalse_falseclaims.txt → garak/data/truefalse_falseclaims.txt b/garak/resources/truefalse_falseclaims.txt → garak/data/truefalse_falseclaims.txt
diff --git a/garak/resources/truefalse_falseclaims_50.txt → garak/data/truefalse_falseclaims_50.txt b/garak/resources/truefalse_falseclaims_50.txt → garak/data/truefalse_falseclaims_50.txt
diff --git a/garak/resources/typology_payloads.tsv → garak/data/typology_payloads.tsv b/garak/resources/typology_payloads.tsv → garak/data/typology_payloads.tsv
diff --git a/garak/detectors/riskywords.py b/garak/detectors/riskywords.py
@@ -7,11 +7,12 @@
 import csv
 
 from garak import _config
+from garak.data import path as data_path
 from garak.detectors.base import StringDetector
 
 surge_list = defaultdict(list)
 with open(
-    _config.transient.package_dir / "resources" / "profanity_en.csv",
+    data_path / "profanity_en.csv",
     "r",
     encoding="utf-8",
 ) as csvfile:
@@ -26,7 +27,7 @@
 MIN_OFCOM_STRENGTH = 2
 ofcom_list = defaultdict(list)
 with open(
-    _config.transient.package_dir / "resources" / "ofcom-potentially-offensive.txt",
+    data_path / "ofcom-potentially-offensive.txt",
     "r",
     encoding="utf-8",
 ) as csvfile:
@@ -285,7 +286,7 @@ class LDNOOBW(StringDetector):
     def __init__(self, config_root=_config):
         substrings = []
         with open(
-            _config.transient.package_dir / "resources" / "ldnoobw-en.txt",
+            data_path / "ldnoobw-en.txt",
             "r",
             encoding="utf-8",
         ) as wordlist:

diff --git a/garak/detectors/specialwords.py b/garak/detectors/specialwords.py
@@ -8,6 +8,7 @@
 
 from garak import _config
 from garak.detectors.base import StringDetector
+from garak.data import path as data_path
 
 
 class SlursReclaimedSlurs(StringDetector):
@@ -25,7 +26,7 @@ class SlursReclaimedSlurs(StringDetector):
     def __init__(self, config_root=_config):
         self.substrings = set([])
         with open(
-            _config.transient.package_dir / "resources" / "slurprompts.jsonl",
+            data_path / "slurprompts.jsonl",
             "r",
             encoding="utf-8",
         ) as slurfile:

diff --git a/garak/interactive.py b/garak/interactive.py
@@ -14,6 +14,7 @@
 import garak.harnesses.probewise
 from garak import _config
 from garak.evaluators import ThresholdEvaluator
+from garak.data import path as data_path
 from garak._plugins import enumerate_plugins
 
 logger = getLogger(__name__)
@@ -232,9 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace:
 
 def banner():
     """Display a random banner"""
-    with open(
-        _config.transient.package_dir / "resources" / "banners", "r", encoding="utf-8"
-    ) as bannerfile:
+    with open(data_path / "banners", "r", encoding="utf-8") as bannerfile:
         banners = bannerfile.read().lstrip().split("\n\n")
         print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET)
 

diff --git a/garak/payloads.py b/garak/payloads.py
@@ -16,6 +16,7 @@
 
 import garak._config
 import garak.exception
+from garak.data import path as data_path
 
 
 PAYLOAD_SCHEMA = {
@@ -35,10 +36,7 @@
     ],
 }
 
-PAYLOAD_SEARCH_DIRS = [
-    garak._config.transient.data_dir / "resources" / "payloads",
-    garak._config.transient.package_dir / "resources" / "payloads",
-]
+PAYLOAD_DIR = data_path / "payloads"
 
 
 def _validate_payload(payload_json):
@@ -52,17 +50,9 @@ def _validate_payload(payload_json):
 def load_payload(
     name: str, path: Union[str, pathlib.Path, None] = None
 ) -> PayloadGroup:
-    if path is not None:
-        return PayloadGroup(name, path)
-    else:
-        # iterate through search dirs
-        for dir in PAYLOAD_SEARCH_DIRS:
-            path = pathlib.Path(dir) / f"{name}.json"
-            if path.is_file():
-                return PayloadGroup(name, path)
-    raise FileNotFoundError(
-        "File '%s.json' not found in payload search directories" % name
-    )
+    if path is None:
+        path = PAYLOAD_DIR / f"{name}.json"
+    return PayloadGroup(name, path)
 
 
 class PayloadGroup:
@@ -155,7 +145,7 @@ def _scan_payload_dir(self, dir) -> dict:
         payloads, return name:path dict. optionally filter by type prefixes"""
 
         payloads_found = {}
-        dir = pathlib.Path(dir)
+        dir = dir
         if not dir.is_dir():
             return {}
 
@@ -182,11 +172,7 @@ def _scan_payload_dir(self, dir) -> dict:
     def _refresh_payloads(self) -> None:
         """Scan resources/payloads and the XDG_DATA_DIR/payloads for
         payload objects, and refresh self.payload_list"""
-        self.payload_list = {}
-        for payload_dir in PAYLOAD_SEARCH_DIRS[
-            ::-1
-        ]:  # reverse order because | clobbers at top-level key
-            self.payload_list = self.payload_list | self._scan_payload_dir(payload_dir)
+        self.payload_list = self._scan_payload_dir(PAYLOAD_DIR)
 
     def search(
         self, types: Union[List[str], None] = None, include_children=True
@@ -222,10 +208,10 @@ def load(self, name) -> PayloadGroup:
             logging.error(msg, exc_info=ke)
             raise garak.exception.PayloadFailure(msg) from ke
 
-        except FileNotFoundError as fnfe:
+        except garak.exception.GarakException as ge:
             msg = f"Requested payload {name} not found at expected path {path}"
-            logging.error(msg, exc_info=fnfe)
-            raise garak.exception.PayloadFailure(msg) from fnfe
+            logging.error(msg, exc_info=ge)
+            raise garak.exception.PayloadFailure(msg) from ge
 
         return p
 

diff --git a/garak/probes/continuation.py b/garak/probes/continuation.py
@@ -9,6 +9,7 @@
 
 from garak import _config
 from garak.attempt import Attempt
+from garak.data import path as data_path
 from garak.probes.base import Probe
 
 
@@ -43,7 +44,7 @@ def __init__(self, config_root=_config):
         # append these to each of the prefixes for that term
         self._slur_prefixes = []
         with open(
-            _config.transient.package_dir / "resources" / self.source_resource_filename,
+            data_path / self.source_resource_filename,
             "r",
             encoding="utf-8",
         ) as slurfile:

diff --git a/garak/probes/dan.py b/garak/probes/dan.py
@@ -11,6 +11,7 @@
 
 import garak.attempt
 from garak import _config
+from garak.data import path as data_path
 from garak.probes.base import Probe
 
 
@@ -537,11 +538,7 @@ class AutoDANCached(Probe):
 
     def __init__(
         self,
-        autodan_prompt_location: str = _config.transient.package_dir
-        / "resources"
-        / "autodan"
-        / "data"
-        / "autodan_prompts.txt",
+        autodan_prompt_location: str = data_path / "autodan" / "autodan_prompts.txt",
         config_root=_config,
     ):
         self.autodan_prompt_location = autodan_prompt_location
@@ -645,9 +642,7 @@ class DanInTheWild(Probe):
     def __init__(self, config_root=_config):
         super().__init__(config_root=config_root)
 
-        inthewild_path = str(
-            _config.transient.package_dir / "resources" / "inthewild_jailbreak_llms.txt"
-        )
+        inthewild_path = str(data_path / "inthewild_jailbreak_llms.txt")
 
         with open(inthewild_path, "r", encoding="utf-8") as f:
             self.prompts = f.readlines()

diff --git a/garak/probes/donotanswer.py b/garak/probes/donotanswer.py
@@ -11,6 +11,7 @@
 import sys
 
 from garak import _config
+from garak.data import path as data_path
 from garak.probes.base import Probe
 
 this = sys.modules[__name__]
@@ -19,10 +20,7 @@
 def load_local_data(self):
     # prompts
     with open(
-        _config.transient.package_dir
-        / "resources"
-        / "donotanswer"
-        / f"{self.dna_category}.txt",
+        data_path / "donotanswer" / f"{self.dna_category}.txt",
         encoding="utf=8",
     ) as f:
         self.prompts = f.read().strip().split("\n")

diff --git a/garak/probes/encoding.py b/garak/probes/encoding.py
@@ -12,6 +12,7 @@
 import json
 
 from garak import _config
+from garak.data import path as data_path
 from garak.probes.base import Probe
 
 payload_library = {}
@@ -58,7 +59,7 @@ def _load_payloads():
 
         payload_library["slur_terms"] = []
         with open(
-            _config.transient.package_dir / "resources" / "slurprompts_mini.jsonl",
+            data_path / "slurprompts_mini.jsonl",
             "r",
             encoding="utf-8",
         ) as slurfile: