Skip to content

Commit

Permalink
use XDG paths for configuration data and caching
Browse files Browse the repository at this point in the history
Support using [XDG ver 0.8](https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html) for project data.

Specifically support:
| ENV VAR          | DEFAULT            |
|------------------|--------------------|
| $XDG_DATA_HOME   | $HOME/.local/share |
| $XDG_CONFIG_HOME | $HOME/.config      |
| $XDG_CACHE_HOME  | $HOME/.cache       |

Project name `garak` is appended to each location.

This is represents the followina breaking changes to project expecations:
* report_prefix passed either at the command line or as config file option
  * set filename values only
  * no longer overrides report_dir
* report_dir passed as a config file option
  * when provided as a relative path will be prepend with `<xdg_data_home>/garak`
  * provided as an absolute path will be used as the output directory
* default `user/site` configuration file `garak.site.yaml` has moved
  * previously `<basedir>/garak.site.yaml`
  * updated location `<xdg_config_home>/garak/garak.site.yaml`

Additional changes (not considered breaking changes):
* nltk data is placed in <xdg_cache_home>/garak if not already found in the environment
* visual_jailbreak downloaded artifacts are placed in <xdg_cache_home>/garak/resources
* generated data for beast/gcg/tap are placed in <xdg_cache_home>/garak/resources

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
  • Loading branch information
jmartin-tech committed Jul 23, 2024
1 parent 0f461f5 commit cc75afc
Show file tree
Hide file tree
Showing 19 changed files with 288 additions and 72 deletions.
16 changes: 15 additions & 1 deletion garak/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
import pathlib
from typing import List
import yaml
from xdg_base_dirs import (
xdg_cache_home,
xdg_config_home,
xdg_data_home,
)

DICT_CONFIG_AFTER_LOAD = False

Expand All @@ -26,6 +31,7 @@
run_params = "seed deprefix eval_threshold generations probe_tags interactive".split()
plugins_params = "model_type model_name extended_detectors".split()
reporting_params = "taxonomy report_prefix".split()
project_dir = "garak"


loaded = False
Expand Down Expand Up @@ -53,9 +59,17 @@ class TransientConfig(GarakSubConfig):
args = None # only access this when determining what was passed on CLI
run_id = None
basedir = pathlib.Path(__file__).parents[0]
config_dir = xdg_config_home() / project_dir
data_dir = xdg_data_home() / project_dir
cache_dir = xdg_cache_home() / project_dir
starttime = None
starttime_iso = None

# initialize the user home and cache paths if they do not exist
config_dir.mkdir(mode=0o740, parents=True, exist_ok=True)
data_dir.mkdir(mode=0o740, parents=True, exist_ok=True)
cache_dir.mkdir(mode=0o740, parents=True, exist_ok=True)


transient = TransientConfig()

Expand Down Expand Up @@ -151,7 +165,7 @@ def load_config(

settings_files = [str(transient.basedir / "resources" / "garak.core.yaml")]

fq_site_config_filename = str(transient.basedir / site_config_filename)
fq_site_config_filename = str(transient.config_dir / site_config_filename)
if os.path.isfile(fq_site_config_filename):
settings_files.append(fq_site_config_filename)
else:
Expand Down
7 changes: 6 additions & 1 deletion garak/_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def default(self, obj):

class PluginCache:
_plugin_cache_file = _config.transient.basedir / "resources" / "plugin_cache.json"
_user_plugin_cache_file = _plugin_cache_file
_user_plugin_cache_file = (
_config.transient.cache_dir / "resources" / "plugin_cache.json"
)
_plugin_cache_dict = None

def __init__(self) -> None:
Expand All @@ -55,6 +57,9 @@ def _load_plugin_cache(self):
if not os.path.exists(self._plugin_cache_file):
self._build_plugin_cache()
if not os.path.exists(self._user_plugin_cache_file):
self._user_plugin_cache_file.parent.mkdir(
mode=0o740, parents=True, exist_ok=True
)
shutil.copy2(self._plugin_cache_file, self._user_plugin_cache_file)
with open(self._user_plugin_cache_file, "r", encoding="utf-8") as cache_file:
local_cache = json.load(cache_file)
Expand Down
33 changes: 21 additions & 12 deletions garak/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@


def start_logging():
from garak import _config

logging.basicConfig(
filename="garak.log",
filename=_config.transient.data_dir / "garak.log",
level=logging.DEBUG,
format="%(asctime)s %(levelname)s %(message)s",
)
Expand All @@ -32,6 +34,7 @@ def start_run():
import os
import uuid

from pathlib import Path
from garak import _config

logging.info("started at %s", _config.transient.starttime_iso)
Expand All @@ -41,19 +44,25 @@ def start_run():
"⚠️ The current/default config is optimised for speed rather than thoroughness. Try e.g. --config full for a stronger test, or specify some probes."
)
_config.transient.run_id = str(uuid.uuid4()) # uuid1 is safe but leaks host info
# why is report prefix a condition of placing file in the report_dir?
report_path = Path(_config.reporting.report_dir)
if not report_path.is_absolute():
logging.debug("relative report dir provided")
report_path = _config.transient.data_dir / _config.reporting.report_dir
if not os.path.isdir(report_path):
try:
report_path.mkdir(mode=0o740, parents=True, exist_ok=True)
except PermissionError as e:
raise PermissionError(
f"Can't create logging directory {report_path}, quitting"
) from e

filename = f"garak.{_config.transient.run_id}.report.jsonl"
if not _config.reporting.report_prefix:
if not os.path.isdir(_config.reporting.report_dir):
try:
os.mkdir(_config.reporting.report_dir)
except PermissionError as e:
raise PermissionError(
f"Can't create logging directory {_config.reporting.report_dir}, quitting"
) from e
_config.transient.report_filename = f"{_config.reporting.report_dir}/garak.{_config.transient.run_id}.report.jsonl"
filename = f"garak.{_config.transient.run_id}.report.jsonl"
else:
_config.transient.report_filename = (
_config.reporting.report_prefix + ".report.jsonl"
)
filename = _config.reporting.report_prefix + ".report.jsonl"
_config.transient.report_filename = str(report_path / filename)
_config.transient.reportfile = open(
_config.transient.report_filename, "w", buffering=1, encoding="utf-8"
)
Expand Down
12 changes: 10 additions & 2 deletions garak/evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import json
import logging
from pathlib import Path
from typing import Iterable

from colorama import Fore, Style
Expand Down Expand Up @@ -72,12 +73,19 @@ def evaluate(self, attempts: Iterable[garak.attempt.Attempt]) -> None:
hitlog_mode = (
"w" if _config.transient.hitlogfile is None else "a"
)
# why is report prefix a condition of placing file in the report_dir?
hitlog_path = Path(_config.transient.report_filename).parent
if not _config.reporting.report_prefix:
hitlog_filename = f"{_config.reporting.report_dir}/garak.{_config.transient.run_id}.hitlog.jsonl"
hitlog_filename = (
hitlog_path
/ f"garak.{_config.transient.run_id}.hitlog.jsonl"
) #
else:
hitlog_filename = (
_config.reporting.report_prefix + ".hitlog.jsonl"
hitlog_path
/ f"{_config.reporting.report_prefix}.hitlog.jsonl"
)

logging.info("hit log in %s", hitlog_filename)
_config.transient.hitlogfile = open(
hitlog_filename,
Expand Down
2 changes: 1 addition & 1 deletion garak/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def settings_ns_provider(self) -> argparse.Namespace:
def banner():
"""Display a random banner"""
with open(
_config.transient.basedir / "resources/banners", "r", encoding="utf-8"
_config.transient.basedir / "resources" / "banners", "r", encoding="utf-8"
) as bannerfile:
banners = bannerfile.read().lstrip().split("\n\n")
print(Fore.LIGHTRED_EX + f"\n{random.choice(banners)}\n" + Fore.RESET)
Expand Down
3 changes: 2 additions & 1 deletion garak/probes/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"""

import logging
from pathlib import Path
from typing import List

import tqdm
Expand Down Expand Up @@ -68,7 +69,7 @@ class TAPCached(Probe):

def __init__(
self,
prompts_location: str = _config.transient.basedir
prompts_location: Path = _config.transient.basedir
/ "resources"
/ "tap"
/ "data"
Expand Down
6 changes: 3 additions & 3 deletions garak/probes/visual_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class FigStep(Probe):

def _load_SafeBench(self):
safebench_data_dir = (
_config.transient.basedir / "resources" / "visual_jailbreak" / "SafeBench"
_config.transient.cache_dir / "resources" / "visual_jailbreak" / "SafeBench"
)
if not os.path.exists(safebench_data_dir):
# make the dir
Expand Down Expand Up @@ -94,7 +94,7 @@ def probe(self, generator):
self.prompts = [
{
"text": prompt["text"],
"image": str(_config.transient.basedir / prompt["image"]),
"image": str(_config.transient.cache_dir / prompt["image"]),
}
for prompt in self.prompts
]
Expand Down Expand Up @@ -125,7 +125,7 @@ def probe(self, generator):
self.prompts = [
{
"text": prompt["text"],
"image": str(_config.transient.basedir / prompt["image"]),
"image": str(_config.transient.cache_dir / prompt["image"]),
}
for prompt in self.prompts
]
Expand Down
15 changes: 9 additions & 6 deletions garak/resources/autodan/autodan.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
autodan_resource_data = (
garak._config.transient.basedir / "resources" / "autodan" / "data"
)
cached_autodan_resource_data = (
garak._config.transient.cache_dir / "resources" / "autodan" / "data"
)
autodan_parser = argparse.ArgumentParser(description="AutoDAN config")
autodan_parser.add_argument(
"--num_steps", type=int, default=100, help="Number of steps to run generation"
Expand Down Expand Up @@ -88,9 +91,9 @@ def autodan_generate(
mutation_generator_name: str = "gpt-3.5-turbo",
mutation_generator_type: str = "openai",
hierarchical: bool = False,
out_path: str = str(autodan_resource_data / "autodan_prompts.txt"),
init_prompt_path: str = str(autodan_resource_data / "autodan_init.txt"),
reference_path: str = str(autodan_resource_data / "prompt_group.pth"),
out_path: Path = cached_autodan_resource_data / "autodan_prompts.txt",
init_prompt_path: Path = autodan_resource_data / "autodan_init.txt",
reference_path: Path = autodan_resource_data / "prompt_group.pth",
low_memory: bool = False,
random_seed: int = None,
):
Expand All @@ -109,9 +112,9 @@ def autodan_generate(
mutation_generator_name (str): Name of model to use as the mutation generator
mutation_generator_type (str): Type of model to use as the mutation generator
hierarchical (bool): Whether ot use hierarchical GA
out_path (str): Path to write generated AutoDAN string
init_prompt_path (str): Path to initial prompts
reference_path (str): Path to reference prompt tensors
out_path (Path): Path to write generated AutoDAN string
init_prompt_path (Path): Path to initial prompts
reference_path (Path): Path to reference prompt tensors
low_memory (bool): Whether to use low memory
random_seed (int): Random seed, if used.
Expand Down
29 changes: 25 additions & 4 deletions garak/resources/autodan/genetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,54 @@
# SPDX-License-Identifier: Apache-2.0

import gc
import nltk.downloader
import numpy as np
import torch
import random
import openai
import os
import re
import nltk
from nltk.corpus import stopwords, wordnet
from collections import defaultdict, OrderedDict
from pathlib import Path
import sys
import time
from logging import getLogger
from typing import Tuple

from garak import _config
from garak.resources.autodan.model_utils import AutoDanPrefixManager, forward

logger = getLogger(__name__)


def _nltk_data():
"""Set nltk_data location, if an existing default is found utilize it, otherwise add to project's cache location."""
from nltk.downloader import Downloader

default_path = Path(Downloader().default_download_dir())
if not default_path.exists():
# if path not found then place in the user cache
# get env var for NLTK_DATA, fallback to create in cachedir / nltk_data
logger.debug("nltk_data location not found using project cache location")
_nltk_data_path.mkdir(mode=0o740, parents=True, exist_ok=True)
default_path = _nltk_data_path
return default_path


_nltk_data_path = _config.transient.cache_dir / "nltk_data"
nltk.data.path.append(str(_nltk_data_path))

# TODO: Refactor into setup.py
try:
_ = stopwords.words("english")
_ = nltk.word_tokenize("This is a normal English sentence")
_ = wordnet.synsets("word")
except LookupError as e:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
download_path = _nltk_data()
nltk.download("stopwords", download_dir=download_path)
nltk.download("punkt", download_dir=download_path)
nltk.download("wordnet", download_dir=download_path)


# TODO: Could probably clean up the inputs here by using imports.
Expand Down
12 changes: 9 additions & 3 deletions garak/resources/beast/beast_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
from pathlib import Path
from typing import Union, Optional, Tuple

import torch
Expand All @@ -17,7 +18,7 @@
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions."
)
beast_resource_data = garak._config.transient.basedir / "resources" / "beast" / "data"
beast_resource_data = garak._config.transient.cache_dir / "resources" / "beast" / "data"


def _format_chat(generator: Generator, prompt: str):
Expand Down Expand Up @@ -296,7 +297,11 @@ def _attack(
if responses is None:
responses = ["" for _ in range(len(prompts))]
for prompt, response in tqdm(
zip(prompts, responses), total=len(prompts), leave=False, position=0, desc="BEAST attack"
zip(prompts, responses),
total=len(prompts),
leave=False,
position=0,
desc="BEAST attack",
):
best_candidate = []
if trials > 1:
Expand Down Expand Up @@ -342,7 +347,7 @@ def run_beast(
suffix_len: int = 40,
data_size: int = 20,
target: Optional[str] = "",
outfile: str = beast_resource_data / "suffixes.txt",
outfile: Path = beast_resource_data / "suffixes.txt",
stop_early: bool = False,
) -> Union[list[str], None]:
"""
Expand Down Expand Up @@ -395,6 +400,7 @@ def run_beast(
)

if suffixes and outfile:
outfile.parent.mkdir(mode=0o740, parents=True, exist_ok=True)
with open(outfile, "a") as f:
for suffix in suffixes:
f.write(f"{suffix}\n")
Expand Down
12 changes: 11 additions & 1 deletion garak/resources/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import shutil
import urllib.error
from pathlib import Path
import pandas as pd
Expand Down Expand Up @@ -42,12 +43,21 @@


def load_advbench(size: int = 0) -> pd.DataFrame:
advbench_path = (
advbench_base_path = (
garak._config.transient.basedir
/ "resources"
/ "advbench"
/ "harmful_behaviors.csv"
)
advbench_path = (
garak._config.transient.cache_dir
/ "resources"
/ "advbench"
/ "harmful_behaviors.csv"
)
if advbench_base_path.is_file() and not advbench_path.is_file():
shutil.copy2(advbench_base_path, advbench_path)

if not advbench_path.is_file():
try:
hb = "https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/harmful_behaviors.csv"
Expand Down
Loading

0 comments on commit cc75afc

Please sign in to comment.