r-three · shayne-longpre · May 23, 2024 · Apr 5, 2024 · Apr 25, 2024 · Apr 26, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -23,7 +23,7 @@ The current list of the permissive licenses allowed by this project is below and
 - [MIT License](https://opensource.org/license/mit/)
 - [BSD License](https://opensource.org/license/bsd-2-clause/)
 
-This list contains some of the common permissive licenses that cover many large data sources, but we intend to expand this list as we continue to collect data. If you come across a source with a license that you believe should be on this list, feel free to comment in our [Allowable License Meta-Issue](https://github.com/r-three/licensed-pile/issues/34). 
+This list contains some of the common permissive licenses that cover many large data sources, but we intend to expand this list as we continue to collect data. If you come across a source with a license that you believe should be on this list, feel free to comment in our [Allowable License Meta-Issue](https://github.com/r-three/licensed-pile/issues/34).
 
 ### Finding License Information
 
@@ -47,14 +47,14 @@ License information can sometimes be difficult to find for certain text sources
 
 5. An "about" page can include licensing information for the website as a whole.
 
-## Contributing Data Collection Code 
+## Contributing Data Collection Code
 
 Once you have selected a source from the list of [Issues](https://github.com/r-three/licensed-pile/issues), add a comment that you plan to work on it and an adim will assign the issue to you. Then, you can follow these guidelines for how to get started with contributing to the repo:
 
 1. Clone the repo
 
 2. Run `pip install -r requirements.txt`
-    
+
 3. Create a subdirectory for your data source (e.g., the `licensed-pile/gutenberg` directory for the Project Gutenberg data source).
 
 4. Identify the best way to collect the raw data
@@ -67,11 +67,11 @@ Once you have selected a source from the list of [Issues](https://github.com/r-t
 
 5. If necessary, write code to filter the downloaded items down to only those with appropriate licenses.
 
-6. Write code that outputs the resulting data to `licensed-pile/data/{SOURCE}/v0` 
+6. Write code that outputs the resulting data to `licensed-pile/data/{SOURCE}/v0`
 
 > The data format used in this project is [Dolma](https://github.com/allenai/dolma). To write out the resulting data as a Dolma dataset, convert each record in the dataset to a python dictionary and use the utilities in `licensed-pile/licensed_pile/write.py` to convert the list of dictionaries to a Dolma dataset. In cases where the dataset is very large, it is better to define a record generator rather than a list and pass the generator to the Dolma utility functions.
 
-> Each record should minimally have the following keys:  
+> Each record should minimally have the following keys:
 ```json
 {
     "id": <unique record identifier>,

diff --git a/courtlistener/process_csv_file.sh b/courtlistener/process_csv_file.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env sh
 set -e
-python courtlistener/process_csv.py
+python courtlistener/process_csv.py
diff --git a/data_provenance/.gitignore b/data_provenance/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/data_provenance/README.md b/data_provenance/README.md
@@ -0,0 +1,20 @@
+# Processing scripts for Data Provenance data
+
+The [Data Provenance Initiative](https://www.dataprovenance.org) is a digital library for supervised datasets that have been manually annotated with their source and license information. It wraps HuggingFace datasets with extra metadata, and provides code to download, standardize and filter for various criteria.
+
+In this case, we have filtered for the following criteria:
+* English language or code data
+* No model generated text
+* Datasets have a commercially viable license, found through the Data Provenance Initiative or the hosting GitHub repository
+* We only include datasets where all associated licenses (from the Data Provenance Initiative and GitHub) are open source compliant or appear in the Gold, Silver or Bronze lists of the Blue Oak Council (https://blueoakcouncil.org/list).
+* The original source(s) of the text are only from the list of sources in `source_allow_list.txt`
+* We only include datasets where the relevant license sources are thoroughly documented and linked.
+
+The specific filter settings are here: https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection/blob/main/src/configs/pile_v2_test.yaml
+
+
+Here is the process to download the data, from inside the `data_provenance` dir:
+
+1. Run `python download.py --include include.csv`
+
+2. Run `python to-dolma.py --include include.csv`
diff --git a/data_provenance/constants.py b/data_provenance/constants.py
@@ -0,0 +1,23 @@
+HF_MAPPING = {
+    "CommitPackFT": "commitpack_ft",
+    "Dolly 15k": "dolly_15k",
+    "Open Assistant v2": "open_assistant_v2",
+    "Open Assistant OctoPack": "octopack_oa",
+    "Open Assistant": "open_assistant",
+    "OIG": "oig",
+    "Anthropic HH-RLHF": "rlhf_anthropic_hh",
+    "Flan Collection (Super-NaturalInstructions)": "flan_sni",
+    "Flan Collection (P3)": "flan_p3",
+    "Flan Collection (Flan 2021)": "flan_2021",
+    "Tasksource Symbol-Tuning": "tasksource_symboltuning",
+    "Tasksource Instruct": "tasksource_instruct",
+    "Flan Collection (Chain-of-Thought)": "flan_cot",
+    "HelpSteer": "helpsteer",
+    "Aya Dataset": "aya_dataset",
+    "AgentInstruct": "agentinstruct",
+    "xP3x": "xp3x",
+    "Flan Collection (Dialog)": "flan_dialog",
+    "Joke Explanation": "joke_explanation",
+    "StarCoder Self-Instruct": "starcoder_selfinstruct",
+    "DialogStudio": "dialogstudio",
+}
diff --git a/data_provenance/download.py b/data_provenance/download.py
@@ -0,0 +1,80 @@
+"""Download Data Provenance Initative data"""
+
+import argparse
+import gzip
+import json
+import logging
+import multiprocessing
+import os
+import tarfile
+import typing
+from collections import defaultdict
+
+import jsonlines
+import pandas as pd
+from constants import HF_MAPPING
+from datasets import load_dataset
+from tqdm.auto import tqdm
+
+from licensed_pile.logs import configure_logging, get_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Data Provenance Data Downloader")
+    parser.add_argument(
+        "--hf",
+        default="DataProvenanceInitiative/Ultra_Permissive_Test",
+        help="The label for the HuggingFace dataset that can be used in HuggingFace's load_dataset()",
+    )
+    parser.add_argument(
+        "--include",
+        default="include.csv",
+        help="Path to csv file with `Collection Name, Dataset ID` we will include",
+    )
+    parser.add_argument(
+        "--outdir", default="data/raw-data-provenance", help="Path to output directory"
+    )
+    return parser.parse_args()
+
+
+def write_jsonl_gz(
+    data,
+    outpath,
+):
+    dirname = os.path.dirname(outpath)
+    if dirname:
+        os.makedirs(dirname, exist_ok=True)
+    with gzip.open(outpath, "wb") as fp:  # Open file in binary write mode
+        data_bytes = (
+            b"\n".join(json.dumps(d).encode() for d in data) + b"\n"
+        )  # Encode strings to bytes
+        fp.write(data_bytes)
+
+
+def main(args):
+    logger = get_logger()
+    logger.info(f"Filtering to just the datasets in {args.include}")
+
+    include_df = pd.read_csv(args.include)
+    include_collections = list(set(include_df["Collection"]))
+    include_dset_ids = set(include_df["Dataset ID"])
+
+    for collection in include_collections:
+        folder_name = HF_MAPPING[collection]
+        subset = load_dataset(
+            args.hf,
+            split="train",
+            num_proc=os.cpu_count(),
+            revision="main",
+            data_files=f"data/{folder_name}/*.jsonl",
+        ).to_list()
+        exs = [ex for ex in subset if ex["dataset"] in include_dset_ids]
+        savepath = os.path.join(args.outdir, f"{folder_name}.jsonl.gz")
+        write_jsonl_gz(exs, savepath)
+        logger.info(f"Saving {len(exs)} examples to {savepath}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    configure_logging()
+    main(args)