USPTO (#71)

* Add data processing and conversion scripts * Add process_uspto.sh script for processing USPTO data * Add TypeScript compilation step * replace bs4 with lxml. Also using polars for the main computation. * Add row limit for testing. * add multiprocessing to speed things up * split runtime script into setup and run * remove streaming. slower but will need alot more memory * fix typehint: Sequence defines `__len__` * add args to bash script * add readme * use pandoc
r-three · Jun 11, 2024 · 6899846 · 6899846
1 parent 1ef9a1c
commit 6899846
Show file tree

Hide file tree

Showing 9 changed files with 428 additions and 3 deletions.
diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
@@ -11,7 +11,7 @@ download_dir="./data/courtlistener/raw"
 mkdir -p "$download_dir"
 
 # Only download the data from most recent CL dump
-# The newest dump contains the previous dumps data 
+# The newest dump contains the previous dumps data
 # Differences from the previous data should not be included
 dates=(
     "2024-05-06"

diff --git a/licensed_pile/write.py b/licensed_pile/write.py
@@ -7,7 +7,7 @@
 import os
 from contextlib import ExitStack
 from queue import Queue
-from typing import Dict, Sequence
+from typing import Dict, Iterator
 
 import smart_open
 import tqdm
@@ -23,7 +23,7 @@ def shard_name(filename: str, shard: str, padding: int = 5):
 
 # TODO: Add overwrite protection
 def to_dolma(
-    examples: Sequence[Dict],
+    examples: Iterator[Dict],
     path: str,
     filename: str,
     shard_size: int = 1,

diff --git a/uspto/README.md b/uspto/README.md
@@ -0,0 +1,47 @@
+# USPTP
+
+USPTO dataset extracted from [Google Patents Public Dataset](https://cloud.google.com/blog/topics/public-datasets/google-patents-public-datasets-connecting-public-paid-and-private-patent-data) and uploaded to HF.
+
+## Data Download and Processing
+
+To clone the unprocessed dataset from HuggingFace run `bash setup.sh`. The default location is `/uspto/data`
+
+`pandoc` is required to run the script. The command to install it is provided in the script (commented out). Alternatively you can install it with`sudo apt-get install pandoc` but that installs an older version.
+
+
+The main script can be run with `bash run process_uspto.sh --output-dir <output_dir> --max-concurrency <int> --limit <max_rows>`.
+
+Note: The script will take a long time to run. The `--max-concurrency` flag can be used to speed up the process. The `--limit` flag can be used to limit the number of rows processed.
+It takes ~30 mins to process 1 file with 256 threads. The bulk of the processing is done by pandoc.
+
+To save the processed data to parquet add the `--to-parquet` flag.
+
+<details>
+<summary>Under the hood of process_uspto.sh</summary>
+
+### setup.sh has 3 main steps:
+
+#### Usage
+1. Ensure you are in the correct directory structure:
+    1. The script expects to be run from the parent directory of the `uspto` directory.
+
+#### Running the Script:
+- Make sure the script has execute permissions. If not, run:
+    ```sh
+    chmod +x process_uspto.sh
+    ```
+
+#### It has the following steps:
+1. The main bulk of the processing in the python script are the pandoc conversions. A progress bar is displayed for each column/file.
+
+</details>
+
+
+## Data Stats
+
+
+## Example
+Some output examples are in the examples dir.
+
+## License
+Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/
diff --git a/uspto/examples/uspto_examples.jsonl b/uspto/examples/uspto_examples.jsonl
diff --git a/uspto/process_uspto.sh b/uspto/process_uspto.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+#!/bin/bash
+
+# Run the Python script
+echo "Running the uspto-to-dolma.py script..."
+python uspto-to-dolma.py "$@"
+echo "uspto-to-dolma.py script completed."
diff --git a/uspto/requirements.txt b/uspto/requirements.txt
@@ -0,0 +1,3 @@
+polars
+pypandoc
+rich
diff --git a/uspto/setup.sh b/uspto/setup.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Download HF Dataset repo
+git clone git@hf.co:datasets/baber/USPTO uspto/data
+echo "HF Dataset repo cloned to uspto/data"
+
+# install pandoc
+#wget https://github.com/jgm/pandoc/releases/download/3.2/pandoc-3.2-1-amd64.deb
+#sudo dpkg -i pandoc-3.2-1-amd64.deb
diff --git a/uspto/uspto-to-dolma.py b/uspto/uspto-to-dolma.py
@@ -0,0 +1,206 @@
+import argparse
+import os
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Iterator
+
+import polars as pl
+from polars import col
+from tqdm import tqdm
+from utils import parallel_apply
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.logs import configure_logging
+from licensed_pile.write import to_dolma
+
+logger = configure_logging("uspto")
+
+if sys.version_info < (3, 9):
+    raise RuntimeError("Python version >= 3.9 required")
+
+
+def process_datasets(
+    data_dir: str = r"data/uspto/",
+    limit: int = 0,
+    max_concurrency: int = 4,
+) -> Iterator[dict]:
+    """
+    This function `run_dataset` scans a dataset located in a directory, converts each file in the dataset to a desired
+    format using pandoc,and returns an iterable of dictionaries containing the converted data.
+
+    Parameters:
+    - `data_dir` (str): The directory where the dataset is located. Default value is "./data/uspto/".
+    - `limit` (int): The maximum number of rows to convert. Default value is 0, which means convert all rows from all files in the dataset.
+    - `max_concurrency` (int): The maximum number of concurrent conversions to perform. Default value is 2.
+
+    Returns:
+    - `Iterable[dict]`: An iterable of dictionaries containing the converted data from each file in the dataset.
+
+    Note:
+    - The `data_dir` parameter should be a valid directory path ending with a forward slash '/'.
+    - The `limit` parameter determines how many row to read. Set it to 0 to convert all files.
+    - The `max_concurrency` parameter determines how many parquet files to process concurrently.
+
+    Example usage:
+    ```python
+    for data in run_dataset(data_dir=r"./data/uspto/", limit=10, max_concurrency=2):
+        # Process each converted data entry
+        print(data)
+    ```
+    """
+    data_path = Path(data_dir)
+    logger.info(f"Processing files in {data_path}")
+    file_names = list(data_path.glob("*.parquet"))
+    for i, file_name in enumerate(file_names):
+        for x in scan_dataset(file_name, limit, max_concurrency).iter_rows(named=True):
+            yield x
+
+
+def to_parquet(
+    output_dir: str, data_dir: str, limit: int, max_concurrency: int
+) -> None:
+    output_dir = Path(output_dir)
+    datapath = Path(data_dir)
+    logger.info(
+        f'Processing {len(list(datapath.glob("*.parquet")))} files in {datapath}'
+    )
+    for i, files in enumerate(tqdm(datapath.glob("*.parquet"))):
+        file_path = output_dir.joinpath(f"uspto{i}.parquet")
+        scan_dataset(files, limit, max_concurrency).write_parquet(file_path)
+
+
+def scan_dataset(file_name, limit, max_concurrency) -> pl.DataFrame:
+    """
+    Scans an individual parquet file and returns a processed DataFrame.
+
+    Returns:
+        DataFrame: A processed DataFrame containing the selected columns from the dataset.
+
+    Example Usage:
+        file_name = "dataset.parquet"
+        limit = 100
+        max_concurrency = 4
+
+        result = scan_dataset((file_name, limit, max_concurrency))
+    """
+    parallel_apply_desc = partial(parallel_apply, False, max_concurrency)
+    parallel_apply_claims = partial(parallel_apply, True, max_concurrency)
+    columns = (
+        "title_text",
+        "title_language",
+        "abstract_text",
+        "description_html",
+        "claims_html",
+        "publication_date",
+        "application_number",
+        "filing_date",
+    )
+
+    df: pl.LazyFrame = (
+        pl.scan_parquet(file_name)
+        .select(columns)
+        .filter(
+            ~pl.all_horizontal(
+                pl.col(["abstract_text", "description_html", "claims_html"]).is_null()
+            )
+        )
+        # we use app no. for the id and filing date for the date created
+        .rename({"application_number": "id", "filing_date": "created"})
+        .with_columns(
+            # the data was scrapped approx at this date
+            pl.lit("2024-03-22", dtype=pl.String).alias("added"),
+            col("created").cast(pl.String, strict=False),
+            col("publication_date").cast(pl.String, strict=False),
+            pl.concat_str(
+                pl.lit(r"ABSTRACT", dtype=pl.String),
+                pl.lit("\n\n", dtype=pl.String),
+                col("abstract_text"),
+                ignore_nulls=False,
+            ).alias("abstract_text"),
+        )
+        .with_columns_seq(
+            col("description_html").map_batches(
+                parallel_apply_desc,
+                return_dtype=pl.String,
+            ),
+            col("claims_html").map_batches(
+                parallel_apply_claims,
+                return_dtype=pl.String,
+            ),
+        )
+        .with_columns(
+            pl.concat_str(
+                col("title_text"),
+                pl.lit("\n\n", dtype=pl.String),
+                col("abstract_text"),
+                pl.lit("\n\n", dtype=pl.String),
+                col("description_html"),
+                pl.lit("\n\n", dtype=pl.String),
+                col("claims_html"),
+                ignore_nulls=True,
+            ).alias("text"),
+            pl.struct(
+                pl.lit(str(PermissiveLicenses.CC_BY), dtype=pl.String).alias("license"),
+                col("title_language").alias("language"),
+                col("publication_date").alias("publication_date"),
+            ).alias("metadata"),
+            pl.lit("Google Patents Public Data").alias("source"),
+        )
+    ).select(["id", "text", "added", "created", "source", "metadata"])
+    if limit > 0:
+        df = df.fetch(limit).lazy()
+    return df.collect()
+
+
+def create_args_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output-path", type=str, help="Output directory", default=r"uspto/outputs"
+    )
+    parser.add_argument(
+        "--data-path",
+        type=str,
+        default=r"uspto/data",
+        help="Dataset directory where all parquet files to process are located ",
+    )
+
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=0,
+        help="Limit the number of rows to read for testing",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=int(os.cpu_count()) - 1,
+        help="Maximum number of multiprocessing for pandoc conversions",
+    )
+    parser.add_argument(
+        "--to-parquet",
+        action="store_true",
+        help="Output to parquet file",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    args = create_args_parser().parse_args()
+    logger.info(
+        f"""Processing USPTO with the following parameters: Output Dir: {args.output_path}, Data Dir: {args.data_path},
+         Limit: {args.limit}, Max Concurrency: {args.max_concurrency}"""
+    )
+    if args.to_parquet:
+        to_parquet(args.output_path, args.data_path, args.limit, args.max_concurrency)
+    else:
+        to_dolma(
+            process_datasets(
+                data_dir=args.data_path,
+                limit=args.limit,
+                max_concurrency=args.max_concurrency,
+            ),
+            args.output_path,
+            "uspto.jsonl.gz",
+        )
diff --git a/uspto/utils.py b/uspto/utils.py
@@ -0,0 +1,42 @@
+import multiprocessing
+import re
+from functools import partial
+from itertools import islice
+
+import polars as pl
+import pypandoc
+from rich.progress import track
+
+
+def batched(iterable, n):
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
+
+
+def parse_html(claims: bool, html_string: str) -> str:
+    if not html_string:
+        return ""
+    text = pypandoc.convert_text(html_string, "plain", "html", extra_args=["--quiet"])
+    # remove single newlines that are not surrounded by other newlines as those are likely line length formatting.
+    new_line_pattern = r"(?<!\n)\n(?!\n)"
+    # also add line-breaks after <number><periods> for claims (as they are all numbered).
+    list_pattern = r"(\s\d+\.\s)"
+    text = re.sub(new_line_pattern, " ", text)
+    if claims:
+        text = re.sub(list_pattern, r"\n\1", text)
+    return text
+
+
+# from: https://stackoverflow.com/a/74749075/19355181
+def parallel_apply(claims: bool, max_concurrency: int, column: pl.Series) -> pl.Series:
+    if claims:
+        fn = partial(parse_html, True)
+    else:
+        fn = partial(parse_html, False)
+    if max_concurrency == 0:
+        max_concurrency = None
+    # polars mainly handles the concurrency but the pandoc calls add as a blocker. This is a workaround to
+    # increase the concurrency of the pandoc calls.
+    with multiprocessing.get_context("spawn").Pool(max_concurrency) as pool:
+        return pl.Series(pool.imap(fn, track(column, description="Processing column")))