ups

r-three · Jun 24, 2024 · 450e6b1 · 450e6b1
1 parent f364ea8
commit 450e6b1
Show file tree

Hide file tree

Showing 9 changed files with 488 additions and 22 deletions.
diff --git a/licensed_pile/utils.py b/licensed_pile/utils.py
@@ -1,5 +1,7 @@
 """Shared utilities like string processing."""
 
+import os
+
 
 # We don't use snake case as the string methods added in PIP616 are named like this.
 def removeprefix(s: str, prefix: str) -> str:
@@ -16,3 +18,15 @@ def removesuffix(s: str, suffix: str) -> str:
     if suffix and s.endswith(suffix):
         return s[: -len(suffix)]
     return s[:]
+
+
+def dolma_input(input_path: str, filepattern: str) -> str:
+    if os.path.exists(input_path) and os.path.isfile(input_path):
+        return input_path
+    return os.path.join(input_path, "documents", filepattern)
+
+
+def dolma_output(output_path: str):
+    if os.path.basename(output_path) != "documents":
+        return os.path.join(output_path, "documents")
+    return output_path
diff --git a/wiki/dolma_utils.py b/wiki/dolma_utils.py
@@ -0,0 +1 @@
+"""Utilities for converting wiki's into the dolma format."""
diff --git a/wiki/dump/download.py b/wiki/dump/download.py
@@ -0,0 +1,48 @@
+"""Download and extract official wiki dumps."""
+
+import argparse
+import os
+import re
+import urllib.parse
+
+import pyunpack
+
+from licensed_pile import logs, scrape
+
+parser = argparse.ArgumentParser(
+    description="Download and Extract official Wiki dumps."
+)
+parser.add_argument("--url", help="The url to download a dump from.")
+parser.add_argument("--wikimedia", help="")
+parser.add_argument(
+    "--output_dir", default="data/dumps", help="Where to save the downloaded dumps."
+)
+
+
+def wikimedia_url(wikimedia):
+    wikimedia = re.sub(r"^en", "", wikimedia)
+    return f"https://dumps.wikimedia.org/en{wikimedia}/latest/en{wikimedia}-latest-pages-articles-multistream.xml.bz2"
+
+
+def download_and_extract(url, ident, output_dir):
+    filename = os.path.basename(urllib.parse.urlparse(url).path)
+
+
+def main(args):
+    if args.url and args.wikimedia:
+        raise ValueError(
+            f"--url={args.url} and --wikimedia={args.wikimedia} cannot be set at the same time."
+        )
+    if not (args.url or args.wikimedia):
+        raise ValueError(f"--url or --wikimedia must be set.")
+    if not args.url:
+        args.url = wikimedia_url(args.wikimedia)
+
+    ident = ...
+    download_and_extract(args.url, ident, args.output_dir)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging("wiki/dump")
+    main(args)
diff --git a/wiki/dump/download.sh b/wiki/dump/download.sh
@@ -21,8 +21,8 @@ declare -a wikis=(
 )
 
 for wiki in ${wikis[@]}; do
-    filename="en${wiki}-${DATE}-pages-articles-multistream.xml.bz2"
-    url="https://dumps.wikimedia.org/en${wiki}/latest/${filename}"
+    filename="en${wiki}-${DATE}-pages-meta-current.xml.bz2"
+    url="https://dumps.wikimedia.org/en${wiki}/${DATE}/${filename}"
     # Use wget to avoid re-downloading and continue downloads.
     wget -nc -c ${url} -O "${data_dir}/dumps/${filename}"
     # bzip2 doesn't decompress if the output is already there, so we don't check

diff --git a/wiki/dump/to_dolma.sh b/wiki/dump/to_dolma.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+DATE=${1}
+export_dir=${2:-"data/dumps"}
+export_dir=${export_dir%/}
+output_dir=${3:-"data/wiki/dump/raw"}
+output_dir=${output_dir%/}
+
+if [ -z ${DATE} ]; then
+    echo "usage: to_dolma.sh [date] dump/ data/wiki/raw/documents" 2> /dev/null
+    exit 1
+fi
+
+declare -a wikis=(
+    # wiki
+    wikibooks
+    wikinews
+    wikiquote
+    wikisource
+    wikiversity
+    wikivoyage
+    wiktionary
+)
+
+for wiki in ${wikis[@]}; do
+    filename="en${wiki}-${DATE}-pages-meta-current.xml"
+    # Check for output
+    if [[ ${wiki} == "wiki" ]]; then
+        url="https://wikipedia.com"
+    else
+        url="https://${wiki}.com"
+    fi
+    python ../scrape/to_dolma.py --license CC-BY-SA/4.0 --wiki "${url}" --export "${export_dir}/${filename}" --output_dir "${output_dir}" --last_author --source "wiki/dump"
+done
diff --git a/wiki/parser/package.json b/wiki/parser/package.json
@@ -1,6 +1,8 @@
 {
   "dependencies": {
     "commander": "^12.1.0",
-    "wtf_wikipedia": "^10.3.1"
+    "wtf_wikipedia": "^10.3.1",
+    "wtf-plugin-api": "^2.0.0",
+    "wtf-plugin-latex": "^1.0.0"
   }
 }
diff --git a/wiki/parser/parser.js b/wiki/parser/parser.js
@@ -1,25 +1,45 @@
+// Simple wikitext parsing server, node parser.js --port [port]
+//
+// Can create multiple versions that listen on multiple ports behind a load
+// balancer for multiprocessing.
+
+// Simple webserver
 const http = require("http");
+// Wikitext parsing
 const wtf = require("wtf_wikipedia");
+// Explored the wikitext -> latex conversion, doesn't help much, things like
+// unicode pi are still unicode.
+// wtf.extend(require("wtf-plugin-latex"))
+// cli parsing
 const { program } = require("commander");
 
 
 const requestListener = (req, res) => {
-  // console.log("Incoming Request!");
+  // Server endpoint:
+  // Input: {"wikitext": str, "id": str, "source": str}
+  //   id and source are just used for debugging.
+  // Output: {"document": List[{"title": str, "text": str}]}
 
+  // Read in request json
   var data = "";
   req.on("data", function (chunk) {
     data += chunk;
   });
   req.on("end", function () {
     data = JSON.parse(data);
-    console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`);
 
+    // Set response headers
+    console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`);
     res.setHeader("Content-Type", "application/json");
 
+    // parse wikitext with wtf_wikipedia
+    var doc = wtf(data["wikitext"]);
+    // convert the format into json, a list of sections (which have title + text)
     const response = {
-      text: wtf(data["wikitext"]).text(),
-    }
+      document: doc.sections().map(s => ({title: s.title(), text: s.text()})),
+    };
 
+    // Return response
     const json = JSON.stringify(response);
     res.writeHead(200);
     res.end(json);
@@ -28,14 +48,16 @@ const requestListener = (req, res) => {
 };
 
 
-const server = http.createServer(requestListener);
-
+// Parse CLI arguments
 program
   .option("--port <int>", "port", 3000)
   .option("--host", "host", "localhost")
   .parse();
 const args = program.opts(process.argv);
 
+// Setup Server
+const server = http.createServer(requestListener);
+// Start Server
 server.listen(args.port, args.host, function(error) {
   if (!error)
     console.log(`Server is Listening at http://${args.host}:${args.port}`);

diff --git a/wiki/preprocess.py b/wiki/preprocess.py
@@ -10,8 +10,9 @@
 import requests
 import tqdm
 
-from licensed_pile import logs
+from licensed_pile import logs, utils
 from licensed_pile.write import ShardParallelProcessor
+from wiki import adjust_indentation, format_document, parse_wikitext, replace_math_tags
 
 parser = argparse.ArgumentParser(description="Preprocess raw books in dolma format.")
 parser.add_argument(
@@ -24,6 +25,11 @@
     default="dump/data/wiki/dump/v0",
     help="The output version, this directory should be where the `documents` dir will live.",
 )
+parser.add_argument(
+    "--filename",
+    default="*.jsonl.gz",
+    help="The filename to match with globs, probably needs to be escaped.",
+)
 # TODO: Respect this flag
 parser.add_argument(
     "--overwrite",
@@ -45,28 +51,39 @@
 logs.configure_logging("dolma.WTFWikipediaParallel")
 
 
-def parse_wikitext(text, doc_id, source):
-    return requests.post(
-        "http://localhost:3000", json={"wikitext": text, "id": doc_id, "source": source}
-    ).json()["text"]
-
-
 class WTFWikipediaParallel(ShardParallelProcessor):
     @classmethod
     def process_example(cls, example, **kwargs):
-        example["text"] = parse_wikitext(
-            example["text"], example["id"], example["source"]
+        logger = cls.get_logger()
+        logger.warning(f"Processing example: {example['id']}")
+        wikitext = example["text"]
+        if wikitext is None:
+            wikitext = ""
+        # Convert <math>
+        wikitext = replace_math_tags(wikitext)
+        # Adjust indentation to avoid reorderings.
+        wikitext = adjust_indentation(wikitext)
+        # Extract Templates
+        ...
+        # Parse Wiki Text
+        document = parse_wikitext(wikitext, example["id"], example["source"])
+        # Format plaintext into document
+        document = format_document(
+            document, example.get("metadata", {}).get("title", "")
         )
+        # Process Templates
+        ...
+        # Reinsert Templates
+        ...
+        example["text"] = document
         return example
 
 
 def main(args):
     with TemporaryDirectory() as tempdir:
         processor = WTFWikipediaParallel(
-            source_prefix=os.path.join(
-                args.input, "documents", "*_wiktionary.com.jsonl.gz"
-            ),
-            destination_prefix=os.path.join(args.output, "documents"),
+            source_prefix=utils.dolma_input(args.input, args.filename),
+            destination_prefix=utils.dolma_output(args.output),
             metadata_prefix=tempdir,
             num_processes=args.processes,
         )