Skip to content

Commit

Permalink
ups
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed Jun 24, 2024
1 parent f364ea8 commit 450e6b1
Show file tree
Hide file tree
Showing 9 changed files with 488 additions and 22 deletions.
14 changes: 14 additions & 0 deletions licensed_pile/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Shared utilities like string processing."""

import os


# We don't use snake case as the string methods added in PIP616 are named like this.
def removeprefix(s: str, prefix: str) -> str:
Expand All @@ -16,3 +18,15 @@ def removesuffix(s: str, suffix: str) -> str:
if suffix and s.endswith(suffix):
return s[: -len(suffix)]
return s[:]


def dolma_input(input_path: str, filepattern: str) -> str:
if os.path.exists(input_path) and os.path.isfile(input_path):
return input_path
return os.path.join(input_path, "documents", filepattern)


def dolma_output(output_path: str):
if os.path.basename(output_path) != "documents":
return os.path.join(output_path, "documents")
return output_path
1 change: 1 addition & 0 deletions wiki/dolma_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Utilities for converting wiki's into the dolma format."""
48 changes: 48 additions & 0 deletions wiki/dump/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Download and extract official wiki dumps."""

import argparse
import os
import re
import urllib.parse

import pyunpack

from licensed_pile import logs, scrape

parser = argparse.ArgumentParser(
description="Download and Extract official Wiki dumps."
)
parser.add_argument("--url", help="The url to download a dump from.")
parser.add_argument("--wikimedia", help="")
parser.add_argument(
"--output_dir", default="data/dumps", help="Where to save the downloaded dumps."
)


def wikimedia_url(wikimedia):
wikimedia = re.sub(r"^en", "", wikimedia)
return f"https://dumps.wikimedia.org/en{wikimedia}/latest/en{wikimedia}-latest-pages-articles-multistream.xml.bz2"


def download_and_extract(url, ident, output_dir):
filename = os.path.basename(urllib.parse.urlparse(url).path)


def main(args):
if args.url and args.wikimedia:
raise ValueError(
f"--url={args.url} and --wikimedia={args.wikimedia} cannot be set at the same time."
)
if not (args.url or args.wikimedia):
raise ValueError(f"--url or --wikimedia must be set.")
if not args.url:
args.url = wikimedia_url(args.wikimedia)

ident = ...
download_and_extract(args.url, ident, args.output_dir)


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wiki/dump")
main(args)
4 changes: 2 additions & 2 deletions wiki/dump/download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ declare -a wikis=(
)

for wiki in ${wikis[@]}; do
filename="en${wiki}-${DATE}-pages-articles-multistream.xml.bz2"
url="https://dumps.wikimedia.org/en${wiki}/latest/${filename}"
filename="en${wiki}-${DATE}-pages-meta-current.xml.bz2"
url="https://dumps.wikimedia.org/en${wiki}/${DATE}/${filename}"
# Use wget to avoid re-downloading and continue downloads.
wget -nc -c ${url} -O "${data_dir}/dumps/${filename}"
# bzip2 doesn't decompress if the output is already there, so we don't check
Expand Down
34 changes: 34 additions & 0 deletions wiki/dump/to_dolma.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

DATE=${1}
export_dir=${2:-"data/dumps"}
export_dir=${export_dir%/}
output_dir=${3:-"data/wiki/dump/raw"}
output_dir=${output_dir%/}

if [ -z ${DATE} ]; then
echo "usage: to_dolma.sh [date] dump/ data/wiki/raw/documents" 2> /dev/null
exit 1
fi

declare -a wikis=(
# wiki
wikibooks
wikinews
wikiquote
wikisource
wikiversity
wikivoyage
wiktionary
)

for wiki in ${wikis[@]}; do
filename="en${wiki}-${DATE}-pages-meta-current.xml"
# Check for output
if [[ ${wiki} == "wiki" ]]; then
url="https://wikipedia.com"
else
url="https://${wiki}.com"
fi
python ../scrape/to_dolma.py --license CC-BY-SA/4.0 --wiki "${url}" --export "${export_dir}/${filename}" --output_dir "${output_dir}" --last_author --source "wiki/dump"
done
4 changes: 3 additions & 1 deletion wiki/parser/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{
"dependencies": {
"commander": "^12.1.0",
"wtf_wikipedia": "^10.3.1"
"wtf_wikipedia": "^10.3.1",
"wtf-plugin-api": "^2.0.0",
"wtf-plugin-latex": "^1.0.0"
}
}
34 changes: 28 additions & 6 deletions wiki/parser/parser.js
Original file line number Diff line number Diff line change
@@ -1,25 +1,45 @@
// Simple wikitext parsing server, node parser.js --port [port]
//
// Can create multiple versions that listen on multiple ports behind a load
// balancer for multiprocessing.

// Simple webserver
const http = require("http");
// Wikitext parsing
const wtf = require("wtf_wikipedia");
// Explored the wikitext -> latex conversion, doesn't help much, things like
// unicode pi are still unicode.
// wtf.extend(require("wtf-plugin-latex"))
// cli parsing
const { program } = require("commander");


const requestListener = (req, res) => {
// console.log("Incoming Request!");
// Server endpoint:
// Input: {"wikitext": str, "id": str, "source": str}
// id and source are just used for debugging.
// Output: {"document": List[{"title": str, "text": str}]}

// Read in request json
var data = "";
req.on("data", function (chunk) {
data += chunk;
});
req.on("end", function () {
data = JSON.parse(data);
console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`);

// Set response headers
console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`);
res.setHeader("Content-Type", "application/json");

// parse wikitext with wtf_wikipedia
var doc = wtf(data["wikitext"]);
// convert the format into json, a list of sections (which have title + text)
const response = {
text: wtf(data["wikitext"]).text(),
}
document: doc.sections().map(s => ({title: s.title(), text: s.text()})),
};

// Return response
const json = JSON.stringify(response);
res.writeHead(200);
res.end(json);
Expand All @@ -28,14 +48,16 @@ const requestListener = (req, res) => {
};


const server = http.createServer(requestListener);

// Parse CLI arguments
program
.option("--port <int>", "port", 3000)
.option("--host", "host", "localhost")
.parse();
const args = program.opts(process.argv);

// Setup Server
const server = http.createServer(requestListener);
// Start Server
server.listen(args.port, args.host, function(error) {
if (!error)
console.log(`Server is Listening at http://${args.host}:${args.port}`);
Expand Down
43 changes: 30 additions & 13 deletions wiki/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
import requests
import tqdm

from licensed_pile import logs
from licensed_pile import logs, utils
from licensed_pile.write import ShardParallelProcessor
from wiki import adjust_indentation, format_document, parse_wikitext, replace_math_tags

parser = argparse.ArgumentParser(description="Preprocess raw books in dolma format.")
parser.add_argument(
Expand All @@ -24,6 +25,11 @@
default="dump/data/wiki/dump/v0",
help="The output version, this directory should be where the `documents` dir will live.",
)
parser.add_argument(
"--filename",
default="*.jsonl.gz",
help="The filename to match with globs, probably needs to be escaped.",
)
# TODO: Respect this flag
parser.add_argument(
"--overwrite",
Expand All @@ -45,28 +51,39 @@
logs.configure_logging("dolma.WTFWikipediaParallel")


def parse_wikitext(text, doc_id, source):
return requests.post(
"http://localhost:3000", json={"wikitext": text, "id": doc_id, "source": source}
).json()["text"]


class WTFWikipediaParallel(ShardParallelProcessor):
@classmethod
def process_example(cls, example, **kwargs):
example["text"] = parse_wikitext(
example["text"], example["id"], example["source"]
logger = cls.get_logger()
logger.warning(f"Processing example: {example['id']}")
wikitext = example["text"]
if wikitext is None:
wikitext = ""
# Convert <math>
wikitext = replace_math_tags(wikitext)
# Adjust indentation to avoid reorderings.
wikitext = adjust_indentation(wikitext)
# Extract Templates
...
# Parse Wiki Text
document = parse_wikitext(wikitext, example["id"], example["source"])
# Format plaintext into document
document = format_document(
document, example.get("metadata", {}).get("title", "")
)
# Process Templates
...
# Reinsert Templates
...
example["text"] = document
return example


def main(args):
with TemporaryDirectory() as tempdir:
processor = WTFWikipediaParallel(
source_prefix=os.path.join(
args.input, "documents", "*_wiktionary.com.jsonl.gz"
),
destination_prefix=os.path.join(args.output, "documents"),
source_prefix=utils.dolma_input(args.input, args.filename),
destination_prefix=utils.dolma_output(args.output),
metadata_prefix=tempdir,
num_processes=args.processes,
)
Expand Down
Loading

0 comments on commit 450e6b1

Please sign in to comment.