From a76f2fdfbde6fe76b3722e23b8fdcbce2dee5429 Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Tue, 30 Jul 2024 02:06:55 -0400 Subject: [PATCH] update --- licensed_pile/write.py | 37 +++++++-- wiki/.gitignore | 1 + wiki/dump/README.md | 7 ++ wiki/dump/download.sh | 4 +- wiki/dump/to_dolma.sh | 4 +- wiki/parser/.gitignore | 14 ++++ wiki/parser/README.md | 34 ++++++++ wiki/parser/haproxy.cfg | 40 +++++++++ wiki/parser/parser-old.js | 71 ---------------- wiki/parser/parser.js | 59 ++++++++++++-- wiki/parser/start.sh | 34 ++++++++ wiki/parser/test.js | 3 - wiki/parser/worker.js | 14 +++- wiki/preprocess.py | 118 ++++++++++++++++++++++----- wiki/wiki.py | 165 +++++++++++++++++++++++--------------- 15 files changed, 431 insertions(+), 174 deletions(-) create mode 100644 wiki/.gitignore create mode 100644 wiki/dump/README.md create mode 100644 wiki/parser/.gitignore create mode 100644 wiki/parser/README.md create mode 100644 wiki/parser/haproxy.cfg delete mode 100644 wiki/parser/parser-old.js create mode 100755 wiki/parser/start.sh delete mode 100644 wiki/parser/test.js diff --git a/licensed_pile/write.py b/licensed_pile/write.py index 048c9e2..70dab85 100644 --- a/licensed_pile/write.py +++ b/licensed_pile/write.py @@ -56,6 +56,20 @@ def to_dolma( wf.write(data + "\n") +def smart_open_exists(path): + try: + with smart_open.open(path): + return True + except: + return False + + +def create_shadow(path): + h, t = os.path.split(path) + # Add shadow at the start to not break any filename inference from smart_open + return os.path.join(h, f"shadow.{t}") + + class ShardParallelProcessor(BaseParallelProcessor): """Handle read/writes to jsonl.gz so our processor code only needs to processing a single example.""" @@ -93,10 +107,14 @@ def process_single( **kwargs, ): logger = cls.get_logger() + overwrite = kwargs.pop("overwrite", False) logger.debug("Processing %s into %s", source_path, destination_path) - with smart_open.open(source_path) as f, smart_open.open( - destination_path, "w" - ) as wf: + if not overwrite and smart_open_exists(destination_path): + logger.info("%s already exists, skipping", destination_path) + cls.increment_progressbar(queue, shards=1) + return + shadow_path = create_shadow(destination_path) + with smart_open.open(source_path) as f, smart_open.open(shadow_path, "w") as wf: document_count = 0 update_interval = kwargs.pop("update_interval", 1) debug = kwargs.pop("debug", False) @@ -126,6 +144,7 @@ def process_single( source_path, i, ) + document_count += 1 continue if debug and og == processed["text"]: @@ -142,8 +161,10 @@ def process_single( update_interval *= 2 document_count = 0 except Exception as e: - logger.warning( - "Failed to process %s:%s %s", source_path, i, e, exc_info=True - ) - return - cls.increment_progressbar(queue, shards=1, documents=document_count) + e.add_note(f"Exception occured while processing {source_path}:{i}") + logger.warning("Failed to process %s:%s", source_path, i, exc_info=True) + raise + # return + # Move, only works on local atm + os.rename(shadow_path, destination_path) + cls.increment_progressbar(queue, shards=1, documents=document_count) diff --git a/wiki/.gitignore b/wiki/.gitignore new file mode 100644 index 0000000..60baa9c --- /dev/null +++ b/wiki/.gitignore @@ -0,0 +1 @@ +data/* diff --git a/wiki/dump/README.md b/wiki/dump/README.md new file mode 100644 index 0000000..bd0a83a --- /dev/null +++ b/wiki/dump/README.md @@ -0,0 +1,7 @@ +# MediaWiki + +## Steps: + +1. Run `download.sh YYYYMMDD` to download xml dumps +2. Run `to_dolma.sh YYYYMMDD` (date must match) to convert to the dolma format +3. Run `python preprocess.py --input ... --output ...` diff --git a/wiki/dump/download.sh b/wiki/dump/download.sh index 38868f7..d09bfb9 100755 --- a/wiki/dump/download.sh +++ b/wiki/dump/download.sh @@ -5,7 +5,7 @@ data_dir=${2:-"data"} data_dir=${data_dir%/} if [ -z ${DATE} ]; then - echo "usage: download.sh [date] data/" 2> /dev/null + echo "usage: download.sh [date YYYYMMDD] data/" 2> /dev/null exit 1 fi @@ -20,6 +20,8 @@ declare -a wikis=( wiktionary ) +mkdir -p "${data_dir}/dumps" + for wiki in ${wikis[@]}; do filename="en${wiki}-${DATE}-pages-meta-current.xml.bz2" url="https://dumps.wikimedia.org/en${wiki}/${DATE}/${filename}" diff --git a/wiki/dump/to_dolma.sh b/wiki/dump/to_dolma.sh index 7acb4c3..aa19982 100755 --- a/wiki/dump/to_dolma.sh +++ b/wiki/dump/to_dolma.sh @@ -7,12 +7,12 @@ output_dir=${3:-"data/wiki/dump/raw"} output_dir=${output_dir%/} if [ -z ${DATE} ]; then - echo "usage: to_dolma.sh [date] dump/ data/wiki/raw/documents" 2> /dev/null + echo "usage: to_dolma.sh [date YYYYMMDD] dump/ data/wiki/raw/documents" 2> /dev/null exit 1 fi declare -a wikis=( - # wiki + wiki wikibooks wikinews wikiquote diff --git a/wiki/parser/.gitignore b/wiki/parser/.gitignore new file mode 100644 index 0000000..55145b7 --- /dev/null +++ b/wiki/parser/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.idea +*.log +tmp/ + +*.tern-port +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +*.tsbuildinfo +.npm +.eslintcache +logs/ diff --git a/wiki/parser/README.md b/wiki/parser/README.md new file mode 100644 index 0000000..5abda9f --- /dev/null +++ b/wiki/parser/README.md @@ -0,0 +1,34 @@ +# WTF WIKIPEDIA parsing server + +We use the dolma format and a server running `wtf_wikipedia` for wikitext parsing instead of they dumpster dip as we want to be able to parse wikitext even when it is not in the standard xml format. + +## Starting the Server + +1. Install HAProxy `sudo apt install haproxy` +2. Install nvm and node +3. Install dependencies `npm install` +4. edit `haproxy.cfg` to include one `server ${name} 127.0.0.1:${port} check` line for each server you plan to run. +5. move/link `haproxy.cfg` to `/etc/haproxy/haproxy.cfg` +6. Restart haproxy (`systemctl restart haproxy` on systemd based systems) +7. Run `./start ${numserver}`. Should match the number of `server` lines in `haproxy` +8. Go to `localhost:8404/stats` to check that each server is seen by haproxy + +## Why? + +Each server uses a worker pool with `1` worker. This is because `wtf_wikipedia` is syncronous code, so we need to run it in a thread to be able to use timeouts to cancel execution for long running documents. This also helps in cases where the parsing causes an OoM error, this happens in the thread instead of the real server. + +We then have multiple copies of the server behing the load balancer, this allows for recovery in cases where the main server itself crashes. + +### v8 garbage collection + +v8, and therefore node, seem to have a pretty complex garbage collector and includes things like different heaps for persistant objects and "young" objects that are short-lived. Despite various efforts to set the sizes for these heaps (defaults to 64 and 32 GB in our code for each worker), I have found a lot of javascript OoM error, even though they seem to say that the heap is much smaller than the limits. This is set in the optinos for the constructor for the worker pool. + +There were also cases where using a large worker pool and a single server, the main server can have OoM errors. This crashes the whole server and grinds the dolma conversion to a halt. Even with commandline arguments to set the size of the heap, this was still happening, again despite it seeming to not have much on the heap. When this happens, our load balancer stops routing traffic to this server and out start script brings a new version online. Once it is live it is added back to the pool. + +These errors tend to happen on pages that have over 2 million characters. + +## Settings + +It seems to be fast to try to make sure that each server is currently working on 1 document and have already received a second document to be processed next. As the python code is syncronous, this means we need ~twice as many dolma processes as we have servers. Having extra python processes allows for the server to not have to wait for python string manipulataions. + +On a Ryzen 9 7950X using 30 dolma processes and 16 servers, the whole system processes ~5.5k documents/second and takes ~4 hours and 15 mins to process wikipeadia + talk pages and the other mediawiki pages. diff --git a/wiki/parser/haproxy.cfg b/wiki/parser/haproxy.cfg new file mode 100644 index 0000000..e31822f --- /dev/null +++ b/wiki/parser/haproxy.cfg @@ -0,0 +1,40 @@ +defaults + mode http + timeout client 10m + timeout connect 10m + timeout server 10m + timeout http-request 10m + balance leastconn + +frontend stats + mode http + bind 127.0.0.1:8404 + stats enable + stats uri /stats + stats refresh 5s + stats admin if LOCALHOST + +frontend wtf + bind 127.0.0.1:5000 + default_backend wtf_workers + +backend wtf_workers + option httpchk + http-check send meth GET uri /health + http-check expect status 200 + server wtf1 127.0.0.1:5001 check + server wtf2 127.0.0.1:5002 check + server wtf3 127.0.0.1:5003 check + server wtf4 127.0.0.1:5004 check + server wtf5 127.0.0.1:5005 check + server wtf6 127.0.0.1:5006 check + server wtf7 127.0.0.1:5007 check + server wtf8 127.0.0.1:5008 check + server wtf9 127.0.0.1:5009 check + server wtf10 127.0.0.1:5010 check + server wtf11 127.0.0.1:5011 check + server wtf12 127.0.0.1:5012 check + server wtf13 127.0.0.1:5013 check + server wtf14 127.0.0.1:5014 check + server wtf15 127.0.0.1:5015 check + server wtf16 127.0.0.1:5016 check diff --git a/wiki/parser/parser-old.js b/wiki/parser/parser-old.js deleted file mode 100644 index d683473..0000000 --- a/wiki/parser/parser-old.js +++ /dev/null @@ -1,71 +0,0 @@ -// Simple wikitext parsing server, node parser.js --port [port] -// -// Can create multiple versions that listen on multiple ports behind a load -// balancer for multiprocessing. - -// Simple webserver -const http = require("http"); -// Wikitext parsing -const wtf = require("wtf_wikipedia"); -// Explored the wikitext -> latex conversion, doesn't help much, things like -// unicode pi are still unicode. -// wtf.extend(require("wtf-plugin-latex")) -// cli parsing -const { program } = require("commander"); - -const requestListener = (req, res) => { - // Server endpoint: - // Input: {"wikitext": str, "id": str, "source": str} - // id and source are just used for debugging. - // Output: {"document": List[{"title": str, "text": str}]} - - // Read in request json - var data = ""; - req.on("data", function (chunk) { - data += chunk; - }); - req.on("end", function () { - data = JSON.parse(data); - - // Set response headers - console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`); - res.setHeader("Content-Type", "application/json"); - - // parse wikitext with wtf_wikipedia - var doc = wtf(data["wikitext"]); - // convert the format into json, a list of sections (which have title + text) - const response = { - document: doc.sections().map(s => ({title: s.title(), text: s.text()})), - }; - - // Return response - const json = JSON.stringify(response); - res.writeHead(200); - res.end(json); - }); - -}; - - -// Parse CLI arguments -program - .option("--port ", "port", 3000) - .option("--host", "host", "localhost") - .parse(); -const args = program.opts(process.argv); - -// Setup Server -const server = http.createServer(requestListener); - -server.on("timeout", (socket) => { - console.log("timeout"); - socket.destroy(); -}) - -// Start Server -server.listen(args.port, args.host, function(error) { - if (!error) - console.log(`Server is Listening at http://${args.host}:${args.port}`); - else - console.log("Error binding server to 3000"); -}); diff --git a/wiki/parser/parser.js b/wiki/parser/parser.js index 0d20e25..be200ea 100644 --- a/wiki/parser/parser.js +++ b/wiki/parser/parser.js @@ -8,42 +8,87 @@ const express = require("express"); // cli parsing const { program } = require("commander"); const workerpool = require("workerpool"); -const pool = workerpool.pool("./worker.js"); +// Convert the cli argument into an actual int. +function parseIntArg(value, prev) { + const parsedValue = parseInt(value, 10); + if (isNaN(parsedValue)) { + throw new commander.InvalidArgumentError("Not an Int.") + } + return parsedValue; +} // Parse CLI arguments program - .option("--port ", "port", 3000) + .option("--port ", "port", parseIntArg, 3000) .option("--host", "host", "localhost") - .option("--timeout ", "timeout (seconds)", 120) + .option("--timeout ", "timeout (seconds)", parseIntArg, 120) + .option("--maxworkers ", "max #workers in pool", parseIntArg, 1) .parse(); const args = program.opts(process.argv); +// TODO: make pool settings configurable +console.log(`Starting worker pool with at most ${args.maxworkers} workers.`) +const pool = workerpool.pool("./worker.js", { + maxWorkers: args.maxworkers, + emitStdStreams: false, + workerThreadOpts: { + resourceLimits: { + maxOldGenerationSizeMb: 65536, + maxYoungGenerationSizeMb: 32768, + }}}); + const app = express(); +// TODO: How to set no size limit? app.use(express.json({limit: "1000mb"})); +// This is an endpoint the load balancer and the runner script will hit to make +// sure the server is running. Sometime the main server and crash when multiple +// large document requests come in. +app.get("/health", async (req, res) => { + res.status(200).send(""); +}) +// Endpoint to parse wikitext. app.post("/", async (req, res) => { + // Document comes as json {"wikitext": str, "id": str, "source": str} const data = req.body; - console.log(`Parsing wikitext from document ${data['id']} of ${data['source']}`); - // var response = await pool.exec('wtf_parse', [data["wikitext"]]); + // Pass this document to the worker pool. Using a worker pool allows us to + // put a timeout on syncronous code (wtf_wikipedia) as the main server will + // run async and kill the worker if it is taking too long. pool + // Run the parsing function `wtf_parse` in the worker file `worker.js` .exec('wtf_parse', [data["wikitext"]]) + // If the worker doesn't return a result in this time, an error is thrown .timeout(args.timeout * 1000) + // When the worker returns, this is run .then((response) => { + // Log finish and return parsed text. + console.log(`Finished parsing wikitext from document ${data['id']} of ${data['source']}`); res.json(response); }) + // If there was an error in the worker, .catch((err) => { console.log(err.message); + // If this is a timeout error, set the status code. if (err.message.indexOf("timed out") != -1) { console.error(`Parsing wikitext from document ${data['id']} of ${data['source']} timed out.`) + // This is technaially for the server to send the client when the client has + // timed out, but there isn't a server side timeout code. 504 is for when the + // server is a proxy, not just long running. res.status(408).json({ timeout: err.message }); + // Log other errors, these are generally from the worker running out of + // memory } else { + console.log(`~~~~~~~~~~ Error processing ${data['id']} of ${data['source']} ~~~~~~~~~~`); console.error(err); - res.status(500).json({ error: "Internal server error"}); + res.status(500).json({ error: err.message}); } }); }) -app.listen(args.port) +// Start the server. +app.listen(args.port, () => { + console.log(`Server started on port=${args.port} with timeout=${args.timeout} seconds.`) +}) diff --git a/wiki/parser/start.sh b/wiki/parser/start.sh new file mode 100755 index 0000000..6f19270 --- /dev/null +++ b/wiki/parser/start.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +NUMSERVERS=${1:-16} + +function port { + local id=${1} + if [[ ${id} -ge 10 ]]; then + echo "50${id}" + else + echo "500${id}" + fi +} + +function launch { + local id=${1} + node --max-old-space-size=65536 --max-semi-space-size=16384 parser.js --port $(port ${id}) --timeout 180 --maxworkers 1 >> ./logs/worker${id}.log 2>&1 & +} + +function ping { + local id=${1} + echo $(curl -I -X GET localhost:$(port ${id})/health 2> /dev/null | head -n 1 | cut -d$" " -f2) +} + +mkdir -p logs + +while true; do + for i in $(seq 1 $NUMSERVERS); do + if [[ $(ping ${i}) -ne "200" ]]; then + echo "Worker ${i} not running, starting." + launch ${i} + fi + done + sleep 5 +done diff --git a/wiki/parser/test.js b/wiki/parser/test.js deleted file mode 100644 index 2cd7ee4..0000000 --- a/wiki/parser/test.js +++ /dev/null @@ -1,3 +0,0 @@ -const wtf = require('wtf_wikipedia') - -console.log(wtf("This is an\n:interlude\nexample").text()) diff --git a/wiki/parser/worker.js b/wiki/parser/worker.js index 5019f28..c4de3c4 100644 --- a/wiki/parser/worker.js +++ b/wiki/parser/worker.js @@ -1,9 +1,21 @@ +// Actually run wtf_wikipedia parsing. This is done in a worker thread to allow +// for timeouts as it is sync code. + const workerpool = require("workerpool"); const wtf = require("wtf_wikipedia"); -function wtf_parse(text) { +function wtf_parse(text){ + // If the input is empty, at least return one empty section. This might have + // been better to have the client code deal with an empty list. + if (!text) { + return {document: [{title: "", text: ""}]} + } + + // Parse with wtf_wikipedia var doc = wtf(text); + // Convert to simple [{"title": str, "text": str}, ...] representation of + // sections for the response const response = { document: doc.sections().map(s => ({title: s.title(), text: s.text()})), }; diff --git a/wiki/preprocess.py b/wiki/preprocess.py index 2c6e447..bd779f5 100644 --- a/wiki/preprocess.py +++ b/wiki/preprocess.py @@ -14,7 +14,7 @@ from licensed_pile import logs, utils from licensed_pile.write import ShardParallelProcessor -parser = argparse.ArgumentParser(description="Preprocess raw books in dolma format.") +parser = argparse.ArgumentParser(description="Preprocess raw wikitext in dolma format.") parser.add_argument( "--input", default="dump/data/wiki/dump/raw", @@ -48,19 +48,67 @@ help="Number of processors for multicore.", ) -logs.configure_logging("dolma.WTFWikipediaParallel", level="DEBUG") +logs.configure_logging("dolma.WTFWikipediaParallel", level="INFO") + + +# These are pages that often crashed the servers. +DENYLIST = { + "Template:Attached KML/U.S. Route 62 in Kentucky", + "Template:Attached KML/U.S. Route 277", + "User:BeywheelzLetItRip/fonts.css", + "User:BeywheelzLetItRip/fonts2.cs", + "Template:Graph:Map/Inner/USA-json", +} class WTFWikipediaParallel(ShardParallelProcessor): + @classmethod + def parse_wikitext(cls, wikitext, ex_id, ex_src): + logger = cls.get_logger() + try: + return wiki.parse_wikitext(wikitext, ex_id, ex_src) + except requests.Timeout: + logger.error("Wikitext parsing for example: %s/%s timed out", ex_src, ex_id) + # Returning None for the whole example will filter it from the output. + return None + except (ValueError, requests.JSONDecodeError): + logger.error( + "Failed wikitext parsing for example: %s/%s", + ex_src, + ex_id, + exc_info=True, + ) + # Returning None for the whole example will filter it from the output. + return None + except Exception as e: + e.add_note(f"Failed to parse wikitext for example: {ex_src}/{ex_id}") + logger.error("Failed to parse wikitext for example: %s/%s", ex_src, ex_id) + raise + @classmethod def process_example(cls, example, **kwargs): logger = cls.get_logger() - logger.debug(f"Processing example: {example['id']}") + logger.debug("Processing example: %s/%s", example["source"], example["id"]) + if (title := example["metadata"]["title"]) in DENYLIST: + logger.warning( + "Skipping example: %s/%s (%s) from the deny list as the text is %d characters long.", + example["source"], + example["id"], + title, + len(example["text"]), + ) + # Returning None for the whole example will filter it from the output. + return None wikitext = example["text"] # Should be fixed in the dolma generation script. if not wikitext: - logger.warning(f"Example {example['id']} is empty") - return example + logger.warning("Example %s/%s is empty", example["source"], example["id"]) + # Returning None for the whole example will filter it from the output. + return None + # ... + # if len(wikitext) > 1_000_000: + # logger.warning("Skipping example: %s/%s as the text is %d characters long.", example['source'], example['id'], len(wikitext)) + # return None # Convert wikitext = wiki.replace_math_tags(wikitext) # Adjust indentation to avoid reorderings. @@ -71,52 +119,86 @@ def process_example(cls, example, **kwargs): ) if math_templates: logger.debug( - f"Found {len(math_templates)} {{{{math|...}}}} templates in document {example['id']}." + "Found %d {{math|...}} templates in example: %s/%s.", + len(math_templates), + example["source"], + example["id"], ) wikitext, raw_templates = wiki.extract_templates( wikitext, wiki.MATH_TEMPLATES, wiki.SECOND_MARKER ) if raw_templates: logger.debug( - f"Found {len(raw_templates)} more templates that appear to contain math in document {example['id']}." + "Found %d more templates that appear to contain math in example: %s/%s.", + len(raw_templates), + example["source"], + example["id"], ) # We replace these symbols after extracting any thare are part of other # templates. Trying to extract these as their own templates (optional \) # creates weird issues like {{Infobox ...}} getting extracted as {{In..}} wikitext = wiki.replace_symbols(wikitext, include_money=True) + # Parse Wiki Text - try: - document = wiki.parse_wikitext(wikitext, example["id"], example["source"]) - except: - logger.error(f"Failed wikitext parsing for {example['id']}", exc_info=True) - example["text"] = "" - return example + document = cls.parse_wikitext(wikitext, example["id"], example["source"]) + if document is None: + logger.warning( + "After parsing wikitext, %s/%s was empty", + example["source"], + example["id"], + ) + # Returning None for the whole example will filter it from the output. + return None + # Format plaintext into document document = wiki.format_document( document, example.get("metadata", {}).get("title", "") ) + if not document: + logger.warning( + "After parsing wikitext, %s/%s was empty", + example["source"], + example["id"], + ) + # Returning None for the whole example will filter it from the output. + return None + # Process Templates math_templates = map(wiki.fix_math, math_templates) parsed_templates = [ - wiki.parse_wikitext(t, example["id"], example["source"])[0]["text"] + cls.parse_wikitext(t, example["id"], example["source"]) for t in math_templates ] + parsed_templates = [ + p[0]["text"] if p is not None else "" for p in parsed_templates + ] for mt, pt in zip(math_templates, parsed_templates): if not pt: - logger.warning(f"Math template: {mt} was parsed to nothing.") + logger.warning( + "Math template: %s in example: %s/%s was parsed to nothing.", + mt, + example["source"], + example["id"], + ) parsed_templates = [t.replace(wiki.ABS_MARKER, "|") for t in parsed_templates] parsed_templates = [f"${t}$" for t in parsed_templates] raw_templates = map(wiki.fix_math, raw_templates) parsed_raw = [ - wiki.parse_wikitext(t, example["id"], example["source"])[0]["text"] + cls.parse_wikitext(t, example["id"], example["source"]) for t in raw_templates ] + parsed_raw = [p[0]["text"] if p is not None else "" for p in parsed_raw] for rt, pr in zip(raw_templates, parsed_templates): if not pr: - logger.warning(f"Template: {rt} was parsed to nothing.") + logger.warning( + "Template: %s in example: %s/%s was parsed to nothing.", + rt, + example["source"], + example["id"], + ) parsed_raw = [t.replace(wiki.ABS_MARKER, "|") for t in parsed_raw] parsed_raw = [f"${t}$" for t in parsed_raw] # Reinsert Templates @@ -134,7 +216,7 @@ def main(args): metadata_prefix=tempdir, num_processes=args.processes, ) - processor(debug=args.debug) + processor(debug=args.debug, overwrite=args.overwrite) if __name__ == "__main__": diff --git a/wiki/wiki.py b/wiki/wiki.py index d9fa602..c489a5e 100644 --- a/wiki/wiki.py +++ b/wiki/wiki.py @@ -13,11 +13,20 @@ # ¦¦¦ "Broken Bar", `|` has a lot of meaning in wikitext so we to replace actual instances of it. ABS_MARKER = "\u00A6\u00A6\u00A6" -# WTF Wikipedia strips out most templates, which is where almost all the math is :( - -# Unclosed scopes, e.g. "An example with {{math|''x''" are removed -> "An example with ''x''" +# WTF Wikipedia strips out most templates, which is where almost all the math is +# :( What we do is find the math templates (regex to find the start then iterate +# forward to find the closing of the scope, allows for nesting) and replace them +# with a symbol that doesn't appear anywhere else. We then clean each template +# ourselves and insert them back, after wtf_wikipedia has been run on the main +# article. +# +# Sometimes wtf_wikipdia converts a template to `1/undefined` and the `/` can +# be an ascii slash or sometime various unicode verions. These are currently +# left in. +# Characters that appear in wikimath templates and how to translate them into +# how they would appear in latex. CHAR_SYMBOLS = { "[Pp]hi": r"\phi", r"\)": ")", @@ -37,9 +46,9 @@ def insert_templates(text: str, templates: List[str], marker) -> str: - """ + """Replace each instance of marker in text with a template. - Again, re.sub was being annoying about \'s in the replacements.' + re.sub was being annoying about \'s in the replacements.' """ offset = 0 new_text = [] @@ -49,7 +58,8 @@ def insert_templates(text: str, templates: List[str], marker) -> str: new_text.append(t) offset = offset + mark.span()[1] else: - # This should be an error + # This should be an error, but the logger isn't plumbed into this + # function atm, just let it go for v0 pass if trailing := text[offset:]: new_text.append(trailing) @@ -132,7 +142,6 @@ def remove_template_brackets(text: str, templates: List[str]) -> str: def fix_equals(text: str) -> str: """wtf_wikipedia can handle the {{math|1=...}} templates but not {{math| ... {{=}} ...}}""" - # TODO: Does this replacement cause any issues? if re.search(r"{{ ?= ?}}|=", text, re.IGNORECASE): text = re.sub(r"{{math ?\|", "{{math|1=", text) return re.sub(r"{{ ?= ?}}|=", "=", text) @@ -152,6 +161,13 @@ def replace_template( nest_close=None, recursive: bool = False, ) -> str: + """Replace templates found in text with a marker. See replace_math_templates + for an explaination of the main parsing code. + + Note: This function *always* allows for the nesting of *different* templates + i.e., {{math|{{overline|...}}}}, but recursive=True must be set to + allow for the nesting of the *same* template, i.e. Xij + """ nest_open = nest_open if nest_open else opening nest_close = nest_close if nest_close else closing offset = 0 @@ -188,6 +204,10 @@ def replace_template( return "".join(new_text) +## +# These are for ease of use, giving names to the common templates we replace in +# the conversion from wikitext to latex. +# def replace_sub(text: str) -> str: return replace_template(text, r"", r"", "_{", "}", recursive=True) @@ -209,7 +229,7 @@ def replace_prime(text: str) -> str: def replace_fraction(text: str) -> str: - """{{Function|...}} isn't handled by wtf_wikipedia but {{sfrac|...}} is.""" + """{{Fraction|}} isn't handled by wtf_wikipedia but {{sfrac|...}} is.""" text = re.sub(r"{{[Ff]ract(?:ion)?(?:/sandbox)? ?\|", "{{sfrac|", text) return re.sub(r"{{sfrac/sandbox ?\|", "{{sfrac|", text) @@ -306,6 +326,7 @@ def replace_angle_bracket(text: str) -> str: def replace_symbols( text: str, symbols: Dict[str, str] = CHAR_SYMBOLS, include_money: bool = False ) -> str: + """Replace templates that evaulate to a symbol {{pi}} -> 𝛑 with the latex version.""" for template, latex in symbols.items(): # re.sub was being difficult about including something like \p in the # replacement string. So do it manually. @@ -318,6 +339,12 @@ def replace_symbols( def replace_abs(text: str) -> str: + """Convert absolute value from wikitext to latex. + + The | symbol is used in the wikitext template syntax, so they uses various + different ways to escape them. This tries to standadize them all to the latex + format. + """ text = text.replace("{{!}}", ABS_MARKER) text = text.replace("|", ABS_MARKER) text = text.replace("||", f"{ABS_MARKER}{ABS_MARKER}") @@ -329,6 +356,11 @@ def replace_abs(text: str) -> str: def replace_mset(text: str) -> str: + """Convert set notation from wikitext to latex. + + Where are some cases where wtf_wikipedia deletes msets that have | bars in + them despite that being legal in wikitext, those are not handled well atm. + """ opening = r"{{[Mm]set\|?" closing = r"}}" return replace_template( @@ -336,12 +368,14 @@ def replace_mset(text: str) -> str: ) -# TODO: Make sure to stripoff |undefined - - ## # This joins together all the text processing we do. def fix_math(text): + """Convert wikitext math to latex. + + Note: The order of these fixes can be important, some latex output can get + caught by regex's for other tempaltes. + """ text = remove_template_brackets( text, ("var", "nobreak", "nowrap", "mvar", "linktext", "em", "italics correction"), @@ -375,13 +409,16 @@ def fix_math(text): return text -## -# This ... def extract_math_templates(text: str) -> Tuple[str, List[str]]: + """Pull all math out of the page to handle later.""" return extract_templates(text, ("math",), MATH_MARKER) def replace_math_tags(text: str) -> str: + """Replace with $$ for latex. + + We try to pick $...$ or $$...$$ based on the wikitext. + """ math_opening = r'' math_closing = r"" offset = 0 @@ -471,6 +508,8 @@ def replace_math_tags(text: str) -> str: "Fe/H", "floor", "Function", + "Fraction", + "Frac", "gamma", "hub", "intmath", @@ -526,6 +565,7 @@ def replace_math_tags(text: str) -> str: "subsup", "sup", "sup sub", + "sfrac", "tau", "theta", "tmath", @@ -595,16 +635,6 @@ def replace_math_tags(text: str) -> str: ) -def test_request(text, latex: bool = False): - import requests - - r = requests.post( - "http://localhost:5000", - json={"wikitext": text, "source": "test", "id": "test", "latex": latex}, - ) - return r.json() - - ## # These function look ahead in the text to find the end of a scope. def finish_template(text, start="{{", end="}}"): @@ -636,52 +666,31 @@ def finish_template(text, start="{{", end="}}"): return -1, -1 -# def finish_mustache_template(text): -# """This is a special case of template finding where `{` and `}` are considered -# scopes that we must close before finding }}.""" -# i = 0 -# scopes = [] -# while i < len(text) - 1: -# if text[i] == "{": -# if text[i + 1] == "{": -# scopes.append("{{") -# i += 1 -# else: -# scopes.append("{") -# elif text[i] == "}": -# if text[i + 1] == "}" and scopes[-1] == "{{": -# scopes.pop() -# i += 1 -# if not scopes: -# return i - 1, i + 1 -# else: -# # This would result in a syntax error. -# if scopes[-1] != "{": -# pass -# scopes.pop() -# i += 1 -# return -1, -1 - - def finish_mustache_template(text): """This is a special case of template finding where `{` and `}` are considered - scopes that we must close before finding }}.""" + scopes that we must close before finding }}. + + If there are } without preceding {, they are ignored. + + In ambiguous cases like {{{, it parses to {{, { for opening the scopes. + """ i = 2 scopes = ["{{"] while i < len(text) - 1: if text[i] == "{": scopes.append("{") elif text[i] == "}": - if text[i + 1] == "}" and scopes[-1] == "{{": - scopes.pop() - i += 1 + if text[i + 1] == "}": + if scopes and scopes[-1] == "{{": + scopes.pop() + i += 1 + elif scopes and scopes[-1] == "{": + scopes.pop() if not scopes: return i - 1, i + 1 else: - # This would result in a syntax error. - if scopes[-1] != "{": - pass - scopes.pop() + if scopes and scopes[-1] == "{": + scopes.pop() i += 1 return -1, -1 @@ -701,17 +710,46 @@ def wiki_to_dir(wiki_id, chars: int = 2, levels: int = 2): return os.path.join(*parts) -def parse_wikitext(text, doc_id, source): +def parse_wikitext( + text, doc_id, source, host: str = "http://localhost", port: int = 5000 +): + """Parse wikitext by hitting a server endpoint.""" r = requests.post( - "http://localhost:5000", + f"{host}:{port}", json={"wikitext": text, "id": doc_id, "source": source}, ) + # This is technaially for the server to send the client when the client has + # timed out, but there isn't a server side timeout code. 504 is for when the + # server is a proxy, not just long running. if r.status_code == 408: raise requests.Timeout() - return r.json()["document"] + # This happens when HAProxy times out + if r.status_code == 504: + message = r.text + raise ValueError(f"{r}, {r.text}, probably from an HAProxy timeout.") + if r.status_code == 200: + try: + return r.json()["document"] + except requests.JSONDecodeError as e: + e.add_note(f"JSON Decoding failed for request {r}:{r.text}") + raise + try: + # Our server returns errors with json information, but if there is a non + # 200 code because of the load balancer, it might not be as JSON. + message = r.json()["error"] + except requests.JSONDecodeError: + message = r.text + raise ValueError(message) def format_section(sec) -> str: + """Convert a section dict into a string like: + + title + text... + more text... + ... + """ match sec: case {"title": "", "text": ""}: return "" @@ -728,6 +766,7 @@ def filter_section(sec, blocklist: Set[str] = SKIP_SECTIONS) -> bool: def format_document(doc, title: str = "") -> str: + """Convert the list of sections into a string, filtering out boilerplate sections.""" sections = filter(filter_section, doc) sections = (sec for s in sections if (sec := format_section(s))) return "\n\n".join(itertools.chain((title,), sections)).strip() @@ -749,8 +788,8 @@ def adjust_indentation(text: str) -> str: """ result = [] while indent := re.search("^:+.+$", text, re.MULTILINE | re.IGNORECASE): - # The :ident is on the last line - if indent.span()[1] == len(text): + # The :ident is on the last line, "\n" isn't matched so subtract 1 + if indent.span()[1] >= (len(text) - 1): result.append(text) break