Tooling to download and process Wikis

Add tools to scrape mediawiki wikis that don't publish dumps Add tool that exports the xml based on the list of pages. Add the ability to convert wikis to dolma
r-three · Jun 3, 2024 · 05d64f2 · 05d64f2
1 parent 1ef9a1c
commit 05d64f2
Show file tree

Hide file tree

Showing 28 changed files with 1,818 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,6 @@ cython_debug/
 #.idea/
 .python-version
 **/licensed_pile_log.txt
+
+node_modules
+package-lock.json
diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py
@@ -9,6 +9,8 @@ def __str__(self):
         return self.value
 
 
+# TODO: With all the different versions that are out in the wild, this flat enum
+# is getting hard to use. We should re-thing how to do this.
 class PermissiveLicenses(StringEnum):
     """By 'Permissive' we mean licenses that are in the Gold, Silver, or Bronze
     lists of the Blue Oak Countil (https://blueoakcouncil.org/list), even if
@@ -17,15 +19,24 @@ class PermissiveLicenses(StringEnum):
 
     PD = "Public Domain"
     CC0 = "Creative Commons Zero - Public Domain - https://creativecommons.org/publicdomain/zero/1.0/"
+    CC_PDM = "Creative Commons Public Domain Mark - https://creativecommons.org/publicdomain/mark/1.0/"
     CC_BY = (
         "Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/"
     )
     CC_BY_3 = (
         "Creative Commons - Attribution - https://creativecommons.org/licenses/by/3.0/"
     )
+    CC_BY_2_5 = (
+        "Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.5/"
+    )
+    CC_BY_2 = (
+        "Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.0/"
+    )
     CC_BY_SA = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/4.0/"
     CC_BY_SA_3 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/3.0/"
     CC_BY_SA_2_5 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.5/"
+    CC_BY_SA_2_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.1/"
+    CC_BY_SA_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/1.0/"
     GFDL = "GNU Free Documentation License"
     APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0"
     MIT = "MIT License"
@@ -49,17 +60,35 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
         s = s.lower().strip()
         if re.match(r".*/publicdomain/zero/1.0/?$", s):
             return cls.CC0
-        if m := re.match(r".*/licenses/by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
-            if m.group("version") == "4":
-                if m.group("share") is None:
+        if re.match(r".*/publicdomain/mark/1.0/?$", s):
+            return cls.CC_PDM
+        if re.match(r".*/publicdomain/.*", s):
+            return cls.PD
+        if m := re.search(r"(?:/licenses/)?by(?P<share>-sa)?/(?P<version>\d.\d)/?", s):
+            if m.group("version") == "4.0":
+                if m.group("share") is not None:
                     return cls.CC_BY_SA
                 return cls.CC_BY
-            elif m.group(1) == "3":
-                if m.group("share") is None:
+            elif m.group("version") == "3.0":
+                if m.group("share") is not None:
                     return cls.CC_BY_SA_3
                 return cls.CC_BY_3
+            elif m.group("version") == "2.5":
+                if m.group("share") is not None:
+                    return cls.CC_BY_SA_2_5
+                return cls.CC_BY_2_5
+            elif m.group("version") == "2.1":
+                if m.group("share") is not None:
+                    return cls.CC_BY_SA_2_1
+            elif m.group("version") == "2.0":
+                return cls.CC_BY_2
+            elif m.group("version") == "1.0":
+                if m.group("share") is not None:
+                    return cls.CC_BY_SA_1
             else:
                 raise ValueError(f"Unable to understand license {s}")
+        if s == "gfdl" or "gnu_free_documentation_license" in s:
+            return cls.GFDL
         raise ValueError(f"Unable to understand license {s}")
 
 

diff --git a/licensed_pile/scrape.py b/licensed_pile/scrape.py
@@ -29,6 +29,7 @@ def get_page(
     resp = requests.get(url, params=params, headers=headers)
     logging.debug(f"Sending GET to {resp.url}")
     if resp.status_code != 200:
+        # TODO: Update logger
         logging.warning(
             f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
         )

diff --git a/licensed_pile/xml.py b/licensed_pile/xml.py
@@ -1,6 +1,8 @@
 """Tools to help with xml parsing."""
 
-from xml.etree import ElementTree as ET
+from typing import List
+
+import lxml.etree as ET
 
 
 def iterate_xml(path: str, tag: str):
@@ -17,6 +19,14 @@ def iterate_xml(path: str, tag: str):
     context = iter(context)
     event, root = next(context)
     for event, elem in context:
-        if event == "end" and elem.tag == tag:
+        # This `.localname` only exists for lxml. Include this or so you can
+        # still do a full namespace match if you need too.
+        if event == "end" and (ET.QName(elem.tag).localname == tag or elem.tag == tag):
             yield elem
             root.clear()
+
+
+def iterate_xmls(paths: List[str], tag: str):
+    """Iterable version of parsing multiple xml files with the same structure as a single iterator."""
+    for path in paths:
+        yield from iterate_xml(path, tag)
diff --git a/requirements.txt b/requirements.txt
@@ -1,16 +1,18 @@
+beautifulsoup4
 charset_normalizer
+datasets
 dolma
 google-cloud-storage
+internetarchive
 logging_json
 markdown-it-py
 pandas
+patool
 pre-commit
+pyunpack
 rdflib
 requests>=2.13
 smart_open
 tenacity
-pandas
-jsonlines
-datasets
 tqdm
 ultimate-sitemap-parser
diff --git a/setup.py b/setup.py
@@ -62,5 +62,6 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str:
         "logging_json",
         "requests>=2.13",
         "tenacity",
+        "lxml",
     ],
 )
diff --git a/wiki/README.md b/wiki/README.md
@@ -0,0 +1,23 @@
+# Wiki
+
+## Notes
+
+The following scanners output a .history.xml to parse
+* "Internet Archive HTML5 Uploader ...": Seems to have .7z
+*  "wikiteam3 (v...)" these get released as .zstandard files.
+* Official Wikipedia Dumps
+* "Internet Archive Python library ..." >= 1.0.4
+
+
+The following use the old format
+* "Internet Archive Python library 0.X.X": As a zip file, you need to make a new dir with -d when you unzip.
+
+
+The archive url can be created with `f"archive.org/details/{item_id}"`
+
+
+Some of the items have multiple uploads, for example  `wiki-kris159shoutwikicom_w` has multiple history files we so need to parse out the date and pic the most recent one, i.e., `kris159shoutwikicom_w-20180506-history-xml.7z` over `kris159shoutwikicom_w-20140129-history.xml.7z`
+
+## Special Cases
+
+Shout Wiki, WikiTravelAllLanguages
diff --git a/wiki/__init__.py b/wiki/__init__.py
diff --git a/wiki/archive/.gitignore b/wiki/archive/.gitignore
@@ -0,0 +1 @@
+data*/*
diff --git a/wiki/archive/README.md b/wiki/archive/README.md
@@ -0,0 +1,23 @@
+# Wiki Dumps from the Internet Archive
+
+We need to download 4.4 TB from the Internet Archive.
+
+If we had a Gigabit connection it would take 9 hours to download.
+
+Based on the Internet, people say the IA generally has a bandwidth of 10 Mbps to 1 Mbps, and the longer you download the less bandwidth they give to you.
+
+| Bandwith | Hosts | Time to DL |
+|----------|------:|-----------:|
+| 1 Gb/s   | 1     | 9h 40m     |
+|          | 4     | 2.3h       |
+|          | 10    | 0.9h       |
+| 10 Mb/s  | 1     | 40d 17h    |
+|          | 4     | 10d +      |
+|          | 10    | 4d +       |
+| 1 Mb/s   | 1     | 407d 9h    |
+|          | 4     | 101d +     |
+|          | 10    | 40d+       |
+|          | 100   | 4d+        |
+|          | 500   | 0.8d       |
+
+We really need hardware based parallelism
diff --git a/wiki/archive/__init__.py b/wiki/archive/__init__.py
diff --git a/wiki/archive/download_archive.py b/wiki/archive/download_archive.py
@@ -0,0 +1,135 @@
+"""Download wiki dumps from the internet archive."""
+
+import argparse
+import functools
+import json
+import multiprocessing.dummy as mp
+import os
+import random
+
+import internetarchive
+import pyunpack
+import utils
+
+from licensed_pile import logs
+
+parser = argparse.ArgumentParser(
+    description="Download wiki dumps from the internet archive."
+)
+parser.add_argument("--wiki_metadata", default="data/ia-wikis.jsonl")
+parser.add_argument("--test_run", type=int, help="")
+parser.add_argument("--num_threads", type=int, default=32, help="")
+parser.add_argument("--worker_id", type=int, required=True, help="")
+parser.add_argument("--num_workers", type=int, required=True, help="")
+
+
+# TODO: Default downloading to .../dumps
+def download_and_extract(
+    ident: str,
+    dl_file,
+    output_dir: str = "/fruitbasket/users/bdlester/projects/licensed_pile/wiki/archive/data/dumps",
+    verbose: bool = False,
+):
+    logger = logs.get_logger("wiki/archive")
+    dest = os.path.join(output_dir, ident)
+    if os.path.exists(dest):
+        logger.info(
+            f"Skipping download of {dl_file['name']} for {ident} as {dest} already exists on disk."
+        )
+        return dest
+    logger.info(f"Downloading {dl_file['name']} for {ident}.")
+    internetarchive.download(
+        ident, checksum=True, verbose=verbose, files=dl_file["name"], destdir=output_dir
+    )
+    logger.info(f"Extracting download for {ident} to {dest}.")
+    pyunpack.Archive(os.path.join(dest, dl_file["name"])).extractall(dest)
+    return dest
+
+
+def download_ia(wiki):
+    logger = logs.get_logger("wiki/archive")
+    if (ident := wiki["metadata"]["identifier"]) in utils.KNOWN_BAD:
+        logger.warning(f"Skipping {ident} as it is listed under utils.KNOWN_BAD")
+        return None
+    dl_file = utils.find_download(wiki)
+    return download_and_extract(ident, dl_file)
+
+
+def download_fandom(wiki):
+    logger = logs.get_logger("wiki/archive")
+    logger.warning(f"Fandom downloads not implemented yet, downloading from IA.")
+    return download_ia(wiki)
+
+
+def download_wikimedia(wiki):
+    logger = logs.get_logger("wiki/archive")
+    logger.warning(f"Wikimedia downloads not implemented yet, downloading from IA.")
+    return download_ia(wiki)
+
+
+def scrape_wiki(wiki):
+    logger = logs.get_logger("wiki/archive")
+    logger.warning(f"Wiki Re-scrapes not implemented yet, downloading from IA.")
+    return download_ia(wiki)
+
+
+def process_wiki(i, wiki, offset):
+    logger = logs.get_logger("wiki/archive")
+    if "metadata" not in wiki:
+        logger.error(f"Metadata missing from line {i}, malformed record")
+        return None
+    ident = wiki["metadata"]["identifier"]
+    if not utils.filter_language(wiki["metadata"].get("language")):
+        lang = wiki["metadata"].get("language")
+        logger.warning(f"{ident} appears to not be english, found: {lang}")
+        return None
+    if not utils.check_alive(wiki):
+        logger.info(f"{ident} is offline, getting dump from IA.")
+        return download_ia(wiki)
+    if not utils.verify_license(wiki):
+        logger.error(f"The IA license for {ident} doesn't match the source.")
+        return None
+    if utils.check_fandom(wiki):
+        logger.info(f"{ident} is a fandom wiki, downloading dump from there.")
+        return download_fandom(wiki)
+    if utils.check_wikimedia(wiki):
+        logger.info(f"{ident} is a WikiMedia wiki, downloading dump from there.")
+        return download_wikimedia(wiki)
+    if utils.check_out_of_date(wiki, offset):
+        logger.warning(f"IA dump for {ident} is very out of date, re-scraping.")
+        return scrape_wiki(wiki)
+
+
+# TODO: configure dest_dir
+def main(args):
+    logger = logs.get_logger("wiki/archive")
+    logger.info(f"Reading wiki metadata from {args.wiki_metadata}")
+    with open(args.wiki_metadata) as f:
+        wiki_metadata = [json.loads(l) for l in f if l]
+    logger.info(f"{len(wiki_metadata)} wikis to download.")
+
+    if args.test_run:
+        logger.info(f"Test Run: Only downloading {args.test_run} wikis")
+        random.shuffle(wiki_metadata)
+        wiki_metadata = wiki_metadata[: args.test_run]
+
+    wiki_metadata = [
+        w for i, w in enumerate(wiki_metadata) if i % args.num_workers == args.worker_id
+    ]
+    logger.info(
+        f"{len(wiki_metadata)} wikis to download as {args.worker_id}/{args.num_workers}."
+    )
+
+    # f = functools.partial(process_wiki, offset=None)
+    # [f(*w) for w in enumerate(wiki_metadata)]
+
+    with mp.Pool(args.num_threads) as pool:
+        pool.starmap(
+            functools.partial(process_wiki, offset=None), enumerate(wiki_metadata)
+        )
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging("wiki/archive")
+    main(args)
diff --git a/wiki/archive/get_metadata.py b/wiki/archive/get_metadata.py
@@ -0,0 +1,65 @@
+"""Download wiki dump metadata from the internet archive.
+
+The licenseurl regex's we are using to search are mutually exclusive so we can
+split the query into multiple chunks instead of `OR`ing them together to get some
+parallelism out of the metadata scrape.
+"""
+
+import argparse
+import json
+import multiprocessing.dummy as mp
+import os
+
+import internetarchive
+
+from licensed_pile import logs
+from licensed_pile.licenses import PermissiveLicenses
+
+parser = argparse.ArgumentParser(
+    description="Download metadata for wiki dumps from the IA."
+)
+parser.add_argument("--output_dir", default="data/metadata/", help="")
+parser.add_argument("--file_name", default="ia-wiki-metadata.jsonl")
+# TODO: Respect these
+parser.add_argument("--include_wikicollections", action="store_true", help="")
+parser.add_argument("--licenses", choices=[], action="append", help="")
+
+
+def get_metadata(idx: int, query: str, file_name: str, output_dir: str):
+    """Fetch item metadata from IA using query and save it to disk."""
+    with open(os.path.join(output_dir, f"{idx:>05}_{file_name}")) as wf:
+        for item in internetarchive.search_items(query):
+            wf.write(json.dumps(i.item_metadata) + "\n")
+
+
+def make_queries(licenses, include_wikicollections):
+    if include_wikicollections:
+        raise NotImplementedError("...")
+    license_regexs = licenses
+    for license_regex in license_regexs:
+        yield f"collection:(wikiteam) AND licenseurl:({license_regex})"
+
+
+def main(args):
+    # TODO have something that translates from the PermissiveLicense Enum to regex's
+    if args.licenses is None:
+        args.licesnes = [
+            "*\/by\/*",
+            "*\/by-sa\/*",
+            "*publicdomain*",
+            "*GNU_Free_Documentation_License*",
+        ]
+    queries = list(make_queries(args.licesnses, args.include_wikicollections))
+    with mp.Pool(len(queries)) as pool:
+        pool.starmap(
+            functools.partial(
+                get_metadata, file_name=args.file_name, output_dir=args.output_dir
+            ),
+            enumerate(queries),
+        )
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging("wiki/archive")
+    main(args)