convert xml to dolma

r-three · Mar 11, 2024 · 2af7c13 · 2af7c13
1 parent 179f890
commit 2af7c13
Show file tree

Hide file tree

Showing 8 changed files with 158 additions and 22 deletions.
diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py
@@ -37,12 +37,12 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
         s = s.lower().strip()
         if re.match(r".*/publicdomain/zero/1.0/?$", s):
             return cls.CC0
-        if m := re.match(r".*/licenses/by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
+        if m := re.match(r".*(?:/licenses/)?by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
             if m.group("version") == "4":
                 if m.group("share") is None:
                     return cls.CC_BY_SA
                 return cls.CC_BY
-            elif m.group(1) == "3":
+            elif m.group("version") == "3":
                 if m.group("share") is None:
                     return cls.CC_BY_SA_3
                 return cls.CC_BY_3

diff --git a/licensed_pile/scrape.py b/licensed_pile/scrape.py
@@ -1,8 +1,36 @@
 """Shared Utilities related to scraping."""
 
+import logging
+from typing import Dict, Optional
+
+import requests
+from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 # A user agent that says we are compatible with most websites (most browsers
 # start with Mozilla/5.0) and also tells that we are a bot and includes a link
 # for context on why we are scraping. We hope this fosters good will with site
 # owners.
 USER_AGENT = "Mozilla/5.0 (compatible; Licensed-Pile-bot/0.1; +http://www.github.com/r-three/licensed-pile)"
+
+DEFAULT_HEADERS = {"User-Agent": USER_AGENT}
+
+
+@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30))
+def get_page(
+    url: str,
+    params: Optional[Dict[str, str]] = None,
+    headers: Optional[Dict[str, str]] = None,
+):
+    """Get page and parse into soup."""
+    params = params if params is not None else {}
+    headers = headers if headers else {}
+    # Unpack the defaults first so the user provided ones can override them.
+    headers = {**DEFAULT_HEADERS, **headers}
+    resp = requests.get(url, params=params, headers=headers)
+    logging.debug(f"Sending GET to {resp.url}")
+    if resp.status_code != 200:
+        logging.warning(
+            f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
+        )
+        raise RuntimeError(f"Failed request to {resp.url}")
+    return resp
diff --git a/licensed_pile/xml.py b/licensed_pile/xml.py
@@ -1,7 +1,8 @@
 """Tools to help with xml parsing."""
 
 from typing import List
-from xml.etree import ElementTree as ET
+
+import lxml.etree as ET
 
 
 def iterate_xml(path: str, tag: str):
@@ -18,7 +19,9 @@ def iterate_xml(path: str, tag: str):
     context = iter(context)
     event, root = next(context)
     for event, elem in context:
-        if event == "end" and elem.tag == tag:
+        # This `.localname` only exists for lxml. Include this or so you can
+        # still do a full namespace match if you need too.
+        if event == "end" and (ET.QName(elem.tag).localname == tag or elem.tag == tag):
             yield elem
             root.clear()
 

diff --git a/setup.py b/setup.py
@@ -58,5 +58,10 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str:
     packages=find_packages(),
     python_requires=">=3.8",
     license="MIT",
-    install_requires=["logging_json"],
+    install_requires=[
+        "logging_json",
+        "requests",
+        "tenacity",
+        "lxml",
+    ],
 )
diff --git a/wikiscrape/README.md b/wikiscrape/README.md
@@ -14,6 +14,6 @@ These steps are to be completed for each wiki that we are scraping.
   * `UserTalk`: 3
 Either the integer or the name can be used as input. This generates lists of page titles at `data/${wiki_name}/pages/${ns}.txt`.
 3. Get the XML export of these pages with `python export_pages.py --wiki ${wiki_url}`. This get xml exports of the all the pages exported pages. It currently fetches all revisions so that we can build a complete author list. This will create a sharded xml export at `data/${wiki_name}/export/${shard_idx}-pages.xml`. The `<text>` tag contains the wikimedia markup.
-
+4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}`
 
 **TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code?
diff --git a/wikiscrape/export_pages.py b/wikiscrape/export_pages.py
@@ -31,6 +31,9 @@
 parser.add_argument(
     "--page_limit", default=35, help="The max number of pages to export at once."
 )
+parser.add_argument(
+    "--test_pages", default=None, type=int, help="The number of test pages to retrieve."
+)
 parser.add_argument(
     "--output_dir",
     help="Where to save the xml export. defaults to data/${wiki_name}/export/.",
@@ -93,9 +96,11 @@ def main(args):
     # with literal "{"'s.
     for i, j in enumerate(range(0, len(pages), args.page_limit)):
         xml = export_pages(args.wiki, pages[j : j + args.page_limit])
-        dirname, filename = os.path.split(args.output)
-        with open(os.path.join(dirname, f"{i:>05}-{filename}"), "w") as wf:
+        with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf:
             wf.write(xml)
+        if args.test_pages and j > args.test_pages:
+            print(f"Scraped {j + args.page_limit} pages, stopping for testing.")
+            break
 
 
 if __name__ == "__main__":

diff --git a/wikiscrape/to-dolma.py b/wikiscrape/to-dolma.py
@@ -0,0 +1,99 @@
+"""Convert a wikiscrape of media-wiki dump into the dolma format."""
+
+import argparse
+import datetime
+import functools
+import glob
+import itertools
+import urllib.parse
+
+from utils import get_wiki_name, wiki_url
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.write import to_dolma
+from licensed_pile.xml import iterate_xmls
+
+SOURCE_NAME = "wikiscrape"
+
+
+parser = argparse.ArgumentParser(description="Convert the xml export to dolma.")
+parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.")
+parser.add_argument("--license", required=True, help="The licenses this is under.")
+parser.add_argument("--export", help="The location of the exported pages.")
+parser.add_argument(
+    "--output_dir",
+    default=f"data/{SOURCE_NAME}/raw/documents/",
+    help="Where the dolma formatted data goes.",
+)
+parser.add_argument(
+    "--filename", default=None, help="The base filename for our chat data."
+)
+parser.add_argument(
+    "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+)
+
+
+def main(args):
+    # Calculate defaults
+    license = PermissiveLicenses.from_string(args.license)
+    args.filename = (
+        args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz"
+    )
+    args.export = (
+        args.export
+        if args.export
+        else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml")
+    )
+
+    # Our parser can ignore namespaces so just use `page`.
+    pages = iterate_xmls(glob.iglob(args.export), tag="page")
+    pages = map(
+        functools.partial(
+            format_dolma, source_name=SOURCE_NAME, wiki=args.wiki, license=license
+        ),
+        pages,
+    )
+    to_dolma(pages, args.output_dir, args.filename, args.shard_size)
+
+
+def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):
+    revisions = [r for r in xml if r.tag.endswith("revision")]
+    # TODO Handle if this fails.
+    text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text
+    page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text
+    page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text
+    created = datetime.datetime.fromisoformat(
+        [ts for ts in revisions[-1] if ts.tag.endswith("timestamp")][0].text
+    ).replace(tzinfo=None)
+    page_title = [t for t in xml if t.tag.endswith("title")][0].text
+
+    contributors = set()
+    for revision in revisions:
+        contribs = [c for c in revision if c.tag.endswith("contributor")]
+        # When there are multiple contributors, there are multiple contributor
+        # xml items where each one has a single username and id items.
+        names = [u.text for c in contribs for u in c if u.tag.endswith("username")]
+        # Save their id too in case they change their username
+        uid = [u.text for c in contribs for u in c if u.tag.endswith("id")]
+        contributors.update(zip(names, uid))
+
+    return {
+        "id": f"{page_namespace}-{page_id}",
+        "text": text,
+        "source": f"{source_name}-{wiki}",
+        "added": datetime.datetime.utcnow().isoformat(),
+        "created": created.isoformat(),
+        "metadata": {
+            "license": str(license),
+            "authors": sorted(contributors),
+            "url": wiki_url(wiki, page_title),
+            "wiki": get_wiki_name(wiki),
+            "namespace": page_namespace,
+            "title": page_title,
+        },
+    }
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)
diff --git a/wikiscrape/utils.py b/wikiscrape/utils.py
@@ -6,28 +6,18 @@
 
 import requests
 from bs4 import BeautifulSoup
-from tenacity import retry, stop_after_attempt, wait_random_exponential
 
-from licensed_pile.scrape import USER_AGENT
+from licensed_pile import scrape
 
 logging.basicConfig(
     level=logging.INFO,
     format="wikiscrape: [%(asctime)s] %(levelname)s - %(message)s",
 )
 
 
-@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30))
-def get_page(url: str, params: Optional[Dict[str, str]] = None):
-    """Get page and parse into soup."""
-    params = params if params is not None else {}
-    resp = requests.get(url, params=params, headers={"User-Agent": USER_AGENT})
-    logging.debug(f"Sending GET to {resp.url}")
-    if resp.status_code != 200:
-        logging.warning(
-            f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
-        )
-        raise RuntimeError(f"Failed request to {resp.url}")
-    return resp.text
+def get_page(*args, **kwargs):
+    r = scrape.get_page(*args, **kwargs)
+    return r.text
 
 
 def get_soup(text, parser="html.parser"):
@@ -48,3 +38,9 @@ def removeprefix(s: str, prefix: str) -> str:
     if s.startswith(prefix):
         return s[len(prefix) :]
     return s[:]
+
+
+def wiki_url(base_url: str, title: str) -> str:
+    """Create a wiki url from the wiki url and the page name."""
+    url = urllib.parse.urljoin(base_url, f"wiki/{title.replace(' ', '_')}")
+    return urllib.parse.quote(url, safe=":/")