Skip to content

Commit

Permalink
convert xml to dolma
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed Mar 11, 2024
1 parent 179f890 commit 2af7c13
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 22 deletions.
4 changes: 2 additions & 2 deletions licensed_pile/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
s = s.lower().strip()
if re.match(r".*/publicdomain/zero/1.0/?$", s):
return cls.CC0
if m := re.match(r".*/licenses/by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
if m := re.match(r".*(?:/licenses/)?by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
if m.group("version") == "4":
if m.group("share") is None:
return cls.CC_BY_SA
return cls.CC_BY
elif m.group(1) == "3":
elif m.group("version") == "3":
if m.group("share") is None:
return cls.CC_BY_SA_3
return cls.CC_BY_3
Expand Down
28 changes: 28 additions & 0 deletions licensed_pile/scrape.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,36 @@
"""Shared Utilities related to scraping."""

import logging
from typing import Dict, Optional

import requests
from tenacity import retry, stop_after_attempt, wait_random_exponential

# A user agent that says we are compatible with most websites (most browsers
# start with Mozilla/5.0) and also tells that we are a bot and includes a link
# for context on why we are scraping. We hope this fosters good will with site
# owners.
USER_AGENT = "Mozilla/5.0 (compatible; Licensed-Pile-bot/0.1; +http://www.github.com/r-three/licensed-pile)"

DEFAULT_HEADERS = {"User-Agent": USER_AGENT}


@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30))
def get_page(
url: str,
params: Optional[Dict[str, str]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""Get page and parse into soup."""
params = params if params is not None else {}
headers = headers if headers else {}
# Unpack the defaults first so the user provided ones can override them.
headers = {**DEFAULT_HEADERS, **headers}
resp = requests.get(url, params=params, headers=headers)
logging.debug(f"Sending GET to {resp.url}")
if resp.status_code != 200:
logging.warning(
f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
)
raise RuntimeError(f"Failed request to {resp.url}")
return resp
7 changes: 5 additions & 2 deletions licensed_pile/xml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Tools to help with xml parsing."""

from typing import List
from xml.etree import ElementTree as ET

import lxml.etree as ET


def iterate_xml(path: str, tag: str):
Expand All @@ -18,7 +19,9 @@ def iterate_xml(path: str, tag: str):
context = iter(context)
event, root = next(context)
for event, elem in context:
if event == "end" and elem.tag == tag:
# This `.localname` only exists for lxml. Include this or so you can
# still do a full namespace match if you need too.
if event == "end" and (ET.QName(elem.tag).localname == tag or elem.tag == tag):
yield elem
root.clear()

Expand Down
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,10 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str:
packages=find_packages(),
python_requires=">=3.8",
license="MIT",
install_requires=["logging_json"],
install_requires=[
"logging_json",
"requests",
"tenacity",
"lxml",
],
)
2 changes: 1 addition & 1 deletion wikiscrape/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ These steps are to be completed for each wiki that we are scraping.
* `UserTalk`: 3
Either the integer or the name can be used as input. This generates lists of page titles at `data/${wiki_name}/pages/${ns}.txt`.
3. Get the XML export of these pages with `python export_pages.py --wiki ${wiki_url}`. This get xml exports of the all the pages exported pages. It currently fetches all revisions so that we can build a complete author list. This will create a sharded xml export at `data/${wiki_name}/export/${shard_idx}-pages.xml`. The `<text>` tag contains the wikimedia markup.

4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}`

**TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code?
9 changes: 7 additions & 2 deletions wikiscrape/export_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
parser.add_argument(
"--page_limit", default=35, help="The max number of pages to export at once."
)
parser.add_argument(
"--test_pages", default=None, type=int, help="The number of test pages to retrieve."
)
parser.add_argument(
"--output_dir",
help="Where to save the xml export. defaults to data/${wiki_name}/export/.",
Expand Down Expand Up @@ -93,9 +96,11 @@ def main(args):
# with literal "{"'s.
for i, j in enumerate(range(0, len(pages), args.page_limit)):
xml = export_pages(args.wiki, pages[j : j + args.page_limit])
dirname, filename = os.path.split(args.output)
with open(os.path.join(dirname, f"{i:>05}-{filename}"), "w") as wf:
with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf:
wf.write(xml)
if args.test_pages and j > args.test_pages:
print(f"Scraped {j + args.page_limit} pages, stopping for testing.")
break


if __name__ == "__main__":
Expand Down
99 changes: 99 additions & 0 deletions wikiscrape/to-dolma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Convert a wikiscrape of media-wiki dump into the dolma format."""

import argparse
import datetime
import functools
import glob
import itertools
import urllib.parse

from utils import get_wiki_name, wiki_url

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma
from licensed_pile.xml import iterate_xmls

SOURCE_NAME = "wikiscrape"


parser = argparse.ArgumentParser(description="Convert the xml export to dolma.")
parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.")
parser.add_argument("--license", required=True, help="The licenses this is under.")
parser.add_argument("--export", help="The location of the exported pages.")
parser.add_argument(
"--output_dir",
default=f"data/{SOURCE_NAME}/raw/documents/",
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--filename", default=None, help="The base filename for our chat data."
)
parser.add_argument(
"--shard_size", type=int, default=1, help="Size, in GB, for each shard."
)


def main(args):
# Calculate defaults
license = PermissiveLicenses.from_string(args.license)
args.filename = (
args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz"
)
args.export = (
args.export
if args.export
else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml")
)

# Our parser can ignore namespaces so just use `page`.
pages = iterate_xmls(glob.iglob(args.export), tag="page")
pages = map(
functools.partial(
format_dolma, source_name=SOURCE_NAME, wiki=args.wiki, license=license
),
pages,
)
to_dolma(pages, args.output_dir, args.filename, args.shard_size)


def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):
revisions = [r for r in xml if r.tag.endswith("revision")]
# TODO Handle if this fails.
text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text
page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text
page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text
created = datetime.datetime.fromisoformat(
[ts for ts in revisions[-1] if ts.tag.endswith("timestamp")][0].text
).replace(tzinfo=None)
page_title = [t for t in xml if t.tag.endswith("title")][0].text

contributors = set()
for revision in revisions:
contribs = [c for c in revision if c.tag.endswith("contributor")]
# When there are multiple contributors, there are multiple contributor
# xml items where each one has a single username and id items.
names = [u.text for c in contribs for u in c if u.tag.endswith("username")]
# Save their id too in case they change their username
uid = [u.text for c in contribs for u in c if u.tag.endswith("id")]
contributors.update(zip(names, uid))

return {
"id": f"{page_namespace}-{page_id}",
"text": text,
"source": f"{source_name}-{wiki}",
"added": datetime.datetime.utcnow().isoformat(),
"created": created.isoformat(),
"metadata": {
"license": str(license),
"authors": sorted(contributors),
"url": wiki_url(wiki, page_title),
"wiki": get_wiki_name(wiki),
"namespace": page_namespace,
"title": page_title,
},
}


if __name__ == "__main__":
args = parser.parse_args()
main(args)
24 changes: 10 additions & 14 deletions wikiscrape/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,18 @@

import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

from licensed_pile.scrape import USER_AGENT
from licensed_pile import scrape

logging.basicConfig(
level=logging.INFO,
format="wikiscrape: [%(asctime)s] %(levelname)s - %(message)s",
)


@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30))
def get_page(url: str, params: Optional[Dict[str, str]] = None):
"""Get page and parse into soup."""
params = params if params is not None else {}
resp = requests.get(url, params=params, headers={"User-Agent": USER_AGENT})
logging.debug(f"Sending GET to {resp.url}")
if resp.status_code != 200:
logging.warning(
f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
)
raise RuntimeError(f"Failed request to {resp.url}")
return resp.text
def get_page(*args, **kwargs):
r = scrape.get_page(*args, **kwargs)
return r.text


def get_soup(text, parser="html.parser"):
Expand All @@ -48,3 +38,9 @@ def removeprefix(s: str, prefix: str) -> str:
if s.startswith(prefix):
return s[len(prefix) :]
return s[:]


def wiki_url(base_url: str, title: str) -> str:
"""Create a wiki url from the wiki url and the page name."""
url = urllib.parse.urljoin(base_url, f"wiki/{title.replace(' ', '_')}")
return urllib.parse.quote(url, safe=":/")

0 comments on commit 2af7c13

Please sign in to comment.