diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py index f1e9baf..1c2ca01 100644 --- a/licensed_pile/licenses.py +++ b/licensed_pile/licenses.py @@ -37,12 +37,12 @@ def from_string(cls, s: str) -> "PermissiveLicenses": s = s.lower().strip() if re.match(r".*/publicdomain/zero/1.0/?$", s): return cls.CC0 - if m := re.match(r".*/licenses/by(?P-sa)?/(?P\d).0/?$", s): + if m := re.match(r".*(?:/licenses/)?by(?P-sa)?/(?P\d).0/?$", s): if m.group("version") == "4": if m.group("share") is None: return cls.CC_BY_SA return cls.CC_BY - elif m.group(1) == "3": + elif m.group("version") == "3": if m.group("share") is None: return cls.CC_BY_SA_3 return cls.CC_BY_3 diff --git a/licensed_pile/scrape.py b/licensed_pile/scrape.py index b4a4e7c..887388e 100644 --- a/licensed_pile/scrape.py +++ b/licensed_pile/scrape.py @@ -1,8 +1,36 @@ """Shared Utilities related to scraping.""" +import logging +from typing import Dict, Optional + +import requests +from tenacity import retry, stop_after_attempt, wait_random_exponential # A user agent that says we are compatible with most websites (most browsers # start with Mozilla/5.0) and also tells that we are a bot and includes a link # for context on why we are scraping. We hope this fosters good will with site # owners. USER_AGENT = "Mozilla/5.0 (compatible; Licensed-Pile-bot/0.1; +http://www.github.com/r-three/licensed-pile)" + +DEFAULT_HEADERS = {"User-Agent": USER_AGENT} + + +@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30)) +def get_page( + url: str, + params: Optional[Dict[str, str]] = None, + headers: Optional[Dict[str, str]] = None, +): + """Get page and parse into soup.""" + params = params if params is not None else {} + headers = headers if headers else {} + # Unpack the defaults first so the user provided ones can override them. + headers = {**DEFAULT_HEADERS, **headers} + resp = requests.get(url, params=params, headers=headers) + logging.debug(f"Sending GET to {resp.url}") + if resp.status_code != 200: + logging.warning( + f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}" + ) + raise RuntimeError(f"Failed request to {resp.url}") + return resp diff --git a/licensed_pile/xml.py b/licensed_pile/xml.py index 8a08a82..798f1f6 100644 --- a/licensed_pile/xml.py +++ b/licensed_pile/xml.py @@ -1,7 +1,8 @@ """Tools to help with xml parsing.""" from typing import List -from xml.etree import ElementTree as ET + +import lxml.etree as ET def iterate_xml(path: str, tag: str): @@ -18,7 +19,9 @@ def iterate_xml(path: str, tag: str): context = iter(context) event, root = next(context) for event, elem in context: - if event == "end" and elem.tag == tag: + # This `.localname` only exists for lxml. Include this or so you can + # still do a full namespace match if you need too. + if event == "end" and (ET.QName(elem.tag).localname == tag or elem.tag == tag): yield elem root.clear() diff --git a/setup.py b/setup.py index 752a273..98079f2 100644 --- a/setup.py +++ b/setup.py @@ -58,5 +58,10 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str: packages=find_packages(), python_requires=">=3.8", license="MIT", - install_requires=["logging_json"], + install_requires=[ + "logging_json", + "requests", + "tenacity", + "lxml", + ], ) diff --git a/wikiscrape/README.md b/wikiscrape/README.md index 99212d8..99e2a22 100644 --- a/wikiscrape/README.md +++ b/wikiscrape/README.md @@ -14,6 +14,6 @@ These steps are to be completed for each wiki that we are scraping. * `UserTalk`: 3 Either the integer or the name can be used as input. This generates lists of page titles at `data/${wiki_name}/pages/${ns}.txt`. 3. Get the XML export of these pages with `python export_pages.py --wiki ${wiki_url}`. This get xml exports of the all the pages exported pages. It currently fetches all revisions so that we can build a complete author list. This will create a sharded xml export at `data/${wiki_name}/export/${shard_idx}-pages.xml`. The `` tag contains the wikimedia markup. - +4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}` **TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code? diff --git a/wikiscrape/export_pages.py b/wikiscrape/export_pages.py index 363a7d9..915195a 100644 --- a/wikiscrape/export_pages.py +++ b/wikiscrape/export_pages.py @@ -31,6 +31,9 @@ parser.add_argument( "--page_limit", default=35, help="The max number of pages to export at once." ) +parser.add_argument( + "--test_pages", default=None, type=int, help="The number of test pages to retrieve." +) parser.add_argument( "--output_dir", help="Where to save the xml export. defaults to data/${wiki_name}/export/.", @@ -93,9 +96,11 @@ def main(args): # with literal "{"'s. for i, j in enumerate(range(0, len(pages), args.page_limit)): xml = export_pages(args.wiki, pages[j : j + args.page_limit]) - dirname, filename = os.path.split(args.output) - with open(os.path.join(dirname, f"{i:>05}-{filename}"), "w") as wf: + with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf: wf.write(xml) + if args.test_pages and j > args.test_pages: + print(f"Scraped {j + args.page_limit} pages, stopping for testing.") + break if __name__ == "__main__": diff --git a/wikiscrape/to-dolma.py b/wikiscrape/to-dolma.py new file mode 100644 index 0000000..6d223db --- /dev/null +++ b/wikiscrape/to-dolma.py @@ -0,0 +1,99 @@ +"""Convert a wikiscrape of media-wiki dump into the dolma format.""" + +import argparse +import datetime +import functools +import glob +import itertools +import urllib.parse + +from utils import get_wiki_name, wiki_url + +from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.write import to_dolma +from licensed_pile.xml import iterate_xmls + +SOURCE_NAME = "wikiscrape" + + +parser = argparse.ArgumentParser(description="Convert the xml export to dolma.") +parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.") +parser.add_argument("--license", required=True, help="The licenses this is under.") +parser.add_argument("--export", help="The location of the exported pages.") +parser.add_argument( + "--output_dir", + default=f"data/{SOURCE_NAME}/raw/documents/", + help="Where the dolma formatted data goes.", +) +parser.add_argument( + "--filename", default=None, help="The base filename for our chat data." +) +parser.add_argument( + "--shard_size", type=int, default=1, help="Size, in GB, for each shard." +) + + +def main(args): + # Calculate defaults + license = PermissiveLicenses.from_string(args.license) + args.filename = ( + args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz" + ) + args.export = ( + args.export + if args.export + else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml") + ) + + # Our parser can ignore namespaces so just use `page`. + pages = iterate_xmls(glob.iglob(args.export), tag="page") + pages = map( + functools.partial( + format_dolma, source_name=SOURCE_NAME, wiki=args.wiki, license=license + ), + pages, + ) + to_dolma(pages, args.output_dir, args.filename, args.shard_size) + + +def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses): + revisions = [r for r in xml if r.tag.endswith("revision")] + # TODO Handle if this fails. + text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text + page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text + page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text + created = datetime.datetime.fromisoformat( + [ts for ts in revisions[-1] if ts.tag.endswith("timestamp")][0].text + ).replace(tzinfo=None) + page_title = [t for t in xml if t.tag.endswith("title")][0].text + + contributors = set() + for revision in revisions: + contribs = [c for c in revision if c.tag.endswith("contributor")] + # When there are multiple contributors, there are multiple contributor + # xml items where each one has a single username and id items. + names = [u.text for c in contribs for u in c if u.tag.endswith("username")] + # Save their id too in case they change their username + uid = [u.text for c in contribs for u in c if u.tag.endswith("id")] + contributors.update(zip(names, uid)) + + return { + "id": f"{page_namespace}-{page_id}", + "text": text, + "source": f"{source_name}-{wiki}", + "added": datetime.datetime.utcnow().isoformat(), + "created": created.isoformat(), + "metadata": { + "license": str(license), + "authors": sorted(contributors), + "url": wiki_url(wiki, page_title), + "wiki": get_wiki_name(wiki), + "namespace": page_namespace, + "title": page_title, + }, + } + + +if __name__ == "__main__": + args = parser.parse_args() + main(args) diff --git a/wikiscrape/utils.py b/wikiscrape/utils.py index c10968c..eb6f958 100644 --- a/wikiscrape/utils.py +++ b/wikiscrape/utils.py @@ -6,9 +6,8 @@ import requests from bs4 import BeautifulSoup -from tenacity import retry, stop_after_attempt, wait_random_exponential -from licensed_pile.scrape import USER_AGENT +from licensed_pile import scrape logging.basicConfig( level=logging.INFO, @@ -16,18 +15,9 @@ ) -@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=1, max=30)) -def get_page(url: str, params: Optional[Dict[str, str]] = None): - """Get page and parse into soup.""" - params = params if params is not None else {} - resp = requests.get(url, params=params, headers={"User-Agent": USER_AGENT}) - logging.debug(f"Sending GET to {resp.url}") - if resp.status_code != 200: - logging.warning( - f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}" - ) - raise RuntimeError(f"Failed request to {resp.url}") - return resp.text +def get_page(*args, **kwargs): + r = scrape.get_page(*args, **kwargs) + return r.text def get_soup(text, parser="html.parser"): @@ -48,3 +38,9 @@ def removeprefix(s: str, prefix: str) -> str: if s.startswith(prefix): return s[len(prefix) :] return s[:] + + +def wiki_url(base_url: str, title: str) -> str: + """Create a wiki url from the wiki url and the page name.""" + url = urllib.parse.urljoin(base_url, f"wiki/{title.replace(' ', '_')}") + return urllib.parse.quote(url, safe=":/")