Skip to content

Commit

Permalink
Tooling to download and process Wikis
Browse files Browse the repository at this point in the history
Add tools to scrape mediawiki wikis that don't publish dumps

Add tool that exports the xml based on the list of pages.

Add the ability to convert wikis to dolma
  • Loading branch information
blester125 committed Jun 3, 2024
1 parent 1ef9a1c commit 05d64f2
Show file tree
Hide file tree
Showing 28 changed files with 1,818 additions and 10 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,6 @@ cython_debug/
#.idea/
.python-version
**/licensed_pile_log.txt

node_modules
package-lock.json
39 changes: 34 additions & 5 deletions licensed_pile/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def __str__(self):
return self.value


# TODO: With all the different versions that are out in the wild, this flat enum
# is getting hard to use. We should re-thing how to do this.
class PermissiveLicenses(StringEnum):
"""By 'Permissive' we mean licenses that are in the Gold, Silver, or Bronze
lists of the Blue Oak Countil (https://blueoakcouncil.org/list), even if
Expand All @@ -17,15 +19,24 @@ class PermissiveLicenses(StringEnum):

PD = "Public Domain"
CC0 = "Creative Commons Zero - Public Domain - https://creativecommons.org/publicdomain/zero/1.0/"
CC_PDM = "Creative Commons Public Domain Mark - https://creativecommons.org/publicdomain/mark/1.0/"
CC_BY = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/4.0/"
)
CC_BY_3 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/3.0/"
)
CC_BY_2_5 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.5/"
)
CC_BY_2 = (
"Creative Commons - Attribution - https://creativecommons.org/licenses/by/2.0/"
)
CC_BY_SA = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/4.0/"
CC_BY_SA_3 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/3.0/"
CC_BY_SA_2_5 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.5/"
CC_BY_SA_2_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/2.1/"
CC_BY_SA_1 = "Creative Commons - Attribution Share-Alike - https://creativecommons.org/licenses/by-sa/1.0/"
GFDL = "GNU Free Documentation License"
APACHE_2 = "Apache 2 License - https://www.apache.org/licenses/LICENSE-2.0"
MIT = "MIT License"
Expand All @@ -49,17 +60,35 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
s = s.lower().strip()
if re.match(r".*/publicdomain/zero/1.0/?$", s):
return cls.CC0
if m := re.match(r".*/licenses/by(?P<share>-sa)?/(?P<version>\d).0/?$", s):
if m.group("version") == "4":
if m.group("share") is None:
if re.match(r".*/publicdomain/mark/1.0/?$", s):
return cls.CC_PDM
if re.match(r".*/publicdomain/.*", s):
return cls.PD
if m := re.search(r"(?:/licenses/)?by(?P<share>-sa)?/(?P<version>\d.\d)/?", s):
if m.group("version") == "4.0":
if m.group("share") is not None:
return cls.CC_BY_SA
return cls.CC_BY
elif m.group(1) == "3":
if m.group("share") is None:
elif m.group("version") == "3.0":
if m.group("share") is not None:
return cls.CC_BY_SA_3
return cls.CC_BY_3
elif m.group("version") == "2.5":
if m.group("share") is not None:
return cls.CC_BY_SA_2_5
return cls.CC_BY_2_5
elif m.group("version") == "2.1":
if m.group("share") is not None:
return cls.CC_BY_SA_2_1
elif m.group("version") == "2.0":
return cls.CC_BY_2
elif m.group("version") == "1.0":
if m.group("share") is not None:
return cls.CC_BY_SA_1
else:
raise ValueError(f"Unable to understand license {s}")
if s == "gfdl" or "gnu_free_documentation_license" in s:
return cls.GFDL
raise ValueError(f"Unable to understand license {s}")


Expand Down
1 change: 1 addition & 0 deletions licensed_pile/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def get_page(
resp = requests.get(url, params=params, headers=headers)
logging.debug(f"Sending GET to {resp.url}")
if resp.status_code != 200:
# TODO: Update logger
logging.warning(
f"Failed request to {resp.url}: {resp.status_code}, {resp.reason}"
)
Expand Down
14 changes: 12 additions & 2 deletions licensed_pile/xml.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Tools to help with xml parsing."""

from xml.etree import ElementTree as ET
from typing import List

import lxml.etree as ET


def iterate_xml(path: str, tag: str):
Expand All @@ -17,6 +19,14 @@ def iterate_xml(path: str, tag: str):
context = iter(context)
event, root = next(context)
for event, elem in context:
if event == "end" and elem.tag == tag:
# This `.localname` only exists for lxml. Include this or so you can
# still do a full namespace match if you need too.
if event == "end" and (ET.QName(elem.tag).localname == tag or elem.tag == tag):
yield elem
root.clear()


def iterate_xmls(paths: List[str], tag: str):
"""Iterable version of parsing multiple xml files with the same structure as a single iterator."""
for path in paths:
yield from iterate_xml(path, tag)
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
beautifulsoup4
charset_normalizer
datasets
dolma
google-cloud-storage
internetarchive
logging_json
markdown-it-py
pandas
patool
pre-commit
pyunpack
rdflib
requests>=2.13
smart_open
tenacity
pandas
jsonlines
datasets
tqdm
ultimate-sitemap-parser
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,6 @@ def get_version(file_name: str, version_variable: str = "__version__") -> str:
"logging_json",
"requests>=2.13",
"tenacity",
"lxml",
],
)
23 changes: 23 additions & 0 deletions wiki/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Wiki

## Notes

The following scanners output a .history.xml to parse
* "Internet Archive HTML5 Uploader ...": Seems to have .7z
* "wikiteam3 (v...)" these get released as .zstandard files.
* Official Wikipedia Dumps
* "Internet Archive Python library ..." >= 1.0.4


The following use the old format
* "Internet Archive Python library 0.X.X": As a zip file, you need to make a new dir with -d when you unzip.


The archive url can be created with `f"archive.org/details/{item_id}"`


Some of the items have multiple uploads, for example `wiki-kris159shoutwikicom_w` has multiple history files we so need to parse out the date and pic the most recent one, i.e., `kris159shoutwikicom_w-20180506-history-xml.7z` over `kris159shoutwikicom_w-20140129-history.xml.7z`

## Special Cases

Shout Wiki, WikiTravelAllLanguages
Empty file added wiki/__init__.py
Empty file.
1 change: 1 addition & 0 deletions wiki/archive/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data*/*
23 changes: 23 additions & 0 deletions wiki/archive/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Wiki Dumps from the Internet Archive

We need to download 4.4 TB from the Internet Archive.

If we had a Gigabit connection it would take 9 hours to download.

Based on the Internet, people say the IA generally has a bandwidth of 10 Mbps to 1 Mbps, and the longer you download the less bandwidth they give to you.

| Bandwith | Hosts | Time to DL |
|----------|------:|-----------:|
| 1 Gb/s | 1 | 9h 40m |
| | 4 | 2.3h |
| | 10 | 0.9h |
| 10 Mb/s | 1 | 40d 17h |
| | 4 | 10d + |
| | 10 | 4d + |
| 1 Mb/s | 1 | 407d 9h |
| | 4 | 101d + |
| | 10 | 40d+ |
| | 100 | 4d+ |
| | 500 | 0.8d |

We really need hardware based parallelism
Empty file added wiki/archive/__init__.py
Empty file.
135 changes: 135 additions & 0 deletions wiki/archive/download_archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""Download wiki dumps from the internet archive."""

import argparse
import functools
import json
import multiprocessing.dummy as mp
import os
import random

import internetarchive
import pyunpack
import utils

from licensed_pile import logs

parser = argparse.ArgumentParser(
description="Download wiki dumps from the internet archive."
)
parser.add_argument("--wiki_metadata", default="data/ia-wikis.jsonl")
parser.add_argument("--test_run", type=int, help="")
parser.add_argument("--num_threads", type=int, default=32, help="")
parser.add_argument("--worker_id", type=int, required=True, help="")
parser.add_argument("--num_workers", type=int, required=True, help="")


# TODO: Default downloading to .../dumps
def download_and_extract(
ident: str,
dl_file,
output_dir: str = "/fruitbasket/users/bdlester/projects/licensed_pile/wiki/archive/data/dumps",
verbose: bool = False,
):
logger = logs.get_logger("wiki/archive")
dest = os.path.join(output_dir, ident)
if os.path.exists(dest):
logger.info(
f"Skipping download of {dl_file['name']} for {ident} as {dest} already exists on disk."
)
return dest
logger.info(f"Downloading {dl_file['name']} for {ident}.")
internetarchive.download(
ident, checksum=True, verbose=verbose, files=dl_file["name"], destdir=output_dir
)
logger.info(f"Extracting download for {ident} to {dest}.")
pyunpack.Archive(os.path.join(dest, dl_file["name"])).extractall(dest)
return dest


def download_ia(wiki):
logger = logs.get_logger("wiki/archive")
if (ident := wiki["metadata"]["identifier"]) in utils.KNOWN_BAD:
logger.warning(f"Skipping {ident} as it is listed under utils.KNOWN_BAD")
return None
dl_file = utils.find_download(wiki)
return download_and_extract(ident, dl_file)


def download_fandom(wiki):
logger = logs.get_logger("wiki/archive")
logger.warning(f"Fandom downloads not implemented yet, downloading from IA.")
return download_ia(wiki)


def download_wikimedia(wiki):
logger = logs.get_logger("wiki/archive")
logger.warning(f"Wikimedia downloads not implemented yet, downloading from IA.")
return download_ia(wiki)


def scrape_wiki(wiki):
logger = logs.get_logger("wiki/archive")
logger.warning(f"Wiki Re-scrapes not implemented yet, downloading from IA.")
return download_ia(wiki)


def process_wiki(i, wiki, offset):
logger = logs.get_logger("wiki/archive")
if "metadata" not in wiki:
logger.error(f"Metadata missing from line {i}, malformed record")
return None
ident = wiki["metadata"]["identifier"]
if not utils.filter_language(wiki["metadata"].get("language")):
lang = wiki["metadata"].get("language")
logger.warning(f"{ident} appears to not be english, found: {lang}")
return None
if not utils.check_alive(wiki):
logger.info(f"{ident} is offline, getting dump from IA.")
return download_ia(wiki)
if not utils.verify_license(wiki):
logger.error(f"The IA license for {ident} doesn't match the source.")
return None
if utils.check_fandom(wiki):
logger.info(f"{ident} is a fandom wiki, downloading dump from there.")
return download_fandom(wiki)
if utils.check_wikimedia(wiki):
logger.info(f"{ident} is a WikiMedia wiki, downloading dump from there.")
return download_wikimedia(wiki)
if utils.check_out_of_date(wiki, offset):
logger.warning(f"IA dump for {ident} is very out of date, re-scraping.")
return scrape_wiki(wiki)


# TODO: configure dest_dir
def main(args):
logger = logs.get_logger("wiki/archive")
logger.info(f"Reading wiki metadata from {args.wiki_metadata}")
with open(args.wiki_metadata) as f:
wiki_metadata = [json.loads(l) for l in f if l]
logger.info(f"{len(wiki_metadata)} wikis to download.")

if args.test_run:
logger.info(f"Test Run: Only downloading {args.test_run} wikis")
random.shuffle(wiki_metadata)
wiki_metadata = wiki_metadata[: args.test_run]

wiki_metadata = [
w for i, w in enumerate(wiki_metadata) if i % args.num_workers == args.worker_id
]
logger.info(
f"{len(wiki_metadata)} wikis to download as {args.worker_id}/{args.num_workers}."
)

# f = functools.partial(process_wiki, offset=None)
# [f(*w) for w in enumerate(wiki_metadata)]

with mp.Pool(args.num_threads) as pool:
pool.starmap(
functools.partial(process_wiki, offset=None), enumerate(wiki_metadata)
)


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wiki/archive")
main(args)
65 changes: 65 additions & 0 deletions wiki/archive/get_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Download wiki dump metadata from the internet archive.
The licenseurl regex's we are using to search are mutually exclusive so we can
split the query into multiple chunks instead of `OR`ing them together to get some
parallelism out of the metadata scrape.
"""

import argparse
import json
import multiprocessing.dummy as mp
import os

import internetarchive

from licensed_pile import logs
from licensed_pile.licenses import PermissiveLicenses

parser = argparse.ArgumentParser(
description="Download metadata for wiki dumps from the IA."
)
parser.add_argument("--output_dir", default="data/metadata/", help="")
parser.add_argument("--file_name", default="ia-wiki-metadata.jsonl")
# TODO: Respect these
parser.add_argument("--include_wikicollections", action="store_true", help="")
parser.add_argument("--licenses", choices=[], action="append", help="")


def get_metadata(idx: int, query: str, file_name: str, output_dir: str):
"""Fetch item metadata from IA using query and save it to disk."""
with open(os.path.join(output_dir, f"{idx:>05}_{file_name}")) as wf:
for item in internetarchive.search_items(query):
wf.write(json.dumps(i.item_metadata) + "\n")


def make_queries(licenses, include_wikicollections):
if include_wikicollections:
raise NotImplementedError("...")
license_regexs = licenses
for license_regex in license_regexs:
yield f"collection:(wikiteam) AND licenseurl:({license_regex})"


def main(args):
# TODO have something that translates from the PermissiveLicense Enum to regex's
if args.licenses is None:
args.licesnes = [
"*\/by\/*",
"*\/by-sa\/*",
"*publicdomain*",
"*GNU_Free_Documentation_License*",
]
queries = list(make_queries(args.licesnses, args.include_wikicollections))
with mp.Pool(len(queries)) as pool:
pool.starmap(
functools.partial(
get_metadata, file_name=args.file_name, output_dir=args.output_dir
),
enumerate(queries),
)


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wiki/archive")
main(args)
Loading

0 comments on commit 05d64f2

Please sign in to comment.