diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py index 1c2ca01..64c499a 100644 --- a/licensed_pile/licenses.py +++ b/licensed_pile/licenses.py @@ -48,6 +48,8 @@ def from_string(cls, s: str) -> "PermissiveLicenses": return cls.CC_BY_3 else: raise ValueError(f"Unable to understand license {s}") + if s == "gfdl": + return cls.GFDL raise ValueError(f"Unable to understand license {s}") diff --git a/wikiscrape/export_pages.py b/wikiscrape/export_pages.py index 915195a..7315c97 100644 --- a/wikiscrape/export_pages.py +++ b/wikiscrape/export_pages.py @@ -19,6 +19,8 @@ from utils import get_page, get_wiki_name +from licensed_pile import logs + parser = argparse.ArgumentParser(description="Export mediawikis as XML") parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.") parser.add_argument( @@ -67,11 +69,13 @@ def read_page_titles(filename: str) -> List[str]: def main(args): if args.listauthors is not None: raise NotImplementedError("--listauthors is current not implemented.") + logger = logs.get_logger("wikiscrape") args.pages = ( args.pages if args.pages is not None else [os.path.join("data", get_wiki_name(args.wiki), "pages")] ) + logger.info("Enumerating pages from %s", args.pages) pages = [] for page in args.pages: if os.path.exists(page) and os.path.isdir(page): @@ -79,6 +83,7 @@ def main(args): pages.extend(read_page_titles(f)) else: pages.extend(read_page_titles(page)) + logger.info("There are %d pages to export.", len(pages)) args.output_dir = ( args.output_dir @@ -86,6 +91,7 @@ def main(args): else os.path.join("data", get_wiki_name(args.wiki), "export") ) os.makedirs(args.output_dir, exist_ok=True) + logger.info("Saving export to %s", args.output_dir) # Save shards of exported pages to # data/${wiki_name}/export/${shard_idx}-pages.xml @@ -99,10 +105,11 @@ def main(args): with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf: wf.write(xml) if args.test_pages and j > args.test_pages: - print(f"Scraped {j + args.page_limit} pages, stopping for testing.") + logger.info(f"Scraped {j + args.page_limit} pages, stopping for testing.") break if __name__ == "__main__": args = parser.parse_args() + logs.configure_logging("wikiscrape") main(args) diff --git a/wikiscrape/get_namespaces.py b/wikiscrape/get_namespaces.py index 7bd6405..c9c2676 100644 --- a/wikiscrape/get_namespaces.py +++ b/wikiscrape/get_namespaces.py @@ -2,13 +2,14 @@ import argparse import json -import logging import os import urllib.parse from typing import Dict from utils import get_page, get_soup, get_wiki_name +from licensed_pile import logs + parser = argparse.ArgumentParser(description="Find all namespaces in a mediawiki wiki.") parser.add_argument("--wiki", required=True, help="The Url for the wiki in question.") parser.add_argument( @@ -19,7 +20,8 @@ def find_namespaces(wiki_url: str) -> Dict[int, str]: options = {} - logging.info(f"Finding all namespaces from {args.wiki}") + logger = logs.get_logger("wikiscrape") + logger.info(f"Finding all namespaces from {args.wiki}") # Even though they recomment using the index.php?title=PAGETITLE url for a lot # of things (with the /wiki/ being for readers), we use it here to start looking # for pages because it is more consistent (some wiki's want /w/index.php and @@ -47,4 +49,5 @@ def main(args): if __name__ == "__main__": args = parser.parse_args() + logs.configure_logging("wikiscrape") main(args) diff --git a/wikiscrape/list_pages.py b/wikiscrape/list_pages.py index f0bd99f..cd1da9d 100644 --- a/wikiscrape/list_pages.py +++ b/wikiscrape/list_pages.py @@ -2,7 +2,6 @@ import argparse import json -import logging import os import urllib.parse from typing import List @@ -10,6 +9,8 @@ from requests.models import PreparedRequest from utils import get_page, get_soup, get_wiki_name, removeprefix +from licensed_pile import logs + parser = argparse.ArgumentParser( description="Find all pages under a namespace for a mediawiki." ) @@ -29,7 +30,8 @@ def enumerate_namespace(wiki_url: str, namespace: int) -> List[str]: """Collect all pages of a wiki from within a namespace.""" - logging.info(f"Finding all pages under the {namespace} namespace from {wiki_url}") + logger = logs.get_logger("wikiscrape") + logger.info(f"Finding all pages under the {namespace} namespace from {wiki_url}") # Even though they recomment using the index.php?title=PAGETITLE url for a lot # of things (with the /wiki/ being for readers), we use it here to start looking # for pages because it is more consistent (some wiki's want /w/index.php and @@ -54,7 +56,8 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str] this wouldn't need to be a parameter. pages: The current list of pages we are building. """ - logging.info(f"Finding page links in {url}") + logger = logs.get_logger("wikiscrape") + logger.info(f"Finding page links in {url}") soup = get_soup(get_page(url)) # Find all the links in the page page_count = len(pages) @@ -63,7 +66,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str] pages.append( urllib.parse.unquote(removeprefix(link.attrs["href"], "/wiki/")) ) - logging.info(f"Found {len(pages) - page_count} pages") + logger.info(f"Found {len(pages) - page_count} pages") # Find a pagination link if nav := soup.find("div", {"class": "mw-allpages-nav"}): @@ -74,7 +77,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str] if link.text.lower().startswith("next page"): # Recurse using the pagination link as the new url. try: - logging.info(f"Found pagination page at {link.attrs['href']}") + logger.info(f"Found pagination page at {link.attrs['href']}") # The current page links have already been added to pages so we can # just return whatever the recusion gives us. return _enumerate_namespace( @@ -85,12 +88,12 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str] except Exception as e: # If something goes wrong in pagination, just return the pages we # have. - loggging.info( + logger.info( f"Something went wrong processing pagination at {link.attrs['href']}, returning partial results." ) return pages # If no pagination link was found, just return what we have. - logging.info(f"No pagination link found, finished.") + logger.info(f"No pagination link found, finished.") return pages @@ -119,4 +122,5 @@ def main(args): if __name__ == "__main__": args = parser.parse_args() + logs.configure_logging("wikiscrape") main(args) diff --git a/wikiscrape/to-dolma.py b/wikiscrape/to-dolma.py index 6d223db..7983512 100644 --- a/wikiscrape/to-dolma.py +++ b/wikiscrape/to-dolma.py @@ -4,12 +4,13 @@ import datetime import functools import glob -import itertools +import os import urllib.parse from utils import get_wiki_name, wiki_url from licensed_pile.licenses import PermissiveLicenses +from licensed_pile.logs import configure_logging, get_logger from licensed_pile.write import to_dolma from licensed_pile.xml import iterate_xmls @@ -36,14 +37,18 @@ def main(args): # Calculate defaults license = PermissiveLicenses.from_string(args.license) + logger = get_logger("wikiscrape") + logger.info("Saving all exported pages as licensed with %s", license) args.filename = ( args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz" ) + logger.info("Saving to dolma format at %s", args.filename) args.export = ( args.export if args.export else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml") ) + logger.info("Loading export from %s", args.export) # Our parser can ignore namespaces so just use `page`. pages = iterate_xmls(glob.iglob(args.export), tag="page") @@ -58,7 +63,7 @@ def main(args): def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses): revisions = [r for r in xml if r.tag.endswith("revision")] - # TODO Handle if this fails. + # TODO Handle if this fails and add logging. text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text @@ -96,4 +101,5 @@ def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses): if __name__ == "__main__": args = parser.parse_args() + configure_logging("wikiscrape") main(args) diff --git a/wikiscrape/utils.py b/wikiscrape/utils.py index eb6f958..61bc414 100644 --- a/wikiscrape/utils.py +++ b/wikiscrape/utils.py @@ -1,6 +1,5 @@ """Utilities for scraping wikis.""" -import logging import urllib.parse from typing import Dict, Optional @@ -9,11 +8,6 @@ from licensed_pile import scrape -logging.basicConfig( - level=logging.INFO, - format="wikiscrape: [%(asctime)s] %(levelname)s - %(message)s", -) - def get_page(*args, **kwargs): r = scrape.get_page(*args, **kwargs)