use shared logging

r-three · Mar 11, 2024 · a2c7008 · a2c7008
1 parent 2af7c13
commit a2c7008
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 18 deletions.
diff --git a/licensed_pile/licenses.py b/licensed_pile/licenses.py
@@ -48,6 +48,8 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
                 return cls.CC_BY_3
             else:
                 raise ValueError(f"Unable to understand license {s}")
+        if s == "gfdl":
+            return cls.GFDL
         raise ValueError(f"Unable to understand license {s}")
 
 

diff --git a/wikiscrape/export_pages.py b/wikiscrape/export_pages.py
@@ -19,6 +19,8 @@
 
 from utils import get_page, get_wiki_name
 
+from licensed_pile import logs
+
 parser = argparse.ArgumentParser(description="Export mediawikis as XML")
 parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.")
 parser.add_argument(
@@ -67,25 +69,29 @@ def read_page_titles(filename: str) -> List[str]:
 def main(args):
     if args.listauthors is not None:
         raise NotImplementedError("--listauthors is current not implemented.")
+    logger = logs.get_logger("wikiscrape")
     args.pages = (
         args.pages
         if args.pages is not None
         else [os.path.join("data", get_wiki_name(args.wiki), "pages")]
     )
+    logger.info("Enumerating pages from %s", args.pages)
     pages = []
     for page in args.pages:
         if os.path.exists(page) and os.path.isdir(page):
             for f in glob.glob(os.path.join(page, "*.txt")):
                 pages.extend(read_page_titles(f))
         else:
             pages.extend(read_page_titles(page))
+    logger.info("There are %d pages to export.", len(pages))
 
     args.output_dir = (
         args.output_dir
         if args.output_dir is not None
         else os.path.join("data", get_wiki_name(args.wiki), "export")
     )
     os.makedirs(args.output_dir, exist_ok=True)
+    logger.info("Saving export to %s", args.output_dir)
 
     # Save shards of exported pages to
     #   data/${wiki_name}/export/${shard_idx}-pages.xml
@@ -99,10 +105,11 @@ def main(args):
         with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf:
             wf.write(xml)
         if args.test_pages and j > args.test_pages:
-            print(f"Scraped {j + args.page_limit} pages, stopping for testing.")
+            logger.info(f"Scraped {j + args.page_limit} pages, stopping for testing.")
             break
 
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    logs.configure_logging("wikiscrape")
     main(args)
diff --git a/wikiscrape/get_namespaces.py b/wikiscrape/get_namespaces.py
@@ -2,13 +2,14 @@
 
 import argparse
 import json
-import logging
 import os
 import urllib.parse
 from typing import Dict
 
 from utils import get_page, get_soup, get_wiki_name
 
+from licensed_pile import logs
+
 parser = argparse.ArgumentParser(description="Find all namespaces in a mediawiki wiki.")
 parser.add_argument("--wiki", required=True, help="The Url for the wiki in question.")
 parser.add_argument(
@@ -19,7 +20,8 @@
 
 def find_namespaces(wiki_url: str) -> Dict[int, str]:
     options = {}
-    logging.info(f"Finding all namespaces from {args.wiki}")
+    logger = logs.get_logger("wikiscrape")
+    logger.info(f"Finding all namespaces from {args.wiki}")
     # Even though they recomment using the index.php?title=PAGETITLE url for a lot
     # of things (with the /wiki/ being for readers), we use it here to start looking
     # for pages because it is more consistent (some wiki's want /w/index.php and
@@ -47,4 +49,5 @@ def main(args):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    logs.configure_logging("wikiscrape")
     main(args)
diff --git a/wikiscrape/list_pages.py b/wikiscrape/list_pages.py
@@ -2,14 +2,15 @@
 
 import argparse
 import json
-import logging
 import os
 import urllib.parse
 from typing import List
 
 from requests.models import PreparedRequest
 from utils import get_page, get_soup, get_wiki_name, removeprefix
 
+from licensed_pile import logs
+
 parser = argparse.ArgumentParser(
     description="Find all pages under a namespace for a mediawiki."
 )
@@ -29,7 +30,8 @@
 
 def enumerate_namespace(wiki_url: str, namespace: int) -> List[str]:
     """Collect all pages of a wiki from within a namespace."""
-    logging.info(f"Finding all pages under the {namespace} namespace from {wiki_url}")
+    logger = logs.get_logger("wikiscrape")
+    logger.info(f"Finding all pages under the {namespace} namespace from {wiki_url}")
     # Even though they recomment using the index.php?title=PAGETITLE url for a lot
     # of things (with the /wiki/ being for readers), we use it here to start looking
     # for pages because it is more consistent (some wiki's want /w/index.php and
@@ -54,7 +56,8 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
               this wouldn't need to be a parameter.
       pages: The current list of pages we are building.
     """
-    logging.info(f"Finding page links in {url}")
+    logger = logs.get_logger("wikiscrape")
+    logger.info(f"Finding page links in {url}")
     soup = get_soup(get_page(url))
     # Find all the links in the page
     page_count = len(pages)
@@ -63,7 +66,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
             pages.append(
                 urllib.parse.unquote(removeprefix(link.attrs["href"], "/wiki/"))
             )
-    logging.info(f"Found {len(pages) - page_count} pages")
+    logger.info(f"Found {len(pages) - page_count} pages")
 
     # Find a pagination link
     if nav := soup.find("div", {"class": "mw-allpages-nav"}):
@@ -74,7 +77,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
             if link.text.lower().startswith("next page"):
                 # Recurse using the pagination link as the new url.
                 try:
-                    logging.info(f"Found pagination page at {link.attrs['href']}")
+                    logger.info(f"Found pagination page at {link.attrs['href']}")
                     # The current page links have already been added to pages so we can
                     # just return whatever the recusion gives us.
                     return _enumerate_namespace(
@@ -85,12 +88,12 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
                 except Exception as e:
                     # If something goes wrong in pagination, just return the pages we
                     # have.
-                    loggging.info(
+                    logger.info(
                         f"Something went wrong processing pagination at {link.attrs['href']}, returning partial results."
                     )
                     return pages
     # If no pagination link was found, just return what we have.
-    logging.info(f"No pagination link found, finished.")
+    logger.info(f"No pagination link found, finished.")
     return pages
 
 
@@ -119,4 +122,5 @@ def main(args):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    logs.configure_logging("wikiscrape")
     main(args)
diff --git a/wikiscrape/to-dolma.py b/wikiscrape/to-dolma.py
@@ -4,12 +4,13 @@
 import datetime
 import functools
 import glob
-import itertools
+import os
 import urllib.parse
 
 from utils import get_wiki_name, wiki_url
 
 from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.logs import configure_logging, get_logger
 from licensed_pile.write import to_dolma
 from licensed_pile.xml import iterate_xmls
 
@@ -36,14 +37,18 @@
 def main(args):
     # Calculate defaults
     license = PermissiveLicenses.from_string(args.license)
+    logger = get_logger("wikiscrape")
+    logger.info("Saving all exported pages as licensed with %s", license)
     args.filename = (
         args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz"
     )
+    logger.info("Saving to dolma format at %s", args.filename)
     args.export = (
         args.export
         if args.export
         else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml")
     )
+    logger.info("Loading export from %s", args.export)
 
     # Our parser can ignore namespaces so just use `page`.
     pages = iterate_xmls(glob.iglob(args.export), tag="page")
@@ -58,7 +63,7 @@ def main(args):
 
 def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):
     revisions = [r for r in xml if r.tag.endswith("revision")]
-    # TODO Handle if this fails.
+    # TODO Handle if this fails and add logging.
     text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text
     page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text
     page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text
@@ -96,4 +101,5 @@ def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):
 
 if __name__ == "__main__":
     args = parser.parse_args()
+    configure_logging("wikiscrape")
     main(args)
diff --git a/wikiscrape/utils.py b/wikiscrape/utils.py
@@ -1,6 +1,5 @@
 """Utilities for scraping wikis."""
 
-import logging
 import urllib.parse
 from typing import Dict, Optional
 
@@ -9,11 +8,6 @@
 
 from licensed_pile import scrape
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="wikiscrape: [%(asctime)s] %(levelname)s - %(message)s",
-)
-
 
 def get_page(*args, **kwargs):
     r = scrape.get_page(*args, **kwargs)