Skip to content

Commit

Permalink
use shared logging
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed Mar 11, 2024
1 parent 2af7c13 commit a2c7008
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 18 deletions.
2 changes: 2 additions & 0 deletions licensed_pile/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def from_string(cls, s: str) -> "PermissiveLicenses":
return cls.CC_BY_3
else:
raise ValueError(f"Unable to understand license {s}")
if s == "gfdl":
return cls.GFDL
raise ValueError(f"Unable to understand license {s}")


Expand Down
9 changes: 8 additions & 1 deletion wikiscrape/export_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from utils import get_page, get_wiki_name

from licensed_pile import logs

parser = argparse.ArgumentParser(description="Export mediawikis as XML")
parser.add_argument("--wiki", required=True, help="The wiki url we are exporting.")
parser.add_argument(
Expand Down Expand Up @@ -67,25 +69,29 @@ def read_page_titles(filename: str) -> List[str]:
def main(args):
if args.listauthors is not None:
raise NotImplementedError("--listauthors is current not implemented.")
logger = logs.get_logger("wikiscrape")
args.pages = (
args.pages
if args.pages is not None
else [os.path.join("data", get_wiki_name(args.wiki), "pages")]
)
logger.info("Enumerating pages from %s", args.pages)
pages = []
for page in args.pages:
if os.path.exists(page) and os.path.isdir(page):
for f in glob.glob(os.path.join(page, "*.txt")):
pages.extend(read_page_titles(f))
else:
pages.extend(read_page_titles(page))
logger.info("There are %d pages to export.", len(pages))

args.output_dir = (
args.output_dir
if args.output_dir is not None
else os.path.join("data", get_wiki_name(args.wiki), "export")
)
os.makedirs(args.output_dir, exist_ok=True)
logger.info("Saving export to %s", args.output_dir)

# Save shards of exported pages to
# data/${wiki_name}/export/${shard_idx}-pages.xml
Expand All @@ -99,10 +105,11 @@ def main(args):
with open(os.path.join(args.output_dir, f"{i:>05}-pages.xml"), "w") as wf:
wf.write(xml)
if args.test_pages and j > args.test_pages:
print(f"Scraped {j + args.page_limit} pages, stopping for testing.")
logger.info(f"Scraped {j + args.page_limit} pages, stopping for testing.")
break


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wikiscrape")
main(args)
7 changes: 5 additions & 2 deletions wikiscrape/get_namespaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

import argparse
import json
import logging
import os
import urllib.parse
from typing import Dict

from utils import get_page, get_soup, get_wiki_name

from licensed_pile import logs

parser = argparse.ArgumentParser(description="Find all namespaces in a mediawiki wiki.")
parser.add_argument("--wiki", required=True, help="The Url for the wiki in question.")
parser.add_argument(
Expand All @@ -19,7 +20,8 @@

def find_namespaces(wiki_url: str) -> Dict[int, str]:
options = {}
logging.info(f"Finding all namespaces from {args.wiki}")
logger = logs.get_logger("wikiscrape")
logger.info(f"Finding all namespaces from {args.wiki}")
# Even though they recomment using the index.php?title=PAGETITLE url for a lot
# of things (with the /wiki/ being for readers), we use it here to start looking
# for pages because it is more consistent (some wiki's want /w/index.php and
Expand Down Expand Up @@ -47,4 +49,5 @@ def main(args):

if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wikiscrape")
main(args)
18 changes: 11 additions & 7 deletions wikiscrape/list_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

import argparse
import json
import logging
import os
import urllib.parse
from typing import List

from requests.models import PreparedRequest
from utils import get_page, get_soup, get_wiki_name, removeprefix

from licensed_pile import logs

parser = argparse.ArgumentParser(
description="Find all pages under a namespace for a mediawiki."
)
Expand All @@ -29,7 +30,8 @@

def enumerate_namespace(wiki_url: str, namespace: int) -> List[str]:
"""Collect all pages of a wiki from within a namespace."""
logging.info(f"Finding all pages under the {namespace} namespace from {wiki_url}")
logger = logs.get_logger("wikiscrape")
logger.info(f"Finding all pages under the {namespace} namespace from {wiki_url}")
# Even though they recomment using the index.php?title=PAGETITLE url for a lot
# of things (with the /wiki/ being for readers), we use it here to start looking
# for pages because it is more consistent (some wiki's want /w/index.php and
Expand All @@ -54,7 +56,8 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
this wouldn't need to be a parameter.
pages: The current list of pages we are building.
"""
logging.info(f"Finding page links in {url}")
logger = logs.get_logger("wikiscrape")
logger.info(f"Finding page links in {url}")
soup = get_soup(get_page(url))
# Find all the links in the page
page_count = len(pages)
Expand All @@ -63,7 +66,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
pages.append(
urllib.parse.unquote(removeprefix(link.attrs["href"], "/wiki/"))
)
logging.info(f"Found {len(pages) - page_count} pages")
logger.info(f"Found {len(pages) - page_count} pages")

# Find a pagination link
if nav := soup.find("div", {"class": "mw-allpages-nav"}):
Expand All @@ -74,7 +77,7 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
if link.text.lower().startswith("next page"):
# Recurse using the pagination link as the new url.
try:
logging.info(f"Found pagination page at {link.attrs['href']}")
logger.info(f"Found pagination page at {link.attrs['href']}")
# The current page links have already been added to pages so we can
# just return whatever the recusion gives us.
return _enumerate_namespace(
Expand All @@ -85,12 +88,12 @@ def _enumerate_namespace(url: str, wiki_url: str, pages: List[str]) -> List[str]
except Exception as e:
# If something goes wrong in pagination, just return the pages we
# have.
loggging.info(
logger.info(
f"Something went wrong processing pagination at {link.attrs['href']}, returning partial results."
)
return pages
# If no pagination link was found, just return what we have.
logging.info(f"No pagination link found, finished.")
logger.info(f"No pagination link found, finished.")
return pages


Expand Down Expand Up @@ -119,4 +122,5 @@ def main(args):

if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging("wikiscrape")
main(args)
10 changes: 8 additions & 2 deletions wikiscrape/to-dolma.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import datetime
import functools
import glob
import itertools
import os
import urllib.parse

from utils import get_wiki_name, wiki_url

from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.logs import configure_logging, get_logger
from licensed_pile.write import to_dolma
from licensed_pile.xml import iterate_xmls

Expand All @@ -36,14 +37,18 @@
def main(args):
# Calculate defaults
license = PermissiveLicenses.from_string(args.license)
logger = get_logger("wikiscrape")
logger.info("Saving all exported pages as licensed with %s", license)
args.filename = (
args.filename if args.filename else f"{get_wiki_name(args.wiki)}.jsonl.gz"
)
logger.info("Saving to dolma format at %s", args.filename)
args.export = (
args.export
if args.export
else os.path.join("data", get_wiki_name(args.wiki), "export", "*.xml")
)
logger.info("Loading export from %s", args.export)

# Our parser can ignore namespaces so just use `page`.
pages = iterate_xmls(glob.iglob(args.export), tag="page")
Expand All @@ -58,7 +63,7 @@ def main(args):

def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):
revisions = [r for r in xml if r.tag.endswith("revision")]
# TODO Handle if this fails.
# TODO Handle if this fails and add logging.
text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text
page_namespace = [ns for ns in xml if ns.tag.endswith("ns")][0].text
page_id = [pid for pid in xml if pid.tag.endswith("id")][0].text
Expand Down Expand Up @@ -96,4 +101,5 @@ def format_dolma(xml, source_name: str, wiki: str, license: PermissiveLicenses):

if __name__ == "__main__":
args = parser.parse_args()
configure_logging("wikiscrape")
main(args)
6 changes: 0 additions & 6 deletions wikiscrape/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Utilities for scraping wikis."""

import logging
import urllib.parse
from typing import Dict, Optional

Expand All @@ -9,11 +8,6 @@

from licensed_pile import scrape

logging.basicConfig(
level=logging.INFO,
format="wikiscrape: [%(asctime)s] %(levelname)s - %(message)s",
)


def get_page(*args, **kwargs):
r = scrape.get_page(*args, **kwargs)
Expand Down

0 comments on commit a2c7008

Please sign in to comment.