Skip to content

Commit

Permalink
ups
Browse files Browse the repository at this point in the history
  • Loading branch information
blester125 committed May 6, 2024
1 parent 63aa588 commit 227138a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
5 changes: 5 additions & 0 deletions wiki/scrape/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ Either the integer or the name can be used as input. This generates lists of pag
4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}`

**TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code?

The export format is the same as the wiki dump

Wikisrchive scraps have ~3 versions, to use the same format as the dump and 1 has a unique format. Most of the wiki's
that aren't online anymore use this old format.
25 changes: 23 additions & 2 deletions wiki/scrape/to-dolma.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,28 @@
default=f"data/{SOURCE_NAME}/raw/documents/",
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--source",
choices=["wikiscrape", "wikiarchive", "wikidump"],
default="wikiscrape",
help="Where does the data come from?",
)
parser.add_argument(
"--filename", default=None, help="The base filename for our chat data."
)
parser.add_argument(
"--shard_size", type=int, default=1, help="Size, in GB, for each shard."
)
parser.add_argument("--last_author", action="store_true", help="")
parser.add_argument(
"--last_author",
action="store_true",
help="Should we only include the most recent author? (Faster)",
)
parser.add_argument(
"--include_redirects",
action="store_true",
help="Should we skip pages that are redirects to others?",
)


def main(args):
Expand All @@ -57,13 +72,16 @@ def main(args):
pages = map(
functools.partial(
format_dolma,
source_name=SOURCE_NAME,
source_name=args.source,
wiki=args.wiki,
license=license,
all_authors=not args.last_author,
skip_redirect=not args.include_redirects,
),
pages,
)
# When we filter out pages based on things like redirects, they may be None
pages = filter(lambda p: p is not None, pages)
to_dolma(pages, args.output_dir, args.filename, args.shard_size)


Expand All @@ -73,7 +91,10 @@ def format_dolma(
wiki: str,
license: PermissiveLicenses,
all_authors: bool = True,
skip_redirect: bool = True,
):
if skip_redirect and [x for x in xml if x.tag.endswith("redirect")]:
return None
revisions = [r for r in xml if r.tag.endswith("revision")]
# TODO Handle if this fails and add logging.
text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text
Expand Down

0 comments on commit 227138a

Please sign in to comment.