From d6a33f1d21a8cbb34b584895554147ad97e97a72 Mon Sep 17 00:00:00 2001 From: boxydog Date: Fri, 1 Dec 2023 11:27:16 -0600 Subject: [PATCH] Medium post importer (from medium export) --- docs/content.rst | 4 +- docs/importer.rst | 13 +- pelican/tests/content/medium_post_content.txt | 4 + ...2017-04-21_-medium-post--d1bf01d62ba3.html | 72 ++++++++ pelican/tests/test_generators.py | 37 +++- pelican/tests/test_importer.py | 83 +++++++++ pelican/tools/pelican_import.py | 165 +++++++++++++++++- 7 files changed, 357 insertions(+), 21 deletions(-) create mode 100644 pelican/tests/content/medium_post_content.txt create mode 100644 pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html diff --git a/docs/content.rst b/docs/content.rst index cacacea9a..46db11405 100644 --- a/docs/content.rst +++ b/docs/content.rst @@ -439,8 +439,8 @@ For **Markdown**, one must rely on an extension. For example, using the `mdx_inc Importing an existing site ========================== -It is possible to import your site from WordPress, Tumblr, Dotclear, and RSS -feeds using a simple script. See :ref:`import`. +It is possible to import your site from several other blogging sites +(like WordPress, Tumblr, ..) using a simple script. See :ref:`import`. Translations ============ diff --git a/docs/importer.rst b/docs/importer.rst index 997a46323..093ef465e 100644 --- a/docs/importer.rst +++ b/docs/importer.rst @@ -11,6 +11,7 @@ software to reStructuredText or Markdown. The supported import formats are: - Blogger XML export - Dotclear export +- Medium export - Tumblr API - WordPress XML export - RSS/Atom feed @@ -65,6 +66,7 @@ Optional arguments -h, --help Show this help message and exit --blogger Blogger XML export (default: False) --dotclear Dotclear export (default: False) + --medium Medium export (default: False) --tumblr Tumblr API (default: False) --wpfile WordPress XML export (default: False) --feed Feed to parse (default: False) @@ -80,8 +82,7 @@ Optional arguments (default: False) --filter-author Import only post from the specified author --strip-raw Strip raw HTML code that can't be converted to markup - such as flash embeds or iframes (wordpress import - only) (default: False) + such as flash embeds or iframes (default: False) --wp-custpost Put wordpress custom post types in directories. If used with --dir-cat option directories will be created as "/post_type/category/" (wordpress import only) @@ -113,6 +114,14 @@ For Dotclear:: $ pelican-import --dotclear -o ~/output ~/backup.txt +For Medium:: + + $ pelican-import --medium -o ~/output ~/medium-export/posts/ + +The Medium export is a zip file. Unzip it, and point this tool to the +"posts" subdirectory. For more information on how to export, see +https://help.medium.com/hc/en-us/articles/115004745787-Export-your-account-data. + For Tumblr:: $ pelican-import --tumblr -o ~/output --blogname= diff --git a/pelican/tests/content/medium_post_content.txt b/pelican/tests/content/medium_post_content.txt new file mode 100644 index 000000000..5e21881cb --- /dev/null +++ b/pelican/tests/content/medium_post_content.txt @@ -0,0 +1,4 @@ + +

Title header

A paragraph of content.

Paragraph number two.

A list:

  1. One.
  2. Two.
  3. Three.

A link: link text.

Header 2

A block quote:

quote words strong words

after blockquote

A figure caption.

A final note: Cross-Validated has sometimes been helpful.


Next: Next post +

+

By User Name on .

Canonical link

Exported from Medium on December 1, 2023.

diff --git a/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html new file mode 100644 index 000000000..02d272dc0 --- /dev/null +++ b/pelican/tests/content/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html @@ -0,0 +1,72 @@ +A title diff --git a/pelican/tests/test_generators.py b/pelican/tests/test_generators.py index af6f5b1ab..8c257b550 100644 --- a/pelican/tests/test_generators.py +++ b/pelican/tests/test_generators.py @@ -264,6 +264,7 @@ def test_generate_feeds_override_url(self): def test_generate_context(self): articles_expected = [ + ["A title", "published", "medium_posts", "article"], ["Article title", "published", "Default", "article"], [ "Article with markdown and summary metadata multi", @@ -391,13 +392,24 @@ def test_generate_categories(self): # terms of process order will define the name for that category categories = [cat.name for cat, _ in self.generator.categories] categories_alternatives = ( - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), ) self.assertIn(sorted(categories), categories_alternatives) # test for slug categories = [cat.slug for cat, _ in self.generator.categories] - categories_expected = ["default", "testcategory", "yeah", "test", "zhi-dao-shu"] + categories_expected = [ + "default", + "testcategory", + "medium_posts", + "yeah", + "test", + "zhi-dao-shu", + ] self.assertEqual(sorted(categories), sorted(categories_expected)) def test_do_not_use_folder_as_category(self): @@ -549,7 +561,8 @@ def test_period_archives_context(self): granularity: {period["period"] for period in periods} for granularity, periods in period_archives.items() } - expected = {"year": {(1970,), (2010,), (2012,), (2014,)}} + self.maxDiff = None + expected = {"year": {(1970,), (2010,), (2012,), (2014,), (2017,)}} self.assertEqual(expected, abbreviated_archives) # Month archives enabled: @@ -570,7 +583,7 @@ def test_period_archives_context(self): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -578,6 +591,7 @@ def test_period_archives_context(self): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, } self.assertEqual(expected, abbreviated_archives) @@ -602,7 +616,7 @@ def test_period_archives_context(self): for granularity, periods in period_archives.items() } expected = { - "year": {(1970,), (2010,), (2012,), (2014,)}, + "year": {(1970,), (2010,), (2012,), (2014,), (2017,)}, "month": { (1970, "January"), (2010, "December"), @@ -610,6 +624,7 @@ def test_period_archives_context(self): (2012, "November"), (2012, "October"), (2014, "February"), + (2017, "April"), }, "day": { (1970, "January", 1), @@ -619,6 +634,7 @@ def test_period_archives_context(self): (2012, "October", 30), (2012, "October", 31), (2014, "February", 9), + (2017, "April", 21), }, } self.assertEqual(expected, abbreviated_archives) @@ -836,8 +852,12 @@ def test_standard_metadata_in_default_metadata(self): categories = sorted([category.name for category, _ in generator.categories]) categories_expected = [ - sorted(["Default", "TestCategory", "yeah", "test", "指導書"]), - sorted(["Default", "TestCategory", "Yeah", "test", "指導書"]), + sorted( + ["Default", "TestCategory", "medium_posts", "yeah", "test", "指導書"] + ), + sorted( + ["Default", "TestCategory", "medium_posts", "Yeah", "test", "指導書"] + ), ] self.assertIn(categories, categories_expected) @@ -864,6 +884,7 @@ def test_article_order_by(self): generator.generate_context() expected = [ + "A title", "An Article With Code Block To Test Typogrify Ignore", "Article title", "Article with Nonconformant HTML meta tags", diff --git a/pelican/tests/test_importer.py b/pelican/tests/test_importer.py index 05ef5bbdb..916c1183d 100644 --- a/pelican/tests/test_importer.py +++ b/pelican/tests/test_importer.py @@ -21,6 +21,10 @@ get_attachments, tumblr2fields, wp2fields, + mediumpost2fields, + mediumposts2fields, + strip_medium_post_content, + medium_slug, ) from pelican.utils import path_to_file_url, slugify @@ -708,3 +712,82 @@ def get_posts(api_key, blogname, offset=0): posts, posts, ) + + +class TestMediumImporter(TestCaseWithCLocale): + def setUp(self): + super().setUp() + self.test_content_root = "pelican/tests/content" + # The content coming out of parsing is similar, but not the same. + # Beautiful soup rearranges the order of attributes, for example. + # So, we keep a copy of the content for the test. + content_filename = f"{self.test_content_root}/medium_post_content.txt" + with open(content_filename, encoding="utf-8") as the_content_file: + # Many editors and scripts add a final newline, so live with that + # in our test + the_content = the_content_file.read() + assert the_content[-1] == "\n" + the_content = the_content[:-1] + self.post_tuple = ( + "A title", + the_content, + # slug: + "2017-04-21-medium-post", + "2017-04-21 17:11", + "User Name", + None, + (), + "published", + "article", + "html", + ) + + def test_mediumpost2field(self): + """Parse one post""" + post_filename = f"{self.test_content_root}/medium_posts/2017-04-21_-medium-post--d1bf01d62ba3.html" + val = mediumpost2fields(post_filename) + self.assertEqual(self.post_tuple, val, val) + + def test_mediumposts2field(self): + """Parse all posts in an export directory""" + posts = [ + fields + for fields in mediumposts2fields(f"{self.test_content_root}/medium_posts") + ] + self.assertEqual(1, len(posts)) + self.assertEqual(self.post_tuple, posts[0]) + + def test_strip_content(self): + """Strip out unhelpful tags""" + html_doc = ( + "
This keeps lots of tags, but not " + "the
section
tags
" + ) + soup = BeautifulSoup(html_doc, "html.parser") + self.assertEqual( + "This keeps lots of tags, but not the section tags", + strip_medium_post_content(soup), + ) + + def test_medium_slug(self): + # Remove hex stuff at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug( + "medium-export/posts/2017-04-27_A-long-title--2971442227dd.html" + ), + ) + # Remove "--DRAFT" at the end + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title--DRAFT.html"), + ) + # Remove both (which happens) + self.assertEqual( + "draft_How-to-do", medium_slug("draft_How-to-do--DRAFT--87225c81dddd.html") + ) + # If no hex stuff, leave it alone + self.assertEqual( + "2017-04-27_A-long-title", + medium_slug("medium-export/posts/2017-04-27_A-long-title.html"), + ) diff --git a/pelican/tools/pelican_import.py b/pelican/tools/pelican_import.py index 681a5c453..eb343860d 100755 --- a/pelican/tools/pelican_import.py +++ b/pelican/tools/pelican_import.py @@ -15,6 +15,8 @@ from urllib.parse import quote, urlparse, urlsplit, urlunsplit from urllib.request import urlretrieve +import dateutil.parser + # because logging.setLoggerClass has to be called before logging.getLogger from pelican.log import init from pelican.settings import DEFAULT_CONFIG @@ -114,19 +116,25 @@ def _multi_replace(dic, string): return content -def xml_to_soup(xml): - """Opens an xml file""" +def _import_bs4(): + """Import and return bs4, otherwise sys.exit.""" try: - from bs4 import BeautifulSoup + import bs4 except ImportError: error = ( 'Missing dependency "BeautifulSoup4" and "lxml" required to ' "import XML files." ) sys.exit(error) + return bs4 + + +def file_to_soup(xml, features="xml"): + """Reads a file, returns soup.""" + bs4 = _import_bs4() with open(xml, encoding="utf-8") as infile: xmlfile = infile.read() - soup = BeautifulSoup(xmlfile, "xml") + soup = bs4.BeautifulSoup(xmlfile, features) return soup @@ -140,7 +148,7 @@ def get_filename(post_name, post_id): def wp2fields(xml, wp_custpost=False): """Opens a wordpress XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") for item in items: if item.find("status").string in ["publish", "draft"]: @@ -210,7 +218,7 @@ def wp2fields(xml, wp_custpost=False): def blogger2fields(xml): """Opens a blogger XML file, and yield Pelican fields""" - soup = xml_to_soup(xml) + soup = file_to_soup(xml) entries = soup.feed.findAll("entry") for entry in entries: raw_kind = entry.find( @@ -536,6 +544,133 @@ def tumblr2fields(api_key, blogname): posts = _get_tumblr_posts(api_key, blogname, offset) +def strip_medium_post_content(soup) -> str: + """Strip some tags and attributes from medium post content. + + For example, the 'section' and 'div' tags cause trouble while rendering. + + The problem with these tags is you can get a section divider (--------------) + that is not between two pieces of content. For example: + + Some text. + + .. container:: section-divider + + -------------- + + .. container:: section-content + + More content. + + In this case, pandoc complains: "Unexpected section title or transition." + + Also, the "id" and "name" attributes in tags cause similar problems. They show + up in .rst as extra junk that separates transitions. + """ + # Remove tags + # section and div cause problems + # footer also can cause problems, and has nothing we want to keep + # See https://stackoverflow.com/a/8439761 + invalid_tags = ["section", "div", "footer"] + for tag in invalid_tags: + for match in soup.findAll(tag): + match.replaceWithChildren() + + # Remove attributes + # See https://stackoverflow.com/a/9045719 + invalid_attributes = ["name", "id", "class"] + bs4 = _import_bs4() + for tag in soup.descendants: + if isinstance(tag, bs4.element.Tag): + tag.attrs = { + key: value + for key, value in tag.attrs.items() + if key not in invalid_attributes + } + + # Get the string of all content, keeping other tags + all_content = "".join(str(element) for element in soup.contents) + return all_content + + +def mediumpost2fields(filepath: str) -> tuple: + """Take an HTML post from a medium export, return Pelican fields.""" + + soup = file_to_soup(filepath, "html.parser") + if not soup: + raise ValueError(f"{filepath} could not be parsed by beautifulsoup") + kind = "article" + + content = soup.find("section", class_="e-content") + if not content: + raise ValueError(f"{filepath}: Post has no content") + + title = soup.find("title").string or "" + + raw_date = soup.find("time", class_="dt-published") + date = None + if raw_date: + # This datetime can include timezone, e.g., "2017-04-21T17:11:55.799Z" + # python before 3.11 can't parse the timezone using datetime.fromisoformat + # See also https://docs.python.org/3.10/library/datetime.html#datetime.datetime.fromisoformat + # "This does not support parsing arbitrary ISO 8601 strings" + # So, we use dateutil.parser, which can handle it. + date_object = dateutil.parser.parse(raw_date.attrs["datetime"]) + date = date_object.strftime("%Y-%m-%d %H:%M") + status = "published" + else: + status = "draft" + author = soup.find("a", class_="p-author h-card") + if author: + author = author.string + + # Now that we're done with classes, we can strip the content + content = strip_medium_post_content(content) + + # medium HTML export doesn't have tag or category + # RSS feed has tags, but it doesn't have all the posts. + tags = () + + slug = medium_slug(filepath) + + # TODO: make the fields a python dataclass + return ( + title, + content, + slug, + date, + author, + None, + tags, + status, + kind, + "html", + ) + + +def medium_slug(filepath: str) -> str: + """Make the filepath of a medium exported file into a slug.""" + # slug: filename without extension + slug = os.path.basename(filepath) + slug = os.path.splitext(slug)[0] + # A medium export filename looks like date_-title-...html + # But, RST doesn't like "_-" (see https://github.com/sphinx-doc/sphinx/issues/4350) + # so get rid of it + slug = slug.replace("_-", "-") + # drop the hex string medium puts on the end of the filename, why keep it. + # e.g., "-a8a8a8a8" or "---a9a9a9a9" + # also: drafts don't need "--DRAFT" + slug = re.sub(r"((-)+([0-9a-f]+|DRAFT))+$", "", slug) + return slug + + +def mediumposts2fields(medium_export_dir: str): + """Take HTML posts in a medium export directory, and yield Pelican fields.""" + for file in os.listdir(medium_export_dir): + filename = os.fsdecode(file) + yield mediumpost2fields(os.path.join(medium_export_dir, filename)) + + def feed2fields(file): """Read a feed and yield pelican fields""" import feedparser @@ -711,7 +846,7 @@ def get_attachments(xml): """returns a dictionary of posts that have attachments with a list of the attachment_urls """ - soup = xml_to_soup(xml) + soup = file_to_soup(xml) items = soup.rss.channel.findAll("item") names = {} attachments = [] @@ -837,6 +972,9 @@ def fields2pelican( posts_require_pandoc.append(filename) slug = not disable_slugs and filename or None + assert slug is None or filename == os.path.basename( + filename + ), f"filename is not a basename: {filename}" if wp_attach and attachments: try: @@ -984,6 +1122,9 @@ def main(): parser.add_argument( "--dotclear", action="store_true", dest="dotclear", help="Dotclear export" ) + parser.add_argument( + "--medium", action="store_true", dest="medium", help="Medium export" + ) parser.add_argument( "--tumblr", action="store_true", dest="tumblr", help="Tumblr export" ) @@ -1069,6 +1210,8 @@ def main(): input_type = "blogger" elif args.dotclear: input_type = "dotclear" + elif args.medium: + input_type = "medium" elif args.tumblr: input_type = "tumblr" elif args.wpfile: @@ -1077,8 +1220,8 @@ def main(): input_type = "feed" else: error = ( - "You must provide either --blogger, --dotclear, " - "--tumblr, --wpfile or --feed options" + "You must provide one of --blogger, --dotclear, " + "--medium, --tumblr, --wpfile or --feed options" ) exit(error) @@ -1097,12 +1240,16 @@ def main(): fields = blogger2fields(args.input) elif input_type == "dotclear": fields = dc2fields(args.input) + elif input_type == "medium": + fields = mediumposts2fields(args.input) elif input_type == "tumblr": fields = tumblr2fields(args.input, args.blogname) elif input_type == "wordpress": fields = wp2fields(args.input, args.wp_custpost or False) elif input_type == "feed": fields = feed2fields(args.input) + else: + raise ValueError(f"Unhandled input_type {input_type}") if args.wp_attach: attachments = get_attachments(args.input)