Skip to content

Commit

Permalink
Strip out the new stolen-content warnings on royalroad
Browse files Browse the repository at this point in the history
They might make these harder to work out in the future, but for now...
  • Loading branch information
kemayo committed Jan 20, 2024
1 parent 9171672 commit d30e56a
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion sites/royalroad.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _chapter(self, url, chapterid):
soup = self._soup(url)
content = soup.find('div', class_='chapter-content')

self._clean(content)
self._clean(content, soup)
self._clean_spoilers(content, chapterid)

content = str(content)
Expand All @@ -108,6 +108,19 @@ def _chapter(self, url, chapterid):

return content, updated

def _clean(self, contents, full_page):
contents = super()._clean(contents)

# Royalroad has started inserting "this was stolen" notices into its
# HTML, and hiding them with CSS. Currently the CSS is very easy to
# find, so do so and filter them out.
for style in full_page.find_all('style'):
if m := re.match(r'\s*\.(\w+)\s*{\s*display:\s*none;\s*}', style.string):
for warning in contents.find_all(class_=m.group(1)):
warning.decompose()

return contents

def _clean_spoilers(self, content, chapterid):
# Spoilers to footnotes
for spoiler in content.find_all(class_=('spoiler-new')):
Expand Down

0 comments on commit d30e56a

Please sign in to comment.