diff --git a/sources/zh/uukanshu.py b/sources/zh/uukanshu.py
index e4caae364..9b00acc5d 100644
--- a/sources/zh/uukanshu.py
+++ b/sources/zh/uukanshu.py
@@ -1,102 +1,74 @@
# -*- coding: utf-8 -*-
import logging
-import re
+from bs4 import Tag
from lncrawl.core.crawler import Crawler
+from lncrawl.models import Chapter, Volume
+from sources.zh.uukanshu_sj import UukanshuOnlineSJ
logger = logging.getLogger(__name__)
novel_search_url = "%ssearch.aspx?k=%s"
-chapter_list_url = "%s&page=%d"
class UukanshuOnline(Crawler):
- base_url = ["https://sj.uukanshu.com/"]
-
- def search_novel(self, query):
- query = query.lower().replace(" ", "+")
- soup = self.get_soup(novel_search_url % (self.home_url, query))
- results = []
-
- for data in soup.select("#bookList li"):
- title = data.select_one(".book-title a.name")["title"]
- author = data.select_one(".book-title .aut").get_text()
- url = self.home_url + data.select_one(".book-title a.name")["href"]
-
- results.append(
- {
- "title": title,
- "url": url,
- "info": f"Author: {author}",
- }
- )
- return results
-
- def read_novel_info(self):
- soup = self.get_soup(self.novel_url)
-
- self.novel_title = soup.select_one(".bookname").text.strip()
- logger.info("Novel title: %s", self.novel_title)
-
- possible_image = soup.select_one(".book-info img")
- if possible_image:
- self.novel_cover = self.absolute_url(possible_image["src"])
- logger.info("Novel cover: %s", self.novel_cover)
-
- self.novel_author = (
- soup.select_one(".book-info dd").text.replace("作者:", "").strip()
- )
- logger.info("Novel author: %s", self.novel_author)
-
- logger.info("Getting chapters...")
- soup = self.get_soup(chapter_list_url % (self.novel_url, 1))
- try:
- last_page = soup.select_one(".pages a:last-child")
- page_count = int(re.findall(r"&page=(\d+)", str(last_page["href"]))[0])
- except Exception as err:
- logger.debug("Failed to parse page count. Error: %s", err)
- page_count = 0
- logger.info("Total pages: %d", page_count)
-
- futures = [
- self.executor.submit(self.get_soup, chapter_list_url % (self.novel_url, p))
- for p in range(2, page_count + 1)
- ]
- page_soups = [soup] + [f.result() for f in futures]
-
- for soup in page_soups:
- for a in soup.select("ul#chapterList li a"):
- chap_id = len(self.chapters) + 1
- vol_id = 1 + len(self.chapters) // 100
- if chap_id % 100 == 1:
- self.volumes.append({"id": vol_id})
- self.chapters.append(
- {
- "id": chap_id,
- "volume": vol_id,
- "title": a.text,
- "url": self.home_url + a["href"],
- }
+ # www is simplified cn, tw is traditional cn but both use same site structure
+ base_url = ["https://www.uukanshu.net/", "https://tw.uukanshu.net/"]
+
+ encoding = "gbk"
+
+ def initialize(self):
+ # the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
+ self.init_parser("html.parser")
+
+ def read_novel_info(self) -> None:
+ # the encoding for tw is utf-8, for www. is gbk -> otherwise output is messed up with wrong symbols.
+ if "tw." in self.novel_url:
+ self.encoding = "utf-8"
+
+ soup = self.get_soup(self.novel_url, encoding=self.encoding)
+ info = soup.select_one("dl.jieshao")
+ assert info # if this fails, HTML structure has fundamentally changed -> needs update
+ meta = info.select_one("dd.jieshao_content")
+
+ img = info.select_one("dt.jieshao-img img")
+ if img:
+ self.novel_cover = self.absolute_url(img["src"])
+
+ self.novel_title = meta.select_one("h1 > a").text
+ self.novel_author = meta.select_one("h2 > a").text
+ self.novel_synopsis = meta.select_one("h3 > p").text
+
+ chapters = soup.select_one("ul#chapterList")
+ for chapter in list(chapters.children)[::-1]: # reverse order as it's newest to oldest
+ # convince typehint that we're looking at Tags & also make sure we skip random text within the ul if any
+ if not isinstance(chapter, Tag):
+ continue
+ # find chapters
+ if chapter.has_attr("class") and "volume" in chapter["class"]:
+ self.volumes.append(
+ Volume(
+ id=len(self.volumes) + 1,
+ title=chapter.text.strip(),
+ )
)
+ continue
+ anchor = chapter.select_one("a")
+ if not anchor:
+ logger.warning("Found
in chapter list, not volume, without link: %s", chapter)
+ continue
+ self.chapters.append(
+ Chapter(
+ id=len(self.chapters) + 1,
+ url=self.absolute_url(anchor["href"]),
+ title=anchor.text,
+ volume=len(self.volumes),
+ )
+ )
- def download_chapter_body(self, chapter):
- soup = self.get_soup(chapter["url"])
- body = soup.select_one("#bookContent")
-
- content = self.cleaner.extract_contents(body)
-
- return self.format_text(content)
-
- def format_text(self, text):
- text = re.sub(
- r"[UU][UU]\s*看书\s*[ww][ww][ww][\..][uu][uu][kk][aa][nn][ss][hh][uu][\..][cc][oo][mm]",
- "",
- text,
- )
- text = text.replace("章节缺失、错误举报", "")
- text = text.replace("注:如你看到本章节内容是防盗错误内容、本书断更等问题请登录后→→", "")
- text = text.replace("最新网址:", "")
- text = text.replace("请记住本书首发域名:。手机版更新最快网址:", "")
- text = text.replace("www.uukanshu.com", "")
- return text
+ def download_chapter_body(self, chapter: Chapter) -> str:
+ soup = self.get_soup(chapter.url, encoding=self.encoding)
+ content = soup.select_one("div#contentbox")
+ # use same filters as already implemented on essentially same site
+ return UukanshuOnlineSJ.format_text(self.cleaner.extract_contents(content))
diff --git a/sources/zh/uukanshu_sj.py b/sources/zh/uukanshu_sj.py
new file mode 100644
index 000000000..159d7326c
--- /dev/null
+++ b/sources/zh/uukanshu_sj.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+import logging
+import re
+
+
+from lncrawl.core.crawler import Crawler
+
+logger = logging.getLogger(__name__)
+
+novel_search_url = "%ssearch.aspx?k=%s"
+chapter_list_url = "%s&page=%d"
+
+
+class UukanshuOnlineSJ(Crawler):
+ base_url = ["https://sj.uukanshu.net/"] # previously .com, redirects .com to .net though
+
+ def search_novel(self, query):
+ query = query.lower().replace(" ", "+")
+ soup = self.get_soup(novel_search_url % (self.home_url, query))
+ results = []
+
+ for data in soup.select("#bookList li"):
+ title = data.select_one(".book-title a.name")["title"]
+ author = data.select_one(".book-title .aut").get_text()
+ url = self.home_url + data.select_one(".book-title a.name")["href"]
+
+ results.append(
+ {
+ "title": title,
+ "url": url,
+ "info": f"Author: {author}",
+ }
+ )
+ return results
+
+ def read_novel_info(self):
+ soup = self.get_soup(self.novel_url)
+
+ self.novel_title = soup.select_one(".bookname").text.strip()
+ logger.info("Novel title: %s", self.novel_title)
+
+ possible_image = soup.select_one(".book-info img")
+ if possible_image:
+ self.novel_cover = self.absolute_url(possible_image["src"])
+ logger.info("Novel cover: %s", self.novel_cover)
+
+ self.novel_author = (
+ soup.select_one(".book-info dd").text.replace("作者:", "").strip()
+ )
+ logger.info("Novel author: %s", self.novel_author)
+
+ logger.info("Getting chapters...")
+ soup = self.get_soup(chapter_list_url % (self.novel_url, 1))
+ try:
+ last_page = soup.select_one(".pages a:last-child")
+ page_count = int(re.findall(r"&page=(\d+)", str(last_page["href"]))[0])
+ except Exception as err:
+ logger.debug("Failed to parse page count. Error: %s", err)
+ page_count = 0
+ logger.info("Total pages: %d", page_count)
+
+ futures = [
+ self.executor.submit(self.get_soup, chapter_list_url % (self.novel_url, p))
+ for p in range(2, page_count + 1)
+ ]
+ page_soups = [soup] + [f.result() for f in futures]
+
+ for soup in page_soups:
+ for a in soup.select("ul#chapterList li a"):
+ chap_id = len(self.chapters) + 1
+ vol_id = 1 + len(self.chapters) // 100
+ if chap_id % 100 == 1:
+ self.volumes.append({"id": vol_id})
+ self.chapters.append(
+ {
+ "id": chap_id,
+ "volume": vol_id,
+ "title": a.text,
+ "url": self.home_url + a["href"],
+ }
+ )
+
+ def download_chapter_body(self, chapter):
+ soup = self.get_soup(chapter["url"])
+ body = soup.select_one("#bookContent")
+
+ content = self.cleaner.extract_contents(body)
+
+ return self.format_text(content)
+
+ @staticmethod
+ def format_text(text):
+ text = re.sub(
+ r"[UU][UU]\s*看书\s*[ww][ww][ww][\..][uu][uu][kk][aa][nn][ss][hh][uu][\..][cc][oo][mm]",
+ "",
+ text,
+ )
+ text = text.replace("章节缺失、错误举报", "")
+ text = text.replace("注:如你看到本章节内容是防盗错误内容、本书断更等问题请登录后→→", "")
+ text = text.replace("最新网址:", "")
+ text = text.replace("请记住本书首发域名:。手机版更新最快网址:", "")
+ text = text.replace("www.uukanshu.com", "")
+ return text