From 5d34cd1cd472f1355dbdc7385c95dd53f785c1c7 Mon Sep 17 00:00:00 2001 From: Nilan Ekanayake <90630231+NilanEkanayake@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:51:27 -0400 Subject: [PATCH 1/7] fix syosetu's new pages --- sources/jp/s/syosetu.py | 67 ++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index a38cbf424..47a55c9b4 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -10,6 +10,8 @@ class SyosetuCrawler(Crawler): has_mtl = True base_url = "https://ncode.syosetu.com/" + def initialize(self) -> None: + self.init_executor(2) def search_novel(self, query): soup = self.get_soup(search_url % quote_plus(query)) @@ -47,26 +49,55 @@ def read_novel_info(self): # Syosetu calls parts "chapters" chapter_id = 0 volume = {"id": 0} - self.volumes.append(volume) - for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): - if 'chapter_title' in tag.attrs.get('class', ''): - # Part/volume (there might be none) - volume = { - "id": volume['id'] + 1, - "title": tag.text.strip(), - } + if soup.find_all("a", {"class": "novelview_pager-last"}): + page_num = int(soup.select_one("a[class='novelview_pager-last']")["href"].split("=", 1)[1]) + for x in range(1, page_num+1): + if self.novel_url.endswith('/'): + soup = self.get_soup(self.novel_url + f'?p={x}') + else: + soup = self.get_soup(self.novel_url + f'/?p={x}') self.volumes.append(volume) - elif tag.name == "a": - # Chapter - chapter_id += 1 - self.chapters.append( - { - "id": chapter_id, - "volume": volume['id'], - "title": tag.text.strip() or ("Chapter %d" % chapter_id), - "url": self.absolute_url(tag["href"]), + for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): + if 'chapter_title' in tag.attrs.get('class', ''): + # Part/volume (there might be none) + volume = { + "id": volume['id'] + 1, + "title": tag.text.strip(), + } + self.volumes.append(volume) + elif tag.name == "a": + # Chapter + chapter_id += 1 + self.chapters.append( + { + "id": chapter_id, + "volume": volume['id'], + "title": tag.text.strip() or ("Chapter %d" % chapter_id), + "url": self.absolute_url(tag["href"]), + } + ) + #volume['id'] = volume['id'] + 1 + else: + self.volumes.append(volume) + for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): + if 'chapter_title' in tag.attrs.get('class', ''): + # Part/volume (there might be none) + volume = { + "id": volume['id'] + 1, + "title": tag.text.strip(), } - ) + self.volumes.append(volume) + elif tag.name == "a": + # Chapter + chapter_id += 1 + self.chapters.append( + { + "id": chapter_id, + "volume": volume['id'], + "title": tag.text.strip() or ("Chapter %d" % chapter_id), + "url": self.absolute_url(tag["href"]), + } + ) def download_chapter_body(self, chapter): soup = self.get_soup(chapter["url"]) From 6c877a01bf9646de199e60bdc3af84f484589a26 Mon Sep 17 00:00:00 2001 From: Nilan Ekanayake <90630231+NilanEkanayake@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:52:10 -0400 Subject: [PATCH 2/7] fix fanstrans --- sources/en/f/fanstrans.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sources/en/f/fanstrans.py b/sources/en/f/fanstrans.py index d9caabb18..8f57dfc40 100644 --- a/sources/en/f/fanstrans.py +++ b/sources/en/f/fanstrans.py @@ -26,6 +26,10 @@ def initialize(self) -> None: r"^Get on Patreon", r"^Check out other novels on Fan’s Translation~", r"^to get Notification for latest Chapter Releases", + r"^Can’t wait to read more? Want to show your support? Click", + r"^to be a sponsor and get additional chapters ahead of time!", + r"^Also check out my other novels:", + r"^Villainess Wants To Turn Over A New Leaf , The Villainess Is Changing Her Role To A BroCon , I Help the Richest Man Spend Money to Prevent Disasters\nand\nThe Legitimate Daughter Doesn’t Care!" ] ) self.cleaner.bad_tags.update(["a"]) @@ -36,6 +40,7 @@ class FansTranslations(Crawler): def initialize(self) -> None: self.cleaner.bad_tags.update(["h3"]) + self.init_executor(4) def search_novel(self, query): query = query.lower().replace(" ", "+") From 0020fad4894e397da5ea94dc0746f37aa0862752 Mon Sep 17 00:00:00 2001 From: Nilan Ekanayake <90630231+NilanEkanayake@users.noreply.github.com> Date: Thu, 25 Jan 2024 23:56:42 -0400 Subject: [PATCH 3/7] removed niche use-case --- sources/en/f/fanstrans.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sources/en/f/fanstrans.py b/sources/en/f/fanstrans.py index 8f57dfc40..e664f1afb 100644 --- a/sources/en/f/fanstrans.py +++ b/sources/en/f/fanstrans.py @@ -28,8 +28,6 @@ def initialize(self) -> None: r"^to get Notification for latest Chapter Releases", r"^Can’t wait to read more? Want to show your support? Click", r"^to be a sponsor and get additional chapters ahead of time!", - r"^Also check out my other novels:", - r"^Villainess Wants To Turn Over A New Leaf , The Villainess Is Changing Her Role To A BroCon , I Help the Richest Man Spend Money to Prevent Disasters\nand\nThe Legitimate Daughter Doesn’t Care!" ] ) self.cleaner.bad_tags.update(["a"]) From bbddf830eb53d2b33b95e98238977ca46c21fd65 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Fri, 9 Feb 2024 18:12:27 +0400 Subject: [PATCH 4/7] Update syosetu.py --- sources/jp/s/syosetu.py | 69 +++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index 47a55c9b4..0472602d0 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -10,6 +10,7 @@ class SyosetuCrawler(Crawler): has_mtl = True base_url = "https://ncode.syosetu.com/" + def initialize(self) -> None: self.init_executor(2) @@ -47,57 +48,37 @@ def read_novel_info(self): self.novel_author = author_tag.text.strip() # Syosetu calls parts "chapters" - chapter_id = 0 - volume = {"id": 0} - if soup.find_all("a", {"class": "novelview_pager-last"}): - page_num = int(soup.select_one("a[class='novelview_pager-last']")["href"].split("=", 1)[1]) - for x in range(1, page_num+1): - if self.novel_url.endswith('/'): - soup = self.get_soup(self.novel_url + f'?p={x}') - else: - soup = self.get_soup(self.novel_url + f'/?p={x}') - self.volumes.append(volume) - for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): - if 'chapter_title' in tag.attrs.get('class', ''): - # Part/volume (there might be none) - volume = { - "id": volume['id'] + 1, - "title": tag.text.strip(), - } - self.volumes.append(volume) - elif tag.name == "a": - # Chapter - chapter_id += 1 - self.chapters.append( - { - "id": chapter_id, - "volume": volume['id'], - "title": tag.text.strip() or ("Chapter %d" % chapter_id), - "url": self.absolute_url(tag["href"]), - } - ) - #volume['id'] = volume['id'] + 1 + soups = [] + pager_last = soup.select_one("a[class='novelview_pager-last']") + if pager_last and 'href' in pager_last.attrs: + page_num = int(pager_last["href"].split("=")[-1]) + for x in range(1, page_num + 1): + soup = self.get_soup(self.novel_url + f'{self.novel_url}?p={x}') + soups.append(soup) else: - self.volumes.append(volume) + soups.append(soup) + + volume_id = 0 + chapter_id = 0 + self.volumes.append({ 'id': 0 }) + for soup in soups: for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): if 'chapter_title' in tag.attrs.get('class', ''): # Part/volume (there might be none) - volume = { - "id": volume['id'] + 1, - "title": tag.text.strip(), - } - self.volumes.append(volume) + volume_id += 1 + self.volumes.append({ + 'id': volume_id, + 'title': tag.text.strip(), + }) elif tag.name == "a": # Chapter chapter_id += 1 - self.chapters.append( - { - "id": chapter_id, - "volume": volume['id'], - "title": tag.text.strip() or ("Chapter %d" % chapter_id), - "url": self.absolute_url(tag["href"]), - } - ) + self.chapters.append({ + "id": chapter_id, + "volume": volume_id, + "title": tag.text.strip(), + "url": self.absolute_url(tag["href"]), + }) def download_chapter_body(self, chapter): soup = self.get_soup(chapter["url"]) From 6f513ab399f9cb20fe3d3d2d4eaeab5d1f794748 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Fri, 9 Feb 2024 18:15:16 +0400 Subject: [PATCH 5/7] Fix lint errors --- sources/jp/s/syosetu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index 0472602d0..d36a914ec 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -10,7 +10,7 @@ class SyosetuCrawler(Crawler): has_mtl = True base_url = "https://ncode.syosetu.com/" - + def initialize(self) -> None: self.init_executor(2) @@ -60,7 +60,7 @@ def read_novel_info(self): volume_id = 0 chapter_id = 0 - self.volumes.append({ 'id': 0 }) + self.volumes.append({'id': 0}) for soup in soups: for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): if 'chapter_title' in tag.attrs.get('class', ''): From 98eb79048e9bd5d930eca5eac9a76638b8d2d7f7 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Fri, 9 Feb 2024 18:16:18 +0400 Subject: [PATCH 6/7] Fix lint errors --- sources/en/f/fanstrans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/en/f/fanstrans.py b/sources/en/f/fanstrans.py index e664f1afb..64c4a8d7d 100644 --- a/sources/en/f/fanstrans.py +++ b/sources/en/f/fanstrans.py @@ -38,7 +38,7 @@ class FansTranslations(Crawler): def initialize(self) -> None: self.cleaner.bad_tags.update(["h3"]) - self.init_executor(4) + self.init_executor(4) def search_novel(self, query): query = query.lower().replace(" ", "+") From 79c47f448432e5f3f265ac8fa307d725c4a3a07c Mon Sep 17 00:00:00 2001 From: Nilan Ekanayake <90630231+NilanEkanayake@users.noreply.github.com> Date: Fri, 9 Feb 2024 11:04:03 -0400 Subject: [PATCH 7/7] Fix duplicate URL --- sources/jp/s/syosetu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index d36a914ec..cec560b49 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -53,7 +53,7 @@ def read_novel_info(self): if pager_last and 'href' in pager_last.attrs: page_num = int(pager_last["href"].split("=")[-1]) for x in range(1, page_num + 1): - soup = self.get_soup(self.novel_url + f'{self.novel_url}?p={x}') + soup = self.get_soup(f'{self.novel_url}?p={x}') soups.append(soup) else: soups.append(soup)