Merge pull request #1989 from dipu-bd/dev

Update sources and fix an issue in crawler template
dipu-bd · Jul 1, 2023 · 74fc656 · 74fc656
2 parents 1d02e81 + 60b9750
commit 74fc656
Show file tree

Hide file tree

Showing 14 changed files with 478 additions and 477 deletions.
diff --git a/.github/contribs.json b/.github/contribs.json
@@ -66,5 +66,7 @@
   "Yoga Setiawan": null,
   "yogainformatika@gmail.com": null,
   "dev ops": null,
-  "ismaelcomsci@gmail.com": null
+  "ismaelcomsci@gmail.com": null,
+  "Anuj2976": null,
+  "akakanuj@gmail.com": null
 }
diff --git a/README.md b/README.md
diff --git a/lncrawl/VERSION b/lncrawl/VERSION
@@ -1 +1 @@
-3.2.6
+3.2.7
diff --git a/lncrawl/templates/browser/optional_volume.py b/lncrawl/templates/browser/optional_volume.py
@@ -57,8 +57,8 @@ def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume:
 
     def select_chapter_tags_in_browser(self, tag: Tag) -> Generator[Tag, None, None]:
         """Select chapter list item tags from volume tag from the browser"""
-        raise self.select_chapter_tags(tag)
+        return self.select_chapter_tags(tag)
 
     def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
         """Parse a single chapter from chapter list item tag from the browser"""
-        raise self.parse_chapter_item(tag, id, vol)
+        return self.parse_chapter_item(tag, id, vol)
diff --git a/lncrawl/templates/browser/with_volume.py b/lncrawl/templates/browser/with_volume.py
@@ -43,8 +43,8 @@ def select_chapter_tags_in_browser(
         self, tag: Tag, vol: Volume
     ) -> Generator[Tag, None, None]:
         """Select chapter list item tags from volume tag from the browser"""
-        raise self.select_chapter_tags(tag, vol)
+        return self.select_chapter_tags(tag, vol)
 
     def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
         """Parse a single chapter from chapter list item tag from the browser"""
-        raise self.parse_chapter_item(tag, id, vol)
+        return self.parse_chapter_item(tag, id, vol)
diff --git a/lncrawl/templates/mangastream.py b/lncrawl/templates/mangastream.py
@@ -43,7 +43,9 @@ def parse_title_in_browser(self) -> str:
         return self.parse_title(self.browser.soup)
 
     def parse_cover(self, soup: BeautifulSoup) -> str:
-        tag = soup.select_one(".thumbook img, meta[property='og:image']")
+        tag = soup.select_one(
+            ".thumbook img, meta[property='og:image'],.sertothumb img"
+        )
         if tag.has_attr("data-src"):
             return self.absolute_url(tag["data-src"])
 
@@ -84,3 +86,7 @@ def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:
 
     def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
         return soup.select_one("#readernovel, #readerarea, .entry-content")
+
+    def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
+        self.visit(chapter.url)
+        self.browser.wait("#readernovel, #readerarea, .entry-content,.mainholder")
diff --git a/lncrawl/templates/novelpub.py b/lncrawl/templates/novelpub.py
@@ -12,7 +12,7 @@
 
 logger = logging.getLogger(__name__)
 
-digit_regex = re.compile(r"page-(\d+)$")
+digit_regex = re.compile(r"page[-,=](\d+)")
 
 
 class NovelPubTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate):

diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py
@@ -18,7 +18,7 @@
 
 automation_warning = """
 <div style="opacity: 0.5; padding: 14px; text-align: center; border: 1px solid #000; font-style: italic; font-size: 0.825rem">
-    Parsed with an automated reader. The content accuracy is not guranteed.
+    Parsed with an automated reader. The content accuracy is not guaranteed.
 </div>
 """.strip()
 

diff --git a/sources/_index.json b/sources/_index.json
diff --git a/sources/en/1/1stkissnovel.py b/sources/en/1/1stkissnovel.py
@@ -6,21 +6,23 @@
 
 logger = logging.getLogger(__name__)
 search_url = (
-    "https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release="
+    "%s?s=%s&post_type=wp-manga&author=&artist=&release="
 )
-wp_admin_ajax_url = "https://1stkissnovel.love/wp-admin/admin-ajax.php"
 
 
 class OneKissNovelCrawler(Crawler):
     has_mtl = True
-    base_url = "https://1stkissnovel.love/"
+    base_url = [
+        "https://1stkissnovel.org/",
+        "https://1stkissnovel.love/",
+    ]
 
     def initialize(self) -> None:
         self.cleaner.bad_tags.update(["h3"])
 
     def search_novel(self, query):
         query = query.lower().replace(" ", "+")
-        soup = self.get_soup(search_url % query)
+        soup = self.get_soup(search_url % (self.home_url, query))
 
         results = []
         for tab in soup.select(".c-tabs-item__content"):
@@ -34,7 +36,6 @@ def search_novel(self, query):
                     "info": "%s | Rating: %s" % (latest, votes),
                 }
             )
-
         return results
 
     def read_novel_info(self):
@@ -48,10 +49,8 @@ def read_novel_info(self):
         logger.info("Novel title: %s", self.novel_title)
 
         img_src = soup.select_one(".summary_image a img")
-
         if img_src:
             self.novel_cover = self.absolute_url(img_src["data-src"])
-
         logger.info("Novel cover: %s", self.novel_cover)
 
         self.novel_author = " ".join(
@@ -65,18 +64,6 @@ def read_novel_info(self):
         self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"]
         logger.info("Novel id: %s", self.novel_id)
 
-        # For getting cookies
-        # self.submit_form(wp_admin_ajax_url, data={
-        #    'action': 'manga_views',
-        #    'manga': self.novel_id,
-        # })
-
-        # Deprecated way to fetch chapters
-        # response = self.submit_form(wp_admin_ajax_url, data={
-        #     'action': 'manga_get_chapters',
-        #     'manga': self.novel_id,
-        # })
-
         clean_novel_url = self.novel_url.split("?")[0].strip("/")
         response = self.submit_form(f"{clean_novel_url}/ajax/chapters/")
 
@@ -96,7 +83,6 @@ def read_novel_info(self):
             )
 
     def download_chapter_body(self, chapter):
-        logger.info("Visiting %s", chapter["url"])
         soup = self.get_soup(chapter["url"])
         contents = soup.select_one("div.text-left")
         return self.cleaner.extract_contents(contents)
diff --git a/sources/en/e/exiledrebels.py b/sources/en/e/exiledrebels.py
@@ -47,5 +47,5 @@ def read_novel_info(self):
 
     def download_chapter_body(self, chapter):
         soup = self.get_soup(chapter["url"])
-        contents = soup.select("div#wtr-content")
+        contents = soup.select_one("div#wtr-content")
         return self.cleaner.extract_contents(contents)
diff --git a/sources/en/l/lightnovetrans.py b/sources/en/l/lightnovetrans.py
@@ -1,52 +1,52 @@
 # -*- coding: utf-8 -*-
+
 import logging
-from lncrawl.core.crawler import Crawler
+from typing import Generator, Union
+
+from bs4 import BeautifulSoup, Tag
+
+from lncrawl.models import Chapter, Volume
+from lncrawl.templates.soup.general import GeneralSoupTemplate
 
 logger = logging.getLogger(__name__)
 
 
-class LNTCrawler(Crawler):
-    base_url = 'https://lightnovelstranslations.com/'
-
-    def read_novel_info(self):
-        soup = self.get_soup(self.novel_url)
-
-        possible_title = soup.select_one('h1.entry-title')
-        assert possible_title, 'No novel title'
-        self.novel_title = possible_title.text
-
-        possible_cover = soup.select_one('meta[property="og:image"]')
-        if possible_cover:
-            self.novel_cover = self.absolute_url(possible_cover['content'])
-
-        for p in soup.select('.entry-content > p'):
-            if 'Author' in p.text:
-                self.novel_author = p.text.replace('Author:', '').strip()
-                break
-
-        for div in soup.select('.entry-content .su-spoiler'):
-            vol = div.select_one('.su-spoiler-title').text.strip()
-            vol_id = int(vol) if vol.isdigit() else len(self.volumes) + 1
-            self.volumes.append({
-                'id': vol_id,
-                'title': vol,
-            })
-            for a in div.select('.su-spoiler-content p a'):
-                if not a.has_attr('href'):
-                    continue
-                self.chapters.append({
-                    'id': len(self.chapters) + 1,
-                    'volume': vol_id,
-                    'title': a.text.strip(),
-                    'url': self.absolute_url(a['href']),
-                })
-
-    def download_chapter_body(self, chapter):
-        logger.info('Visiting: %s', chapter['url'])
-        soup = self.get_soup(chapter['url'])
-
-        content = soup.select_one('.entry-content')
-        for bad in content.select('.alignleft, .alignright, hr, p[style*="text-align: center"]'):
-            bad.extract()
-
-        return '\n'.join([str(p) for p in content.find_all('p')])
+class LNTCrawler(GeneralSoupTemplate):
+    base_url = ["https://lightnovelstranslations.com/"]
+
+    has_manga = False
+    has_mtl = False
+
+    def get_novel_soup(self) -> BeautifulSoup:
+        return self.get_soup(f"{self.novel_url}/?tab=table_contents")
+
+    def parse_title(self, soup: BeautifulSoup) -> str:
+        tag = soup.select_one(".novel_title")
+        assert tag
+        return tag.text.strip()
+
+    def parse_cover(self, soup: BeautifulSoup) -> str:
+        tag = soup.select_one(".novel-image img")
+        assert tag
+        if tag.has_attr("data-src"):
+            return self.absolute_url(tag["data-src"])
+        if tag.has_attr("src"):
+            return self.absolute_url(tag["src"])
+
+    def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
+        for p in soup.select(".entry-content > p"):
+            if "Author" in p.text:
+                yield p.text.replace("Author:", "").strip()
+
+    def parse_chapter_list(
+        self, soup: BeautifulSoup
+    ) -> Generator[Union[Chapter, Volume], None, None]:
+        _id = 0
+        for a in soup.select(".novel_list_chapter_content li.unlock a"):
+            _id += 1
+            yield Chapter(
+                id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
+            )
+
+    def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
+        return soup.select_one(".text_story")
diff --git a/sources/en/n/novelsonline.py b/sources/en/n/novelsonline.py
@@ -1,76 +1,78 @@
 # -*- coding: utf-8 -*-
+
 import logging
-import re
-from lncrawl.core.crawler import Crawler
+from typing import Generator, Union
 
-logger = logging.getLogger(__name__)
-search_url = "https://novelsonline.net/search/autocomplete"
+from bs4 import BeautifulSoup, Tag
 
+from lncrawl.models import Chapter, Volume
+from lncrawl.templates.browser.general import GeneralBrowserTemplate
 
-class NovelsOnline(Crawler):
-    base_url = "https://novelsonline.net/"
+logger = logging.getLogger(__name__)
 
-    def read_novel_info(self):
-        logger.debug("Visiting %s", self.novel_url)
-        soup = self.get_soup(self.novel_url)
 
-        possible_title = soup.select_one(".block-title h1")
-        assert possible_title, "No novel title"
-        self.novel_title = possible_title.text
-        logger.info("Novel title: %s", self.novel_title)
+class NovelsOnline(GeneralBrowserTemplate):
+    base_url = ["https://novelsonline.net/"]
+    has_manga = False
+    has_mtl = False
 
-        self.novel_cover = self.absolute_url(
-            soup.find("img", {"alt": self.novel_title})["src"]
+    # TODO: [OPTIONAL] This is called before all other methods.
+    def initialize(self) -> None:
+        self.cleaner.bad_tags.update(["div"])
+        self.cleaner.bad_css.update(
+            [
+                ".trinity-player-iframe-wrapper",
+                ".hidden",
+                ".ads-title",
+                "script",
+                "center",
+                "interaction",
+                "a[href*=remove-ads]",
+                "a[target=_blank]",
+                "hr",
+                "br",
+                "#growfoodsmart",
+                ".col-md-6",
+                ".trv_player_container",
+                ".ad1",
+            ]
         )
-        logger.info("Novel cover: %s", self.novel_cover)
-
-        author_link = soup.select_one("a[href*=author]")
-        if author_link:
-            self.novel_author = author_link.text.strip().title()
-        logger.info("Novel author: %s", self.novel_author)
 
-        volume_ids = set()
-        for a in soup.select(".chapters .chapter-chs li a"):
-            chap_id = len(self.chapters) + 1
-            vol_id = (chap_id - 1) // 100 + 1
-            volume_ids.add(vol_id)
-            self.chapters.append(
-                {
-                    "id": chap_id,
-                    "volume": vol_id,
-                    "url": self.absolute_url(a["href"]),
-                    "title": a.text.strip() or ("Chapter %d" % chap_id),
-                }
-            )
+    # TODO: [OPTIONAL] Open the Novel URL in the browser
+    def visit_novel_page_in_browser(self) -> BeautifulSoup:
+        self.visit(self.novel_url)
+        self.browser.wait(".container--content")
 
-        self.volumes = [{"id": i} for i in volume_ids]
+    def parse_title(self, soup: BeautifulSoup) -> str:
+        tag = soup.select_one(".block-title h1")
+        assert tag
+        return tag.text.strip()
 
-    def download_chapter_body(self, chapter):
-        soup = self.get_soup(chapter["url"])
+    def parse_cover(self, soup: BeautifulSoup) -> str:
+        tag = soup.find("img", {"alt": self.novel_title})
+        assert tag
+        if tag.has_attr("data-src"):
+            return self.absolute_url(tag["data-src"])
+        elif tag.has_attr("src"):
+            return self.absolute_url(tag["src"])
 
-        div = soup.select_one(".chapter-content3")
+    def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
+        for a in soup.select("a[href*=author]"):
+            yield a.text.strip()
 
-        bad_selectors = [
-            ".trinity-player-iframe-wrapper" ".hidden",
-            ".ads-title",
-            "script",
-            "center",
-            "interaction",
-            "a[href*=remove-ads]",
-            "a[target=_blank]",
-            "hr",
-            "br",
-            "#growfoodsmart",
-            ".col-md-6",
-        ]
-        for hidden in div.select(", ".join(bad_selectors)):
-            hidden.extract()
+    def parse_chapter_list(
+        self, soup: BeautifulSoup
+    ) -> Generator[Union[Chapter, Volume], None, None]:
+        _id = 0
+        for a in soup.select(".chapters .chapter-chs li a"):
+            _id += 1
+            yield Chapter(
+                id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
+            )
 
-        body = self.cleaner.extract_contents(div)
-        if re.search(r"c?hapter .?\d+", body[0], re.IGNORECASE):
-            title = body[0].replace("<strong>", "").replace("</strong>", "").strip()
-            title = ("C" if title.startswith("hapter") else "") + title
-            chapter["title"] = title.strip()
-            body = body[1:]
+    def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
+        self.visit(chapter.url)
+        self.browser.wait(".container--content")
 
-        return "<p>" + "</p><p>".join(body) + "</p>"
+    def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
+        return soup.select_one("#contentall")