Skip to content

Commit

Permalink
Merge pull request #1989 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Update sources and fix an issue in crawler template
  • Loading branch information
dipu-bd committed Jul 1, 2023
2 parents 1d02e81 + 60b9750 commit 74fc656
Show file tree
Hide file tree
Showing 14 changed files with 478 additions and 477 deletions.
4 changes: 3 additions & 1 deletion .github/contribs.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,7 @@
"Yoga Setiawan": null,
"yogainformatika@gmail.com": null,
"dev ops": null,
"ismaelcomsci@gmail.com": null
"ismaelcomsci@gmail.com": null,
"Anuj2976": null,
"akakanuj@gmail.com": null
}
683 changes: 344 additions & 339 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.6
3.2.7
4 changes: 2 additions & 2 deletions lncrawl/templates/browser/optional_volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume:

def select_chapter_tags_in_browser(self, tag: Tag) -> Generator[Tag, None, None]:
"""Select chapter list item tags from volume tag from the browser"""
raise self.select_chapter_tags(tag)
return self.select_chapter_tags(tag)

def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
"""Parse a single chapter from chapter list item tag from the browser"""
raise self.parse_chapter_item(tag, id, vol)
return self.parse_chapter_item(tag, id, vol)
4 changes: 2 additions & 2 deletions lncrawl/templates/browser/with_volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def select_chapter_tags_in_browser(
self, tag: Tag, vol: Volume
) -> Generator[Tag, None, None]:
"""Select chapter list item tags from volume tag from the browser"""
raise self.select_chapter_tags(tag, vol)
return self.select_chapter_tags(tag, vol)

def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
"""Parse a single chapter from chapter list item tag from the browser"""
raise self.parse_chapter_item(tag, id, vol)
return self.parse_chapter_item(tag, id, vol)
8 changes: 7 additions & 1 deletion lncrawl/templates/mangastream.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def parse_title_in_browser(self) -> str:
return self.parse_title(self.browser.soup)

def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".thumbook img, meta[property='og:image']")
tag = soup.select_one(
".thumbook img, meta[property='og:image'],.sertothumb img"
)
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])

Expand Down Expand Up @@ -84,3 +86,7 @@ def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:

def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one("#readernovel, #readerarea, .entry-content")

def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
self.visit(chapter.url)
self.browser.wait("#readernovel, #readerarea, .entry-content,.mainholder")
2 changes: 1 addition & 1 deletion lncrawl/templates/novelpub.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

logger = logging.getLogger(__name__)

digit_regex = re.compile(r"page-(\d+)$")
digit_regex = re.compile(r"page[-,=](\d+)")


class NovelPubTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate):
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/templates/novelupdates.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

automation_warning = """
<div style="opacity: 0.5; padding: 14px; text-align: center; border: 1px solid #000; font-style: italic; font-size: 0.825rem">
Parsed with an automated reader. The content accuracy is not guranteed.
Parsed with an automated reader. The content accuracy is not guaranteed.
</div>
""".strip()

Expand Down
2 changes: 1 addition & 1 deletion sources/_index.json

Large diffs are not rendered by default.

26 changes: 6 additions & 20 deletions sources/en/1/1stkissnovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,23 @@

logger = logging.getLogger(__name__)
search_url = (
"https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release="
"%s?s=%s&post_type=wp-manga&author=&artist=&release="
)
wp_admin_ajax_url = "https://1stkissnovel.love/wp-admin/admin-ajax.php"


class OneKissNovelCrawler(Crawler):
has_mtl = True
base_url = "https://1stkissnovel.love/"
base_url = [
"https://1stkissnovel.org/",
"https://1stkissnovel.love/",
]

def initialize(self) -> None:
self.cleaner.bad_tags.update(["h3"])

def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)
soup = self.get_soup(search_url % (self.home_url, query))

results = []
for tab in soup.select(".c-tabs-item__content"):
Expand All @@ -34,7 +36,6 @@ def search_novel(self, query):
"info": "%s | Rating: %s" % (latest, votes),
}
)

return results

def read_novel_info(self):
Expand All @@ -48,10 +49,8 @@ def read_novel_info(self):
logger.info("Novel title: %s", self.novel_title)

img_src = soup.select_one(".summary_image a img")

if img_src:
self.novel_cover = self.absolute_url(img_src["data-src"])

logger.info("Novel cover: %s", self.novel_cover)

self.novel_author = " ".join(
Expand All @@ -65,18 +64,6 @@ def read_novel_info(self):
self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"]
logger.info("Novel id: %s", self.novel_id)

# For getting cookies
# self.submit_form(wp_admin_ajax_url, data={
# 'action': 'manga_views',
# 'manga': self.novel_id,
# })

# Deprecated way to fetch chapters
# response = self.submit_form(wp_admin_ajax_url, data={
# 'action': 'manga_get_chapters',
# 'manga': self.novel_id,
# })

clean_novel_url = self.novel_url.split("?")[0].strip("/")
response = self.submit_form(f"{clean_novel_url}/ajax/chapters/")

Expand All @@ -96,7 +83,6 @@ def read_novel_info(self):
)

def download_chapter_body(self, chapter):
logger.info("Visiting %s", chapter["url"])
soup = self.get_soup(chapter["url"])
contents = soup.select_one("div.text-left")
return self.cleaner.extract_contents(contents)
2 changes: 1 addition & 1 deletion sources/en/e/exiledrebels.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ def read_novel_info(self):

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
contents = soup.select("div#wtr-content")
contents = soup.select_one("div#wtr-content")
return self.cleaner.extract_contents(contents)
92 changes: 46 additions & 46 deletions sources/en/l/lightnovetrans.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,52 @@
# -*- coding: utf-8 -*-

import logging
from lncrawl.core.crawler import Crawler
from typing import Generator, Union

from bs4 import BeautifulSoup, Tag

from lncrawl.models import Chapter, Volume
from lncrawl.templates.soup.general import GeneralSoupTemplate

logger = logging.getLogger(__name__)


class LNTCrawler(Crawler):
base_url = 'https://lightnovelstranslations.com/'

def read_novel_info(self):
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one('h1.entry-title')
assert possible_title, 'No novel title'
self.novel_title = possible_title.text

possible_cover = soup.select_one('meta[property="og:image"]')
if possible_cover:
self.novel_cover = self.absolute_url(possible_cover['content'])

for p in soup.select('.entry-content > p'):
if 'Author' in p.text:
self.novel_author = p.text.replace('Author:', '').strip()
break

for div in soup.select('.entry-content .su-spoiler'):
vol = div.select_one('.su-spoiler-title').text.strip()
vol_id = int(vol) if vol.isdigit() else len(self.volumes) + 1
self.volumes.append({
'id': vol_id,
'title': vol,
})
for a in div.select('.su-spoiler-content p a'):
if not a.has_attr('href'):
continue
self.chapters.append({
'id': len(self.chapters) + 1,
'volume': vol_id,
'title': a.text.strip(),
'url': self.absolute_url(a['href']),
})

def download_chapter_body(self, chapter):
logger.info('Visiting: %s', chapter['url'])
soup = self.get_soup(chapter['url'])

content = soup.select_one('.entry-content')
for bad in content.select('.alignleft, .alignright, hr, p[style*="text-align: center"]'):
bad.extract()

return '\n'.join([str(p) for p in content.find_all('p')])
class LNTCrawler(GeneralSoupTemplate):
base_url = ["https://lightnovelstranslations.com/"]

has_manga = False
has_mtl = False

def get_novel_soup(self) -> BeautifulSoup:
return self.get_soup(f"{self.novel_url}/?tab=table_contents")

def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".novel_title")
assert tag
return tag.text.strip()

def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".novel-image img")
assert tag
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
if tag.has_attr("src"):
return self.absolute_url(tag["src"])

def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for p in soup.select(".entry-content > p"):
if "Author" in p.text:
yield p.text.replace("Author:", "").strip()

def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
_id = 0
for a in soup.select(".novel_list_chapter_content li.unlock a"):
_id += 1
yield Chapter(
id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
)

def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one(".text_story")
122 changes: 62 additions & 60 deletions sources/en/n/novelsonline.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,78 @@
# -*- coding: utf-8 -*-

import logging
import re
from lncrawl.core.crawler import Crawler
from typing import Generator, Union

logger = logging.getLogger(__name__)
search_url = "https://novelsonline.net/search/autocomplete"
from bs4 import BeautifulSoup, Tag

from lncrawl.models import Chapter, Volume
from lncrawl.templates.browser.general import GeneralBrowserTemplate

class NovelsOnline(Crawler):
base_url = "https://novelsonline.net/"
logger = logging.getLogger(__name__)

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one(".block-title h1")
assert possible_title, "No novel title"
self.novel_title = possible_title.text
logger.info("Novel title: %s", self.novel_title)
class NovelsOnline(GeneralBrowserTemplate):
base_url = ["https://novelsonline.net/"]
has_manga = False
has_mtl = False

self.novel_cover = self.absolute_url(
soup.find("img", {"alt": self.novel_title})["src"]
# TODO: [OPTIONAL] This is called before all other methods.
def initialize(self) -> None:
self.cleaner.bad_tags.update(["div"])
self.cleaner.bad_css.update(
[
".trinity-player-iframe-wrapper",
".hidden",
".ads-title",
"script",
"center",
"interaction",
"a[href*=remove-ads]",
"a[target=_blank]",
"hr",
"br",
"#growfoodsmart",
".col-md-6",
".trv_player_container",
".ad1",
]
)
logger.info("Novel cover: %s", self.novel_cover)

author_link = soup.select_one("a[href*=author]")
if author_link:
self.novel_author = author_link.text.strip().title()
logger.info("Novel author: %s", self.novel_author)

volume_ids = set()
for a in soup.select(".chapters .chapter-chs li a"):
chap_id = len(self.chapters) + 1
vol_id = (chap_id - 1) // 100 + 1
volume_ids.add(vol_id)
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"url": self.absolute_url(a["href"]),
"title": a.text.strip() or ("Chapter %d" % chap_id),
}
)
# TODO: [OPTIONAL] Open the Novel URL in the browser
def visit_novel_page_in_browser(self) -> BeautifulSoup:
self.visit(self.novel_url)
self.browser.wait(".container--content")

self.volumes = [{"id": i} for i in volume_ids]
def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".block-title h1")
assert tag
return tag.text.strip()

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.find("img", {"alt": self.novel_title})
assert tag
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
elif tag.has_attr("src"):
return self.absolute_url(tag["src"])

div = soup.select_one(".chapter-content3")
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for a in soup.select("a[href*=author]"):
yield a.text.strip()

bad_selectors = [
".trinity-player-iframe-wrapper" ".hidden",
".ads-title",
"script",
"center",
"interaction",
"a[href*=remove-ads]",
"a[target=_blank]",
"hr",
"br",
"#growfoodsmart",
".col-md-6",
]
for hidden in div.select(", ".join(bad_selectors)):
hidden.extract()
def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
_id = 0
for a in soup.select(".chapters .chapter-chs li a"):
_id += 1
yield Chapter(
id=_id, url=self.absolute_url(a["href"]), title=a.text.strip()
)

body = self.cleaner.extract_contents(div)
if re.search(r"c?hapter .?\d+", body[0], re.IGNORECASE):
title = body[0].replace("<strong>", "").replace("</strong>", "").strip()
title = ("C" if title.startswith("hapter") else "") + title
chapter["title"] = title.strip()
body = body[1:]
def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
self.visit(chapter.url)
self.browser.wait(".container--content")

return "<p>" + "</p><p>".join(body) + "</p>"
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one("#contentall")
Loading

0 comments on commit 74fc656

Please sign in to comment.