Skip to content

Commit

Permalink
Merge pull request #39 from UNICT-DMI/update-scraper
Browse files Browse the repository at this point in the history
fix: new scraper for new html structure
  • Loading branch information
DefEnge authored Feb 19, 2024
2 parents 82adde6 + e570112 commit 59e65c3
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions module/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,22 @@ def get_html(url: str) -> str:
def find_info(article: any) -> tuple:

# finds the info of an article
href_article = article.find_all('a', href=True, title=True)
href_article = article.find_all("a", href=True)

if len(href_article) > 1:
title_article = href_article[1].get_text()
else:
title_article = None

div_article = article.find('div', {'class': 'slide-meta'})
link_article = article.find('a')
link_article = link_article['href']

time_article = None
div_article = article.find("div", {"class": "sow-entry-meta"})
if div_article is not None:
time_article = div_article.find('time').get_text()
else:
time_article = None
time_el = div_article.find("time")
if time_el is not None:
time_article = div_article.find("time").get_text().strip()

# find the tag of the article and content
html_article = get_html(link_article)
Expand All @@ -48,7 +49,7 @@ def find_info(article: any) -> tuple:
else:
tag_article = soup_article.find_all('a', {'rel': 'tag'})[-1].get_text()

content_article = soup_article.find('div', {'class': 'entry-content'})
content_article = soup_article.find("section", {"class": "entry-content"})
if content_article is not None:
paragraph = content_article.find('p')
if paragraph:
Expand Down

0 comments on commit 59e65c3

Please sign in to comment.