-
Notifications
You must be signed in to change notification settings - Fork 0
/
test2wr.py
88 lines (81 loc) · 4.23 KB
/
test2wr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import requests
from bs4 import BeautifulSoup
# Function to convert HTML to plain text
def html_to_text(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Extract text from HTML content
text = soup.get_text(separator='\n')
return text
else:
print("Failed to retrieve content from the URL.")
return None
# Function to filter out header and filler text and extract individual posts
def filter_and_extract_posts(text, keywords):
# Split the text using the separator "Be the First to comment"
posts = text.split("Be the First to Comment")
# Filter out empty strings and whitespace
posts = [post.strip() for post in posts if post.strip()]
# Filter out header and filler text, and potential bot posts
filtered_posts = []
for post in posts:
# Check if the post contains any keywords and doesn't contain "http" or "https"
if any(keyword in post.lower() for keyword in keywords) and "http" not in post.lower():
filtered_posts.append(post)
return filtered_posts
# Function to scrape posts for a given keyword and page number
def scrape_posts_for_keyword(keyword, page):
url = f"https://www.somewheretowrite.com/page/{page}?s={keyword}"
text = html_to_text(url)
return text
# Define the happy keywords to search for
happy_keywords = ["joyous", "celebration", "wonderful", "amazing", "delightful", "ecstatic", "blissful", "cheerful", "exuberant", "jubilant", "euphoric", "thrilled", "content", "elated", "gleeful", "grateful", "happy", "merry", "radiant", "sunny", "upbeat", "victorious", "vivacious", "zestful", "blessed", "fortunate", "lucky", "jolly", "smiling", "joy", "happiness", "excited", "positive"]
# Define the sad keywords to search for
sad_keywords = ["death", "sad", "depressed", "worst", "miserable", "hate", "unhappy", "tragic", "grief", "heartbroken", "sorrow", "melancholy", "gloomy", "grief-stricken", "despair", "disheartened", "tearful", "unfortunate", "bleak", "desolate", "forlorn", "dejected", "woeful", "anguish", "dismal", "unbearable", "painful", "distraught", "regretful", "bereaved", "pain", "suffering", "negative", "downcast"]
# Iterate over each happy keyword and scrape posts
for keyword in happy_keywords:
print(f"Scraping posts for happy keyword '{keyword}':")
page = 1
while True:
text = scrape_posts_for_keyword(keyword, page)
if text and "Be the First to Comment" in text:
# Filter and extract individual posts
posts = filter_and_extract_posts(text, happy_keywords)
if posts:
# Print the extracted posts
print(f"Extracted posts for happy keyword '{keyword}', page {page}:")
for i, post in enumerate(posts, start=1):
print(f"Post {i}:")
print(post)
print("-" * 50)
page += 1
else:
print(f"No relevant posts found for happy keyword '{keyword}' on page {page}.")
break
else:
print(f"No content to analyze for happy keyword '{keyword}' on page {page}.")
break
# Iterate over each sad keyword and scrape posts
for keyword in sad_keywords:
print(f"Scraping posts for sad keyword '{keyword}':")
page = 1
while True:
text = scrape_posts_for_keyword(keyword, page)
if text and "Be the First to Comment" in text:
# Filter and extract individual posts
posts = filter_and_extract_posts(text, sad_keywords)
if posts:
# Print the extracted posts
print(f"Extracted posts for sad keyword '{keyword}', page {page}:")
for i, post in enumerate(posts, start=1):
print(f"Post {i}:")
print(post)
print("-" * 50)
page += 1
else:
print(f"No relevant posts found for sad keyword '{keyword}' on page {page}.")
break
else:
print(f"No content to analyze for sad keyword '{keyword}' on page {page}.")
break