-
Notifications
You must be signed in to change notification settings - Fork 0
/
solvers.py
149 lines (122 loc) · 6.08 KB
/
solvers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
""" Solvers for the HQ Trivia bot project """
import re
import sys
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from utils import Colours, get_raw_words, get_significant_words
class BaseSolver(object):
""" an instance of a question solver to return answer confidences """
weight = 0
service_url = None
@staticmethod
def build_queries(question_text, answers):
""" build queries with question text and answers """
raise NotImplementedError()
def build_urls(self, question_text, answers):
""" build URLs with search queries """
urls = {}
queries = self.build_queries(question_text.replace(' NOT ', ' ').replace(' NEVER ', ' '), answers)
for answer_key, query_string in queries.items():
urls[answer_key] = self.service_url.format(quote_plus(query_string))
return urls
@staticmethod
def fetch_responses(urls, session):
""" fetch responses for solver URLs """
responses = {}
for answer_key, url in urls.items():
responses[answer_key] = session.get(url)
return responses
def get_answer_matches(self, response, answer_key, answers, matches):
""" get answer occurences for response """
raise NotImplementedError()
def compute_confidence(self, matches, confidence):
""" Calculate confidence for matches """
total_matches = sum(matches.values())
if total_matches:
for index, count in matches.items():
confidence[index] += int(((count / total_matches) * 100) * self.weight)
return confidence
@staticmethod
def choose_answer(question_text, confidence):
""" Choose an answer using confidence """
if sum(confidence.values()) == 0:
return 'A'
comparison = min if ' NOT ' in question_text or ' NEVER ' in question_text else max
return comparison(confidence, key=confidence.get)
def run(self, question_text, answers, responses, confidence):
""" Run solver and return confidence """
print('\n%s: ' % (re.sub(r'(\w)([A-Z])', r'\1 \2', self.__class__.__name__)[:-7]))
matches = {'A': 0, 'B': 0, 'C': 0}
for answer_key, response in responses.items():
response = response.result() if hasattr(response, 'result') else response
if '/sorry/index?continue=' in response.url:
sys.exit('ERROR: Google rate limiting detected.')
matches = self.get_answer_matches(response, answer_key, answers, matches)
confidence = self.compute_confidence(matches, confidence)
prediction = self.choose_answer(question_text, confidence)
return prediction, confidence
class GoogleAnswerWordsSolver(BaseSolver):
""" Solver that searches question on Google and counts answers in results """
weight = 200
full_answer_weight = 1000
partial_answer_weight = 100
service_url = 'https://www.google.co.uk/search?pws=0&q={}'
@staticmethod
def build_queries(question_text, answers):
""" Built a single query for all answers. _ notation is used to show a
universal query for all keys
:param question_text: string of the question
:param answers: dict of possible answer values keyed A, B or C
:returns: dict with one universal query for all answers
"""
return {'_': question_text}
def get_answer_matches(self, response, _answer_key, answers, matches):
""" get answer occurrences for response """
results = ''
document = BeautifulSoup(response.text, "html5lib")
for element in document.find_all(class_='st'):
results += " " + element.text # Search result descriptions
for element in document.find_all(class_='r'):
results += " " + element.text # Search result titles
for element in document.find_all(class_='mod'):
results += " " + element.text # Quick answer card
for element in document.find_all(class_='brs_col'):
results += " " + element.text # Related searches
results_words = get_raw_words(results)
print('Exact matches: ')
for answer_key, answer in answers.items():
count = results_words.count(' {} '.format(get_raw_words(answer)))
matches[answer_key] += self.full_answer_weight * count
print('{}: {}'.format(answer_key, Colours.BOLD.value + str(count) + Colours.ENDC.value))
print('\nPartial matches: ')
for answer_key, answer in answers.items():
count = 0
for word in get_significant_words(get_raw_words(answer)):
count += results_words.count(' {} '.format(word))
matches[answer_key] += self.partial_answer_weight * count
print('{}: {}'.format(answer_key, Colours.BOLD.value + str(count) + Colours.ENDC.value))
return matches
class GoogleResultsCountSolver(BaseSolver):
""" Solver that searches question with quoted answer on Google and compares the number of results """
weight = 100
service_url = 'https://www.google.co.uk/search?pws=0&q={}'
@staticmethod
def build_queries(question_text, answers):
""" build queries dict with question text and answers """
queries = {}
for answer_key, answer_value in answers.items():
queries[answer_key] = '%s "%s"' % (question_text, answer_value)
return queries
def get_answer_matches(self, response, answer_key, answers, matches):
""" get answer occurences for response """
document = BeautifulSoup(response.text, "html5lib")
if getattr(document.find(id='topstuff'), 'text', '')[:16] != 'No results found':
if document.find(id='resultStats'):
results_count_text = document.find(id='resultStats').text.replace(',', '')
results_count = re.findall(r'\d+', results_count_text)
if results_count:
matches[answer_key] += int(results_count[0])
print('{}: {}{:,}{}'.format(
answer_key, Colours.BOLD.value, matches[answer_key], Colours.ENDC.value
))
return matches