-
Notifications
You must be signed in to change notification settings - Fork 1
/
document.py
105 lines (82 loc) · 2.51 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from Algorithms.stemmer_algo import PorterStemmer
import re
class Document:
def __init__(self):
self.docno = ""
self.fileId = ""
self.first = ""
self.second = ""
self.head = ""
self.dateline = ""
self.note = ""
self.byline = ""
self.unk = ""
self.text = ""
self.words = {}
def setData(self, markup, content):
if markup == "DOCNO":
self.docno = content
elif markup == "FILEID":
self.fileId = content
elif markup == "FIRST":
self.first = content
elif markup == "SECOND":
self.search = content
elif markup == "HEAD":
self.head = content
elif markup == "DATELINE":
self.dateline = content
elif markup == "TEXT":
self.text += ' ' + content
elif markup == "NOTE":
self.note = content
elif markup == "BYLINE":
self.byline = content
elif markup == "UNK":
self.unk = content
else:
print("%s does not exist" % markup)
def listWords(self):
words = set()
p = PorterStemmer()
for word in re.sub('[^ A-Za-z]+', '', self.text.lower()).split(' '):
word = p.stem(word, 0, len(word) - 1)
words.add(word)
return words
def countWords(self):
p = PorterStemmer()
text = re.sub('[^ A-Za-z]+', '', self.text.lower()).split(' ')
for word in text:
word = p.stem(word, 0, len(word) - 1)
if word in self.words:
self.words[word] += 1
else:
self.words[word] = 1
return self.words
def __str__(self):
return str(self.docno)
def buildInvertedIndex(corpus):
"""
{ word : [ count corpus, ( document, count ) ] }
"""
words = {}
index = 0
for document in corpus:
for word, count in document.countWords().items():
if word in words:
words[word][0] += 1
words[word].append((index, count))
else:
words[word] = [1]
words[word].append((index, count))
index += 1
# remove stop words
with open("stopwords.txt", "r") as stopwords:
word = stopwords.readline().rstrip()
while word:
if word in words:
del words[word]
word = stopwords.readline().rstrip()
if '' in words:
del words['']
return words