-
Notifications
You must be signed in to change notification settings - Fork 1
/
fileLoader.py
79 lines (63 loc) · 2.62 KB
/
fileLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
from document import Document
def loadFile(path):
markups = [
"DOCNO",
"FILEID",
"FIRST",
"BYLINE",
"SECOND",
"HEAD",
"DATELINE",
"NOTE",
"UNK"
]
documents = []
with open(path) as file:
line = file.readline().rstrip()
while line:
# New Document
if line == "<DOC>":
document = Document()
line = file.readline().rstrip()
#print("DOC begin")
while line != "</DOC>": # until the end of the doc
for markup in markups: # scan which markup
# single line
if line[:len(markup) + 2] == '<' + markup + '>' and line[len(line) - len(markup) - 3:] == "</" + markup + '>':
#print(f"\t{markup} added")
document.setData(
markup, line[len(markup) + 2:len(line) - len(markup) - 3])
line = file.readline().rstrip()
# multiline
elif line[:len(markup) + 2] == '<' + markup + '>':
text = line[len(markup) + 2:]
while line[len(line) - len(markup) - 3:] != "</" + markup + '>':
text += line
line = file.readline().rstrip()
text += line[len(markup) +
2:len(line) - len(markup) - 3]
line = file.readline().rstrip()
document.setData(markup, text)
# multiline, loop until markup TEXT
if line == "<TEXT>" or line[:len("TEXT") + 2] == '<' + "TEXT" + '>':
#print("text begin")
line = file.readline().rstrip()
text = ""
while line != "</TEXT>":
# print(line)
text += line
line = file.readline().rstrip()
line = file.readline().rstrip()
document.setData("TEXT", text)
# print(line)
documents.append(document)
#print("document added")
else: # continue until a doc markup
line = file.readline().rstrip()
return documents
def indexCorpus(folder):
corpus = []
for file in os.listdir(folder):
corpus += loadFile("%s/%s" % (folder, file))
return corpus