-
Notifications
You must be signed in to change notification settings - Fork 0
/
logmap_to_starspace.py
executable file
·69 lines (50 loc) · 1.86 KB
/
logmap_to_starspace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# coding:utf8
"""
copyright Asan AGIBETOV <asan.agibetov@gmail.com>
Simple script to convert LogMap stem -> concept_index mappings of the form
```
stem_1;stem_2|concept_index1,concept_index2
```
into the form which is required by the starspace toolkit
```
word1 word2 __label__concept_index1 __label__concept_index2
```
So we assume that we have mappings (stem -> word) and (concept_index -> __label__concept_index)
"""
# Standard-library imports
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Third-party imports
import click
@click.command()
@click.argument("logmap_f", type=click.Path(exists=True))
@click.option("--output", default="output.starspace")
def main(logmap_f, output):
lines = None
logger.info("Reading in file {}".format(logmap_f))
with open(logmap_f, "r") as f:
lines = [l.strip() for l in f.readlines()]
logger.info("Splitting on |")
lines = [l.split("|") for l in lines]
logger.info("Here is how I split them")
logger.info(lines[:5])
logger.info("Splitting words and concepts on ;")
logger.info("Here is how I split them")
lines = [(words.split(";"), ['__label__' + c for c in concepts.split(";")])
for (words, concepts) in lines]
logger.info(lines[:5])
logger.info("Writing starspace file to {}".format(output))
with open(output, "w") as f:
def write_out(words, concepts):
fmt_str = " ".join(words)
fmt_str = fmt_str + "\t"
fmt_str = fmt_str + "\t".join(concepts)
fmt_str = fmt_str + "\n"
return fmt_str
lines = [write_out(words, concepts) for words, concepts in lines]
logger.info("Here is the final representation")
logger.info(lines[:5])
f.writelines(lines)
if __name__ == "__main__":
main()