-
Notifications
You must be signed in to change notification settings - Fork 15
/
TreeTagger.py
146 lines (119 loc) · 5.55 KB
/
TreeTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the TreeTagger POS-tagger
#
# Copyright (C) Mirko Otto
# Author: Mirko Otto <dropsy@gmail.com>
"""
A Python module for interfacing with the Treetagger by Helmut Schmid.
"""
import os
from subprocess import Popen, PIPE
from nltk.internals import find_binary, find_file
from nltk.tag.api import TaggerI
from sys import platform as _platform
_treetagger_url = 'http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/'
_treetagger_languages = ['bulgarian', 'dutch', 'english', 'estonian', 'finnish', 'french', 'galician', 'german',
'italian', 'polish', 'russian', 'slovak', 'slovak2', 'spanish', 'portuguese']
# Acceptable parts of speech tags by language: only nouns, abbreviations and unknown tags
polish = "subst xxx ign brev burk adj".split(" ")
dutch = "noun adj".split(" ")
french = "abr nom nam adj".split(" ")
italian = "abr fw nom npr adj".split(" ")
english = "nn fw np jj".split(" ")
class TreeTagger(TaggerI):
r"""
A class for pos tagging with TreeTagger. The default encoding used by TreeTagger is utf-8. The input is the paths to:
- a language trained on training data
- (optionally) the path to the TreeTagger binary
This class communicates with the TreeTagger binary via pipes.
"""
def __init__(self, path_to_home=None, language='german',
verbose=False, abbreviation_list=None):
"""
Initialize the TreeTagger.
:param path_to_home: The TreeTagger binary.
:param language: Default language is german.
The encoding used by the model. Unicode tokens
passed to the tag() and batch_tag() methods are converted to
this charset when they are sent to TreeTagger.
The default is utf-8.
This parameter is ignored for str tokens, which are sent as-is.
The caller must ensure that tokens are encoded in the right charset.
"""
treetagger_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
'/Applications/bin', '~/bin', '~/Applications/bin',
'~/work/tmp/treetagger/cmd', '~/TreeTagger/cmd']
treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
self._abbr_list = abbreviation_list
self.language = language
if language in _treetagger_languages:
if _platform == "win32":
treetagger_bin_name = 'tag-' + language
else:
treetagger_bin_name = 'tree-tagger-' + language
else:
raise LookupError('Language not in language list!')
try:
self._treetagger_bin = find_binary(
treetagger_bin_name, path_to_home,
env_vars=('TREETAGGER', 'TREETAGGER_HOME'),
searchpath=treetagger_paths,
url=_treetagger_url,
verbose=verbose)
except LookupError:
print('NLTK was unable to find the TreeTagger bin!')
def tag(self, sentences):
"""Tags a single sentence: a list of words.
The tokens should not contain any newline characters.
"""
# Write the actual sentences to the temporary input file
if isinstance(sentences, list):
_input = '\n'.join((x for x in sentences))
else:
_input = sentences
# Run the tagger and get the output
if (self._abbr_list is None):
p = Popen([self._treetagger_bin],
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
elif (self._abbr_list is not None):
p = Popen([self._treetagger_bin, "-a", self._abbr_list],
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
# (stdout, stderr) = p.communicate(bytes(_input, 'UTF-8'))
(stdout, stderr) = p.communicate(str(_input).encode('utf-8'))
# Check the return code.
if p.returncode != 0:
print(stderr)
raise OSError('TreeTagger command failed!')
treetagger_output = stdout.decode('UTF-8')
# Output the tagged sentences
tagged_sentences = []
for tagged_word in treetagger_output.strip().split('\n'):
tagged_word_split = tagged_word.split('\t')
tagged_sentences.append(tagged_word_split)
return tagged_sentences
def is_acceptable_portuguese_tag(self, tag):
return (tag.startswith('n') and not tag.endswith('g0')) or tag.startswith('a')
def is_acceptable_russian_tag(self, tag):
return tag.startswith('n') or tag.startswith('a')
def is_acceptable_german_tag(self, tag):
return tag.startswith('n') or tag.startswith('adj')
def is_acceptable_tag(self, tag, acceptable_tags):
for acceptable_tag in acceptable_tags:
if acceptable_tag in tag:
return True
return False
def is_acceptable_word(self, word):
try:
word_tag = self.tag(word)[0][1].lower()
return {
'english': self.is_acceptable_tag(word_tag, english),
'french': self.is_acceptable_tag(word_tag, french),
'german': self.is_acceptable_german_tag(word_tag),
'portuguese': self.is_acceptable_portuguese_tag(word_tag),
'italian': self.is_acceptable_tag(word_tag, italian),
'polish': self.is_acceptable_tag(word_tag, polish),
'russian': self.is_acceptable_russian_tag(word_tag),
'dutch': self.is_acceptable_tag(word_tag, dutch)
}[self.language]
except:
return True