-
Notifications
You must be signed in to change notification settings - Fork 1
/
Pipeline.java
143 lines (115 loc) · 4.59 KB
/
Pipeline.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package org.icij.datashare.text.nlp;
import org.icij.datashare.reflect.EnumTypeToken;
import org.icij.datashare.text.Document;
import org.icij.datashare.text.Language;
import org.icij.datashare.text.NamedEntity;
import java.nio.charset.Charset;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Arrays.asList;
import static java.util.Arrays.stream;
import static java.util.Collections.singletonList;
import static org.icij.datashare.function.ThrowingFunctions.joinComma;
import static org.icij.datashare.text.NamedEntity.Category.*;
import static org.icij.datashare.text.nlp.NlpStage.NER;
public interface Pipeline {
static Set<Type> set(Type ...types) {
return new HashSet<>(Arrays.asList(types));
}
enum Type implements EnumTypeToken {
TEST((short)-1),
CORENLP((short)0),
GATENLP((short)1),
IXAPIPE((short)2),
MITIE((short)3),
OPENNLP((short)4),
EMAIL((short)5);
private final String className;
public final short code;
public final int mask;
Type(final short code) {
this.code = code;
mask = 1 << code;
className = buildClassName(Pipeline.class, this);
}
public static Type fromCode(final int code) {
for (Type t: Type.values()) {
if (t.code == code) {
return t;
}
}
throw new IllegalArgumentException("cannot find code " + code);
}
@Override
public String getClassName() { return className; }
public static Type parse(final String valueName) {
return EnumTypeToken.parse(Type.class, valueName).
orElseThrow(() -> new IllegalArgumentException("unknown pipeline type: " + valueName));
}
public static Optional<Type> fromClassName(final String className) {
return EnumTypeToken.parseClassName(Pipeline.class, Type.class, className);
}
public static Set<Pipeline.Type> parseAll(final String comaSeparatedTypes) {
return comaSeparatedTypes == null || comaSeparatedTypes.isEmpty() ? new HashSet<>():
stream(comaSeparatedTypes.split(",")).map(Type::valueOf).collect(Collectors.toSet());
}
}
enum Property {
STAGES,
ENTITIES,
CACHING,
LANGUAGE,
ENCODING;
public String getName() {
return name().toLowerCase().replace('_', '-');
}
public static Function<List<NlpStage>, Function<List<NamedEntity.Category>, Function<Boolean, Properties>>>
build =
nlpStages -> entityCategories -> enableCaching -> {
Properties properties = new Properties();
properties.setProperty(STAGES.getName(), joinComma.apply(nlpStages));
properties.setProperty(ENTITIES.getName(), joinComma.apply(entityCategories));
properties.setProperty(CACHING.getName(), String.valueOf(enableCaching));
return properties;
};
}
Charset DEFAULT_ENCODING = UTF_8;
List<NlpStage> DEFAULT_TARGET_STAGES = singletonList(NER);
List<NamedEntity.Category> DEFAULT_ENTITIES = asList(PERSON, ORGANIZATION, LOCATION);
boolean DEFAULT_CACHING = true;
Type getType();
boolean initialize(Language language) throws InterruptedException;
List<NamedEntity> process(Document doc) throws InterruptedException;
List<NamedEntity> process(Document doc, int contentLength, int contentOffset) throws InterruptedException;
void terminate(Language language) throws InterruptedException ;
/**
* Is stage supported for language?
*
* @param stage the stage to test for support
* @param language the language on which stage is tested
* @return true if stage supports language; false otherwise
*/
boolean supports(NlpStage stage, Language language);
/**
* @return the list of all targeted named entity categories
*/
List<NamedEntity.Category> getTargetEntities();
/**
* @return the list of all involved stages
*/
List<NlpStage> getStages();
/**
* @return true if pipeline is caching annotators; false otherwise
*/
boolean isCaching();
/**
* @return the list of all involved stages
*/
Charset getEncoding();
/**
* @return the tagset used by the part-of-speech tagger
*/
Optional<String> getPosTagSet(Language language);
}