-
Notifications
You must be signed in to change notification settings - Fork 125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable multi-source input in marian-server #505
Changes from 1 commit
0baf0ae
8e8aae3
61d7ba4
84ce4cd
750d204
d342e44
b8e2f06
a10f5fb
908dcad
e4182e5
1f53dea
c5c1a4c
33f6002
cdf9b52
999b8fa
f4056af
d1ed2c2
8f591f6
46b8eba
9a89c25
7159f45
7c7fc54
79d8307
16d4ce6
95fc443
9963bea
6915f9c
b26ce69
41d247e
3a5d836
ff50a04
0c29e7f
577dbe7
8972fd9
12c07c0
15c1fdf
704f4e0
8cfa33d
05695ff
216608d
723b7fc
7676c5c
cf283d0
47000c5
d81ec71
9d31fb5
5bafca4
a73a5e4
ae84923
583a927
6745abd
bd0118c
49e8792
e94fd17
bbbc9a3
6dcfdaa
1469e45
ebec822
deb9b6a
fe72735
d60cae0
3dfd32c
369148c
82e82f6
ed5f586
5406641
4f4989b
0f7e40e
b22d316
1174cec
31bac7d
e20de49
27ce118
abf95d0
e375905
68f9d90
463b29c
1ab2484
55e3bcf
8fc0772
8653370
aff86fd
ad6f7ff
9f669f9
b478917
398ed0c
05c24fc
321fab9
2dfb302
0a89e8f
a0e4722
47eb656
bf10d36
42250be
5476f62
6b6444d
3a16eb6
880cc5d
9d8777b
939625f
af7df9d
0bb5dd1
81c14cd
0116646
6d330e3
9e8c772
b930762
b32e677
e53a46a
7b36b32
7efde49
8c3cb06
5ed441f
5050aad
52c7618
54fba78
e3399ef
1abd125
7886867
baf4d29
f042eae
78f671c
7ba804b
03bb51c
233281c
0cb8125
a8826c5
1171e3d
59011c8
ca61033
13e6182
66b95a5
fd3404d
c96d709
5dfd8a7
9353f06
189d89e
9a4f784
5fb31b2
4c0698f
d394641
61c0195
76e2293
a27fda7
b19820c
a1763e2
93b7ed8
9e090e3
26859d2
f07042b
e12a5db
120ab8f
49b54a6
0197b89
f007772
9fd5ba9
9c9a240
183d0b8
67e9bc4
82da7d5
b20d9d7
4b4e6b5
a6d0af0
6b3c8f5
34e99da
ac20f77
4993417
c12fd5b
6224cb1
734a879
c343ced
eb5f972
2b14d49
5be8558
e0500b2
eba7aed
bab02e3
f882f27
0dc1ef1
2bd986d
24f062c
88d9980
164d26c
0fab6ea
af02867
1f7a63d
703fcf4
b822cd4
b3a2310
cfdde15
7228698
7f4c730
a43ccd6
ad28f99
22ad592
5336040
990aeb5
1044f7f
bb44c2a
64a67d5
a2a567c
0873311
e09f713
63272d1
24df8f1
03fbf31
67b055f
00d2e99
45b83b2
015a218
9f29403
bec7e02
f4ea823
cf7f032
aad22c9
8640031
4b23fe7
3c7a88f
69d6f02
adba021
f1be95f
32186be
a5a5c62
9ccb075
95c65bb
d2b4f38
4a1d918
696bb44
2248a65
e78a068
a1d2f94
f561e12
e6f82f5
3126e2b
485a077
d593608
fe0572b
39cea6d
c70d93d
cbb2990
18e6a9a
81631e8
c0b6686
5af9899
5e21a28
3c0c1e1
58e316d
f2347a8
d3c8fbd
455724d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
#endif | ||
#include <codecvt> | ||
#include <cwctype> | ||
#include <vector> | ||
|
||
namespace marian { | ||
namespace utils { | ||
|
@@ -93,6 +94,26 @@ std::string join(const std::vector<std::string>& words, const std::string& del / | |
return ss.str(); | ||
} | ||
|
||
std::vector<std::string> tsv2lists(const std::string& inputText, int inputNum) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a comment documenting the function, ideally with an example input and output. |
||
std::string line_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove the trailing underscore from a local variable. |
||
std::vector<std::vector<std::string>> inputLists(inputNum); | ||
std::istringstream inputStream(inputText); | ||
while (std::getline(inputStream, line_)) { | ||
auto items = marian::utils::splitAny(line_, "\t"); | ||
std::cerr << "Split into " << items.size() << std::endl; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove the debug. |
||
for (size_t i = 0; i < items.size(); ++i) { | ||
inputLists[i].push_back(items[i]); | ||
} | ||
} | ||
|
||
std::vector<std::string> inputs; | ||
for (auto &inputList : inputLists) { | ||
inputs.push_back(marian::utils::join(inputList, "\n")); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment why this part is needed. |
||
return inputs; | ||
} | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double whitespace. |
||
// escapes a string for passing to popen, which uses /bin/sh to parse its argument string | ||
static std::string escapeForPOpen(const std::string& arg) { | ||
// e.g. abc -> 'abc'; my file.txt -> 'my file.txt'; $10 -> '$10'; it's -> 'it'\''s' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,8 +23,11 @@ const SentenceTuple& TextIterator::dereference() const { | |
TextInput::TextInput(std::vector<std::string> inputs, | ||
std::vector<Ptr<Vocab>> vocabs, | ||
Ptr<Options> options) | ||
: DatasetBase(inputs, options), vocabs_(vocabs) { | ||
// note: inputs are automatically stored in the inherited variable named paths_, but these are | ||
: DatasetBase(inputs, options), | ||
vocabs_(vocabs), | ||
maxLength_(options_->get<size_t>("max-length")), | ||
maxLengthCrop_(options_->get<bool>("max-length-crop")) { | ||
// note: inputs are automatically stored in the inherited variable named paths_, but these ar | ||
// texts not paths! | ||
for(const auto& text : paths_) | ||
files_.emplace_back(new std::istringstream(text)); | ||
|
@@ -43,6 +46,9 @@ SentenceTuple TextInput::next() { | |
std::string line; | ||
if(io::getline(dummyStream, line)) { | ||
Words words = vocabs_[i]->encode(line, /*addEOS =*/ true, /*inference =*/ inference_); | ||
if(this->maxLengthCrop_ && words.size() > this->maxLength_) { | ||
words.resize(maxLength_); | ||
} | ||
if(words.empty()) | ||
words.push_back(Word::ZERO); // @TODO: What is this for? @BUGBUG: addEOS=true, so this can never happen, right? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the EOS token needs to be added after cropping the sentence, similar to https://github.com/marian-nmt/marian-dev/blob/master/src/data/corpus_base.cpp#L211. |
||
tup.push_back(words); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
#pragma once | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "common/logging.h" | ||
|
||
namespace marian { | ||
|
||
|
@@ -10,5 +13,9 @@ struct ModelTask { | |
|
||
struct ModelServiceTask { | ||
virtual std::string run(const std::string&) = 0; | ||
virtual std::string run(const std::vector<std::string>&) { | ||
ABORT("Not implemented"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not a pure virtual function? |
||
return ""; | ||
} | ||
}; | ||
} // namespace marian |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With this logic
std::string run(std::string)
is used nowhere, am I right? So either this version of the function can be removed, or the logic here can be something similar to this:I think the latter is cleaner, but feel free to object.