Merged PR 14864: Merge with public master

Merge public master with internal master. The changes have already been reviewed on GitHub. There are a few tiny changes here to make the three builds on Windows, Linux, MacOS happy and one change in file_stream.cpp to satisfy a regression test. Also moves pointer to regression tests to most recent version. Redoes the merge after revert. I incorrectly selected a squash commit which we do not want in this case (many PRs).
marian-nmt · Aug 20, 2020 · 3f15573 · 3f15573
2 parents e45b93d + 5cc25c8
commit 3f15573
Show file tree

Hide file tree

Showing 25 changed files with 851 additions and 661 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Printing word-level scores in marian-scorer
+- Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
 - Decoding multi-source models in marian-server with --tsv
 - GitHub workflows on Ubuntu, Windows, and MacOS
 - LSH indexing to replace short list
@@ -19,6 +21,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Training and scoring from STDIN
 - Support for reading from TSV files from STDIN and other sources during training
   and translation with options --tsv and --tsv-fields n.
+- Internal optional parameter in n-best list generation that skips empty hypotheses.
 
 ### Fixed
 - Fix compilation without BLAS installed
@@ -34,6 +37,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
   absolute and not relative. We assumed incorrectly that epsilon is absolute tolerance.
 - Fixed bug in finding .git/logs/HEAD when Marian is a submodule in another project.
 - Properly record cmake variables in the cmake build directory instead of the source tree.
+- Added default "none" for option shuffle in BatchGenerator, so that it works in executables where shuffle is not an option.
+- Added a few missing header files in shortlist.h and beam_search.h.
 
 ### Changed
 - Move Simple-WebSocket-Server to submodule

diff --git a/VERSION b/VERSION
@@ -1,2 +1 @@
-v1.9.34
-
+v1.9.35
diff --git a/regression-tests b/regression-tests
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -130,7 +130,9 @@ set(MARIAN_GIT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../.git)
 if(NOT IS_DIRECTORY ${MARIAN_GIT_DIR}) # i.e., it's a submodule
   file(READ ${MARIAN_GIT_DIR} MARIAN_GIT_DIR)
   string(REGEX REPLACE "gitdir: (.*)\n" "\\1" MARIAN_GIT_DIR ${MARIAN_GIT_DIR})
-  get_filename_component(MARIAN_GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${MARIAN_GIT_DIR}" ABSOLUTE)
+  if(NOT IS_ABSOLUTE ${MARIAN_GIT_DIR})
+    get_filename_component(MARIAN_GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${MARIAN_GIT_DIR}" ABSOLUTE)
+  endif()
 endif(NOT IS_DIRECTORY ${MARIAN_GIT_DIR})
 
 add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -561,7 +561,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
   cli.add<bool>("--n-best",
       "Generate n-best list");
   cli.add<bool>("--word-scores",
-      "Print word-level scores");
+      "Print word-level scores. One score per subword unit, not normalized even if --normalize");
 
   // efficiency options
   cli.add<int>("--valid-mini-batch",
@@ -626,7 +626,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Return word alignment. Possible values: 0.0-1.0, hard, soft")
     ->implicit_val("1");
   cli.add<bool>("--word-scores",
-      "Print word-level scores");
+      "Print word-level scores. One score per subword unit, not normalized even if --normalize");
 #ifdef USE_SENTENCEPIECE
   cli.add<bool>("--no-spm-decode",
       "Keep the output segmented into SentencePiece subwords");
@@ -695,6 +695,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
   cli.add<std::string>("--alignment",
      "Return word alignments. Possible values: 0.0-1.0, hard, soft")
      ->implicit_val("1"),
+  cli.add<bool>("--word-scores",
+      "Print word-level scores. One score per subword unit, not normalized even if --normalize");
 
   addSuboptionsInputLength(cli);
   addSuboptionsTSV(cli);

diff --git a/src/common/definitions.h b/src/common/definitions.h
@@ -22,6 +22,61 @@
 #define DONT_OPTIMIZE // silently ignore on Visual Studio, where this is less of a problem
 #endif
 
+// Use these macros to enable faster floating-point math. Put them around one
+// or more functions.
+//
+// Usage:
+// MARIAN_FFAST_MATH_BEGIN
+// void LayerNormalization(float *arg) { *arg += 1.0; }
+// void SomethingElse() {}
+// MARIAN_FFAST_MATH_END
+//
+// ffast-math allows the compiler to assume associative arithmetic and finite
+// values.
+//
+// Associative arithmetic is particularly important to vectorize i.e. a sum:
+//   for (const float f : range) sum += f;
+// Without ffast-math, the sum will be done one value at a time.  On x86 it
+// still uses vector math, but only uses the first slot and wastes the rest.
+//
+// With ffast-math, the compiler can sum in batches of 4, 8, or 16 floats.
+// Also, it can run multiple adds in parallel e.g. vaddps has latency 4 and
+// throughput 0.5 on Skylake so multiple vector adds can run at once.
+//
+// On average, a vectorized sum is more numerically stable because it sums in
+// batches. Vectorized floats can still produce NaNs and infs (remember even
+// scalar operations are implemented with vector instructions).
+//
+// Allowing the compiler to assume finite values means functions like isnan or
+// isinf do not work as expected. Do not enable this for a function that
+// depends upon fully standard float behavior.
+//
+// It can also change the sign of zeros.
+//
+// Fast math also makes results more architecture dependent because different
+// register widths mean different results. They also depend on the compiler
+// and compiler version more. For example, clang <= 10 does not support the
+// float_control pragma below so it will still be conservative.
+//
+// There is a more conservative option for just associativity:
+// llvm introduced "#pragma clang fp reassociate" that goes inside a function.
+// However, llvm <11 considers that pragma an error so we'd need some ugly
+// version test (which they don't recommend) or a compilation test.  Moreover,
+// it has to be in the function to keep scope.
+// gcc supports "-fassociative-math" that has to be outside a function.
+// I didn't find a MSVC equivalent.
+#if defined(_MSC_VER)
+#define MARIAN_FFAST_MATH_BEGIN __pragma(float_control(precise, off, push))
+#define MARIAN_FFAST_MATH_END __pragma(float_control(pop))
+#elif defined(__clang__)
+#define MARIAN_FFAST_MATH_BEGIN _Pragma("float_control(precise, off, push)")
+#define MARIAN_FFAST_MATH_END _Pragma("float_control(pop)")
+#elif defined(__GNUC__)
+// Also available as __attribute__((optimize("-ffast-math"))) but done as pragmas for consistency
+#define MARIAN_FFAST_MATH_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"-ffast-math\")")
+#define MARIAN_FFAST_MATH_END _Pragma("GCC pop_options")
+#endif
+
 namespace marian {
 
 // Type to be used for all index types, e.g. for integer tensors for rows operator.

diff --git a/src/common/file_stream.cpp b/src/common/file_stream.cpp
@@ -34,9 +34,10 @@ InputFileStream::InputFileStream(const std::string &file)
 #else
     ABORT("Pipe syntax not supported in this build of Marian: {}", file);
 #endif
-  }
-  else
+  } else {
+    ABORT_IF(!marian::filesystem::exists(file), "File '{}' does not exist", file);
     file_ = file;
+  }
   streamBuf1_.reset(new std::filebuf());
   auto ret = static_cast<std::filebuf*>(streamBuf1_.get())->open(file_.string().c_str(), std::ios::in | std::ios::binary);
   ABORT_IF(!ret, "Error opening file ({}): {}", errno, file_.string());

diff --git a/src/common/utils.cpp b/src/common/utils.cpp
@@ -112,15 +112,13 @@ std::vector<std::string> splitAny(const std::string& line,
 }
 
 std::string join(const std::vector<std::string>& words, const std::string& del /*= " "*/) {
-  std::stringstream ss;
-  if(words.empty()) {
+  if(words.empty())
     return "";
-  }
 
+  std::stringstream ss;
   ss << words[0];
-  for(size_t i = 1; i < words.size(); ++i) {
+  for(size_t i = 1; i < words.size(); ++i)
     ss << del << words[i];
-  }
 
   return ss.str();
 }
@@ -131,6 +129,18 @@ std::string join(const std::vector<size_t>& nums, const std::string& del /*= " "
   return join(words, del);
 }
 
+std::string join(const std::vector<float>& nums, const std::string& del /*= " "*/, size_t prec /*= 5*/) {
+  if(nums.empty())
+    return "";
+
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(prec) << nums[0];
+  for(size_t i = 1; i < nums.size(); ++i)
+    ss << del << nums[i];
+
+  return ss.str();
+}
+
 // escapes a string for passing to popen, which uses /bin/sh to parse its argument string
 static std::string escapeForPOpen(const std::string& arg) {
   // e.g. abc -> 'abc'; my file.txt -> 'my file.txt'; $10 -> '$10'; it's -> 'it'\''s'

diff --git a/src/common/utils.h b/src/common/utils.h
@@ -31,8 +31,12 @@ std::vector<std::string> splitAny(const std::string& line,
                                   const std::string& del = " ",
                                   bool keepEmpty = false);
 
+// Return a string which is the concatenation of the strings in the given vector
 std::string join(const std::vector<std::string>& words, const std::string& del = " ");
+// Return a string which is the concatenation of values from the given vector of integers
 std::string join(const std::vector<size_t>& words, const std::string& del = " ");
+// Return a string which is the concatenation of values from the given vector of floats
+std::string join(const std::vector<float>& words, const std::string& del = " ", size_t prec = 5);
 
 std::string exec(const std::string& cmd, const std::vector<std::string>& args = {}, const std::string& arg = "");
 

diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
@@ -276,7 +276,7 @@ class BatchGenerator : public RNGEngine {
                  bool runAsync = true)
       : data_(data), options_(options), stats_(stats), 
         runAsync_(runAsync), threadPool_(runAsync ? new ThreadPool(1) : nullptr) {
-    auto shuffle = options_->get<std::string>("shuffle");
+    auto shuffle = options_->get<std::string>("shuffle", "none");
     shuffleData_ = shuffle == "data";
     shuffleBatches_ = shuffleData_ || shuffle == "batches";
   }

diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
@@ -365,7 +365,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
       ABORT_IF(!filesystem::exists(path), "Alignment file does not exist");
       LOG(info, "[data] Using word alignments from file {}", path);
 
-      alignFileIdx_ = paths_.size();
+      alignFileIdx_ = (int)paths_.size();
       paths_.emplace_back(path);
       io::InputFileStream* strm = new io::InputFileStream(path);
       ABORT_IF(strm->empty(), "File with alignments '{}' is empty", path);
@@ -378,7 +378,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
       ABORT_IF(!filesystem::exists(path), "Weight file does not exist");
       LOG(info, "[data] Using weights from file {}", path);
 
-      weightFileIdx_ = paths_.size();
+      weightFileIdx_ = (int)paths_.size();
       paths_.emplace_back(path);
       io::InputFileStream* strm = new io::InputFileStream(path);
       ABORT_IF(strm->empty(), "File with weights '{}' is empty", path);