Merge branch 'master' into ug-graceful-shutdown

marian-nmt · Aug 18, 2020 · 0a0b83b · 0a0b83b
2 parents 75459e3 + 3aed914
commit 0a0b83b
Show file tree

Hide file tree

Showing 12 changed files with 124 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Printing word-level scores in marian-scorer
 - Optimize LayerNormalization on CPU by 6x through vectorization (ffast-math) and fixing performance regression introduced with strides in 77a420
 - Decoding multi-source models in marian-server with --tsv
 - GitHub workflows on Ubuntu, Windows, and MacOS

diff --git a/VERSION b/VERSION
@@ -1,2 +1,2 @@
-v1.9.31
+v1.9.33
 
diff --git a/regression-tests b/regression-tests
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -687,6 +687,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
   cli.add<std::string>("--alignment",
      "Return word alignments. Possible values: 0.0-1.0, hard, soft")
      ->implicit_val("1"),
+  cli.add<bool>("--word-scores",
+      "Print word-level scores");
 
   addSuboptionsInputLength(cli);
   addSuboptionsTSV(cli);

diff --git a/src/common/utils.cpp b/src/common/utils.cpp
@@ -112,15 +112,13 @@ std::vector<std::string> splitAny(const std::string& line,
 }
 
 std::string join(const std::vector<std::string>& words, const std::string& del /*= " "*/) {
-  std::stringstream ss;
-  if(words.empty()) {
+  if(words.empty())
     return "";
-  }
 
+  std::stringstream ss;
   ss << words[0];
-  for(size_t i = 1; i < words.size(); ++i) {
+  for(size_t i = 1; i < words.size(); ++i)
     ss << del << words[i];
-  }
 
   return ss.str();
 }
@@ -131,6 +129,18 @@ std::string join(const std::vector<size_t>& nums, const std::string& del /*= " "
   return join(words, del);
 }
 
+std::string join(const std::vector<float>& nums, const std::string& del /*= " "*/, size_t prec /*= 5*/) {
+  if(nums.empty())
+    return "";
+
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(prec) << nums[0];
+  for(size_t i = 1; i < nums.size(); ++i)
+    ss << del << nums[i];
+
+  return ss.str();
+}
+
 // escapes a string for passing to popen, which uses /bin/sh to parse its argument string
 static std::string escapeForPOpen(const std::string& arg) {
   // e.g. abc -> 'abc'; my file.txt -> 'my file.txt'; $10 -> '$10'; it's -> 'it'\''s'

diff --git a/src/common/utils.h b/src/common/utils.h
@@ -33,6 +33,7 @@ std::vector<std::string> splitAny(const std::string& line,
 
 std::string join(const std::vector<std::string>& words, const std::string& del = " ");
 std::string join(const std::vector<size_t>& words, const std::string& del = " ");
+std::string join(const std::vector<float>& words, const std::string& del = " ", size_t prec = 5);
 
 std::string exec(const std::string& cmd, const std::vector<std::string>& args = {}, const std::string& arg = "");
 

diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
@@ -233,8 +233,8 @@ class SubBatch {
 };
 
 /**
- * @brief Batch of source and target sentences with additional information,
- * such as guided alignments and sentence or word-leve weighting.
+ * @brief Batch of source(s) and target sentences with additional information,
+ * such as guided alignments and sentence or word-level weighting.
  */
 class CorpusBatch : public Batch {
 protected:

diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp
@@ -8,11 +8,12 @@ Ptr<LabelwiseLoss> newLoss(Ptr<Options> options, bool inference) {
   float factorWeight = options->get<float>("factor-weight", 1.0f);
   std::string costType = options->get<std::string>("cost-type", "ce-mean");
   bool unlikelihood = options->get<bool>("unlikelihood-loss", false);
-
-  if(costType == "ce-rescore") { // returns per-batch-item scores (while ce-mean reduces over batch)
-    return New<RescorerLoss>();
-  } else if(unlikelihood) {  
-    ABORT_IF(!options->hasAndNotEmpty("data-weighting") 
+
+  if(costType == "ce-rescore") {  // per-batch-item scores (while ce-mean reduces over batch)
+    bool wordScores = options->get<bool>("word-scores", false);
+    return New<RescorerLoss>(wordScores);
+  } else if(unlikelihood) {
+    ABORT_IF(!options->hasAndNotEmpty("data-weighting")
              && options->get<std::string>("data-weighting-type") != "word",
              "Unlikelihood loss training requires error annotation in form of per-target-label scores");
     return New<SequenceUnlikelihoodLoss>(smoothing, factorWeight); // this is a mix of CE-loss and unlikelihood less depending on values given for data-weighting

diff --git a/src/layers/loss.h b/src/layers/loss.h
@@ -434,17 +434,46 @@ class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
   }
 };
 
-
 /**
- * @brief Cross entropy in rescorer used for computing sentences-level log probabilities
+ * @brief Cross entropy in rescorer used for computing sentences-level or word-level log
+ * probabilities
+ *
+ * This class differs from CrossEntropy in the different 'axes' setting, and that label smoothing
+ * is disabled.
  */
 class RescorerLoss : public CrossEntropyLoss {
+private:
+  bool wordScores_{false};  // compute word-level log probabilities
+
 public:
-  // sentence-wise CE, hence reduce only over time axis
-  // This class differs from CrossEntropy in the different 'axes' setting, and that label smoothing is disabled.
-  RescorerLoss() : CrossEntropyLoss(/*axes=*/{-3} /*time axis*/, /*smoothing=*/0.f, /*factorWeight=*/1.0f) {}
+  // For sentence-wise CE reduce only over time axis.
+  // For word-level CE do not reduce over any axis.
+  RescorerLoss(bool wordScores)
+      : CrossEntropyLoss(/*axes=*/wordScores ? std::vector<int>({}) : std::vector<int>({-3}),
+                         /*smoothing=*/0.f,
+                         /*factorWeight=*/1.0f),
+        wordScores_(wordScores) {}
+
+  virtual RationalLoss apply(Logits logits,
+                             const Words& labels,
+                             Expr mask = nullptr,
+                             Expr labelWeights = nullptr) override {
+    ABORT_IF(!mask, "Word-level CE from rescorer must have mask");
+    auto loss = CrossEntropyLoss::compute(logits, labels, mask, labelWeights);
+
+    if(!wordScores_) {  // for sentence-level CE, reduce loss and labels as in cross-entropy
+      return reduce(loss, mask);
+    } else {  // for word-level CE, reduce labels only to get sentence lengths
+      ABORT_IF(!loss, "Loss has not been computed");
+
+      Expr labelsSum = cast(mask, Type::float32);  // accumulate in float32
+      labelsSum = sum(labelsSum, -3);              // reduce over time axis to get sentence lengths
+      return RationalLoss(loss, labelsSum);
+    }
+  }
 };
 
+
 /**
  * @brief Factory for label-wise loss functions
  */

diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
@@ -112,6 +112,10 @@ class Rescore : public ModelTask {
     bool summarize = !summary.empty();
     // @TODO: make normalize here a float and pass into loss to compute the same way as in decoding
     bool normalize = options_->get<bool>("normalize");
+    bool wordLevel = options_->get<bool>("word-scores", false);
+
+    if(wordLevel && summarize)
+      LOG(warn, "[warn] Word-level scores will not be printed if --summarize is enabled");
 
     float sumLoss = 0;
     size_t sumWords = 0;
@@ -142,12 +146,18 @@ class Rescore : public ModelTask {
           // get loss
           std::vector<float> scoresForSummary;
           dynamicLoss->loss(scoresForSummary);
-          std::vector<float> sentScores(scoresForSummary); // if '--normalize' then report scoresForSummary length-normalized
-          if (normalize) {
-            std::vector<float> sentLengths;
+
+          // get sentence lengths
+          std::vector<float> sentLengths(scoresForSummary);
+          if(normalize || wordLevel) {
             dynamicLoss->count(sentLengths);
-            for (size_t i = 0; i < scoresForSummary.size(); i++) {
-              if (sentScores[i] != 0) // (avoid 0/0)
+          }
+
+          std::vector<float> sentScores(scoresForSummary);
+          // if '--normalize' then report scoresForSummary length-normalized
+          if(normalize && !wordLevel) {  // sentence-level scores printed with word-level scores is calculated later
+            for(size_t i = 0; i < scoresForSummary.size(); i++) {
+              if(sentScores[i] != 0) // (avoid 0/0)
                 sentScores[i] /= (sentLengths.size() == 1 ? sentLengths[0] : sentLengths[i]); // emulate broadcasting semantics
             }
           }
@@ -158,17 +168,42 @@ class Rescore : public ModelTask {
             getAlignmentsForBatch(builder->getAlignment(), batch, aligns);
           }
 
+          // update statistics for the summarized score
           std::unique_lock<std::mutex> lock(smutex);
           for(auto s : scoresForSummary)
             sumLoss += s;
           sumWords += batch->back()->batchWords();
           sumSamples += batch->size();
 
           if(!summarize) {
-            for(size_t i = 0; i < batch->size(); ++i) {
-              output->Write((long)batch->getSentenceIds()[i],
-                             -1.f * sentScores[i], // report logProb while score is CE, hence negate
-                             aligns[i]);
+            if(!wordLevel) {
+              for(size_t i = 0; i < batch->size(); ++i)
+                output->Write((long)batch->getSentenceIds()[i],
+                              -1.f * sentScores[i],  // report logProb while score is CE, hence negate
+                              aligns[i]);
+            } else {
+              std::vector<float> wordScores;
+              float sentScore{0.f};
+
+              for(size_t i = 0; i < batch->size(); ++i) {
+                // Sum word-level scores to get the sentence score
+                for(size_t j = 0; j < sentLengths[i]; ++j) {
+                  size_t idx = j * batch->size() + i;            // the j-th word in i-th sentence
+                  wordScores.push_back(-1.f * sentScores[idx]);  // report logProb, hence negate
+                  sentScore += sentScores[idx];
+                }
+
+                sentScore *= -1.f;  // report logProb while score is CE, hence negate
+                if(normalize)
+                  // Note: word-level scores are not normalized; this is consistent with decoding
+                  // TODO: return length-normalized scores in both marian-scorer and marian-decoder
+                  sentScore /= sentLengths[i];
+
+                output->Write((long)batch->getSentenceIds()[i], sentScore, aligns[i], wordScores);
+
+                wordScores.clear();
+                sentScore = 0.f;
+              }
             }
           }
 

diff --git a/src/rescorer/score_collector.cpp b/src/rescorer/score_collector.cpp
@@ -56,10 +56,13 @@ void ScoreCollector::Write(long id, const std::string& message) {
 
 void ScoreCollector::Write(long id,
                            float score,
-                           const data::SoftAlignment& align) {
+                           const data::SoftAlignment& align /*= {}*/,
+                           const std::vector<float>& wordScores /*= {}*/) {
   auto msg = std::to_string(score);
   if(!alignment_.empty() && !align.empty())
     msg += " ||| " + getAlignment(align);
+  if(!wordScores.empty())
+    msg += " ||| WordScores= " + utils::join(wordScores, " ");
   Write(id, msg);
 }
 
@@ -85,7 +88,8 @@ ScoreCollectorNBest::ScoreCollectorNBest(const Ptr<Options>& options)
 
 void ScoreCollectorNBest::Write(long id,
                                 float score,
-                                const data::SoftAlignment& align) {
+                                const data::SoftAlignment& align /*= {}*/,
+                                const std::vector<float>& wordScores /*= {}*/) {
   std::string line;
   {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -105,17 +109,20 @@ void ScoreCollectorNBest::Write(long id,
     buffer_.erase(iter);
   }
 
-  ScoreCollector::Write(id, addToNBest(line, fname_, score, align));
+  ScoreCollector::Write(id, addToNBest(line, fname_, score, align, wordScores));
 }
 
 std::string ScoreCollectorNBest::addToNBest(const std::string nbest,
                                             const std::string feature,
                                             float score,
-                                            const data::SoftAlignment& align) {
+                                            const data::SoftAlignment& align /*= {}*/,
+                                            const std::vector<float>& wordScores /*= {}*/) {
   auto fields = utils::split(nbest, "|||");
   std::stringstream ss;
   if(!alignment_.empty() && !align.empty())
     ss << " " << getAlignment(align) << " |||";
+  if(!wordScores.empty())
+    ss << " WordScores= " + utils::join(wordScores, " ") << " |||";
   ss << fields[2] << feature << "= " << score << " ";
   fields[2] = ss.str();
   return utils::join(fields, "|||");

diff --git a/src/rescorer/score_collector.h b/src/rescorer/score_collector.h
@@ -18,7 +18,8 @@ class ScoreCollector {
   virtual void Write(long id, const std::string& message);
   virtual void Write(long id,
                      float score,
-                     const data::SoftAlignment& align = {});
+                     const data::SoftAlignment& align = {},
+                     const std::vector<float>& wordScores = {});
 
 protected:
   long nextId_{0};
@@ -51,7 +52,8 @@ class ScoreCollectorNBest : public ScoreCollector {
 
   virtual void Write(long id,
                      float score,
-                     const data::SoftAlignment& align = {}) override;
+                     const data::SoftAlignment& align = {},
+                     const std::vector<float>& wordScores = {}) override;
 
 private:
   std::string nBestList_;
@@ -63,6 +65,7 @@ class ScoreCollectorNBest : public ScoreCollector {
   std::string addToNBest(const std::string nbest,
                          const std::string feature,
                          float score,
-                         const data::SoftAlignment& align = {});
+                         const data::SoftAlignment& align = {},
+                         const std::vector<float>& wordScores = {});
 };
 }  // namespace marian