Merged PR 12243: For int8 quantized model, use int8 quantization for …

…encoders as well For int8 quantized model, use int8 quantization for encoders as well. The quality difference between fp16 encoder and int8 encoder is small, but they have quite amount of speed difference.
marian-nmt · May 20, 2020 · 128e1fc · 128e1fc
1 parent 63006db
commit 128e1fc
Showing 1 changed file with 5 additions and 8 deletions.
diff --git a/src/tensors/cpu/fbgemm/expression_graph_packable.h b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@@ -36,11 +36,10 @@ class ExpressionGraphPackable : public ExpressionGraph {
 
       // save as packed format
       // @TODO Hardcoded to find packable weights
-      // int8 - quantize decoder only for better quality, all the weights used for affine op and dot op (int8)
-      // fp16 - all the weights used for affine op (fp16)
+      // int8 - all the weights used for affine op and dot op
+      // fp16 - all the weights used for affine op
       if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
-        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)
-        && pName.find("encoder") == std::string::npos) {
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
 #if USE_FBGEMM
         using namespace marian::cpu::variant;
         // packing information - size
@@ -85,10 +84,8 @@ class ExpressionGraphPackable : public ExpressionGraph {
 #else
         ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
 #endif
-      // fp16 quantization option + encoders for int8 quantized models
-      } else if ((gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3)
-        || ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
-        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2))) {
+      // fp16 quantization option
+      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
 #if USE_FBGEMM
         using namespace marian::cpu::variant;