From 24be24082552aa41f6342fb3803fa5c6593eae12 Mon Sep 17 00:00:00 2001
From: Ziyue Zhang <zhangziyue21@mails.ucas.edu.cn>
Date: Tue, 2 Jul 2024 18:02:52 +0800
Subject: [PATCH] rv64v: fix some exception checks with bugs or missing

* add exception check for vector load/store, remove alignment check when access vector registers
* move vector load/store instructions info generation from decode stage to execute stage
* fix illegal vtype check
* when vstart is not zero and is a arithmetic vector instruction, raise illegal instruction exception
* support sew check for zext and sext instructions
---
 src/isa/riscv64/instr/rvf/decode.h        |  16 --
 src/isa/riscv64/instr/rvv/vcfg.h          |  14 +-
 src/isa/riscv64/instr/rvv/vcommon.c       |  27 +++-
 src/isa/riscv64/instr/rvv/vcommon.h       |   1 +
 src/isa/riscv64/instr/rvv/vcompute.h      |  12 +-
 src/isa/riscv64/instr/rvv/vcompute_impl.c |  28 ++--
 src/isa/riscv64/instr/rvv/vldst.h         |  33 ++++
 src/isa/riscv64/instr/rvv/vldst_impl.c    | 187 +++++++++++++++++-----
 src/isa/riscv64/instr/rvv/vreg_impl.c     |   4 +-
 src/isa/riscv64/system/priv.c             |   1 +
 10 files changed, 226 insertions(+), 97 deletions(-)

diff --git a/src/isa/riscv64/instr/rvf/decode.h b/src/isa/riscv64/instr/rvf/decode.h
index 1765880a1..e935f7bfd 100644
--- a/src/isa/riscv64/instr/rvf/decode.h
+++ b/src/isa/riscv64/instr/rvf/decode.h
@@ -115,14 +115,6 @@ def_THelper(vstore_mmu) {
 def_THelper(fload) {
   print_Dop(id_src1->str, OP_STR_SIZE, "%ld(%s)", id_src2->imm, reg_name(s->isa.instr.i.rs1, 4));
 
-  #ifdef CONFIG_RVV
-  const int table [8] = {1, 0, 0, 0, 0, 2, 4, 8};
-  s->vm = s->isa.instr.v_opv.v_vm; //1 for without mask; 0 for with mask
-  s->v_width = table[s->isa.instr.vldfp.v_width];
-  s->v_nf = s->isa.instr.vldfp.v_nf;
-  s->v_lsumop = s->isa.instr.vldfp.v_lsumop;
-  #endif // CONFIG_RVV
-
   int mmu_mode = isa_mmu_state();
   if (mmu_mode == MMU_DIRECT) {
     if (fp_enable()) {
@@ -163,14 +155,6 @@ def_THelper(fload) {
 def_THelper(fstore) {
   print_Dop(id_src1->str, OP_STR_SIZE, "%ld(%s)", id_src2->imm, reg_name(s->isa.instr.i.rs1, 4));
 
-  #ifdef CONFIG_RVV
-  const int table [8] = {1, 0, 0, 0, 0, 2, 4, 8};
-  s->vm = s->isa.instr.v_opv.v_vm; //1 for without mask; 0 for with mask
-  s->v_width = table[s->isa.instr.vldfp.v_width];
-  s->v_nf = s->isa.instr.vldfp.v_nf;
-  s->v_lsumop = s->isa.instr.vldfp.v_lsumop;
-  #endif // CONFIG_RVV
-
   int mmu_mode = isa_mmu_state();
   if (mmu_mode == MMU_DIRECT) {
 #ifndef CONFIG_FPU_NONE
diff --git a/src/isa/riscv64/instr/rvv/vcfg.h b/src/isa/riscv64/instr/rvv/vcfg.h
index 5e480a011..4f6e03a79 100644
--- a/src/isa/riscv64/instr/rvv/vcfg.h
+++ b/src/isa/riscv64/instr/rvv/vcfg.h
@@ -49,20 +49,18 @@ void set_vtype_vl(Decode *s, int mode) {
   
   if(vl_num == (uint64_t)-1 || check_vlmul_sew_illegal(id_src2->val)) {
     vtype->val = error;
+
+    // if vtype illegal, set vl = 0, vd = 0
+    vl->val = 0;
+    rtl_sr(s, id_dest->reg, &vl->val, 8);
+    return;
   }
   else {
     vtype->val = id_src2->val;
   }
-  // if vtype illegal,set vl = 0 ,vd = 0
-  if(check_vlmul_sew_illegal(id_src2->val)){
-    vl->val = 0;
-
-    rtl_sr(s, id_dest->reg, &vl->val, 8/*4*/);
-    return;
-  }
   vl->val = vl_num;
 
-  rtl_sr(s, id_dest->reg, &vl_num, 8/*4*/);
+  rtl_sr(s, id_dest->reg, &vl_num, 8);
 
   vstart->val = 0;
 }
diff --git a/src/isa/riscv64/instr/rvv/vcommon.c b/src/isa/riscv64/instr/rvv/vcommon.c
index f3fcc7726..c4f6a1d1f 100644
--- a/src/isa/riscv64/instr/rvv/vcommon.c
+++ b/src/isa/riscv64/instr/rvv/vcommon.c
@@ -3,6 +3,8 @@
 
 #include <math.h>
 #include "vcommon.h"
+#include <cpu/cpu.h>
+
 uint8_t check_vstart_ignore(Decode *s) {
   if(vstart->val >= vl->val) {
     if(vstart->val > 0) {
@@ -15,13 +17,28 @@ uint8_t check_vstart_ignore(Decode *s) {
   return 0;
 }
 
-bool check_vlmul_sew_illegal(rtlreg_t vtype_req){
-  vtype_t vt = (vtype_t )vtype_req;
+uint8_t check_vstart_exception(Decode *s) {
+  if(vstart->val > 0) {
+    longjmp_exception(EX_II);
+  }
+  if (vl->val == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+bool check_vlmul_sew_illegal(rtlreg_t vtype_req) {
+  vtype_t vt = (vtype_t) vtype_req;
   int vlmul = vt.vlmul;
-  int vsew = vt.vsew;
   if (vlmul > 4) vlmul -= 8;
-  if((vlmul < vsew + 3 - log2(MAXELEN)) || vlmul == 4) return true; // vmul < sew/ELEN || vlmul == 100
-  return false;
+  int vsew = 8 << vt.vsew;
+  float vflmul = vlmul >= 0 ? 1 << vlmul : 1.0 / (1 << -vlmul);
+  float min_vflmul = vflmul < 1.0f ? vflmul : 1.0f;
+  int vill = !(vflmul >= 0.125 && vflmul <= 8)
+           || vsew > min_vflmul * 64
+           || (vtype_req >> 8) != 0
+           || vsew > 64;
+  return vill == 1;
 }
 
 void set_NAN(rtlreg_t* fpreg, uint64_t vsew){
diff --git a/src/isa/riscv64/instr/rvv/vcommon.h b/src/isa/riscv64/instr/rvv/vcommon.h
index c8f601bbb..5440cb77b 100644
--- a/src/isa/riscv64/instr/rvv/vcommon.h
+++ b/src/isa/riscv64/instr/rvv/vcommon.h
@@ -9,6 +9,7 @@
 #include "../local-include/rtl.h"
 
 uint8_t check_vstart_ignore(Decode *s);
+uint8_t check_vstart_exception(Decode *s);
 bool check_vlmul_sew_illegal(rtlreg_t vtype_req);
 void set_NAN(rtlreg_t* fpreg, uint64_t vsew);
 bool check_isFpCanonicalNAN(rtlreg_t* fpreg, uint64_t vsew);
diff --git a/src/isa/riscv64/instr/rvv/vcompute.h b/src/isa/riscv64/instr/rvv/vcompute.h
index 8fdcf27da..dad60cca0 100644
--- a/src/isa/riscv64/instr/rvv/vcompute.h
+++ b/src/isa/riscv64/instr/rvv/vcompute.h
@@ -26,13 +26,11 @@ def_EHelper(vadd) {
 def_EHelper(vsub) {
   Assert(s->src_vmode != SRC_VI, "vsub.vi not supported\n");
   ARTHI(SUB, SIGNED)
-  // print_asm_template3(vsub);
 }
 
 def_EHelper(vrsub) {
   Assert(s->src_vmode != SRC_VV, "vrsub.vv not supported\n");
   ARTHI(RSUB, SIGNED)
-  // print_asm_template3(vrsub);
 }
 
 def_EHelper(vminu) {
@@ -378,7 +376,7 @@ def_EHelper(vmvnr) {
 def_EHelper(vpopc) {
   require_vector(true);
   if(vstart->val != 0)
-    check_vstart_ignore(s);
+    check_vstart_exception(s);
   
   rtl_li(s, s1, 0);
   for(int idx = vstart->val; idx < vl->val; idx ++) {
@@ -400,7 +398,7 @@ def_EHelper(vpopc) {
 def_EHelper(vfirst) {
   require_vector(true);
   if(vstart->val != 0)
-    check_vstart_ignore(s);
+    check_vstart_exception(s);
 
   int pos = -1;
   for(int idx = vstart->val; idx < vl->val; idx ++) {
@@ -571,7 +569,7 @@ def_EHelper(viota) {
   require_aligned(id_dest->reg, vflmul);
   require_noover(id_dest->reg, vflmul, id_src2->reg, 1);
 
-  if(!check_vstart_ignore(s)) {
+  if(!check_vstart_exception(s)) {
     rtl_li(s, s1, 0);
     for(int idx = vstart->val; idx < vl->val; idx ++) {
       rtlreg_t mask = get_mask(0, idx, vtype->vsew, vtype->vlmul);
@@ -617,7 +615,7 @@ def_EHelper(vid) {
   double vflmul = compute_vflmul();
   require_aligned(id_dest->reg, vflmul);
 
-  if(!check_vstart_ignore(s)) {
+  if(!check_vstart_exception(s)) {
     for(int idx = 0; idx < vl->val; idx ++) {
       // mask
       rtlreg_t mask = get_mask(0, idx, vtype->vsew, vtype->vlmul);
@@ -787,7 +785,7 @@ def_EHelper(vcompress) {
     longjmp_exception(EX_II);
   }
   require_noover(id_dest->reg, vflmul, id_src->reg, 1);
-  if(!check_vstart_ignore(s)) {
+  if(!check_vstart_exception(s)) {
 
     rtl_li(s, s1, 0);
     for(int idx = vstart->val; idx < vl->val; idx ++) {
diff --git a/src/isa/riscv64/instr/rvv/vcompute_impl.c b/src/isa/riscv64/instr/rvv/vcompute_impl.c
index cc122fafa..d5607718f 100644
--- a/src/isa/riscv64/instr/rvv/vcompute_impl.c
+++ b/src/isa/riscv64/instr/rvv/vcompute_impl.c
@@ -318,7 +318,6 @@ void vector_slide_check(Decode *s, bool is_over) {
 }
 
 void arthimetic_instr(int opcode, int is_signed, int widening, int narrow, int dest_mask, Decode *s) {
-  if(check_vstart_ignore(s)) return;
   require_vector(true);
   int vlmax = get_vlmax(vtype->vsew, vtype->vlmul);
   int idx;
@@ -369,7 +368,12 @@ void arthimetic_instr(int opcode, int is_signed, int widening, int narrow, int d
     } else {
       vector_vwv_check(s, false);
     }
+  } else if (narrow < 0) {
+    if (vtype->vsew + narrow < 0) {
+      longjmp_exception(EX_II);
+    }
   }
+  if(check_vstart_exception(s)) return;
   for(idx = vstart->val; idx < vl->val; idx ++) {
     // mask
     rtlreg_t mask = get_mask(0, idx, vtype->vsew, vtype->vlmul);
@@ -417,7 +421,7 @@ void arthimetic_instr(int opcode, int is_signed, int widening, int narrow, int d
     switch (opcode) {
       case VEXT:
         eew = vtype->vsew + narrow;
-        emul = vtype->vlmul - ((vtype->vsew) - (vtype->vsew + narrow));
+        emul = vtype->vlmul + narrow;
         break;
       default:
         eew = vtype->vsew + narrow;
@@ -833,8 +837,7 @@ void arthimetic_instr(int opcode, int is_signed, int widening, int narrow, int d
  * because the illegal instruction exception is handled in vcompute.h for vrgather and vslide instruction
  */
 void permutaion_instr(int opcode, Decode *s) {
-  if(check_vstart_ignore(s)) return;
-  require_vector(true);
+  if(check_vstart_exception(s)) return;
   int vlmax = get_vlmax(vtype->vsew, vtype->vlmul);
   int idx;
   for(idx = vstart->val; idx < vl->val; idx ++) {
@@ -978,7 +981,6 @@ void permutaion_instr(int opcode, Decode *s) {
 }
 
 void floating_arthimetic_instr(int opcode, int is_signed, int widening, int dest_mask, Decode *s) {
-  if(check_vstart_ignore(s)) return;
   require_vector(true);
   if (dest_mask) {
     if (s->src_vmode == SRC_VV) {
@@ -1013,17 +1015,13 @@ void floating_arthimetic_instr(int opcode, int is_signed, int widening, int dest
       vector_wwv_check(s, false);
     }
   }
+  if(check_vstart_exception(s)) return;
   int idx;
   word_t FPCALL_TYPE = FPCALL_W64;
   // fpcall type
   switch (vtype->vsew) {
     case 0 :
-      switch (widening) {
-        case vdNarrow   : FPCALL_TYPE = FPCALL_W16; break;
-        case vdWidening : FPCALL_TYPE = FPCALL_W8; break;
-        default         : Loge("f8 not supported"); longjmp_exception(EX_II); break;
-      }
-      break;
+      Loge("f8 not supported"); longjmp_exception(EX_II); break;
     case 1 : 
       switch (widening) {
         case vsdWidening : FPCALL_TYPE = FPCALL_W16_to_32; break;
@@ -1215,7 +1213,7 @@ void mask_instr(int opcode, Decode *s) {
   if (s->vm == 0) {
     longjmp_exception(EX_II);
   }
-  if(check_vstart_ignore(s)) return;
+  if(check_vstart_exception(s)) return;
   int idx;
   for(idx = vstart->val; idx < vl->val; idx++) {
     // operand - vs2
@@ -1268,8 +1266,8 @@ vector register, not a vector register group, so any vector register can be the
 scalar source or destination of a vector reduction regardless of LMUL setting.
 */
 void reduction_instr(int opcode, int is_signed, int wide, Decode *s) {
-  if(check_vstart_ignore(s)) return;
   vector_reduction_check(s, wide);
+  if(check_vstart_exception(s)) return;
   // operand - vs1
   get_vreg(id_src->reg, 0, s1, vtype->vsew+wide, vtype->vlmul, is_signed, 0);
   if(is_signed) rtl_sext(s, s1, s1, 1 << (vtype->vsew+wide));
@@ -1311,8 +1309,8 @@ void reduction_instr(int opcode, int is_signed, int wide, Decode *s) {
 }
 
 void float_reduction_instr(int opcode, int widening, Decode *s) {
-  if(check_vstart_ignore(s)) return;
   vector_reduction_check(s, widening);
+  if(check_vstart_exception(s)) return;
   if (widening)
     get_vreg(id_src->reg, 0, s1, vtype->vsew+1, vtype->vlmul, 0, 1);
   else
@@ -1448,8 +1446,8 @@ void float_reduction_step1(uint64_t src1, uint64_t src2, Decode *s) {
 }
 
 void float_reduction_computing(Decode *s) {
-  if(check_vstart_ignore(s)) return;
   vector_reduction_check(s, false);
+  if(check_vstart_exception(s)) return;
   word_t FPCALL_TYPE = FPCALL_W64;
   int idx;
 
diff --git a/src/isa/riscv64/instr/rvv/vldst.h b/src/isa/riscv64/instr/rvv/vldst.h
index 11a69d3ce..bb7d862ff 100644
--- a/src/isa/riscv64/instr/rvv/vldst.h
+++ b/src/isa/riscv64/instr/rvv/vldst.h
@@ -19,23 +19,40 @@
 #include "vldst_impl.h"
 #include "vcompute_impl.h"
 
+// if we decode some information in decode stage
+// when running in opt mode, these information will not be generated because
+// it only runs the exec functions
+void predecode_vls(Decode *s) {
+#ifdef CONFIG_RVV
+  const int table [8] = {1, 0, 0, 0, 0, 2, 4, 8};
+  s->vm = s->isa.instr.v_opv.v_vm; //1 for without mask; 0 for with mask
+  s->v_width = table[s->isa.instr.vldfp.v_width];
+  s->v_nf = s->isa.instr.vldfp.v_nf;
+  s->v_lsumop = s->isa.instr.vldfp.v_lsumop;
+#endif
+}
+
 
 def_EHelper(vle) { //unit-strided
+  predecode_vls(s);
   require_vector(true);
   VLD(MODE_UNIT, UNSIGNED, s, MMU_DIRECT)
 }
 
 def_EHelper(vlm) { //mask
+  predecode_vls(s);
   require_vector(true);
   VLD(MODE_MASK, UNSIGNED, s, MMU_DIRECT)
 }
 
 def_EHelper(vlr) { // whole register
+  predecode_vls(s);
   require_vector(false);
   VLR(MODE_UNIT, UNSIGNED, s, MMU_DIRECT)
 }
 
 def_EHelper(vlse) { //strided unsigned
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -43,6 +60,7 @@ def_EHelper(vlse) { //strided unsigned
 }
 
 def_EHelper(vlxe) {
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -50,21 +68,25 @@ def_EHelper(vlxe) {
 }
 
 def_EHelper(vse) {
+  predecode_vls(s);
   require_vector(true);
   VST(MODE_UNIT, MMU_DIRECT)
 }
 
 def_EHelper(vsm) {
+  predecode_vls(s);
   require_vector(true);
   VST(MODE_MASK, MMU_DIRECT)
 }
 
 def_EHelper(vsr) {
+  predecode_vls(s);
   require_vector(false);
   VSR(MODE_UNIT, MMU_DIRECT)
 }
 
 def_EHelper(vsse) {
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -72,6 +94,7 @@ def_EHelper(vsse) {
 }
 
 def_EHelper(vsxe) {
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -79,21 +102,25 @@ def_EHelper(vsxe) {
 }
 
 def_EHelper(vle_mmu) { //unit-strided
+  predecode_vls(s);
   require_vector(true);
   VLD(MODE_UNIT, UNSIGNED, s, MMU_TRANSLATE)
 }
 
 def_EHelper(vlm_mmu) { //mask
+  predecode_vls(s);
   require_vector(true);
   VLD(MODE_MASK, UNSIGNED, s, MMU_TRANSLATE)
 }
 
 def_EHelper(vlr_mmu) { //whple register
+  predecode_vls(s);
   require_vector(false);
   VLR(MODE_UNIT, UNSIGNED, s, MMU_TRANSLATE)
 }
 
 def_EHelper(vlse_mmu) { //strided unsigned
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -101,6 +128,7 @@ def_EHelper(vlse_mmu) { //strided unsigned
 }
 
 def_EHelper(vlxe_mmu) {
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
@@ -108,26 +136,31 @@ def_EHelper(vlxe_mmu) {
 }
 
 def_EHelper(vse_mmu) {
+  predecode_vls(s);
   require_vector(true);
   VST(MODE_UNIT, MMU_TRANSLATE)
 }
 
 def_EHelper(vsm_mmu) {
+  predecode_vls(s);
   require_vector(true);
   VST(MODE_MASK, MMU_TRANSLATE)
 }
 
 def_EHelper(vsr_mmu) {
+  predecode_vls(s);
   require_vector(false);
   VSR(MODE_UNIT, MMU_TRANSLATE)
 }
 
 def_EHelper(vsse_mmu) {
+  predecode_vls(s);
   require_vector(true);
   VST(MODE_STRIDED, MMU_TRANSLATE)
 }
 
 def_EHelper(vsxe_mmu) {
+  predecode_vls(s);
   require_vector(true);
   s->src2.reg = s->isa.instr.fp.rs2;
   rtl_lr(s, &(s->src2.val), s->src2.reg, 4);
diff --git a/src/isa/riscv64/instr/rvv/vldst_impl.c b/src/isa/riscv64/instr/rvv/vldst_impl.c
index 9fc4879ed..71d0ecb49 100644
--- a/src/isa/riscv64/instr/rvv/vldst_impl.c
+++ b/src/isa/riscv64/instr/rvv/vldst_impl.c
@@ -18,11 +18,12 @@
 
 #include <cpu/cpu.h>
 #include "vldst_impl.h"
+#include "vcompute_impl.h"
 #include "../local-include/intr.h"
 
 // reference: v_ext_macros.h in riscv-isa-sim
 
-void isa_emul_check(int emul, int nfields) {
+static void isa_emul_check(int emul, int nfields) {
   if (emul > 3) {
     Log("vector EMUL > 8 happen: EMUL:%d\n", (1 << emul));
     longjmp_exception(EX_II);
@@ -42,11 +43,108 @@ void isa_emul_check(int emul, int nfields) {
   }
 }
 
+static void vstore_check(int mode, Decode *s) {
+  int eew = 0;
+  switch(s->v_width) {
+    case 1: eew = 0; break;
+    case 2: eew = 1; break;
+    case 4: eew = 2; break;
+    case 8: eew = 3; break;
+    default: Loge("illegal v_width: %d\n", s->v_width);
+    longjmp_exception(EX_II); break;
+  }
+  uint64_t veew = mode == MODE_MASK ? 1 : 8 << eew;
+  uint64_t vsew = 8 << vtype->vsew;
+  double vflmul = compute_vflmul();
+  float vemul = mode == MODE_MASK ? 1 : ((float)veew / vsew * vflmul);
+  uint64_t emul = vemul < 1 ? 1 : vemul;
+  if (!(vemul >= 0.125 && vemul <= 8)) {
+    Loge("illegal EMUL: %f\n", vemul);
+    longjmp_exception(EX_II);
+  }
+  require_aligned(id_dest->reg, vemul);
+  uint64_t nf = s->v_nf + 1;
+  if (!((nf * emul <= 8) && (id_dest->reg + nf * emul <= 32))) {
+    Loge("illegal NFIELDS: %lu EMUL: %lu\n", nf, emul);
+    longjmp_exception(EX_II);
+  }
+}
+
+static void vload_check(int mode, Decode *s) {
+  vstore_check(mode, s);
+  require_vm(s);
+}
+
+static void index_vstore_check(int mode, Decode *s) {
+  int eew = vtype->vsew;
+  int elt_width = 0;
+  switch(s->v_width) {
+    case 1: elt_width = 0; break;
+    case 2: elt_width = 1; break;
+    case 4: elt_width = 2; break;
+    case 8: elt_width = 3; break;
+    default: break;
+  }
+  double vflmul = compute_vflmul();
+  float vemul = (float)(8 << elt_width) / (8 << eew) * vflmul;
+    if (!(vemul >= 0.125 && vemul <= 8)) {
+    Loge("illegal EMUL: %f\n", vemul);
+    longjmp_exception(EX_II);
+  }
+
+  uint64_t flmul = vflmul < 1 ? 1 : vflmul;
+
+  require_aligned(id_dest->reg, vflmul);
+  require_aligned(id_src2->reg, vemul);
+
+  uint64_t nf = s->v_nf + 1;
+  if (!((nf * flmul <= 8) && (id_dest->reg + nf * flmul <= 32))) {
+    Loge("illegal NFIELDS: %lu LMUL: %lu\n", nf, flmul);
+    longjmp_exception(EX_II);
+  }
+}
+
+static void index_vload_check(int mode, Decode *s) {
+  index_vstore_check(mode, s);
+  int eew = vtype->vsew;
+  int elt_width = 0;
+  switch(s->v_width) {
+    case 1: elt_width = 0; break;
+    case 2: elt_width = 1; break;
+    case 4: elt_width = 2; break;
+    case 8: elt_width = 3; break;
+    default: break;
+  }
+  uint64_t nf = s->v_nf + 1;
+  double vflmul = compute_vflmul();
+  float vemul = (float)(8 << elt_width) / (8 << eew) * vflmul;
+  uint64_t flmul = vflmul < 1 ? 1 : vflmul;
+  for (uint64_t idx = 0; idx < nf; idx++) {
+    uint64_t seg_vd = id_dest->reg + idx * flmul;
+    if (elt_width > eew) {
+      if (seg_vd != id_src2->reg) {
+        require_noover(seg_vd, vflmul, id_src2->reg, vemul);
+      }
+    } else if (elt_width < eew) {
+      if (vemul < 1) {
+        require_noover(seg_vd, vflmul, id_src2->reg, vemul);
+      } else {
+        require_noover_widen(seg_vd, vflmul, id_src2->reg, vemul);
+      }
+    }
+    if (nf >= 2) {
+      require_noover(seg_vd, vflmul, id_src2->reg, vemul);
+    }
+  }
+  require_vm(s);
+}
+
 void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
+  vload_check(mode, s);
   if(check_vstart_ignore(s)) return;
   word_t idx;
   uint64_t nf, fn, vl_val, base_addr, vd, addr;
-  int eew, emul, emul_coding, stride, is_stride;
+  int eew, emul, stride, is_unit_stride;
 
   // s->v_width is the bytes of a unit
   // eew is the coding like vsew
@@ -58,17 +156,17 @@ void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
     case 8: eew = 3; break;
     default: break;
   }
-  emul_coding = vtype->vlmul > 4 ? vtype->vlmul - 8 + eew - vtype->vsew : vtype->vlmul + eew - vtype->vsew;
-  isa_emul_check(mode == MODE_MASK ? 1 : emul_coding, 1);
-  emul_coding = emul_coding < 0 ? 0 : emul_coding;
-  emul = 1 << emul_coding;
+  emul = vtype->vlmul > 4 ? vtype->vlmul - 8 + eew - vtype->vsew : vtype->vlmul + eew - vtype->vsew;
+  isa_emul_check(mode == MODE_MASK ? 1 : emul, 1);
+  emul = emul < 0 ? 0 : emul;
+  emul = 1 << emul;
 
   if (mode == MODE_STRIDED) {
     stride = id_src2->val;
-    is_stride = 0;
+    is_unit_stride = 0;
   } else {
     stride = 0;
-    is_stride = 1;
+    is_unit_stride = 1;
   }
   // previous decode does not load vals for us
   rtl_lr(s, &(s->src1.val), s->src1.reg, 4);
@@ -84,24 +182,24 @@ void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
       if (RVV_AGNOSTIC && vtype->vma) {
         tmp_reg[1] = (uint64_t) -1;
         for (fn = 0; fn < nf; fn++) {
-          set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
+          set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, 0, 0);
         }
       }
       continue;
     }
     for (fn = 0; fn < nf; fn++) {
-      addr = base_addr + idx * stride + (idx * nf * is_stride + fn) * s->v_width;
+      addr = base_addr + idx * stride + (idx * nf * is_unit_stride + fn) * s->v_width;
       rtl_lm(s, &tmp_reg[1], &addr, 0, s->v_width, mmu_mode);
-      set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
+      set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, 0, 0);
     }
   }
 
   if (RVV_AGNOSTIC && (mode == MODE_MASK || vtype->vta)) {   // set tail of vector register to 1
-    int vlmax =  mode == MODE_MASK ? VLEN / 8 : get_vlen_max(eew, emul_coding, 0);
+    int vlmax =  mode == MODE_MASK ? VLEN / 8 : get_vlen_max(vtype->vsew, vtype->vlmul, 0);
     for(idx = vl_val; idx < vlmax; idx++) {
       tmp_reg[1] = (uint64_t) -1;
       for (fn = 0; fn < nf; fn++) {
-        set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
+        set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, 0, 0);
       }
     }
   }
@@ -115,22 +213,22 @@ void vldx(int mode, int is_signed, Decode *s, int mmu_mode) {
   //        5  ->  16         1  ->  16
   //        6  ->  32         2  ->  32
   //        7  ->  64         3  ->  64
+  index_vload_check(mode, s);
   if(check_vstart_ignore(s)) return;
   word_t idx;
   uint64_t nf = s->v_nf + 1, fn, vl_val, base_addr, vd, index, addr;
-  int eew, lmul, index_width, data_length;
+  int eew, lmul, index_width, data_width;
 
   index_width = 0;
   eew = vtype->vsew;
-  s->v_width = s->isa.instr.vldfp.v_width;
   switch(s->v_width) {
-    case 0: index_width = 0; break;
-    case 5: index_width = 1; break;
-    case 6: index_width = 2; break;
-    case 7: index_width = 3; break;
+    case 1: index_width = 0; break;
+    case 2: index_width = 1; break;
+    case 4: index_width = 2; break;
+    case 8: index_width = 3; break;
     default: break;
   }
-  data_length = 1 << eew;
+  data_width = 1 << eew;
   lmul = vtype->vlmul > 4 ? vtype->vlmul - 8 : vtype->vlmul;
   isa_emul_check(lmul, nf);
   lmul = lmul < 0 ? 0 : lmul;
@@ -149,31 +247,31 @@ void vldx(int mode, int is_signed, Decode *s, int mmu_mode) {
       if (RVV_AGNOSTIC && vtype->vma) {
         tmp_reg[1] = (uint64_t) -1;
         for (fn = 0; fn < nf; fn++) {
-          set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, vtype->vlmul, 1);
+          set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, 0, 0);
         }
       }
       continue;
     }
     for (fn = 0; fn < nf; fn++) {
       // read index
-      get_vreg(id_src2->reg, idx, &tmp_reg[2], index_width, vtype->vlmul, 0, 1);
+      get_vreg(id_src2->reg, idx, &tmp_reg[2], index_width, 0, 0, 0);
       index = tmp_reg[2];
 
       // read data in memory
-      addr = base_addr + index + fn * data_length;
+      addr = base_addr + index + fn * data_width;
       s->v_is_vx = 1;
-      rtl_lm(s, &tmp_reg[1], &addr, 0, data_length, mmu_mode);
+      rtl_lm(s, &tmp_reg[1], &addr, 0, data_width, mmu_mode);
       s->v_is_vx = 0;
-      set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, vtype->vlmul, 1);
+      set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, 0, 0);
     }
   }
 
   if (RVV_AGNOSTIC && vtype->vta) {   // set tail of vector register to 1
-    int vlmax = get_vlen_max(eew, vtype->vlmul, 0);
+    int vlmax = get_vlen_max(vtype->vsew, vtype->vlmul, 0);
     for(idx = vl->val; idx < vlmax; idx++) {
       tmp_reg[1] = (uint64_t) -1;
       for (fn = 0; fn < nf; fn++) {
-        set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, vtype->vlmul, 1);
+        set_vreg(vd + fn * lmul, idx, tmp_reg[1], eew, 0, 0);
       }
     }
   }
@@ -184,10 +282,11 @@ void vldx(int mode, int is_signed, Decode *s, int mmu_mode) {
 }
 
 void vst(int mode, Decode *s, int mmu_mode) {
+  vstore_check(mode, s);
   if(check_vstart_ignore(s)) return;
   word_t idx;
   uint64_t nf, fn, vl_val, base_addr, vd, addr;
-  int eew, emul, stride, is_stride;
+  int eew, emul, stride, is_unit_stride;
 
   eew = 0;
   switch(s->v_width) {
@@ -204,10 +303,10 @@ void vst(int mode, Decode *s, int mmu_mode) {
 
   if (mode == MODE_STRIDED) {
     stride = id_src2->val;
-    is_stride = 0;
+    is_unit_stride = 0;
   } else {
     stride = 0;
-    is_stride = 1;
+    is_unit_stride = 1;
   }
   // previous decode does not load vals for us
   rtl_lr(s, &(s->src1.val), s->src1.reg, 4);
@@ -223,8 +322,8 @@ void vst(int mode, Decode *s, int mmu_mode) {
       continue;
     }
     for (fn = 0; fn < nf; fn++) {
-      get_vreg(vd + fn * emul, idx, &tmp_reg[1], eew, vtype->vlmul, 0, mode == MODE_MASK ? 0 : 1);
-      addr = base_addr + idx * stride + (idx * nf * is_stride + fn) * s->v_width;
+      get_vreg(vd + fn * emul, idx, &tmp_reg[1], eew, 0, 0, 0);
+      addr = base_addr + idx * stride + (idx * nf * is_unit_stride + fn) * s->v_width;
       rtl_sm(s, &tmp_reg[1], &addr, 0, s->v_width, mmu_mode);
     }
   }
@@ -234,22 +333,22 @@ void vst(int mode, Decode *s, int mmu_mode) {
 }
 
 void vstx(int mode, Decode *s, int mmu_mode) {
+  index_vload_check(mode, s);
   if(check_vstart_ignore(s)) return;
   word_t idx;
   uint64_t nf = s->v_nf + 1, fn, vl_val, base_addr, vd, index, addr;
-  int eew, lmul, index_width, data_length;
+  int eew, lmul, index_width, data_width;
 
   index_width = 0;
   eew = vtype->vsew;
-  s->v_width = s->isa.instr.vldfp.v_width;
   switch(s->v_width) {
-    case 0: index_width = 0; break;
-    case 5: index_width = 1; break;
-    case 6: index_width = 2; break;
-    case 7: index_width = 3; break;
+    case 1: index_width = 0; break;
+    case 2: index_width = 1; break;
+    case 4: index_width = 2; break;
+    case 8: index_width = 3; break;
     default: break;
   }
-  data_length = 1 << eew;
+  data_width = 1 << eew;
   lmul = vtype->vlmul > 4 ? vtype->vlmul - 8 : vtype->vlmul;
   isa_emul_check(lmul, nf);
   lmul = lmul < 0 ? 0 : lmul;
@@ -269,14 +368,14 @@ void vstx(int mode, Decode *s, int mmu_mode) {
     }
     for (fn = 0; fn < nf; fn++) {
       // read index
-      get_vreg(id_src2->reg, idx, &tmp_reg[2], index_width, vtype->vlmul, 0, 1);
+      get_vreg(id_src2->reg, idx, &tmp_reg[2], index_width, 0, 0, 0);
       index = tmp_reg[2];
 
       // read data in vector register
-      get_vreg(vd + fn * lmul, idx, &tmp_reg[1], eew, vtype->vlmul, 0, 1);
-      addr = base_addr + index + fn * data_length;
+      get_vreg(vd + fn * lmul, idx, &tmp_reg[1], eew, 0, 0, 0);
+      addr = base_addr + index + fn * data_width;
       s->v_is_vx = 1;
-      rtl_sm(s, &tmp_reg[1], &addr, 0, data_length, mmu_mode);
+      rtl_sm(s, &tmp_reg[1], &addr, 0, data_width, mmu_mode);
       s->v_is_vx = 0;
     }
   }
@@ -286,7 +385,7 @@ void vstx(int mode, Decode *s, int mmu_mode) {
   vp_set_dirty();
 }
 
-void isa_whole_reg_check(uint64_t vd, uint64_t nfields) {
+static void isa_whole_reg_check(uint64_t vd, uint64_t nfields) {
   if (nfields != 1 && nfields != 2 && nfields != 4 && nfields != 8) {
     Log("illegal NFIELDS for whole register instrs: NFIELDS:%lu", nfields);
     longjmp_exception(EX_II);
diff --git a/src/isa/riscv64/instr/rvv/vreg_impl.c b/src/isa/riscv64/instr/rvv/vreg_impl.c
index 512a3082f..87f3025b1 100644
--- a/src/isa/riscv64/instr/rvv/vreg_impl.c
+++ b/src/isa/riscv64/instr/rvv/vreg_impl.c
@@ -40,7 +40,7 @@ rtlreg_t check_vsetvl(rtlreg_t vtype_req, rtlreg_t vl_req, int mode) {
   if (mode == 1) {
     return VLMAX;
   } else if (mode == 2) {
-    return old_vl;
+    return old_vl < VLMAX ? old_vl : VLMAX;
   } else {
     if (vt.vsew > 3) { //check if max-len supported
       return (uint64_t)-1; //return 0 means error, including vl_req is 0, for vl_req should not be 0.
@@ -102,7 +102,7 @@ int get_idx(uint64_t reg, int idx, uint64_t vsew) {
 void isa_misalign_vreg_check(uint64_t reg, uint64_t vlmul, int needAlign) {
   if (needAlign && vlmul < 4) {
     if (reg % (1 << vlmul) != 0) {
-      Log("vector register group misaligned happen: reg:x%lu vlmul:0x%lx needAlign:%d", reg, vlmul, needAlign);
+      Loge("vector register group misaligned happen: reg:x%lu vlmul:0x%lx needAlign:%d", reg, vlmul, needAlign);
       longjmp_exception(EX_II);
     }
   }
diff --git a/src/isa/riscv64/system/priv.c b/src/isa/riscv64/system/priv.c
index bc40c512d..b753eabcd 100644
--- a/src/isa/riscv64/system/priv.c
+++ b/src/isa/riscv64/system/priv.c
@@ -674,6 +674,7 @@ static inline void csr_write(word_t *dest, word_t src) {
   else if (is_write(vcsr)) { *dest = src & 0b111; vxrm->val = (src >> 1) & 0b11; vxsat->val = src & 0b1; }
   else if (is_write(vxrm)) { *dest = src & 0b11; vcsr->val = (vxrm->val) << 1 | vxsat->val; }
   else if (is_write(vxsat)) { *dest = src & 0b1; vcsr->val = (vxrm->val) << 1 | vxsat->val; }
+  else if (is_write(vstart)) { *dest = src & (VLEN - 1); }
 #endif
 #ifdef CONFIG_MISA_UNCHANGEABLE
   else if (is_write(misa)) { /* do nothing */ }