diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 59104383..10f08087 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -17,7 +17,7 @@ permissions: env: GCC_VERSION: "11" LLVM_VERSION: "17" - COMMON_CMAKE_FLAGS: | + COMMON_CMAKE_FLAGS: > -DSLEEF_SHOW_CONFIG=1 -DSLEEF_BUILD_GNUABI_LIBS=ON -DSLEEF_BUILD_INLINE_HEADERS=ON @@ -227,10 +227,6 @@ jobs: EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DSLEEF_ENFORCE_RVVM1=ON -DSLEEF_ENFORCE_RVVM2=ON" # Disable inline headers, they just don't compile on riscv64 EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DSLEEF_BUILD_INLINE_HEADERS=OFF" - # Disable dft, it fails with linker error to `cexp` - EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DSLEEF_BUILD_DFT=OFF" - # Disable quad, it's missing the `Sleef_quad` function - EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DSLEEF_BUILD_QUAD=OFF" fi cmake -S . -B _build-${{ matrix.arch }} -GNinja \ diff --git a/CMakeLists.txt b/CMakeLists.txt index ecd272d8..742912b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,7 @@ set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS ) set(SLEEF_SUPPORTED_QUAD_EXTENSIONS - PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2) + PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2 RVVM1 RVVM2) # MKMASKED_PARAMS diff --git a/src/arch/helperrvv.h b/src/arch/helperrvv.h index 1cd707af..6dccf485 100644 --- a/src/arch/helperrvv.h +++ b/src/arch/helperrvv.h @@ -17,31 +17,43 @@ #if CONFIG == 1 || CONFIG == 2 #define ISANAME "RISC-V Vector Extension with Min. VLEN" -#define SLEEF_RVV_VLEN __riscv_vlenb() +#define SLEEF_RVV_VLEN 0 #elif CONFIG == 7 // 128-bit vector length #define ISANAME "RISC-V Vector Extension 128-bit" +#define LOG2VECTLENDP 1 #define SLEEF_RVV_VLEN ((1 << 7) / 8) +#define DFTPRIORITY 19 #elif CONFIG == 8 // 256-bit vector length #define ISANAME "RISC-V Vector Extension 256-bit" +#define LOG2VECTLENDP 2 #define SLEEF_RVV_VLEN ((1 << 8) / 8) +#define DFTPRIORITY 20 #elif CONFIG == 9 // 512-bit vector length #define ISANAME "RISC-V Vector Extension 512-bit" +#define LOG2VECTLENDP 3 #define SLEEF_RVV_VLEN ((1 << 9) / 8) +#define DFTPRIORITY 21 #elif CONFIG == 10 // 1024-bit vector length #define ISANAME "RISC-V Vector Extension 1024-bit" +#define LOG2VECTLENDP 4 #define SLEEF_RVV_VLEN ((1 << 10) / 8) +#define DFTPRIORITY 22 #elif CONFIG == 11 // 2048-bit vector length #define ISANAME "RISC-V Vector Extension 2048-bit" +#define LOG2VECTLENDP 5 #define SLEEF_RVV_VLEN ((1 << 11) / 8) +#define DFTPRIORITY 23 #else #error CONFIG macro invalid or not defined #endif +#define LOG2VECTLENSP (LOG2VECTLENDP+1) + #define ENABLE_SP #define ENABLE_DP @@ -89,8 +101,6 @@ static INLINE vfloat64m2x4_t __riscv_vcreate_v_f64m2x4(vfloat64m2_t x, vfloat64m #define SLEEF_RVV_VEXT(size, from_to, v) __riscv_vmv_v(__riscv_vlmul_ext_v_##from_to(v), __riscv_vsetvlmax_##size()) #endif -static INLINE int vavailability_i(int name) { return -1; } - //////////////////////////////////////////////////////////////////////////////// // RISC-V Vector Types //////////////////////////////////////////////////////////////////////////////// @@ -111,6 +121,8 @@ typedef vbool32_t rvv_sp_vopmask; typedef vbool64_t rvv_dp_vopmask; typedef vint32mf2_t vint; +typedef vint64m1_t vint64; +typedef vuint64m1_t vuint64; typedef vfloat64m1_t vdouble; typedef vfloat64m2_t vdouble2; typedef vfloat64m4_t vdouble3; @@ -124,10 +136,21 @@ typedef vfloat32m4_t df2; typedef vint32m1_t vint2; typedef vint32m2_t fi_t; typedef vint32m4_t dfi_t; +typedef vuint64m1_t rvv_dp_vuint2; + +typedef vfloat64m1x4_t tdx; +typedef vfloat64m1x4_t tdi_t; + #define SLEEF_RVV_SP_LMUL 1 #define SLEEF_RVV_DP_LMUL 1 +#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m1() +#if SLEEF_RVV_VLEN == 0 +#define VECTLENSP (__riscv_vsetvlmax_e32m1()) +#define VECTLENDP SLEEF_RVV_DP_RUNTIME_VL() +#else #define VECTLENSP (SLEEF_RVV_SP_LMUL * SLEEF_RVV_VLEN / sizeof(float)) #define VECTLENDP (SLEEF_RVV_DP_LMUL * SLEEF_RVV_VLEN / sizeof(double)) +#endif #define SLEEF_RVV_SP_VCAST_VF_F __riscv_vfmv_v_f_f32m1 #define SLEEF_RVV_SP_VCAST_VI2_I __riscv_vmv_v_x_i32m1 #define SLEEF_RVV_SP_VREINTERPRET_VF __riscv_vreinterpret_f32m1 @@ -137,6 +160,8 @@ typedef vint32m4_t dfi_t; #define SLEEF_RVV_SP_VREINTERPRET_2VI __riscv_vreinterpret_i32m2 #define SLEEF_RVV_SP_VREINTERPRET_VU __riscv_vreinterpret_u32m1 #define SLEEF_RVV_SP_VREINTERPRET_VU2 __riscv_vreinterpret_u32m1 +#define SLEEF_RVV_SP_VREINTERPRET_VOM __riscv_vreinterpret_b32 +#define SLEEF_RVV_SP_VID() __riscv_vid_v_u32m1(VECTLENSP) #define SLEEF_RVV_SP_VGET_VI2 __riscv_vget_i32m1 #define SLEEF_RVV_SP_VGET_2VI __riscv_vget_i32m2 #define SLEEF_RVV_SP_VGET_VF __riscv_vget_f32m1 @@ -147,6 +172,7 @@ typedef vint32m4_t dfi_t; #define SLEEF_RVV_SP_VCREATE_VF2 __riscv_vcreate_v_f32m1_f32m2 #define SLEEF_RVV_SP_VLMUL_EXT_VI2_TO_FI(v) SLEEF_RVV_VEXT(e32m1, i32m1_i32m2, v) #define SLEEF_RVV_SP_LOAD_VF __riscv_vle32_v_f32m1 +#define SLEEF_RVV_SP_LOAD_2VI __riscv_vle32_v_i32m2 #define SLEEF_RVV_SP_VFNCVT_X_F_VI __riscv_vfcvt_x_f_v_i32m1_rm #define SLEEF_RVV_SP_VFCVT_F_X_VF __riscv_vfcvt_f_x_v_f32m1 #define SLEEF_RVV_SP_VFCVT_X_F_VF_RM __riscv_vfcvt_x_f_v_i32m1_rm @@ -196,6 +222,8 @@ typedef vbool16_t rvv_sp_vopmask; typedef vbool32_t rvv_dp_vopmask; typedef vint32m1_t vint; +typedef vint64m2_t vint64; +typedef vuint64m2_t vuint64; typedef vfloat64m2_t vdouble; typedef vfloat64m4_t vdouble2; typedef vfloat64m8_t vdouble3; @@ -209,10 +237,21 @@ typedef vfloat32m8_t df2; typedef vint32m2_t vint2; typedef vint32m4_t fi_t; typedef vint32m8_t dfi_t; +typedef vuint64m2_t rvv_dp_vuint2; + +typedef vfloat64m2x4_t tdx; +typedef vfloat64m2x4_t tdi_t; + #define SLEEF_RVV_SP_LMUL 2 #define SLEEF_RVV_DP_LMUL 2 +#define SLEEF_RVV_DP_RUNTIME_VL() __riscv_vsetvlmax_e64m2() +#if SLEEF_RVV_VLEN == 0 +#define VECTLENSP (__riscv_vsetvlmax_e32m2()) +#define VECTLENDP SLEEF_RVV_DP_RUNTIME_VL() +#else #define VECTLENSP (SLEEF_RVV_SP_LMUL * SLEEF_RVV_VLEN / sizeof(float)) #define VECTLENDP (SLEEF_RVV_DP_LMUL * SLEEF_RVV_VLEN / sizeof(double)) +#endif #define SLEEF_RVV_SP_VCAST_VF_F __riscv_vfmv_v_f_f32m2 #define SLEEF_RVV_SP_VCAST_VI2_I __riscv_vmv_v_x_i32m2 #define SLEEF_RVV_SP_VREINTERPRET_VF __riscv_vreinterpret_f32m2 @@ -222,6 +261,8 @@ typedef vint32m8_t dfi_t; #define SLEEF_RVV_SP_VREINTERPRET_2VI __riscv_vreinterpret_i32m4 #define SLEEF_RVV_SP_VREINTERPRET_VU __riscv_vreinterpret_u32m2 #define SLEEF_RVV_SP_VREINTERPRET_VU2 __riscv_vreinterpret_u32m2 +#define SLEEF_RVV_SP_VREINTERPRET_VOM __riscv_vreinterpret_b16 +#define SLEEF_RVV_SP_VID() __riscv_vid_v_u32m2(VECTLENSP) #define SLEEF_RVV_SP_VGET_VI2 __riscv_vget_i32m2 #define SLEEF_RVV_SP_VGET_2VI __riscv_vget_i32m4 #define SLEEF_RVV_SP_VGET_VF __riscv_vget_f32m2 @@ -232,6 +273,7 @@ typedef vint32m8_t dfi_t; #define SLEEF_RVV_SP_VCREATE_VF2 __riscv_vcreate_v_f32m2_f32m4 #define SLEEF_RVV_SP_VLMUL_EXT_VI2_TO_FI(v) SLEEF_RVV_VEXT(e32m2, i32m2_i32m4, v) #define SLEEF_RVV_SP_LOAD_VF __riscv_vle32_v_f32m2 +#define SLEEF_RVV_SP_LOAD_2VI __riscv_vle32_v_i32m4 #define SLEEF_RVV_SP_VFNCVT_X_F_VI __riscv_vfcvt_x_f_v_i32m2_rm #define SLEEF_RVV_SP_VFCVT_F_X_VF __riscv_vfcvt_f_x_v_f32m2 #define SLEEF_RVV_SP_VFCVT_X_F_VF_RM __riscv_vfcvt_x_f_v_i32m2_rm @@ -250,6 +292,9 @@ typedef vint32m8_t dfi_t; #define SLEEF_RVV_DP_VREINTERPRET_4VI(v) __riscv_vreinterpret_i32m4(__riscv_vreinterpret_i64m4(v)) #define SLEEF_RVV_DP_VREINTERPRET_VU __riscv_vreinterpret_u32m1 #define SLEEF_RVV_DP_VREINTERPRET_4VU __riscv_vreinterpret_u32m4 +#define SLEEF_RVV_DP_VREINTERPRET_VQ __riscv_vreinterpret_u64m4 +#define SLEEF_RVV_DP_VREINTERPRET_VOM __riscv_vreinterpret_b32 +#define SLEEF_RVV_DP_VID() __riscv_vid_v_u64m2(VECTLENDP) #define SLEEF_RVV_DP_VGET_VM __riscv_vget_u64m2 #define SLEEF_RVV_DP_VGET_VD __riscv_vget_f64m2 #define SLEEF_RVV_DP_VGET_VD2 __riscv_vget_f64m4 @@ -274,6 +319,15 @@ typedef vint32m8_t dfi_t; #error "unknown rvv lmul" #endif // ENABLE_RVVM1 +typedef vquad vargquad; + +static INLINE int vavailability_i(int name) { + // Note that VECTLENDP may be defined to SLEEF_RVV_DP_RUNTIME_VL(). That + // case isn't entirely redundant because it's still an opportunity to raise + // SIGILL to be captured by the caller if vector isn't supported. + return (SLEEF_RVV_DP_RUNTIME_VL() >= VECTLENDP) ? 3 : 0; +} + //////////////////////////////////////////////////////////////////////////////// // Single-Precision Functions //////////////////////////////////////////////////////////////////////////////// @@ -757,13 +811,18 @@ static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return __riscv_vfmadd(x, y, z, VECTLENDP); } -// Multiply subtract: z = z - x * y +// Multiply subtract: z = x * y - z static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return __riscv_vfmsub(x, y, z, VECTLENDP); } +// z = z - x * y +static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { + return __riscv_vfnmsac(z, x, y, VECTLENDP); +} #else static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); } #endif // fused multiply add / sub static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { @@ -998,6 +1057,238 @@ static INLINE vint vand_vi_vo_vi(rvv_dp_vopmask x, vint y) { return __riscv_vmerge(y, 0, __riscv_vmnot(x, VECTLENDP), VECTLENDP); } +/****************************************/ +/* DFT Operations */ +/****************************************/ + +static INLINE vdouble vposneg_vd_vd(vdouble d) { + rvv_dp_vopmask mask = SLEEF_RVV_DP_VREINTERPRET_VOM(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1())); + vdouble nd = __riscv_vfneg(d, VECTLENDP); + return __riscv_vmerge(nd, d, mask, VECTLENDP); +} + +static INLINE vdouble vnegpos_vd_vd(vdouble d) { + rvv_dp_vopmask mask = SLEEF_RVV_DP_VREINTERPRET_VOM(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1())); + vdouble nd = __riscv_vfneg(d, VECTLENDP); + return __riscv_vmerge(nd, d, mask, VECTLENDP); +} + +static INLINE vfloat vposneg_vf_vf(vfloat d) { + rvv_sp_vopmask mask = SLEEF_RVV_SP_VREINTERPRET_VOM(__riscv_vmv_v_x_u8m1(0x55, __riscv_vsetvlmax_e8m1())); + vfloat nd = __riscv_vfneg(d, VECTLENSP); + return __riscv_vmerge(nd, d, mask, VECTLENSP); +} + +static INLINE vfloat vnegpos_vf_vf(vfloat d) { + rvv_sp_vopmask mask = SLEEF_RVV_SP_VREINTERPRET_VOM(__riscv_vmv_v_x_u8m1(0xaa, __riscv_vsetvlmax_e8m1())); + vfloat nd = __riscv_vfneg(d, VECTLENSP); + return __riscv_vmerge(nd, d, mask, VECTLENSP); +} + +static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } +static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } +static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } +static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } + +// + +static INLINE vdouble vrev21_vd_vd(vdouble vd) { + rvv_dp_vuint2 id = SLEEF_RVV_DP_VID(); + id = __riscv_vxor(id, 1, VECTLENDP); + return __riscv_vrgather(vd, id, VECTLENDP); +} + +static INLINE vfloat vrev21_vf_vf(vfloat vf) { + vint2 id = SLEEF_RVV_SP_VREINTERPRET_VI2(SLEEF_RVV_SP_VID()); + id = __riscv_vxor(id, 1, VECTLENSP); + return __riscv_vrgather(vf, SLEEF_RVV_SP_VREINTERPRET_VU2(id), VECTLENSP); +} + +static INLINE vdouble vreva2_vd_vd(vdouble vd) { + rvv_dp_vuint2 id = SLEEF_RVV_DP_VID(); + id = __riscv_vxor(id, VECTLENDP - 2, VECTLENDP); + return __riscv_vrgather(vd, id, VECTLENDP); +} + +static INLINE vfloat vreva2_vf_vf(vfloat vf) { + vint2 id = SLEEF_RVV_SP_VREINTERPRET_VI2(SLEEF_RVV_SP_VID()); + id = __riscv_vxor(id, VECTLENSP - 2, VECTLENSP); + return __riscv_vrgather(vf, SLEEF_RVV_SP_VREINTERPRET_VU2(id), VECTLENSP); +} + +// + +static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { + ptr += offset * 2; + for (int i = 0; i < VECTLENDP; i += 2) { + vdouble vv = __riscv_vslidedown(v, i, 2); + __riscv_vse64(ptr, vv, 2); + ptr += step * 2; + } +} + +static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { + ptr += offset * 2; + for (int i = 0; i < VECTLENSP; i += 2) { + vfloat vv = __riscv_vslidedown(v, i, 2); + __riscv_vse32(ptr, vv, 2); + ptr += step * 2; + } +} + + +/****************************************/ +/* Quad Operations */ +/****************************************/ + + +static INLINE vmask tdxgete_vm_tdx(tdx t) { + return SLEEF_RVV_DP_VREINTERPRET_VM(SLEEF_RVV_DP_VGET_VD(t, 0)); +} +static INLINE vdouble tdxgetd3x_vd_tdx(tdx t) { + return SLEEF_RVV_DP_VGET_VD(t, 1); +} +static INLINE vdouble tdxgetd3y_vd_tdx(tdx t) { + return SLEEF_RVV_DP_VGET_VD(t, 2); +} +static INLINE vdouble tdxgetd3z_vd_tdx(tdx t) { + return SLEEF_RVV_DP_VGET_VD(t, 3); +} +static INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { + return __riscv_vset(t, 0, SLEEF_RVV_DP_VREINTERPRET_VD(e)); +} +static INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { + return __riscv_vset(t, 1, x); +} +static INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { + return __riscv_vset(t, 2, y); +} +static INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { + return __riscv_vset(t, 3, z); +} + +static INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) { + return vd3setxyz_vd3_vd_vd_vd(tdxgetd3x_vd_tdx(t), tdxgetd3y_vd_tdx(t), tdxgetd3z_vd_tdx(t)); +} + +static INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) { + t = tdxsetx_tdx_tdx_vd(t, x); + t = tdxsety_tdx_tdx_vd(t, y); + t = tdxsetz_tdx_tdx_vd(t, z); + return t; +} +static INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { + return tdxsetxyz_tdx_tdx_vd_vd_vd(t, vd3getx_vd_vd3(d3), vd3gety_vd_vd3(d3), vd3getz_vd_vd3(d3)); +} + +static INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { + return SLEEF_RVV_DP_VCREATE_TD(SLEEF_RVV_DP_VREINTERPRET_VD(e), + vd3getx_vd_vd3(d3), vd3gety_vd_vd3(d3), vd3getz_vd_vd3(d3)); +} +static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) { + return SLEEF_RVV_DP_VCREATE_TD(SLEEF_RVV_DP_VREINTERPRET_VD(e), x, y, z); +} + +static INLINE vdouble tdigetx_vd_tdi(tdi_t d) { + return SLEEF_RVV_DP_VGET_VD(d, 0); +} +static INLINE vdouble tdigety_vd_tdi(tdi_t d) { + return SLEEF_RVV_DP_VGET_VD(d, 1); +} +static INLINE vdouble tdigetz_vd_tdi(tdi_t d) { + return SLEEF_RVV_DP_VGET_VD(d, 2); +} + +static INLINE vint tdigeti_vi_tdi(tdi_t d) { + vdouble vd = SLEEF_RVV_DP_VGET_VD(d, 3); + vint2 vi2 = SLEEF_RVV_DP_VREINTERPRET_VI2(vd); + vint vi = SLEEF_RVV_DP_VLMUL_TRUNC_VI2_TO_VI(vi2); + return vi; +} +static INLINE tdi_t tdisetx_tdi_tdi_vd(tdi_t t, vdouble x) { + return __riscv_vset(t, 0, x); +} +static INLINE tdi_t tdisety_tdi_tdi_vd(tdi_t t, vdouble y) { + return __riscv_vset(t, 1, y); +} +static INLINE tdi_t tdisetz_tdi_tdi_vd(tdi_t t, vdouble z) { + return __riscv_vset(t, 2, z); +} +static INLINE tdi_t tdiseti_tdi_tdi_vi(tdi_t t, vint i) { + vint2 vi2 = SLEEF_RVV_DP_VLMUL_EXT_VI_TO_VI2(i); + vdouble vd = SLEEF_RVV_DP_VREINTERPRET_VD(vi2); + return __riscv_vset(t, 3, vd); +} + +static INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) { + return vd3setxyz_vd3_vd_vd_vd(tdigetx_vd_tdi(d), tdigety_vd_tdi(d), tdigetz_vd_tdi(d)); +} +static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) { + tdi = tdisetx_tdi_tdi_vd(tdi, vd3getx_vd_vd3(v)); + tdi = tdisety_tdi_tdi_vd(tdi, vd3gety_vd_vd3(v)); + tdi = tdisetz_tdi_tdi_vd(tdi, vd3getz_vd_vd3(v)); + return tdi; +} +static INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) { + tdi_t ret = SLEEF_RVV_DP_VCREATE_TD(vd3getx_vd_vd3(v), vd3gety_vd_vd3(v), vd3getz_vd_vd3(v), vd3getz_vd_vd3(v)); + return tdiseti_tdi_tdi_vi(ret, i); +} + + +static INLINE rvv_dp_vopmask vcast_vo_i(int i) { + return SLEEF_RVV_DP_VREINTERPRET_VOM(__riscv_vmv_v_x_u32m1(i, VECTLENSP)); +} +static INLINE vmask vreinterpret_vm_vi64(vint64 v) { + return SLEEF_RVV_DP_VREINTERPRET_VM(v); +} +static INLINE vint64 vreinterpret_vi64_vm(vmask m) { + return SLEEF_RVV_DP_VREINTERPRET_VI64(m); +} +static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { + return v; +} +static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { + return m; +} +static INLINE int vtestallzeros_i_vo64(rvv_dp_vopmask g) { + return __riscv_vcpop(g, VECTLENDP) == 0; +} + + +static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } +static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } +static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } +static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } + +// These functions are for debugging +static double vcast_d_vd(vdouble v) { + return __riscv_vfmv_f(v); +} + +static float vcast_f_vf(vfloat v) { + return __riscv_vfmv_f(v); +} + +static int vcast_i_vi(vint v) { + return __riscv_vmv_x(v); +} + +static int vcast_i_vi2(vint2 v) { + return __riscv_vmv_x(v); +} + +// + +static vquad loadu_vq_p(const int32_t *ptr) { + return SLEEF_RVV_DP_VREINTERPRET_VQ(SLEEF_RVV_DP_VREINTERPRET_4VU(SLEEF_RVV_SP_LOAD_2VI(ptr, VECTLENSP * 2))); +} + +static INLINE vquad cast_vq_aq(vargquad aq) { return aq; } +static INLINE vargquad cast_aq_vq(vquad vq) { return vq; } + +static INLINE void vprefetch_v_p(const void *ptr) {} + + /****************************************/ /* RVV_SP and RVV_DP reconciliation */ /****************************************/ diff --git a/src/dft/CMakeLists.txt b/src/dft/CMakeLists.txt index 86f57c0a..f3097447 100644 --- a/src/dft/CMakeLists.txt +++ b/src/dft/CMakeLists.txt @@ -75,6 +75,46 @@ set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11) set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE}) set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11) set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE}) +set(MACRODEF_rvvm1128dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=7) +set(CFLAGS_rvvm1128dp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm1128sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=7) +set(CFLAGS_rvvm1128sp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm1256dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=8) +set(CFLAGS_rvvm1256dp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm1256sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=8) +set(CFLAGS_rvvm1256sp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm1512dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=9) +set(CFLAGS_rvvm1512dp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm1512sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=9) +set(CFLAGS_rvvm1512sp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm11024dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=10) +set(CFLAGS_rvvm11024dp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm11024sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=10) +set(CFLAGS_rvvm11024sp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm12048dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=11) +set(CFLAGS_rvvm12048dp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm12048sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=11) +set(CFLAGS_rvvm12048sp ${FLAGS_ENABLE_RVVM1}) +set(MACRODEF_rvvm2128dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=7) +set(CFLAGS_rvvm2128dp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm2128sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=7) +set(CFLAGS_rvvm2128sp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm2256dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=8) +set(CFLAGS_rvvm2256dp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm2256sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=8) +set(CFLAGS_rvvm2256sp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm2512dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=9) +set(CFLAGS_rvvm2512dp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm2512sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=9) +set(CFLAGS_rvvm2512sp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm21024dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=10) +set(CFLAGS_rvvm21024dp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm21024sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=10) +set(CFLAGS_rvvm21024sp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm22048dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=11) +set(CFLAGS_rvvm22048dp ${FLAGS_ENABLE_RVVM2}) +set(MACRODEF_rvvm22048sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=11) +set(CFLAGS_rvvm22048sp ${FLAGS_ENABLE_RVVM2}) set(MACRODEF_vsxdp BASETYPEID=1 ENABLE_VSX CONFIG=1) set(CFLAGS_vsxdp ${FLAGS_ENABLE_VSX}) set(MACRODEF_vsxsp BASETYPEID=2 ENABLE_VSX CONFIG=1) @@ -139,6 +179,16 @@ if (COMPILER_SUPPORTS_NEON32) set(ISALIST_SP ${ISALIST_SP} neon32sp) endif(COMPILER_SUPPORTS_NEON32) +if (COMPILER_SUPPORTS_RVVM1) + set(ISALIST_SP ${ISALIST_SP} rvvm1128sp rvvm1256sp rvvm1512sp rvvm11024sp rvvm12048sp) + set(ISALIST_DP ${ISALIST_DP} rvvm1128dp rvvm1256dp rvvm1512dp rvvm11024dp rvvm12048dp) +endif(COMPILER_SUPPORTS_RVVM1) + +if (COMPILER_SUPPORTS_RVVM2) + set(ISALIST_SP ${ISALIST_SP} rvvm2128sp rvvm2256sp rvvm2512sp rvvm21024sp rvvm22048sp) + set(ISALIST_DP ${ISALIST_DP} rvvm2128dp rvvm2256dp rvvm2512dp rvvm21024dp rvvm22048dp) +endif(COMPILER_SUPPORTS_RVVM2) + if (COMPILER_SUPPORTS_VSX) set(ISALIST_SP ${ISALIST_SP} vsxsp) set(ISALIST_DP ${ISALIST_DP} vsxdp) diff --git a/src/dft/vectortype.h b/src/dft/vectortype.h index 98b9b8c5..7e9ac68a 100644 --- a/src/dft/vectortype.h +++ b/src/dft/vectortype.h @@ -37,6 +37,10 @@ #include "helpersve.h" #endif +#if defined(ENABLE_RVVM1) || defined(ENABLE_RVVM2) +#include "helperrvv.h" +#endif + #ifdef ENABLE_VSX #include "helperpower_128.h" #endif diff --git a/src/quad-tester/qiutsimd.c b/src/quad-tester/qiutsimd.c index eaf2d8d4..74f61e1b 100644 --- a/src/quad-tester/qiutsimd.c +++ b/src/quad-tester/qiutsimd.c @@ -228,6 +228,29 @@ #define VARGQUAD Sleef_quadx2 #endif +#ifdef ENABLE_RVVM1 +#include "qrenamervvm1.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#define VARGQUAD Sleef_rvvm1quad +#endif +#define SIZEOF_VARGQUAD (__riscv_vsetvlmax_e64m1()*8) +#endif + +#ifdef ENABLE_RVVM2 +#include "qrenamervvm2.h" +#if !defined(USE_INLINE_HEADER) +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#define VARGQUAD Sleef_rvvm2quad +#endif +#define SIZEOF_VARGQUAD (__riscv_vsetvlmax_e64m2()*8) +#endif + + #ifndef VARGQUAD #define VARGQUAD vargquad #endif diff --git a/src/quad-tester/tester2simdqp.c b/src/quad-tester/tester2simdqp.c index 06d0d8a6..63c0436f 100644 --- a/src/quad-tester/tester2simdqp.c +++ b/src/quad-tester/tester2simdqp.c @@ -122,6 +122,22 @@ #define VARGQUAD Sleef_quadx2 #endif +#ifdef ENABLE_RVVM1 +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#include "qrenamervvm1.h" +#define VARGQUAD Sleef_rvvm1quad +#endif + +#ifdef ENABLE_RVVM2 +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#include "qrenamervvm2.h" +#define VARGQUAD Sleef_rvvm2quad +#endif + // #define DENORMAL_DBL_MIN (4.9406564584124654418e-324) @@ -217,7 +233,7 @@ int main(int argc,char **argv) mpfr_t frw, frx, fry, frz; mpfr_inits(frw, frx, fry, frz, NULL); -#ifndef ENABLE_SVE +#if !(defined ENABLE_SVE || defined ENABLE_RVVM1 || defined ENABLE_RVVM2) memset(&a0, 0, sizeof(a0)); memset(&a1, 0, sizeof(a1)); memset(&a2, 0, sizeof(a2)); diff --git a/src/quad/CMakeLists.txt b/src/quad/CMakeLists.txt index ee996268..bf422763 100644 --- a/src/quad/CMakeLists.txt +++ b/src/quad/CMakeLists.txt @@ -12,6 +12,8 @@ set(QUAD_HEADER_PARAMS_AVX512F 8 Sleef_quadx8 Sleef_quadx8_2 __m512d set(QUAD_HEADER_PARAMS_ADVSIMD 2 Sleef_quadx2 Sleef_quadx2_2 float64x2_t float32x4_t uint32x4_t int32x2_t int64x2_t uint64x2_t __ARM_NEON advsimd) set(QUAD_HEADER_PARAMS_DSPX2_AARCH64 2 Sleef_quadx2 Sleef_quadx2_2 float64x2_t float32x4_t uint32x4_t int32x2_t int64x2_t uint64x2_t __ARM_NEON) set(QUAD_HEADER_PARAMS_SVE x Sleef_svquad Sleef_svquad_2 svfloat64_t svfloat32_t svint32_t svint32_t svint64_t svuint64_t __ARM_FEATURE_SVE sve) +set(QUAD_HEADER_PARAMS_RVVM1 x Sleef_rvvm1quad Sleef_rvvm1quad_2 vfloat64m1_t vfloat32m1_t rvv_dp_mask vint32mf2_t vint64m1_t vuint64m1_t __riscv_vector rvvm1) +set(QUAD_HEADER_PARAMS_RVVM2 x Sleef_rvvm2quad Sleef_rvvm2quad_2 vfloat64m2_t vfloat32m2_t rvv_dp_mask vint32m1_t vint64m2_t vuint64m2_t __riscv_vector rvvm2) set(QUAD_HEADER_PARAMS_VSX 2 Sleef_quadx2 Sleef_quadx2_2 "SLEEF_VECTOR_DOUBLE" "SLEEF_VECTOR_FLOAT" "SLEEF_VECTOR_UINT" "SLEEF_VECTOR_INT" "SLEEF_VECTOR_LONGLONG" "SLEEF_VECTOR_ULONGLONG" __VSX__ vsx) set(QUAD_HEADER_PARAMS_VSX3 2 Sleef_quadx2 Sleef_quadx2_2 "SLEEF_VECTOR_DOUBLE" "SLEEF_VECTOR_FLOAT" "SLEEF_VECTOR_UINT" "SLEEF_VECTOR_INT" "SLEEF_VECTOR_LONGLONG" "SLEEF_VECTOR_ULONGLONG" __VSX__ vsx3) set(QUAD_HEADER_PARAMS_DSPX2_PPC64 2 Sleef_quadx2 Sleef_quadx2_2 "SLEEF_VECTOR_DOUBLE" "SLEEF_VECTOR_FLOAT" "SLEEF_VECTOR_UINT" "SLEEF_VECTOR_INT" "SLEEF_VECTOR_LONGLONG" "SLEEF_VECTOR_ULONGLONG" __VSX__) @@ -27,6 +29,8 @@ set(QUAD_RENAME_PARAMS_AVX2 4 avx2) set(QUAD_RENAME_PARAMS_AVX512F 8 avx512f) set(QUAD_RENAME_PARAMS_ADVSIMD 2 advsimd) set(QUAD_RENAME_PARAMS_SVE x sve) +set(QUAD_RENAME_PARAMS_RVVM1 x rvvm1) +set(QUAD_RENAME_PARAMS_RVVM2 x rvvm2) set(QUAD_RENAME_PARAMS_VSX 2 vsx) set(QUAD_RENAME_PARAMS_VSX3 2 vsx3) set(QUAD_RENAME_PARAMS_VXE 2 vxe) @@ -81,6 +85,14 @@ elseif(SLEEF_ARCH_S390X) DSPX2_S390X ) set(QMKDISP_PARAMS_X2 2 Sleef_quadx2 "SLEEF_VECTOR_DOUBLE" "SLEEF_VECTOR_INT" "SLEEF_VECTOR_LONGLONG" "SLEEF_VECTOR_ULONGLONG" vxe vxe2) +elseif(SLEEF_ARCH_RISCV64) + set(SLEEF_HEADER_LIST + RVVM1 + RVVM2 + PUREC_SCALAR + PURECFMA_SCALAR + DSPSCALAR + ) endif() # diff --git a/src/quad/sleefquad_header.h.org.in b/src/quad/sleefquad_header.h.org.in index 44109745..70ff9184 100644 --- a/src/quad/sleefquad_header.h.org.in +++ b/src/quad/sleefquad_header.h.org.in @@ -106,6 +106,20 @@ typedef struct { #endif // #if !defined(Sleef_quadx2_DEFINED) +#if !defined(Sleef_rvvm1quad_DEFINED) +#if defined(__riscv) && defined(__riscv_v) +#define Sleef_rvvm1quadDEFINED +typedef vuint64m1x2_t Sleef_rvvm1quad; +#endif +#endif + +#if !defined(Sleef_rvvm2quad_DEFINED) +#if defined(__riscv) && defined(__riscv_v) +#define Sleef_rvvm2quadDEFINED +typedef vuint64m2x2_t Sleef_rvvm2quad; +#endif +#endif + #if !defined(Sleef_quadx4_DEFINED) #if defined(__AVX__) #define Sleef_quadx4_DEFINED diff --git a/src/quad/sleefsimdqp.c b/src/quad/sleefsimdqp.c index 5a7f267d..6244ce4f 100644 --- a/src/quad/sleefsimdqp.c +++ b/src/quad/sleefsimdqp.c @@ -145,6 +145,27 @@ typedef vquad Sleef_quadx2; typedef vquad Sleef_quadx2; #endif +// RISC-V +#ifdef ENABLE_RVVM1 +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#ifdef DORENAME +#include "qrenamervvm1.h" +#endif +typedef vquad Sleef_rvvm1quad; +#endif + +#ifdef ENABLE_RVVM2 +#define CONFIG 1 +#define ENABLE_RVV_DP +#include "helperrvv.h" +#ifdef DORENAME +#include "qrenamervvm2.h" +#endif +typedef vquad Sleef_rvvm2quad; +#endif + #include "dd.h" #include "commonfuncs.h"