Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono][jit] Adding more arm64 SIMD operations, SIMD codegen with instruction table. #83094

Merged
merged 7 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1576,8 +1576,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
/* NEON :: across lanes */
#define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn))



// contrary to most other opcodes, the suffix is the type of source
#define arm_neon_saddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00011, (rd), (rn))
#define arm_neon_saddlv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00011, (rd), (rn))
Expand Down Expand Up @@ -1821,6 +1819,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_cmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b00110, (rd), (rn), (rm))
#define arm_neon_cmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b00111, (rd), (rn), (rm))
#define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm))
#define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm))
#define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm))

// Generalized macros for float ops:
// width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
Expand Down Expand Up @@ -2303,7 +2303,10 @@ arm_encode_arith_imm (int imm, guint32 *shift)


/* NEON :: modified immediate */
// TODO
#define arm_neon_mimm_opcode(p, q, op, cmode, o2, imm, rd) arm_neon_opcode_1reg ((p), (q), 0b00001111000000000000010000000000 | (op) << 29 | (cmode) << 12 | (o2) << 11 | (imm & 0b11100000) << 11 | (imm & 0b11111) << 5, (rd))

#define ARM_IMM_FONE (0b01110000)
#define arm_neon_fmov_imm(p, width, type, rd, imm) arm_neon_mimm_opcode ((p), (width), (type), 0b1111, 0b0, (imm), (rd))

/* NEON :: shift by immediate */
#define arm_neon_shimm_opcode(p, q, u, immh, immb, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001111000000000000010000000000 | (u) << 29 | (immh) << 19 | (immb) << 16 | (opcode) << 11, (rd), (rn))
Expand Down
2 changes: 2 additions & 0 deletions src/mono/mono/arch/arm64/codegen-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,8 @@ main (int argc, char *argv [])
arm_neon_cmgt (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
arm_neon_cmge (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
arm_neon_cmeq (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
arm_neon_cmhi (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
arm_neon_cmhs (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);

// neon float 3-reg same type
arm_neon_fadd (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2);
Expand Down
4 changes: 4 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,10 @@ xbinop: dest:x src1:x src2:x len:4
xzero: dest:x len:4
xmove: dest:x src1:x len:4
xconst: dest:x len:10
xcompare: dest:x src1:x src2:x len:4
xcompare_fp: dest:x src1:x src2:x len:4
negate: dest:x src1:x len:4
ones_complement: dest:x src1:x len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
103 changes: 89 additions & 14 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,39 @@

#include "interp/interp.h"

// The following defines are here to support the inclusion of simd-arm64.h
#define EXPAND(x) x
vargaz marked this conversation as resolved.
Show resolved Hide resolved
#define PARENTHESIZE(...) (__VA_ARGS__)
#define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__))
#define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2
#define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1
#define _UNDEF(...) g_assert_not_reached ()
#define SIMD_OP_CODE(reg_w, op, c) ((reg_w << 31) | (op) << 16 | (c))
#define VREG_64 VREG_LOW
#define VREG_128 VREG_FULL
#define OPCODE_BASIC 0
#define OPCODE_SIMD 1

#define SIMD_OP_INTERNAL(code, reg_w, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \
if (_f) { \
jandupej marked this conversation as resolved.
Show resolved Hide resolved
if (_t == TYPE_F32) { \
EXPAND_FUN (f32fun, code, OPFMT_##fmt); \
} else { \
EXPAND_FUN (f64fun, code, OPFMT_##fmt); \
} \
} else { \
if (_t == TYPE_I8) { \
EXPAND_FUN (i8fun, code, OPFMT_##fmt); \
} else if (_t == TYPE_I16) { \
EXPAND_FUN (i16fun, code, OPFMT_##fmt); \
} else if (_t == TYPE_I32) { \
EXPAND_FUN (i32fun, code, OPFMT_##fmt); \
} else { \
EXPAND_FUN (i64fun, code, OPFMT_##fmt); \
} \
}


// Several of the 32-bit bit shifts must remain 32-bit, since they assume possible wrap around.
// result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
MONO_DISABLE_WARNING(4334)
Expand Down Expand Up @@ -62,6 +95,8 @@ static gpointer bp_trampoline;
static gboolean ios_abi;
static gboolean enable_ptrauth;

static char opcode_simd_status[OP_LAST - OP_START];

#if defined(HOST_WIN32)
#define WARN_UNUSED_RESULT _Check_return_
#else
Expand Down Expand Up @@ -271,6 +306,13 @@ mono_arch_init (void)
bp_trampoline = mini_get_breakpoint_trampoline ();

mono_arm_gsharedvt_init ();

#ifndef DISABLE_JIT
memset(opcode_simd_status, OPCODE_BASIC, OP_LAST - OP_START);
jandupej marked this conversation as resolved.
Show resolved Hide resolved
#undef SIMD_OP
#define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) opcode_simd_status[(op) - OP_START] = OPCODE_SIMD;
#include "simd-arm64.h"
#endif
}

void
Expand Down Expand Up @@ -3271,9 +3313,16 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
return code;
}

static gboolean
is_type_float_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8);
}

static int
get_vector_size_macro (MonoInst *ins)
{
g_assert (ins->klass);
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
Expand Down Expand Up @@ -3352,6 +3401,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
sreg2 = ins->sreg2;
imm = ins->inst_imm;

if (opcode_simd_status [ins->opcode - OP_START] == OPCODE_SIMD)
{
jandupej marked this conversation as resolved.
Show resolved Hide resolved
const int _t = get_type_size_macro (ins->inst_c1);
const gboolean _f = is_type_float_macro (ins->inst_c1);
const int _w = get_vector_size_macro (ins);
jandupej marked this conversation as resolved.
Show resolved Hide resolved

#undef SIMD_OP
#define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \
case SIMD_OP_CODE (VREG_##reg_w, (op), (c)): { \
SIMD_OP_INTERNAL (code, reg_w, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun); \
} break;

switch (SIMD_OP_CODE (_w, ins->opcode, ins->inst_c0)) {
#include "simd-arm64.h"
default:
g_assert_not_reached();
break;
}

goto after_instruction_emit;
}

switch (ins->opcode) {
case OP_ICONST:
code = emit_imm (code, dreg, ins->inst_c0);
Expand Down Expand Up @@ -3500,9 +3571,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_LOADX_MEMBASE:
code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XMOVE:
arm_neon_mov (code, dreg, sreg1);
break;
Expand Down Expand Up @@ -3589,18 +3657,22 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_true_bb, MONO_R_ARM64_CBZ);
arm_cbnzx (code, sreg1, 0);
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IADD:
arm_neon_add (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_FADD:
arm_neon_fadd (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();

/* SIMD that is not table-generated */
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;

/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down Expand Up @@ -4831,15 +4903,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
if ((MONO_ARCH_CALLEE_SAVED_REGS & (1 << i)) || i == ARMREG_SP || i == ARMREG_FP)
arm_strx (code, i, ins->sreg1, MONO_STRUCT_OFFSET (MonoContext, regs) + i * sizeof (target_mgreg_t));
break;

default:
g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
g_assert_not_reached ();
}


after_instruction_emit:
if ((cfg->opt & MONO_OPT_BRANCH) && ((code - cfg->native_code - offset) > max_len)) {
g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %d)",
mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
g_assert_not_reached ();

}
}
set_code_cursor (cfg, code);
Expand Down
53 changes: 53 additions & 0 deletions src/mono/mono/mini/simd-arm64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* Remarks:
* - This table is used to drive code generation on operations that are defined by the tuple (ins->opcode, ins->inst_c0).
* - Operand config specifies the order of operands that are to be supplied to the function or macro. These are
* variations of: W (width of the vector register), T (element type), D (dest reg number), S (source reg number),
* I (immediate value). If _REV is specifed, the order of source registers is reversed. Note that not all
* options are supported. To specify more options, add the respective macros to the files that include this
* (e.g. mini-arm64.c).
* - To specify that a particular operation is not supported for some data type, use _UNDEF.
*/

/* 64-bit vectors */
/* Width Opcode Function Operand config I8 I16 I32 I64 F32 F64 */
jandupej marked this conversation as resolved.
Show resolved Hide resolved
/*--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
SIMD_OP (64, OP_XCOMPARE, CMP_EQ, WTDSS, arm_neon_cmeq, arm_neon_cmeq, arm_neon_cmeq, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_GT, WTDSS, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_GT_UN, WTDSS, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_GE, WTDSS, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_GE_UN, WTDSS, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_LT, WTDSS_REV, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_LT_UN, WTDSS_REV, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_LE, WTDSS_REV, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XCOMPARE, CMP_LE_UN, WTDSS_REV, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, _UNDEF, _UNDEF, _UNDEF)

SIMD_OP (64, OP_XCOMPARE_FP, CMP_EQ, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmeq, _UNDEF)
SIMD_OP (64, OP_XCOMPARE_FP, CMP_GT, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmgt, _UNDEF)
SIMD_OP (64, OP_XCOMPARE_FP, CMP_GE, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmge, _UNDEF)
SIMD_OP (64, OP_XCOMPARE_FP, CMP_LT, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmgt, _UNDEF)
SIMD_OP (64, OP_XCOMPARE_FP, CMP_LE, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmge, _UNDEF)

SIMD_OP (64, OP_XBINOP, OP_IADD, WTDSS, arm_neon_add, arm_neon_add, arm_neon_add, _UNDEF, _UNDEF, _UNDEF)
SIMD_OP (64, OP_XBINOP, OP_FADD, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fadd, _UNDEF)

/* 128-bit vectors */
/* Width Opcode Function Operand config I8 I16 I32 I64 F32 F64 */
/*--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
SIMD_OP (128, OP_XCOMPARE, CMP_EQ, WTDSS, arm_neon_cmeq, arm_neon_cmeq, arm_neon_cmeq, arm_neon_cmeq, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_GT, WTDSS, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_GT_UN, WTDSS, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_GE, WTDSS, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_GE_UN, WTDSS, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_LT, WTDSS_REV, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, arm_neon_cmgt, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_LT_UN, WTDSS_REV, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, arm_neon_cmhi, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_LE, WTDSS_REV, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, arm_neon_cmge, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XCOMPARE, CMP_LE_UN, WTDSS_REV, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, arm_neon_cmhs, _UNDEF, _UNDEF)

SIMD_OP (128, OP_XCOMPARE_FP, CMP_EQ, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmeq, arm_neon_fcmeq)
SIMD_OP (128, OP_XCOMPARE_FP, CMP_GT, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmgt, arm_neon_fcmgt)
SIMD_OP (128, OP_XCOMPARE_FP, CMP_GE, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmge, arm_neon_fcmge)
SIMD_OP (128, OP_XCOMPARE_FP, CMP_LT, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmgt, arm_neon_fcmgt)
SIMD_OP (128, OP_XCOMPARE_FP, CMP_LE, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmge, arm_neon_fcmge)

SIMD_OP (128, OP_XBINOP, OP_IADD, WTDSS, arm_neon_add, arm_neon_add, arm_neon_add, arm_neon_add, _UNDEF, _UNDEF)
SIMD_OP (128, OP_XBINOP, OP_FADD, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fadd, arm_neon_fadd)
21 changes: 16 additions & 5 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1190,10 +1190,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
return NULL;
#endif
// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
return NULL;
#endif
//#ifdef TARGET_ARM64
// if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
// return NULL;
//#endif

int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);
if (id == -1) {
Expand All @@ -1207,8 +1207,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!COMPILE_LLVM (cfg)) {
if (id != SN_Add)
switch (id) {
case SN_Add:
case SN_Equals:
case SN_GreaterThan:
case SN_GreaterThanOrEqual:
case SN_LessThan:
case SN_LessThanOrEqual:
case SN_Negate:
case SN_OnesComplement:
break;
default:
return NULL;
}
MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]);
int class_size = mono_class_value_size (arg0_class, NULL);
if (class_size != 16)
Expand Down