dotnet · jandupej · Mar 13, 2023 · Feb 22, 2023 · Mar 7, 2023 · Mar 7, 2023
@@ -1576,8 +1576,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 /* NEON :: across lanes */
 #define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn))
 
-
-
 // contrary to most other opcodes, the suffix is the type of source
 #define arm_neon_saddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00011, (rd), (rn))
 #define arm_neon_saddlv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00011, (rd), (rn))
@@ -1821,6 +1819,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_cmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b00110, (rd), (rn), (rm))
 #define arm_neon_cmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b00111, (rd), (rn), (rm))
 #define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm))
+#define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm))
+#define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm))
 
 // Generalized macros for float ops:
 //   width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
@@ -2303,7 +2303,10 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 
 
 /* NEON :: modified immediate */
-// TODO
+#define arm_neon_mimm_opcode(p, q, op, cmode, o2, imm, rd) arm_neon_opcode_1reg ((p), (q), 0b00001111000000000000010000000000 | (op) << 29 | (cmode) << 12 | (o2) << 11 | (imm & 0b11100000) << 11 | (imm & 0b11111) << 5, (rd))
+
+#define ARM_IMM_FONE (0b01110000)
+#define arm_neon_fmov_imm(p, width, type, rd, imm) arm_neon_mimm_opcode ((p), (width), (type), 0b1111, 0b0, (imm), (rd))
 
 /* NEON :: shift by immediate */
 #define arm_neon_shimm_opcode(p, q, u, immh, immb, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001111000000000000010000000000 | (u) << 29 | (immh) << 19 | (immb) << 16 | (opcode) << 11, (rd), (rn))

@@ -440,6 +440,8 @@ main (int argc, char *argv [])
 	arm_neon_cmgt (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
 	arm_neon_cmge (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
 	arm_neon_cmeq (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
+	arm_neon_cmhi (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
+	arm_neon_cmhs (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
 
 	// neon float 3-reg same type
 	arm_neon_fadd (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2);

@@ -499,6 +499,10 @@ xbinop: dest:x src1:x src2:x len:4
 xzero: dest:x len:4
 xmove: dest:x src1:x len:4
 xconst: dest:x len:10
+xcompare: dest:x src1:x src2:x len:4
+xcompare_fp: dest:x src1:x src2:x len:4
+negate: dest:x src1:x len:4
+ones_complement: dest:x src1:x len:4
 
 generic_class_init: src1:a len:44 clob:c
 gc_safe_point: src1:i len:12 clob:c

@@ -30,6 +30,39 @@
 
 #include "interp/interp.h"
 
+// The following defines are here to support the inclusion of simd-arm64.h
+#define EXPAND(x) x
+#define PARENTHESIZE(...) (__VA_ARGS__)
+#define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__))
+#define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2
+#define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1
+#define _UNDEF(...) g_assert_not_reached ()
+#define SIMD_OP_CODE(reg_w, op, c) ((reg_w << 31) | (op) << 16 | (c))
+#define VREG_64 VREG_LOW
+#define VREG_128 VREG_FULL
+#define OPCODE_BASIC 0
+#define OPCODE_SIMD 1
+
+#define SIMD_OP_INTERNAL(code, reg_w, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \
+	if (_f) { \
+		if (_t == TYPE_F32) { \
+			EXPAND_FUN (f32fun, code, OPFMT_##fmt); \
+		} else { \
+			EXPAND_FUN (f64fun, code, OPFMT_##fmt); \
+		} \
+	} else { \
+		if (_t == TYPE_I8) { \
+			EXPAND_FUN (i8fun, code, OPFMT_##fmt); \
+		} else if (_t == TYPE_I16) { \
+			EXPAND_FUN (i16fun, code, OPFMT_##fmt); \
+		} else if (_t == TYPE_I32) { \
+			EXPAND_FUN (i32fun, code, OPFMT_##fmt); \
+		} else { \
+			EXPAND_FUN (i64fun, code, OPFMT_##fmt); \
+		} \
+	}
+
+
 // Several of the 32-bit bit shifts must remain 32-bit, since they assume possible wrap around.
 // result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
 MONO_DISABLE_WARNING(4334)
@@ -62,6 +95,8 @@ static gpointer bp_trampoline;
 static gboolean ios_abi;
 static gboolean enable_ptrauth;
 
+static char opcode_simd_status[OP_LAST - OP_START];
+
 #if defined(HOST_WIN32)
 #define WARN_UNUSED_RESULT _Check_return_
 #else
@@ -271,6 +306,13 @@ mono_arch_init (void)
 		bp_trampoline = mini_get_breakpoint_trampoline ();
 
 	mono_arm_gsharedvt_init ();
+
+#ifndef DISABLE_JIT
+	memset(opcode_simd_status, OPCODE_BASIC, OP_LAST - OP_START);
+	#undef SIMD_OP 
+	#define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) opcode_simd_status[(op) - OP_START] = OPCODE_SIMD;
+	#include "simd-arm64.h"
+#endif
 }
 
 void
@@ -3271,9 +3313,16 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
 	return code;
 }
 
+static gboolean
+is_type_float_macro (MonoTypeEnum type)
+{
+	return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); 
+}
+
 static int
 get_vector_size_macro (MonoInst *ins)
 {
+	g_assert (ins->klass);
 	int size = mono_class_value_size (ins->klass, NULL);
 	switch (size) {
 	case 16:
@@ -3352,6 +3401,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		sreg2 = ins->sreg2;
 		imm = ins->inst_imm;
 
+		if (opcode_simd_status [ins->opcode - OP_START] == OPCODE_SIMD)
+		{
+			const int _t = get_type_size_macro (ins->inst_c1);
+    	const gboolean _f = is_type_float_macro (ins->inst_c1);
+    	const int _w = get_vector_size_macro (ins);
+
+			#undef SIMD_OP
+			#define SIMD_OP(reg_w, op, c, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun) \
+				case SIMD_OP_CODE (VREG_##reg_w, (op), (c)): { \
+					SIMD_OP_INTERNAL (code, reg_w, fmt, i8fun, i16fun, i32fun, i64fun, f32fun, f64fun); \
+					} break;
+
+			switch (SIMD_OP_CODE (_w, ins->opcode, ins->inst_c0)) {
+				#include "simd-arm64.h"
+			default:
+				g_assert_not_reached();
+				break;
+			}
+
+			goto after_instruction_emit;
+		}
+
 		switch (ins->opcode) {
 		case OP_ICONST:
 			code = emit_imm (code, dreg, ins->inst_c0);
@@ -3500,9 +3571,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_LOADX_MEMBASE:
 			code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
 			break;
-		case OP_XZERO:
-			arm_neon_eor_16b (code, dreg, dreg, dreg);
-			break;
 		case OP_XMOVE:
 			arm_neon_mov (code, dreg, sreg1);
 			break;
@@ -3589,18 +3657,22 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 			mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_true_bb, MONO_R_ARM64_CBZ);
 			arm_cbnzx (code, sreg1, 0);
 			break;
-		case OP_XBINOP:
-			switch (ins->inst_c0) {
-			case OP_IADD:
-				arm_neon_add (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-				break;
-			case OP_FADD:
-				arm_neon_fadd (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-				break;
-			default:
-				g_assert_not_reached ();
+
+			/* SIMD that is not table-generated */
+		case OP_ONES_COMPLEMENT:
+			arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
+			break;
+		case OP_NEGATION:
+			if (is_type_float_macro (ins->inst_c1)) {
+				arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
+			} else {
+				arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
 			}
 			break;
+		case OP_XZERO:
+			arm_neon_eor_16b (code, dreg, dreg, dreg);
+			break;
+
 			/* ALU */
 		case OP_IADD:
 			arm_addw (code, dreg, sreg1, sreg2);
@@ -4831,15 +4903,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 				if ((MONO_ARCH_CALLEE_SAVED_REGS & (1 << i)) || i == ARMREG_SP || i == ARMREG_FP)
 					arm_strx (code, i, ins->sreg1, MONO_STRUCT_OFFSET (MonoContext, regs) + i * sizeof (target_mgreg_t));
 			break;
+
 		default:
 			g_warning ("unknown opcode %s in %s()\n", mono_inst_name (ins->opcode), __FUNCTION__);
 			g_assert_not_reached ();
 		}
-
+
+	after_instruction_emit:
 		if ((cfg->opt & MONO_OPT_BRANCH) && ((code - cfg->native_code - offset) > max_len)) {
 			g_warning ("wrong maximal instruction length of instruction %s (expected %d, got %d)",
 				   mono_inst_name (ins->opcode), max_len, code - cfg->native_code - offset);
 			g_assert_not_reached ();
+
 		}
 	}
 	set_code_cursor (cfg, code);

@@ -0,0 +1,53 @@
+/* Remarks:
+ * - This table is used to drive code generation on operations that are defined by the tuple (ins->opcode, ins->inst_c0).
+ * - Operand config specifies the order of operands that are to be supplied to the function or macro. These are
+ *   variations of: W (width of the vector register), T (element type), D (dest reg number), S (source reg number),
+ *   I (immediate value). If _REV is specifed, the order of source registers is reversed. Note that not all 
+ *   options are supported. To specify more options, add the respective macros to the files that include this
+ *   (e.g. mini-arm64.c).
+ * - To specify that a particular operation is not supported for some data type, use  _UNDEF.
+ */
+
+/* 64-bit vectors */
+/*        Width   Opcode          Function              Operand config      I8                I16               I32               I64               F32               F64         */
+/*--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_EQ,               WTDSS,              arm_neon_cmeq,    arm_neon_cmeq,    arm_neon_cmeq,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_GT,               WTDSS,              arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_GT_UN,            WTDSS,              arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_GE,               WTDSS,              arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_GE_UN,            WTDSS,              arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_LT,               WTDSS_REV,          arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_LT_UN,            WTDSS_REV,          arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_LE,               WTDSS_REV,          arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    _UNDEF,           _UNDEF,           _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE,    CMP_LE_UN,            WTDSS_REV,          arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    _UNDEF,           _UNDEF,           _UNDEF)
+
+SIMD_OP  (64,  OP_XCOMPARE_FP, CMP_EQ,               WTDSS,               _UNDEF,          _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmeq,   _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE_FP, CMP_GT,               WTDSS,               _UNDEF,          _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmgt,   _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE_FP, CMP_GE,               WTDSS,               _UNDEF,          _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmge,   _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE_FP, CMP_LT,               WTDSS_REV,           _UNDEF,          _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmgt,   _UNDEF)
+SIMD_OP  (64,  OP_XCOMPARE_FP, CMP_LE,               WTDSS_REV,           _UNDEF,          _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmge,   _UNDEF)
+
+SIMD_OP  (64,  OP_XBINOP,      OP_IADD,              WTDSS,              arm_neon_add,     arm_neon_add,     arm_neon_add,     _UNDEF,           _UNDEF,           _UNDEF)  
+SIMD_OP  (64,  OP_XBINOP,      OP_FADD,              WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fadd,    _UNDEF)
+
+/* 128-bit vectors */
+/*         Width  Opcode          Function              Operand config      I8                I16               I32               I64               F32               F64         */
+/*--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
+SIMD_OP  (128, OP_XCOMPARE,    CMP_EQ,               WTDSS,              arm_neon_cmeq,    arm_neon_cmeq,    arm_neon_cmeq,    arm_neon_cmeq,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_GT,               WTDSS,              arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_GT_UN,            WTDSS,              arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_GE,               WTDSS,              arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_GE_UN,            WTDSS,              arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_LT,               WTDSS_REV,          arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    arm_neon_cmgt,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_LT_UN,            WTDSS_REV,          arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    arm_neon_cmhi,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_LE,               WTDSS_REV,          arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    arm_neon_cmge,    _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XCOMPARE,    CMP_LE_UN,            WTDSS_REV,          arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    arm_neon_cmhs,    _UNDEF,           _UNDEF)
+
+SIMD_OP  (128, OP_XCOMPARE_FP, CMP_EQ,               WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmeq,   arm_neon_fcmeq)
+SIMD_OP  (128, OP_XCOMPARE_FP, CMP_GT,               WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmgt,   arm_neon_fcmgt)
+SIMD_OP  (128, OP_XCOMPARE_FP, CMP_GE,               WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmge,   arm_neon_fcmge)
+SIMD_OP  (128, OP_XCOMPARE_FP, CMP_LT,               WTDSS_REV,          _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmgt,   arm_neon_fcmgt)
+SIMD_OP  (128, OP_XCOMPARE_FP, CMP_LE,               WTDSS_REV,          _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmge,   arm_neon_fcmge)
+
+SIMD_OP  (128, OP_XBINOP,      OP_IADD,              WTDSS,              arm_neon_add,     arm_neon_add,     arm_neon_add,    arm_neon_add,      _UNDEF,           _UNDEF)  
+SIMD_OP  (128, OP_XBINOP,      OP_FADD,              WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fadd,    arm_neon_fadd)
@@ -1190,10 +1190,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		return NULL;
 #endif
 // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
-#ifdef TARGET_ARM64
-	if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
-		return NULL;
-#endif
+//#ifdef TARGET_ARM64
+//	if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
+//		return NULL;
+//#endif
 
 	int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);
 	if (id == -1) {
@@ -1207,8 +1207,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
 #ifdef TARGET_ARM64
 	if (!COMPILE_LLVM (cfg)) {
-		if (id != SN_Add)
+		switch (id) {
+		case SN_Add:
+		case SN_Equals:
+		case SN_GreaterThan:
+		case SN_GreaterThanOrEqual:
+		case SN_LessThan:
+		case SN_LessThanOrEqual:
+		case SN_Negate:
+		case SN_OnesComplement:
+			break;
+		default: 
 			return NULL;
+		}
 		MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]);
 		int class_size = mono_class_value_size (arg0_class, NULL);
 		if (class_size != 16)