Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass HFA/HVA in registers #62623

Merged
merged 2 commits into from
Dec 15, 2021
Merged

Conversation

echesakov
Copy link
Contributor

@echesakov echesakov commented Dec 10, 2021

Consider the following code snippet

PassHaStructsInRegs.cs
using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;

namespace PassHaStructsInRegs
{
    class Program
    {
        struct float32x2
        {
            public float _0;
            public float _1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float32x2 Produce_float32x2()
        {
            return default(float32x2);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float32x2 val)
        {
        }

        struct float32x3
        {
            public float _0;
            public float _1;
            public float _2;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float32x3 Produce_float32x3()
        {
            return default(float32x3);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float32x3 val)
        {
        }

        struct float32x4
        {
            public float _0;
            public float _1;
            public float _2;
            public float _3;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float32x4 Produce_float32x4()
        {
            return default(float32x4);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float32x4 val)
        {
        }

        struct float64x2
        {
            public double _0;
            public double _1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float64x2 Produce_float64x2()
        {
            return default(float64x2);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float64x2 val)
        {
        }

        struct float64x3
        {
            public double _0;
            public double _1;
            public double _2;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float64x3 Produce_float64x3()
        {
            return default(float64x3);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float64x3 val)
        {
        }

        struct float64x4
        {
            public double _0;
            public double _1;
            public double _2;
            public double _3;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static float64x4 Produce_float64x4()
        {
            return default(float64x4);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(float64x4 val)
        {
        }

        struct int32x2x2
        {
            public Vector64<int> _0;
            public Vector64<int> _1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x2x2 Produce_int32x2x2()
        {
            return default(int32x2x2);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x2x2 val)
        {
        }

        struct int32x2x3
        {
            public Vector64<int> _0;
            public Vector64<int> _1;
            public Vector64<int> _2;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x2x3 Produce_int32x2x3()
        {
            return default(int32x2x3);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x2x3 val)
        {
        }

        struct int32x2x4
        {
            public Vector64<int> _0;
            public Vector64<int> _1;
            public Vector64<int> _2;
            public Vector64<int> _3;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x2x4 Produce_int32x2x4()
        {
            return default(int32x2x4);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x2x4 val)
        {
        }

        struct int32x4x2
        {
            public Vector128<int> _0;
            public Vector128<int> _1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x4x2 Produce_int32x4x2()
        {
            return default(int32x4x2);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x4x2 val)
        {
        }

        struct int32x4x3
        {
            public Vector128<int> _0;
            public Vector128<int> _1;
            public Vector128<int> _2;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x4x3 Produce_int32x4x3()
        {
            return default(int32x4x3);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x4x3 val)
        {
        }

        struct int32x4x4
        {
            public Vector128<int> _0;
            public Vector128<int> _1;
            public Vector128<int> _2;
            public Vector128<int> _3;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int32x4x4 Produce_int32x4x4()
        {
            return default(int32x4x4);
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static void Consume(int32x4x4 val)
        {
        }

        static int Main(string[] args)
        {
            Consume(Produce_float32x2());
            Consume(Produce_float32x3());
            Consume(Produce_float32x4());
            Consume(Produce_float64x2());
            Consume(Produce_float64x3());
            Consume(Produce_float64x4());
            Consume(Produce_int32x2x2());
            Consume(Produce_int32x2x3());
            Consume(Produce_int32x2x4());
            Consume(Produce_int32x4x2());
            Consume(Produce_int32x4x3());
            Consume(Produce_int32x4x4());

            return 100;
        }
    }
}

Note that all HFA/HVA values are stored to memory and, then, reloaded back to the same registers when the values are passed as argument to a function as it can be seen in the following output:

Currently
; Assembly listing for method PassHaStructsInRegs.Program:Main(System.String[]):int
; Emitting BLENDED_CODE for generic ARM64 CPU - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
;* V00 arg0         [V00    ] (  0,  0   )     ref  ->  zero-ref    class-hnd single-def
;# V01 OutArgs      [V01    ] (  1,  1   )  lclBlk ( 0) [sp+00H]   "OutgoingArgSpace"
;  V02 tmp1         [V02    ] (  3,  6   )  struct ( 8) [fp+158H]   HFA(float)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V03 tmp2         [V03    ] (  4,  8   )  struct (16) [fp+148H]   HFA(float)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V04 tmp3         [V04,T00] (  5, 10   )  struct (16) [fp+138H]   HFA(float)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V05 tmp4         [V05    ] (  3,  6   )  struct (16) [fp+128H]   HFA(double)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V06 tmp5         [V06    ] (  4,  8   )  struct (24) [fp+110H]   HFA(double)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V07 tmp6         [V07,T01] (  5, 10   )  struct (32) [fp+F0H]   HFA(double)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V08 tmp7         [V08    ] (  3,  6   )  struct (16) [fp+E0H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V09 tmp8         [V09    ] (  4,  8   )  struct (24) [fp+C8H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V10 tmp9         [V10,T02] (  5, 10   )  struct (32) [fp+A8H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V11 tmp10        [V11,T08] (  3,  6   )  struct (32) [fp+88H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V12 tmp11        [V12,T04] (  4,  8   )  struct (48) [fp+58H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V13 tmp12        [V13,T03] (  5, 10   )  struct (64) [fp+18H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V14 tmp13        [V14,T17] (  3,  6   )   float  ->  [fp+158H]   do-not-enreg[] V02._0(offs=0x00) P-DEP "field V02._0 (fldOffset=0x0)"
;  V15 tmp14        [V15,T18] (  3,  6   )   float  ->  [fp+15CH]   do-not-enreg[] V02._1(offs=0x04) P-DEP "field V02._1 (fldOffset=0x4)"
;  V16 tmp15        [V16,T11] (  4,  8   )   float  ->  [fp+148H]   do-not-enreg[] V03._0(offs=0x00) P-DEP "field V03._0 (fldOffset=0x0)"
;  V17 tmp16        [V17,T12] (  4,  8   )   float  ->  [fp+14CH]   do-not-enreg[] V03._1(offs=0x04) P-DEP "field V03._1 (fldOffset=0x4)"
;  V18 tmp17        [V18,T13] (  4,  8   )   float  ->  [fp+150H]   do-not-enreg[] V03._2(offs=0x08) P-DEP "field V03._2 (fldOffset=0x8)"
;  V19 tmp18        [V19,T19] (  3,  6   )  double  ->  [fp+128H]   do-not-enreg[] V05._0(offs=0x00) P-DEP "field V05._0 (fldOffset=0x0)"
;  V20 tmp19        [V20,T20] (  3,  6   )  double  ->  [fp+130H]   do-not-enreg[] V05._1(offs=0x08) P-DEP "field V05._1 (fldOffset=0x8)"
;  V21 tmp20        [V21,T14] (  4,  8   )  double  ->  [fp+110H]   do-not-enreg[] V06._0(offs=0x00) P-DEP "field V06._0 (fldOffset=0x0)"
;  V22 tmp21        [V22,T15] (  4,  8   )  double  ->  [fp+118H]   do-not-enreg[] V06._1(offs=0x08) P-DEP "field V06._1 (fldOffset=0x8)"
;  V23 tmp22        [V23,T16] (  4,  8   )  double  ->  [fp+120H]   do-not-enreg[] V06._2(offs=0x10) P-DEP "field V06._2 (fldOffset=0x10)"
;  V24 tmp23        [V24,T09] (  3,  6   )    long  ->  [fp+E0H]   do-not-enreg[] V08._0(offs=0x00) P-DEP "field V08._0 (fldOffset=0x0)"
;  V25 tmp24        [V25,T10] (  3,  6   )    long  ->  [fp+E8H]   do-not-enreg[] V08._1(offs=0x08) P-DEP "field V08._1 (fldOffset=0x8)"
;  V26 tmp25        [V26,T05] (  4,  8   )    long  ->  [fp+C8H]   do-not-enreg[] V09._0(offs=0x00) P-DEP "field V09._0 (fldOffset=0x0)"
;  V27 tmp26        [V27,T06] (  4,  8   )    long  ->  [fp+D0H]   do-not-enreg[] V09._1(offs=0x08) P-DEP "field V09._1 (fldOffset=0x8)"
;  V28 tmp27        [V28,T07] (  4,  8   )    long  ->  [fp+D8H]   do-not-enreg[] V09._2(offs=0x10) P-DEP "field V09._2 (fldOffset=0x10)"
;
; Lcl frame size = 336

G_M40249_IG01:              ;; offset=0000H
        A9AA7BFD          stp     fp, lr, [sp,#-352]!
        910003FD          mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M40249_IG02:              ;; offset=0008H
        97FFCE6E          bl      PassHaStructsInRegs.Program:Produce_float32x2():float32x2
        BD015BA0          str     s0, [fp,#344]
        BD015FA1          str     s1, [fp,#348]
        BD415BA0          ldr     s0, [fp,#344]
        BD415FA1          ldr     s1, [fp,#348]
        97FFCE6F          bl      PassHaStructsInRegs.Program:Consume(float32x2)
        97FFCE74          bl      PassHaStructsInRegs.Program:Produce_float32x3():float32x3
        BD014BA0          str     s0, [fp,#328]
        BD014FA1          str     s1, [fp,#332]
        BD0153A2          str     s2, [fp,#336]
        BD414BA0          ldr     s0, [fp,#328]
        BD414FA1          ldr     s1, [fp,#332]
        BD4153A2          ldr     s2, [fp,#336]
        97FFCE73          bl      PassHaStructsInRegs.Program:Consume(float32x3)
        97FFCE78          bl      PassHaStructsInRegs.Program:Produce_float32x4():float32x4
        BD013BA0          str     s0, [fp,#312]
        BD013FA1          str     s1, [fp,#316]
        BD0143A2          str     s2, [fp,#320]
        BD0147A3          str     s3, [fp,#324]
        BD413BA0          ldr     s0, [fp,#312]
        BD413FA1          ldr     s1, [fp,#316]
        BD4143A2          ldr     s2, [fp,#320]
        BD4147A3          ldr     s3, [fp,#324]
        97FFCE75          bl      PassHaStructsInRegs.Program:Consume(float32x4)
        97FFCE7A          bl      PassHaStructsInRegs.Program:Produce_float64x2():float64x2
        FD0097A0          str     d0, [fp,#296]
        FD009BA1          str     d1, [fp,#304]
        FD4097A0          ldr     d0, [fp,#296]
        FD409BA1          ldr     d1, [fp,#304]
        97FFCE7B          bl      PassHaStructsInRegs.Program:Consume(float64x2)
        97FFCE80          bl      PassHaStructsInRegs.Program:Produce_float64x3():float64x3
        FD008BA0          str     d0, [fp,#272]
        FD008FA1          str     d1, [fp,#280]
        FD0093A2          str     d2, [fp,#288]
        FD408BA0          ldr     d0, [fp,#272]
        FD408FA1          ldr     d1, [fp,#280]
        FD4093A2          ldr     d2, [fp,#288]
        97FFCE7F          bl      PassHaStructsInRegs.Program:Consume(float64x3)
        97FFCE84          bl      PassHaStructsInRegs.Program:Produce_float64x4():float64x4
        FD007BA0          str     d0, [fp,#240]
        FD007FA1          str     d1, [fp,#248]
        FD0083A2          str     d2, [fp,#256]
        FD0087A3          str     d3, [fp,#264]
        FD407BA0          ldr     d0, [fp,#240]
        FD407FA1          ldr     d1, [fp,#248]
        FD4083A2          ldr     d2, [fp,#256]
        FD4087A3          ldr     d3, [fp,#264]
        97FFCE81          bl      PassHaStructsInRegs.Program:Consume(float64x4)
        97FFCE86          bl      PassHaStructsInRegs.Program:Produce_int32x2x2():int32x2x2
        FD0073A0          str     d0, [fp,#224]
        FD0077A1          str     d1, [fp,#232]
        FD4073A0          ldr     d0, [fp,#224]
        FD4077A1          ldr     d1, [fp,#232]
        97FFCE87          bl      PassHaStructsInRegs.Program:Consume(int32x2x2)
        97FFCE8C          bl      PassHaStructsInRegs.Program:Produce_int32x2x3():int32x2x3
        FD0067A0          str     d0, [fp,#200]
        FD006BA1          str     d1, [fp,#208]
        FD006FA2          str     d2, [fp,#216]
        FD4067A0          ldr     d0, [fp,#200]
        FD406BA1          ldr     d1, [fp,#208]
        FD406FA2          ldr     d2, [fp,#216]
        97FFCE8B          bl      PassHaStructsInRegs.Program:Consume(int32x2x3)
        97FFCE90          bl      PassHaStructsInRegs.Program:Produce_int32x2x4():int32x2x4
        FD0057A0          str     d0, [fp,#168]
        FD005BA1          str     d1, [fp,#176]
        FD005FA2          str     d2, [fp,#184]
        FD0063A3          str     d3, [fp,#192]
        FD4057A0          ldr     d0, [fp,#168]
        FD405BA1          ldr     d1, [fp,#176]
        FD405FA2          ldr     d2, [fp,#184]
        FD4063A3          ldr     d3, [fp,#192]
        97FFCE8D          bl      PassHaStructsInRegs.Program:Consume(int32x2x4)
        97FFCE92          bl      PassHaStructsInRegs.Program:Produce_int32x4x2():int32x4x2
        3C8883A0          str     q0, [fp,#136]
        3C8983A1          str     q1, [fp,#152]
        3CC883A0          ldr     q0, [fp,#136]
        3CC983A1          ldr     q1, [fp,#152]
        97FFCE93          bl      PassHaStructsInRegs.Program:Consume(int32x4x2)
        97FFCE98          bl      PassHaStructsInRegs.Program:Produce_int32x4x3():int32x4x3
        3C8583A0          str     q0, [fp,#88]
        3C8683A1          str     q1, [fp,#104]
        3C8783A2          str     q2, [fp,#120]
        3CC583A0          ldr     q0, [fp,#88]
        3CC683A1          ldr     q1, [fp,#104]
        3CC783A2          ldr     q2, [fp,#120]
        97FFCE97          bl      PassHaStructsInRegs.Program:Consume(int32x4x3)
        97FFCE9C          bl      PassHaStructsInRegs.Program:Produce_int32x4x4():int32x4x4
        3C8183A0          str     q0, [fp,#24]
        3C8283A1          str     q1, [fp,#40]
        3C8383A2          str     q2, [fp,#56]
        3C8483A3          str     q3, [fp,#72]
        3CC183A0          ldr     q0, [fp,#24]
        3CC283A1          ldr     q1, [fp,#40]
        3CC383A2          ldr     q2, [fp,#56]
        3CC483A3          ldr     q3, [fp,#72]
        97FFCE99          bl      PassHaStructsInRegs.Program:Consume(int32x4x4)
        52800C80          mov     w0, #100
                                                ;; bbWeight=1    PerfScore 132.50
G_M40249_IG03:              ;; offset=018CH
        A8D67BFD          ldp     fp, lr, [sp],#352
        D65F03C0          ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

; Total bytes of code 404, prolog size 8, PerfScore 176.40, instruction count 101, allocated bytes for code 404 (MethodHash=a22662c6) for method PassHaStructsInRegs.Program:Main(System.String[]):int

The change adds support in fgMorphMultiregStructArg() to allow in a case when such HFA/HVA value is promoted to be represented as GT_FIELD_LIST and, effectively, be passed in registers.

After #62623
; Assembly listing for method PassHaStructsInRegs.Program:Main(System.String[]):int
; Emitting BLENDED_CODE for generic ARM64 CPU - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
;* V00 arg0         [V00    ] (  0,  0   )     ref  ->  zero-ref    class-hnd single-def
;# V01 OutArgs      [V01    ] (  1,  1   )  lclBlk ( 0) [sp+00H]   "OutgoingArgSpace"
;* V02 tmp1         [V02    ] (  0,  0   )  struct ( 8) zero-ref    HFA(float)  multireg-arg multireg-ret "Return value temp for multireg return"
;* V03 tmp2         [V03    ] (  0,  0   )  struct (16) zero-ref    HFA(float)  multireg-arg multireg-ret "Return value temp for multireg return"
;  V04 tmp3         [V04,T00] (  5, 10   )  struct (16) [fp+110H]   HFA(float)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;* V05 tmp4         [V05    ] (  0,  0   )  struct (16) zero-ref    HFA(double)  multireg-arg multireg-ret "Return value temp for multireg return"
;* V06 tmp5         [V06    ] (  0,  0   )  struct (24) zero-ref    HFA(double)  multireg-arg multireg-ret "Return value temp for multireg return"
;  V07 tmp6         [V07,T01] (  5, 10   )  struct (32) [fp+F0H]   HFA(double)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V08 tmp7         [V08    ] (  3,  6   )  struct (16) [fp+E0H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V09 tmp8         [V09    ] (  4,  8   )  struct (24) [fp+C8H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V10 tmp9         [V10,T02] (  5, 10   )  struct (32) [fp+A8H]   HFA(simd8)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V11 tmp10        [V11,T08] (  3,  6   )  struct (32) [fp+88H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V12 tmp11        [V12,T04] (  4,  8   )  struct (48) [fp+58H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V13 tmp12        [V13,T03] (  5, 10   )  struct (64) [fp+18H]   HFA(simd16)  do-not-enreg[SFAR] multireg-arg multireg-ret "Return value temp for multireg return"
;  V14 tmp13        [V14,T11] (  2,  2   )   float  ->   d0         V02._0(offs=0x00) P-INDEP "field V02._0 (fldOffset=0x0)"
;  V15 tmp14        [V15,T12] (  2,  2   )   float  ->   d1         V02._1(offs=0x04) P-INDEP "field V02._1 (fldOffset=0x4)"
;  V16 tmp15        [V16,T13] (  2,  2   )   float  ->   d0         V03._0(offs=0x00) P-INDEP "field V03._0 (fldOffset=0x0)"
;  V17 tmp16        [V17,T14] (  2,  2   )   float  ->   d1         V03._1(offs=0x04) P-INDEP "field V03._1 (fldOffset=0x4)"
;  V18 tmp17        [V18,T15] (  2,  2   )   float  ->   d2         V03._2(offs=0x08) P-INDEP "field V03._2 (fldOffset=0x8)"
;  V19 tmp18        [V19,T16] (  2,  2   )  double  ->   d0         V05._0(offs=0x00) P-INDEP "field V05._0 (fldOffset=0x0)"
;  V20 tmp19        [V20,T17] (  2,  2   )  double  ->   d1         V05._1(offs=0x08) P-INDEP "field V05._1 (fldOffset=0x8)"
;  V21 tmp20        [V21,T18] (  2,  2   )  double  ->   d0         V06._0(offs=0x00) P-INDEP "field V06._0 (fldOffset=0x0)"
;  V22 tmp21        [V22,T19] (  2,  2   )  double  ->   d1         V06._1(offs=0x08) P-INDEP "field V06._1 (fldOffset=0x8)"
;  V23 tmp22        [V23,T20] (  2,  2   )  double  ->   d2         V06._2(offs=0x10) P-INDEP "field V06._2 (fldOffset=0x10)"
;  V24 tmp23        [V24,T09] (  3,  6   )    long  ->  [fp+E0H]   do-not-enreg[] V08._0(offs=0x00) P-DEP "field V08._0 (fldOffset=0x0)"
;  V25 tmp24        [V25,T10] (  3,  6   )    long  ->  [fp+E8H]   do-not-enreg[] V08._1(offs=0x08) P-DEP "field V08._1 (fldOffset=0x8)"
;  V26 tmp25        [V26,T05] (  4,  8   )    long  ->  [fp+C8H]   do-not-enreg[] V09._0(offs=0x00) P-DEP "field V09._0 (fldOffset=0x0)"
;  V27 tmp26        [V27,T06] (  4,  8   )    long  ->  [fp+D0H]   do-not-enreg[] V09._1(offs=0x08) P-DEP "field V09._1 (fldOffset=0x8)"
;  V28 tmp27        [V28,T07] (  4,  8   )    long  ->  [fp+D8H]   do-not-enreg[] V09._2(offs=0x10) P-DEP "field V09._2 (fldOffset=0x10)"
;
; Lcl frame size = 272

G_M40249_IG01:              ;; offset=0000H
        A9AE7BFD          stp     fp, lr, [sp,#-288]!
        910003FD          mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M40249_IG02:              ;; offset=0008H
        97FFCE6E          bl      PassHaStructsInRegs.Program:Produce_float32x2():float32x2
        97FFCE73          bl      PassHaStructsInRegs.Program:Consume(float32x2)
        97FFCE78          bl      PassHaStructsInRegs.Program:Produce_float32x3():float32x3
        97FFCE7D          bl      PassHaStructsInRegs.Program:Consume(float32x3)
        97FFCE82          bl      PassHaStructsInRegs.Program:Produce_float32x4():float32x4
        BD0113A0          str     s0, [fp,#272]
        BD0117A1          str     s1, [fp,#276]
        BD011BA2          str     s2, [fp,#280]
        BD011FA3          str     s3, [fp,#284]
        BD4113A0          ldr     s0, [fp,#272]
        BD4117A1          ldr     s1, [fp,#276]
        BD411BA2          ldr     s2, [fp,#280]
        BD411FA3          ldr     s3, [fp,#284]
        97FFCE7F          bl      PassHaStructsInRegs.Program:Consume(float32x4)
        97FFCE84          bl      PassHaStructsInRegs.Program:Produce_float64x2():float64x2
        97FFCE89          bl      PassHaStructsInRegs.Program:Consume(float64x2)
        97FFCE8E          bl      PassHaStructsInRegs.Program:Produce_float64x3():float64x3
        97FFCE93          bl      PassHaStructsInRegs.Program:Consume(float64x3)
        97FFCE98          bl      PassHaStructsInRegs.Program:Produce_float64x4():float64x4
        FD007BA0          str     d0, [fp,#240]
        FD007FA1          str     d1, [fp,#248]
        FD0083A2          str     d2, [fp,#256]
        FD0087A3          str     d3, [fp,#264]
        FD407BA0          ldr     d0, [fp,#240]
        FD407FA1          ldr     d1, [fp,#248]
        FD4083A2          ldr     d2, [fp,#256]
        FD4087A3          ldr     d3, [fp,#264]
        97FFCE95          bl      PassHaStructsInRegs.Program:Consume(float64x4)
        97FFCE9A          bl      PassHaStructsInRegs.Program:Produce_int32x2x2():int32x2x2
        FD0073A0          str     d0, [fp,#224]
        FD0077A1          str     d1, [fp,#232]
        FD4073A0          ldr     d0, [fp,#224]
        FD4077A1          ldr     d1, [fp,#232]
        97FFCE9B          bl      PassHaStructsInRegs.Program:Consume(int32x2x2)
        97FFCEA0          bl      PassHaStructsInRegs.Program:Produce_int32x2x3():int32x2x3
        FD0067A0          str     d0, [fp,#200]
        FD006BA1          str     d1, [fp,#208]
        FD006FA2          str     d2, [fp,#216]
        FD4067A0          ldr     d0, [fp,#200]
        FD406BA1          ldr     d1, [fp,#208]
        FD406FA2          ldr     d2, [fp,#216]
        97FFCE9F          bl      PassHaStructsInRegs.Program:Consume(int32x2x3)
        97FFCEA4          bl      PassHaStructsInRegs.Program:Produce_int32x2x4():int32x2x4
        FD0057A0          str     d0, [fp,#168]
        FD005BA1          str     d1, [fp,#176]
        FD005FA2          str     d2, [fp,#184]
        FD0063A3          str     d3, [fp,#192]
        FD4057A0          ldr     d0, [fp,#168]
        FD405BA1          ldr     d1, [fp,#176]
        FD405FA2          ldr     d2, [fp,#184]
        FD4063A3          ldr     d3, [fp,#192]
        97FFCEA1          bl      PassHaStructsInRegs.Program:Consume(int32x2x4)
        97FFCEA6          bl      PassHaStructsInRegs.Program:Produce_int32x4x2():int32x4x2
        3C8883A0          str     q0, [fp,#136]
        3C8983A1          str     q1, [fp,#152]
        3CC883A0          ldr     q0, [fp,#136]
        3CC983A1          ldr     q1, [fp,#152]
        97FFCEA7          bl      PassHaStructsInRegs.Program:Consume(int32x4x2)
        97FFCEAC          bl      PassHaStructsInRegs.Program:Produce_int32x4x3():int32x4x3
        3C8583A0          str     q0, [fp,#88]
        3C8683A1          str     q1, [fp,#104]
        3C8783A2          str     q2, [fp,#120]
        3CC583A0          ldr     q0, [fp,#88]
        3CC683A1          ldr     q1, [fp,#104]
        3CC783A2          ldr     q2, [fp,#120]
        97FFCEAB          bl      PassHaStructsInRegs.Program:Consume(int32x4x3)
        97FFCEB0          bl      PassHaStructsInRegs.Program:Produce_int32x4x4():int32x4x4
        3C8183A0          str     q0, [fp,#24]
        3C8283A1          str     q1, [fp,#40]
        3C8383A2          str     q2, [fp,#56]
        3C8483A3          str     q3, [fp,#72]
        3CC183A0          ldr     q0, [fp,#24]
        3CC283A1          ldr     q1, [fp,#40]
        3CC383A2          ldr     q2, [fp,#56]
        3CC483A3          ldr     q3, [fp,#72]
        97FFCEAD          bl      PassHaStructsInRegs.Program:Consume(int32x4x4)
        52800C80          mov     w0, #100
                                                ;; bbWeight=1    PerfScore 102.50
G_M40249_IG03:              ;; offset=013CH
        A8D27BFD          ldp     fp, lr, [sp],#288
        D65F03C0          ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

; Total bytes of code 324, prolog size 8, PerfScore 138.40, instruction count 81, allocated bytes for code 324 (MethodHash=a22662c6) for method PassHaStructsInRegs.Program:Main(System.String[]):int

Note, however, that the HFA/HVA values with 4 fields are still passed through the memory - currently we don't promote struct with more than 3 fields if none of the fields is accessed. I plan to work on this in a follow-up PR.

@echesakov echesakov added arch-arm64 area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI labels Dec 10, 2021
@ghost
Copy link

ghost commented Dec 10, 2021

Tagging subscribers to this area: @JulieLeeMSFT
See info in area-owners.md if you want to be subscribed.

Issue Details

null

Author: echesakovMSFT
Assignees: -
Labels:

arch-arm64, area-CodeGen-coreclr

Milestone: -

@echesakov
Copy link
Contributor Author

/azp run runtime-coreclr jitstress

@azure-pipelines
Copy link

Azure Pipelines successfully started running 1 pipeline(s).

@echesakov echesakov marked this pull request as ready for review December 10, 2021 22:27
@echesakov
Copy link
Contributor Author

@kunalspathak PTAL
cc @dotnet/jit-contrib

@@ -4284,11 +4284,7 @@ GenTree* Compiler::fgMorphMultiregStructArg(GenTree* arg, fgArgTabEntry* fgEntry
var_types type[MAX_ARG_REG_COUNT] = {}; // TYP_UNDEF = 0

hfaType = fgEntryPtr->GetHfaType();
if (varTypeIsValidHfaType(hfaType)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the condition:

  1. I believe if was incorrectly mentioning !HOST_UNIX instead of specifying that the target is Windows.
  2. My understanding is we wanted to check here is that the argument is passed in SIMD register (win-arm64 varargs prohibits using of SIMD registers).
  3. However, I believe that checking for !fgEntryPtr->IsVararg() was not enough - we should've checked for a stronger condition that the function has any vararg arguments at all - https://docs.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-160#addendum-variadic-functions ?

@echesakov
Copy link
Contributor Author

An example of the diff in the libraries code:

diff --git a/base/178576.dasm b/diff/178576.dasm
index 7a4d5361b1c..12dc7570806 100644
--- a/base/178576.dasm
+++ b/diff/178576.dasm
@@ -6,45 +6,37 @@
 ; No matching PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00    ] (  6,  6   )  struct ( 8) [fp+28H]   HFA(float)  do-not-enreg[SFA] multireg-arg single-def
-;  V01 arg1         [V01    ] (  6,  6   )  struct ( 8) [fp+20H]   HFA(float)  do-not-enreg[SFA] multireg-arg single-def
+;* V00 arg0         [V00    ] (  0,  0   )  struct ( 8) zero-ref    HFA(float)  multireg-arg single-def
+;* V01 arg1         [V01    ] (  0,  0   )  struct ( 8) zero-ref    HFA(float)  multireg-arg single-def
 ;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [sp+00H]   "OutgoingArgSpace"
 ;* V03 tmp1         [V03    ] (  0,  0   )  struct ( 8) zero-ref    HFA(float)  multireg-ret "Return value temp for multireg return"
-;  V04 tmp2         [V04,T01] (  5,  5   )   float  ->  [fp+28H]   do-not-enreg[] single-def V00.x(offs=0x00) P-DEP "field V00.x (fldOffset=0x0)"
-;  V05 tmp3         [V05,T02] (  5,  5   )   float  ->  [fp+2CH]   do-not-enreg[] single-def V00.y(offs=0x04) P-DEP "field V00.y (fldOffset=0x4)"
-;  V06 tmp4         [V06,T03] (  5,  5   )   float  ->  [fp+20H]   do-not-enreg[] single-def V01.width(offs=0x00) P-DEP "field V01.width (fldOffset=0x0)"
-;  V07 tmp5         [V07,T04] (  5,  5   )   float  ->  [fp+24H]   do-not-enreg[] single-def V01.height(offs=0x04) P-DEP "field V01.height (fldOffset=0x4)"
+;  V04 tmp2         [V04,T01] (  2,  2   )   float  ->   d0         single-def V00.x(offs=0x00) P-INDEP "field V00.x (fldOffset=0x0)"
+;  V05 tmp3         [V05,T02] (  2,  2   )   float  ->   d1         single-def V00.y(offs=0x04) P-INDEP "field V00.y (fldOffset=0x4)"
+;  V06 tmp4         [V06,T03] (  2,  2   )   float  ->   d2         single-def V01.width(offs=0x00) P-INDEP "field V01.width (fldOffset=0x0)"
+;  V07 tmp5         [V07,T04] (  2,  2   )   float  ->   d3         single-def V01.height(offs=0x04) P-INDEP "field V01.height (fldOffset=0x4)"
 ;  V08 tmp6         [V08,T05] (  2,  2   )   float  ->   d0         V03.x(offs=0x00) P-INDEP "field V03.x (fldOffset=0x0)"
 ;  V09 tmp7         [V09,T06] (  2,  2   )   float  ->   d1         V03.y(offs=0x04) P-INDEP "field V03.y (fldOffset=0x4)"
 ;  V10 tmp8         [V10,T00] (  3,  3   )  struct ( 8) [fp+18H]   HFA(float)  do-not-enreg[SFR] multireg-ret "Return value temp for multi-reg return (rejected tail call)."
 ;
-; Lcl frame size = 32
+; Lcl frame size = 16

 G_M36202_IG01:        ; gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
-            stp     fp, lr, [sp,#-48]!
+            stp     fp, lr, [sp,#-32]!
             mov     fp, sp
-            str     s0, [fp,#40]
-            str     s1, [fp,#44]
-            str     s2, [fp,#32]
-            str     s3, [fp,#36]
-                                               ;; bbWeight=1    PerfScore 5.50
+                                               ;; bbWeight=1    PerfScore 1.50
 G_M36202_IG02:        ; gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
-            ldr     s0, [fp,#40]
-            ldr     s1, [fp,#44]
-            ldr     s2, [fp,#32]
-            ldr     s3, [fp,#36]
             bl      System.Drawing.PointF:Add(System.Drawing.PointF,System.Drawing.SizeF):System.Drawing.PointF
             str     s0, [fp,#24]
             str     s1, [fp,#28]
             ldr     s0, [fp,#24]
             ldr     s1, [fp,#28]
-                                               ;; bbWeight=1    PerfScore 15.00
+                                               ;; bbWeight=1    PerfScore 7.00
 G_M36202_IG03:        ; , epilog, nogc, extend
-            ldp     fp, lr, [sp],#48
+            ldp     fp, lr, [sp],#32
             ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

-; Total bytes of code 68, prolog size 8, PerfScore 29.30, instruction count 17, allocated bytes for code 68 (MethodHash=96e07295) for method System.Drawing.PointF:op_Addition(System.Drawing.PointF,System.Drawing.SizeF):System.Drawing.PointF
+; Total bytes of code 36, prolog size 8, PerfScore 14.10, instruction count 9, allocated bytes for code 36 (MethodHash=96e07295) for method System.Drawing.PointF:op_Addition(System.Drawing.PointF,System.Drawing.SizeF):System.Drawing.PointF
 ; ============================================================

Copy link
Member

@BruceForstall BruceForstall left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was confused by the title, because "Pass HFA/HVA in registers" implies to me that you are changing the calling convention / ABI, but actually, you're just changing how HFA/HVA arguments are handled in setting up for calls, not where the values live in argument registers.

Copy link
Member

@kunalspathak kunalspathak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Nice wins!

@echesakov
Copy link
Contributor Author

I was confused by the title, because "Pass HFA/HVA in registers" implies to me that you are changing the calling convention / ABI, but actually, you're just changing how HFA/HVA arguments are handled in setting up for calls, not where the values live in argument registers.

Sorry for the confusion. I should've named the change as "Allow promoted fields of HFA/HVAs to stay in registers when possible".

@echesakov echesakov merged commit c5fa469 into dotnet:main Dec 15, 2021
@echesakov echesakov deleted the Pass-HFA-HVA-In-Regs branch December 15, 2021 19:07
@ghost ghost locked as resolved and limited conversation to collaborators Jan 15, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
arch-arm64 area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants