Skip to content

Arm64: Generate conditional comparison and selection instructions #55364

@echesakov

Description

@echesakov

Arm64 provides branchless conditional selection and comparison instructions that should be utilized by RyuJIT in the code it generates.

image

Reference: https://eclecticlight.co/2021/07/20/code-in-arm-assembly-conditions-without-branches/

RyuJIT already has support for them as seen below:

INST1(csel, "csel", 0, IF_DR_3D, 0x1A800000)
// csel Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc00nnnnnddddd 1A80 0000 cond
INST1(csinc, "csinc", 0, IF_DR_3D, 0x1A800400)
// csinc Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc01nnnnnddddd 1A80 0400 cond
INST1(csinv, "csinv", 0, IF_DR_3D, 0x5A800000)
// csinv Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc00nnnnnddddd 5A80 0000 cond
INST1(csneg, "csneg", 0, IF_DR_3D, 0x5A800400)
// csneg Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc01nnnnnddddd 5A80 0400 cond
INST1(cinc, "cinc", 0, IF_DR_2D, 0x1A800400)
// cinc Rd,Rn,cond DR_2D X0011010100nnnnn cccc01nnnnnddddd 1A80 0400 cond
INST1(cinv, "cinv", 0, IF_DR_2D, 0x5A800000)
// cinv Rd,Rn,cond DR_2D X1011010100nnnnn cccc00nnnnnddddd 5A80 0000 cond
INST1(cneg, "cneg", 0, IF_DR_2D, 0x5A800400)
// cneg Rd,Rn,cond DR_2D X1011010100nnnnn cccc01nnnnnddddd 5A80 0400 cond
INST1(cset, "cset", 0, IF_DR_1D, 0x1A9F07E0)
// cset Rd,cond DR_1D X001101010011111 cccc0111111ddddd 1A9F 07E0 Rd cond

INST2(ccmp, "ccmp", CMP, IF_EN2F, 0x7A400000, 0x7A400800)
// ccmp Rn,Rm, nzcv,cond DR_2I X1111010010mmmmm cccc00nnnnn0nzcv 7A40 0000 nzcv, cond
// ccmp Rn,imm5,nzcv,cond DI_1F X1111010010iiiii cccc10nnnnn0nzcv 7A40 0800 imm5, nzcv, cond
INST2(ccmn, "ccmn", CMP, IF_EN2F, 0x3A400000, 0x3A400800)
// ccmn Rn,Rm, nzcv,cond DR_2I X0111010010mmmmm cccc00nnnnn0nzcv 3A40 0000 nzcv, cond
// ccmn Rn,imm5,nzcv,cond DI_1F X0111010910iiiii cccc10nnnnn0nzcv 3A40 0800 imm5, nzcv, cond

Currently, the method emitIns_R_R_R_COND and emitIns_R_I_FLAGS_COND that produces these instructions are not utilized at all. emitIns_R_R_R_COND was recently used in #66407 to generate csneg instruction. Once these instructions are used, we could produce much better code. Below are some examples:

Example# 1:

    static void Test(uint op1, uint op2) {
        if (op1 > 0 && op2 > 0) {
            op1 = 5;
        }  else {
            op1 = 10;
        }
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/5ov9TKx6P
Current code:

G_M2878_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M2878_IG02:
            cbz     w0, G_M2878_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG03:
            cbz     w1, G_M2878_IG04
            mov     w0, #5
            b       G_M2878_IG05
                                                ;; bbWeight=0.50 PerfScore 1.25
G_M2878_IG04:
            mov     w0, #10
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M2878_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 2:

    static void Test(uint op1, uint op2) {
        op1 = op1 > 0 ? 5 : 6;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GTnc4jjfG
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #6
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 3:

    static void Test(uint op1, uint op2) {
        op1 = (op1 > 0) ? 0 : 1;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GoqcsM1Tf
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #1
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, wzr
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 4:

    static void Test(uint op1, uint op2, uint xyz, uint def) {
        op1 = op1 > 0 ? xyz : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/1EfxPn48q
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cbnz    w0, G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, w2
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 5:

    static void Test(int op1, int op2, int xyz, int def) {
        op1 = ((op1 & op2) == 0) ? 5 : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/fc3eddPx3
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            tst     w0, w1
            beq     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

@TamarChristinaArm

Some related issues:

Presumably, some parts of the analysis can be implemented in platform agnostic way and benefit both Arm64 and X86 platforms.

category:cq
theme:intrinsics
skill-level:expert
cost:large
impact:medium

Metadata

Metadata

Labels

Bottom Up WorkNot part of a theme, epic, or user storyarea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

Status

Done

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions