-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Description
Arm64 provides branchless conditional selection and comparison instructions that should be utilized by RyuJIT in the code it generates.
RyuJIT already has support for them as seen below:
runtime/src/coreclr/jit/instrsarm64.h
Lines 1353 to 1375 in f0b7773
INST1(csel, "csel", 0, IF_DR_3D, 0x1A800000) | |
// csel Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc00nnnnnddddd 1A80 0000 cond | |
INST1(csinc, "csinc", 0, IF_DR_3D, 0x1A800400) | |
// csinc Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc01nnnnnddddd 1A80 0400 cond | |
INST1(csinv, "csinv", 0, IF_DR_3D, 0x5A800000) | |
// csinv Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc00nnnnnddddd 5A80 0000 cond | |
INST1(csneg, "csneg", 0, IF_DR_3D, 0x5A800400) | |
// csneg Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc01nnnnnddddd 5A80 0400 cond | |
INST1(cinc, "cinc", 0, IF_DR_2D, 0x1A800400) | |
// cinc Rd,Rn,cond DR_2D X0011010100nnnnn cccc01nnnnnddddd 1A80 0400 cond | |
INST1(cinv, "cinv", 0, IF_DR_2D, 0x5A800000) | |
// cinv Rd,Rn,cond DR_2D X1011010100nnnnn cccc00nnnnnddddd 5A80 0000 cond | |
INST1(cneg, "cneg", 0, IF_DR_2D, 0x5A800400) | |
// cneg Rd,Rn,cond DR_2D X1011010100nnnnn cccc01nnnnnddddd 5A80 0400 cond | |
INST1(cset, "cset", 0, IF_DR_1D, 0x1A9F07E0) | |
// cset Rd,cond DR_1D X001101010011111 cccc0111111ddddd 1A9F 07E0 Rd cond |
runtime/src/coreclr/jit/instrsarm64.h
Lines 633 to 639 in f0b7773
INST2(ccmp, "ccmp", CMP, IF_EN2F, 0x7A400000, 0x7A400800) | |
// ccmp Rn,Rm, nzcv,cond DR_2I X1111010010mmmmm cccc00nnnnn0nzcv 7A40 0000 nzcv, cond | |
// ccmp Rn,imm5,nzcv,cond DI_1F X1111010010iiiii cccc10nnnnn0nzcv 7A40 0800 imm5, nzcv, cond | |
INST2(ccmn, "ccmn", CMP, IF_EN2F, 0x3A400000, 0x3A400800) | |
// ccmn Rn,Rm, nzcv,cond DR_2I X0111010010mmmmm cccc00nnnnn0nzcv 3A40 0000 nzcv, cond | |
// ccmn Rn,imm5,nzcv,cond DI_1F X0111010910iiiii cccc10nnnnn0nzcv 3A40 0800 imm5, nzcv, cond |
Currently, the method emitIns_R_R_R_COND and emitIns_R_I_FLAGS_COND that produces these instructions are not utilized at all. emitIns_R_R_R_COND
was recently used in #66407 to generate csneg instruction. Once these instructions are used, we could produce much better code. Below are some examples:
Example# 1:
static void Test(uint op1, uint op2) {
if (op1 > 0 && op2 > 0) {
op1 = 5;
} else {
op1 = 10;
}
Consume(op1, op2);
}
Ideal code: https://godbolt.org/z/5ov9TKx6P
Current code:
G_M2878_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M2878_IG02:
cbz w0, G_M2878_IG04
;; bbWeight=1 PerfScore 1.00
G_M2878_IG03:
cbz w1, G_M2878_IG04
mov w0, #5
b G_M2878_IG05
;; bbWeight=0.50 PerfScore 1.25
G_M2878_IG04:
mov w0, #10
;; bbWeight=0.50 PerfScore 0.25
G_M2878_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M2878_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Example# 2:
static void Test(uint op1, uint op2) {
op1 = op1 > 0 ? 5 : 6;
Consume(op1, op2);
}
Ideal code: https://godbolt.org/z/GTnc4jjfG
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cmp w0, #0
bgt G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
mov w0, #6
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
mov w0, #5
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Example# 3:
static void Test(uint op1, uint op2) {
op1 = (op1 > 0) ? 0 : 1;
Consume(op1, op2);
}
Ideal code: https://godbolt.org/z/GoqcsM1Tf
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cmp w0, #0
bgt G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
mov w0, #1
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
mov w0, wzr
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.00
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Example# 4:
static void Test(uint op1, uint op2, uint xyz, uint def) {
op1 = op1 > 0 ? xyz : def;
Consume(op1, op2);
}
Ideal code: https://godbolt.org/z/1EfxPn48q
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
cbnz w0, G_M9565_IG04
;; bbWeight=1 PerfScore 1.00
G_M9565_IG03:
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
mov w3, w2
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
mov w0, w3
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.50
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Example# 5:
static void Test(int op1, int op2, int xyz, int def) {
op1 = ((op1 & op2) == 0) ? 5 : def;
Consume(op1, op2);
}
Ideal code: https://godbolt.org/z/fc3eddPx3
Current code:
G_M9565_IG01:
stp fp, lr, [sp,#-16]!
mov fp, sp
;; bbWeight=1 PerfScore 1.50
G_M9565_IG02:
tst w0, w1
beq G_M9565_IG04
;; bbWeight=1 PerfScore 1.50
G_M9565_IG03:
b G_M9565_IG05
;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
mov w3, #5
;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
mov w0, w3
bl _12219:Consume(int,int)
;; bbWeight=1 PerfScore 1.50
G_M9565_IG06:
ldp fp, lr, [sp],#16
ret lr
;; bbWeight=1 PerfScore 2.00
Some related issues:
- RyuJit: avoid conditional jumps using cmov and similar instructions #6749 RyuJit: avoid conditional jumps using cmov and similar instructions
- RyuJIT: Optimize "X / POW2_CNS" via cmovns #41549 RyuJIT: Optimize "X / POW2_CNS" via cmovns
- [RyuJIT][arm64] Optimize "x<0" and "x>=0" #43440 [RyuJIT][arm64] Optimize "x<0" and "x>=0"
Presumably, some parts of the analysis can be implemented in platform agnostic way and benefit both Arm64 and X86 platforms.
category:cq
theme:intrinsics
skill-level:expert
cost:large
impact:medium
Metadata
Metadata
Assignees
Labels
Type
Projects
Status