-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Arm64: re-enable use of predicate variants #117313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Fixes dotnet#101970 Predicate variants were implemented in dotnet#114438 and then turned off in dotnet#115566. The code was then removed in dotnet#117101 when the AMD64 version was moved to from morph to folding. This is a simple rework of that code. Replaces dotnet#116854
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
Looking at the diffs, there's code movement, but zero gains. Generally this is because we don't have any test that uses the result of the predicate as a mask. The changes get gains on the input then lose them again on the result conversion. There one case (+12) where we no longer optimise away a constant conditional select. I can take a look at that. Diffs are based on 2,565,859 contexts (1,098,055 MinOpts, 1,467,804 FullOpts). MISSED contexts: 944 (0.04%) Overall (+12 bytes)
FullOpts (+12 bytes)
Example diffscoreclr_tests.run.linux.arm64.checked.mch+0 (0.00%) : 186049.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -143,9 +143,9 @@ G_M40199_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p3.b
cmpne p3.b, p3/z, z11.b, #0
brkpa p1.b, p1/z, p2.b, p3.b
- mov z16.b, p1/z, #1
- movi v17.4s, #0
- sel z12.b, p0, z16.b, z17.b
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.b, p0/z, #1
str q12, [fp, #0x20] // [V07 tmp6]
add x21, x19, #96
; byrRegs +[x21]
@@ -321,7 +321,7 @@ G_M40199_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
blr x1
; byrRegs -[x0]
ldr x1, [x24, #0x20]
- ;; size=716 bbWeight=1 PerfScore 201.50
+ ;; size=716 bbWeight=1 PerfScore 203.00
G_M40199_IG03: ; bbWeight=1, extend
add x0, x0, x1
sub x0, x0, #1
@@ -382,16 +382,16 @@ G_M40199_IG03: ; bbWeight=1, extend
ldp q10, q11, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
+ cmpne p2.b, p2/z, z9.b, #0
ptrue p3.b
- cmpne p3.b, p3/z, z11.b, #0
- brkpa p1.b, p1/z, p2.b, p3.b
- mov z17.b, p1/z, #1
- sel z12.b, p0, z16.b, z17.b
+ cmpne p3.b, p3/z, z10.b, #0
+ ptrue p4.b
+ cmpne p4.b, p4/z, z11.b, #0
+ brkpa p2.b, p2/z, p3.b, p4.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.b, p0/z, #1
str q12, [fp, #0x10] // [V47 tmp46]
mov x22, x21
; byrRegs +[x22]
@@ -534,7 +534,7 @@ G_M40199_IG03: ; bbWeight=1, extend
mov x0, x20
; gcrRegs +[x0]
movz x1, #0xD1FFAB1E
- ;; size=728 bbWeight=1 PerfScore 196.50
+ ;; size=728 bbWeight=1 PerfScore 198.00
G_M40199_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
@@ -614,7 +614,7 @@ G_M40199_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1744, prolog size 36, PerfScore 472.00, instruction count 436, allocated bytes for code 1744 (MethodHash=469562f8) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1744, prolog size 36, PerfScore 475.00, instruction count 436, allocated bytes for code 1744 (MethodHash=469562f8) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 186370.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M36609_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.h
cmpne p2.h, p2/z, z10.h, #0
brkb p1.b, p1/z, p2.b
- mov z16.h, p1/z, #1
- movi v17.4s, #0
- sel z11.h, p0, z16.h, z17.h
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z11.h, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M36609_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M36609_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M36609_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.h
cmpne p0.h, p0/z, z8.h, #0
- movi v16.4s, #0
- ptrue p1.h
- cmpne p1.h, p1/z, z9.h, #0
+ pfalse p1.b
ptrue p2.h
- cmpne p2.h, p2/z, z10.h, #0
- brkb p1.b, p1/z, p2.b
- mov z17.h, p1/z, #1
- sel z11.h, p0, z16.h, z17.h
+ cmpne p2.h, p2/z, z9.h, #0
+ ptrue p3.h
+ cmpne p3.h, p3/z, z10.h, #0
+ brkb p2.b, p2/z, p3.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z11.h, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M36609_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M36609_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M36609_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=e17d70fe) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=e17d70fe) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 186179.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)@@ -143,9 +143,9 @@ G_M6879_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, byr
ptrue p3.h
cmpne p3.h, p3/z, z11.h, #0
brkpa p1.b, p1/z, p2.b, p3.b
- mov z16.h, p1/z, #1
- movi v17.4s, #0
- sel z12.h, p0, z16.h, z17.h
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.h, p0/z, #1
str q12, [fp, #0x20] // [V07 tmp6]
add x21, x19, #96
; byrRegs +[x21]
@@ -321,7 +321,7 @@ G_M6879_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, byr
blr x1
; byrRegs -[x0]
ldr x1, [x24, #0x20]
- ;; size=716 bbWeight=1 PerfScore 201.50
+ ;; size=716 bbWeight=1 PerfScore 203.00
G_M6879_IG03: ; bbWeight=1, extend
add x0, x0, x1
sub x0, x0, #1
@@ -382,16 +382,16 @@ G_M6879_IG03: ; bbWeight=1, extend
ldp q10, q11, [x19, #0x30]
ptrue p0.h
cmpne p0.h, p0/z, z8.h, #0
- movi v16.4s, #0
- ptrue p1.h
- cmpne p1.h, p1/z, z9.h, #0
+ pfalse p1.b
ptrue p2.h
- cmpne p2.h, p2/z, z10.h, #0
+ cmpne p2.h, p2/z, z9.h, #0
ptrue p3.h
- cmpne p3.h, p3/z, z11.h, #0
- brkpa p1.b, p1/z, p2.b, p3.b
- mov z17.h, p1/z, #1
- sel z12.h, p0, z16.h, z17.h
+ cmpne p3.h, p3/z, z10.h, #0
+ ptrue p4.h
+ cmpne p4.h, p4/z, z11.h, #0
+ brkpa p2.b, p2/z, p3.b, p4.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.h, p0/z, #1
str q12, [fp, #0x10] // [V47 tmp46]
mov x22, x21
; byrRegs +[x22]
@@ -534,7 +534,7 @@ G_M6879_IG03: ; bbWeight=1, extend
mov x0, x20
; gcrRegs +[x0]
movz x1, #0xD1FFAB1E
- ;; size=728 bbWeight=1 PerfScore 196.50
+ ;; size=728 bbWeight=1 PerfScore 198.00
G_M6879_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
@@ -614,7 +614,7 @@ G_M6879_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1744, prolog size 36, PerfScore 472.00, instruction count 436, allocated bytes for code 1744 (MethodHash=74b4e520) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1744, prolog size 36, PerfScore 475.00, instruction count 436, allocated bytes for code 1744 (MethodHash=74b4e520) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +12 (+12.50%) : 344442.dasm - Runtime_113338:Test() (FullOpts)@@ -20,17 +20,20 @@ G_M1759_IG01: ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {}
mov fp, sp
;; size=8 bbWeight=1 PerfScore 1.50
G_M1759_IG02: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
+ ptrue p0.s
+ pfalse p1.b
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
mov w0, wzr
- ptrue p0.b, vl1
- sqincp x0, p0.d, w0
+ ptrue p1.b, vl1
+ sqincp x0, p1.d, w0
movi v16.4s, #0
ins v16.s[0], w0
- ptrue p0.s
- cmpne p0.s, p0/z, z16.s, #0
- pfalse p1.b
- ptest p1, p0.b
+ ptrue p1.s
+ cmpne p1.s, p1/z, z16.s, #0
+ ptest p0, p1.b
blo G_M1759_IG04
- ;; size=40 bbWeight=1 PerfScore 20.00
+ ;; size=52 bbWeight=1 PerfScore 26.00
G_M1759_IG03: ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
ldp fp, lr, [sp], #0x10
ret lr
@@ -50,7 +53,7 @@ G_M1759_IG05: ; bbWeight=0.50, epilog, nogc, extend
br x1
;; size=8 bbWeight=0.50 PerfScore 1.00
-; Total bytes of code 96, prolog size 8, PerfScore 28.00, instruction count 24, allocated bytes for code 96 (MethodHash=5e0bf920) for method Runtime_113338:Test() (FullOpts)
+; Total bytes of code 108, prolog size 8, PerfScore 34.00, instruction count 27, allocated bytes for code 108 (MethodHash=5e0bf920) for method Runtime_113338:Test() (FullOpts)
; ============================================================
Unwind Info:
@@ -61,7 +64,7 @@ Unwind Info:
E bit : 0
X bit : 0
Vers : 0
- Function Length : 24 (0x00018) Actual length = 96 (0x000060)
+ Function Length : 27 (0x0001b) Actual length = 108 (0x00006c)
---- Epilog scopes ----
---- Scope 0
Epilog Start Offset : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e) +0 (0.00%) : 188800.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M16753_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.b
cmpne p2.b, p2/z, z10.b, #0
pnext p2.b, p1, p2.b
- mov z16.b, p2/z, #1
- movi v17.4s, #0
- sel z11.b, p0, z16.b, z17.b
+ pfalse p1.b
+ sel p0.b, p0, p2.b, p1.b
+ mov z11.b, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M16753_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M16753_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M16753_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
- pnext p2.b, p1, p2.b
- mov z17.b, p2/z, #1
- sel z11.b, p0, z16.b, z17.b
+ cmpne p2.b, p2/z, z9.b, #0
+ ptrue p3.b
+ cmpne p3.b, p3/z, z10.b, #0
+ pnext p3.b, p2, p3.b
+ sel p0.b, p0, p1.b, p3.b
+ mov z11.b, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M16753_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M16753_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M16753_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=1d46be8e) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=1d46be8e) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 188607.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M40735_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.b
cmpne p2.b, p2/z, z10.b, #0
pfirst p2.b, p1, p2.b
- mov z16.b, p2/z, #1
- movi v17.4s, #0
- sel z11.b, p0, z16.b, z17.b
+ pfalse p1.b
+ sel p0.b, p0, p2.b, p1.b
+ mov z11.b, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M40735_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M40735_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M40735_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
- pfirst p2.b, p1, p2.b
- mov z17.b, p2/z, #1
- sel z11.b, p0, z16.b, z17.b
+ cmpne p2.b, p2/z, z9.b, #0
+ ptrue p3.b
+ cmpne p3.b, p3/z, z10.b, #0
+ pfirst p3.b, p2, p3.b
+ sel p0.b, p0, p1.b, p3.b
+ mov z11.b, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M40735_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M40735_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M40735_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=626d60e0) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=626d60e0) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: DetailsSize improvements/regressions per collection
PerfScore improvements/regressions per collection
Context information
jit-analyze output |
Difference in ZipLowMask from the test case: HEAD:
PR:
|
Difference in TransposeEvenAndMask from the test case: HEAD:
PR:
|
With the latest version there are 0 diffs in size, but plenty of code movement. The new tests do show improvements (see previous comments). Diffs are based on 2,565,859 contexts (1,098,055 MinOpts, 1,467,804 FullOpts). MISSED contexts: 944 (0.04%) Overall (+0 bytes)
FullOpts (+0 bytes)
Example diffscoreclr_tests.run.linux.arm64.checked.mch+0 (0.00%) : 186049.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -143,9 +143,9 @@ G_M40199_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p3.b
cmpne p3.b, p3/z, z11.b, #0
brkpa p1.b, p1/z, p2.b, p3.b
- mov z16.b, p1/z, #1
- movi v17.4s, #0
- sel z12.b, p0, z16.b, z17.b
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.b, p0/z, #1
str q12, [fp, #0x20] // [V07 tmp6]
add x21, x19, #96
; byrRegs +[x21]
@@ -321,7 +321,7 @@ G_M40199_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
blr x1
; byrRegs -[x0]
ldr x1, [x24, #0x20]
- ;; size=716 bbWeight=1 PerfScore 201.50
+ ;; size=716 bbWeight=1 PerfScore 203.00
G_M40199_IG03: ; bbWeight=1, extend
add x0, x0, x1
sub x0, x0, #1
@@ -382,16 +382,16 @@ G_M40199_IG03: ; bbWeight=1, extend
ldp q10, q11, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
+ cmpne p2.b, p2/z, z9.b, #0
ptrue p3.b
- cmpne p3.b, p3/z, z11.b, #0
- brkpa p1.b, p1/z, p2.b, p3.b
- mov z17.b, p1/z, #1
- sel z12.b, p0, z16.b, z17.b
+ cmpne p3.b, p3/z, z10.b, #0
+ ptrue p4.b
+ cmpne p4.b, p4/z, z11.b, #0
+ brkpa p2.b, p2/z, p3.b, p4.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.b, p0/z, #1
str q12, [fp, #0x10] // [V47 tmp46]
mov x22, x21
; byrRegs +[x22]
@@ -534,7 +534,7 @@ G_M40199_IG03: ; bbWeight=1, extend
mov x0, x20
; gcrRegs +[x0]
movz x1, #0xD1FFAB1E
- ;; size=728 bbWeight=1 PerfScore 196.50
+ ;; size=728 bbWeight=1 PerfScore 198.00
G_M40199_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
@@ -614,7 +614,7 @@ G_M40199_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1744, prolog size 36, PerfScore 472.00, instruction count 436, allocated bytes for code 1744 (MethodHash=469562f8) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1744, prolog size 36, PerfScore 475.00, instruction count 436, allocated bytes for code 1744 (MethodHash=469562f8) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_byte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 186370.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M36609_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.h
cmpne p2.h, p2/z, z10.h, #0
brkb p1.b, p1/z, p2.b
- mov z16.h, p1/z, #1
- movi v17.4s, #0
- sel z11.h, p0, z16.h, z17.h
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z11.h, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M36609_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M36609_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M36609_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.h
cmpne p0.h, p0/z, z8.h, #0
- movi v16.4s, #0
- ptrue p1.h
- cmpne p1.h, p1/z, z9.h, #0
+ pfalse p1.b
ptrue p2.h
- cmpne p2.h, p2/z, z10.h, #0
- brkb p1.b, p1/z, p2.b
- mov z17.h, p1/z, #1
- sel z11.h, p0, z16.h, z17.h
+ cmpne p2.h, p2/z, z9.h, #0
+ ptrue p3.h
+ cmpne p3.h, p3/z, z10.h, #0
+ brkb p2.b, p2/z, p3.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z11.h, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M36609_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M36609_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M36609_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=e17d70fe) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=e17d70fe) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakBeforeMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 186179.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)@@ -143,9 +143,9 @@ G_M6879_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, byr
ptrue p3.h
cmpne p3.h, p3/z, z11.h, #0
brkpa p1.b, p1/z, p2.b, p3.b
- mov z16.h, p1/z, #1
- movi v17.4s, #0
- sel z12.h, p0, z16.h, z17.h
+ pfalse p2.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.h, p0/z, #1
str q12, [fp, #0x20] // [V07 tmp6]
add x21, x19, #96
; byrRegs +[x21]
@@ -321,7 +321,7 @@ G_M6879_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, byr
blr x1
; byrRegs -[x0]
ldr x1, [x24, #0x20]
- ;; size=716 bbWeight=1 PerfScore 201.50
+ ;; size=716 bbWeight=1 PerfScore 203.00
G_M6879_IG03: ; bbWeight=1, extend
add x0, x0, x1
sub x0, x0, #1
@@ -382,16 +382,16 @@ G_M6879_IG03: ; bbWeight=1, extend
ldp q10, q11, [x19, #0x30]
ptrue p0.h
cmpne p0.h, p0/z, z8.h, #0
- movi v16.4s, #0
- ptrue p1.h
- cmpne p1.h, p1/z, z9.h, #0
+ pfalse p1.b
ptrue p2.h
- cmpne p2.h, p2/z, z10.h, #0
+ cmpne p2.h, p2/z, z9.h, #0
ptrue p3.h
- cmpne p3.h, p3/z, z11.h, #0
- brkpa p1.b, p1/z, p2.b, p3.b
- mov z17.h, p1/z, #1
- sel z12.h, p0, z16.h, z17.h
+ cmpne p3.h, p3/z, z10.h, #0
+ ptrue p4.h
+ cmpne p4.h, p4/z, z11.h, #0
+ brkpa p2.b, p2/z, p3.b, p4.b
+ sel p0.b, p0, p1.b, p2.b
+ mov z12.h, p0/z, #1
str q12, [fp, #0x10] // [V47 tmp46]
mov x22, x21
; byrRegs +[x22]
@@ -534,7 +534,7 @@ G_M6879_IG03: ; bbWeight=1, extend
mov x0, x20
; gcrRegs +[x0]
movz x1, #0xD1FFAB1E
- ;; size=728 bbWeight=1 PerfScore 196.50
+ ;; size=728 bbWeight=1 PerfScore 198.00
G_M6879_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
@@ -614,7 +614,7 @@ G_M6879_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1744, prolog size 36, PerfScore 472.00, instruction count 436, allocated bytes for code 1744 (MethodHash=74b4e520) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1744, prolog size 36, PerfScore 475.00, instruction count 436, allocated bytes for code 1744 (MethodHash=74b4e520) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleTernaryOpTest__Sve_CreateBreakAfterPropagateMask_short:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 188800.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M16753_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.b
cmpne p2.b, p2/z, z10.b, #0
pnext p2.b, p1, p2.b
- mov z16.b, p2/z, #1
- movi v17.4s, #0
- sel z11.b, p0, z16.b, z17.b
+ pfalse p1.b
+ sel p0.b, p0, p2.b, p1.b
+ mov z11.b, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M16753_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M16753_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M16753_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
- pnext p2.b, p1, p2.b
- mov z17.b, p2/z, #1
- sel z11.b, p0, z16.b, z17.b
+ cmpne p2.b, p2/z, z9.b, #0
+ ptrue p3.b
+ cmpne p3.b, p3/z, z10.b, #0
+ pnext p3.b, p2, p3.b
+ sel p0.b, p0, p1.b, p3.b
+ mov z11.b, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M16753_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M16753_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M16753_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=1d46be8e) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=1d46be8e) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForNextActiveElement_byte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 188607.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)@@ -135,9 +135,9 @@ G_M40735_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ptrue p2.b
cmpne p2.b, p2/z, z10.b, #0
pfirst p2.b, p1, p2.b
- mov z16.b, p2/z, #1
- movi v17.4s, #0
- sel z11.b, p0, z16.b, z17.b
+ pfalse p1.b
+ sel p0.b, p0, p2.b, p1.b
+ mov z11.b, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -315,7 +315,7 @@ G_M40735_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=712 bbWeight=1 PerfScore 190.00
+ ;; size=712 bbWeight=1 PerfScore 191.50
G_M40735_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -361,14 +361,14 @@ G_M40735_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.b
cmpne p0.b, p0/z, z8.b, #0
- movi v16.4s, #0
- ptrue p1.b
- cmpne p1.b, p1/z, z9.b, #0
+ pfalse p1.b
ptrue p2.b
- cmpne p2.b, p2/z, z10.b, #0
- pfirst p2.b, p1, p2.b
- mov z17.b, p2/z, #1
- sel z11.b, p0, z16.b, z17.b
+ cmpne p2.b, p2/z, z9.b, #0
+ ptrue p3.b
+ cmpne p3.b, p3/z, z10.b, #0
+ pfirst p3.b, p2, p3.b
+ sel p0.b, p0, p1.b, p3.b
+ mov z11.b, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -523,7 +523,7 @@ G_M40735_IG03: ; bbWeight=1, extend
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
- ;; size=696 bbWeight=1 PerfScore 185.50
+ ;; size=696 bbWeight=1 PerfScore 187.00
G_M40735_IG04: ; bbWeight=1, extend
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
@@ -578,7 +578,7 @@ G_M40735_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1624, prolog size 36, PerfScore 427.00, instruction count 406, allocated bytes for code 1624 (MethodHash=626d60e0) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1624, prolog size 36, PerfScore 430.00, instruction count 406, allocated bytes for code 1624 (MethodHash=626d60e0) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateMaskForFirstActiveElement_sbyte:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: +0 (0.00%) : 186687.dasm - JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakPropagateMask_uint:ConditionalSelect_ZeroOp():this (FullOpts)@@ -136,9 +136,9 @@ G_M47239_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
cmpne p2.s, p2/z, z10.s, #0
ptrue p3.s
brkn p2.b, p3/z, p1.b, p2.b
- mov z16.s, p2/z, #1
- movi v17.4s, #0
- sel z11.s, p0, z16.s, z17.s
+ pfalse p1.b
+ sel p0.b, p0, p2.b, p1.b
+ mov z11.s, p0/z, #1
add x21, x19, #80
; byrRegs +[x21]
mov x22, x21
@@ -317,7 +317,7 @@ G_M47239_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
- ;; size=720 bbWeight=1 PerfScore 192.00
+ ;; size=720 bbWeight=1 PerfScore 193.50
G_M47239_IG03: ; bbWeight=1, extend
blr x1
; byrRegs -[x0]
@@ -363,15 +363,15 @@ G_M47239_IG03: ; bbWeight=1, extend
ldr q10, [x19, #0x30]
ptrue p0.s
cmpne p0.s, p0/z, z8.s, #0
- movi v16.4s, #0
- ptrue p1.s
- cmpne p1.s, p1/z, z9.s, #0
+ pfalse p1.b
ptrue p2.s
- cmpne p2.s, p2/z, z10.s, #0
+ cmpne p2.s, p2/z, z9.s, #0
ptrue p3.s
- brkn p2.b, p3/z, p1.b, p2.b
- mov z17.s, p2/z, #1
- sel z11.s, p0, z16.s, z17.s
+ cmpne p3.s, p3/z, z10.s, #0
+ ptrue p4.s
+ brkn p3.b, p4/z, p2.b, p3.b
+ sel p0.b, p0, p1.b, p3.b
+ mov z11.s, p0/z, #1
mov x22, x21
; byrRegs +[x22]
mov x0, x23
@@ -525,7 +525,7 @@ G_M47239_IG03: ; bbWeight=1, extend
; byrRegs +[x20]
mov x0, x23
; byrRegs +[x0]
- ;; size=696 bbWeight=1 PerfScore 186.50
+ ;; size=696 bbWeight=1 PerfScore 188.00
G_M47239_IG04: ; bbWeight=1, extend
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
@@ -582,7 +582,7 @@ G_M47239_IG05: ; bbWeight=1, epilog, nogc, extend
br x3
;; size=36 bbWeight=1 PerfScore 9.00
-; Total bytes of code 1640, prolog size 36, PerfScore 431.00, instruction count 410, allocated bytes for code 1640 (MethodHash=35534778) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakPropagateMask_uint:ConditionalSelect_ZeroOp():this (FullOpts)
+; Total bytes of code 1640, prolog size 36, PerfScore 434.00, instruction count 410, allocated bytes for code 1640 (MethodHash=35534778) for method JIT.HardwareIntrinsics.Arm._Sve.SimpleBinaryOpTest__Sve_CreateBreakPropagateMask_uint:ConditionalSelect_ZeroOp():this (FullOpts)
; ============================================================
Unwind Info: DetailsSize improvements/regressions per collection
PerfScore improvements/regressions per collection
Context information
jit-analyze output |
@dotnet/arm64-contrib @kunalspathak @tannergooding |
and @amanasifkhalid |
Fixes #101970
Predicate variants were implemented in #114438 and then turned off in #115566. The code was then removed in #117101 when the AMD64 version was moved to from morph to folding.
This is a simple rework of that code.
Replaces #116854