-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Isel x86/Aarch64] i320+ generates useless 33% extra instructions when chaining sub.with.overflow and select / conditional moves. #103717
Comments
…imal code (and fail for i320 and i384 llvm/llvm-project#103717)
* feat(LLVM): add codegenerator for saturated field add/sub * LLVM: WIP refactor - boilerplate, linkage, assembly sections, ... * feat(llvm): try (and fail) to workaround bad modular addition codegen with inline function. * llvm: partial workaround failure around https://github.com/llvm/llvm-project/issues/102868\#issuecomment-2284935755 module inlining breaks machine instruction fusion * llvm: define our own addcarry/subborrow which properly optimize on x86 (but not ARM see llvm/llvm-project#102062) * llvm: use builtin llvm.uadd.with.overflow.iXXX to try to generate optimal code (and fail for i320 and i384 llvm/llvm-project#103717)
@llvm/issue-subscribers-backend-x86 Author: Mamy Ratsimbazafy (mratsim)
Yet another experiment to properly generate efficient modular addition in pure LLVM IR in at least x86 and arm without inline assembly (follow-up of #102062, #102868). This time using the intrinsics llvm.uadd.with.overflow.iXXX.
The code is optimal on both x86 and ARM for i256 but has extra sbb, sbb, or sequence for i320 or i384. The LLVM IR has a similar structure so I assume it's at the MIR level that some threshold is reached. Ignoring the moves, just from an algorithm point of view, on 320-bit modular addition that needs 5 add/adc + 5 sub/sbb + 5 cmov, 5 extra instructions so 33% extra are generated. Test case: https://alive2.llvm.org/ce/z/fV77tb ; ModuleID = 'x86_poc'
source_filename = "x86_poc"
; target triple = "arm64"
target triple = "x86_64"
@<!-- -->bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@<!-- -->bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@<!-- -->bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256, i256) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256, i256) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @<!-- -->llvm.uadd.with.overflow.i384(i384, i384) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @<!-- -->llvm.usub.with.overflow.i384(i384, i384) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320, i320) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320, i320) #<!-- -->0
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x6(ptr noalias nocapture writeonly %0, i384 %.0.val, i384 %.0.val1, i384 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i384, i1 } @<!-- -->llvm.uadd.with.overflow.i384(i384 %.0.val, i384 %.0.val1)
%.lo = extractvalue { i384, i1 } %2, 0
%3 = call { i384, i1 } @<!-- -->llvm.usub.with.overflow.i384(i384 %.lo, i384 %.0.val3)
%.lo1 = extractvalue { i384, i1 } %3, 0
%.borrow = extractvalue { i384, i1 } %3, 1
%4 = select i1 %.borrow, i384 %.lo, i384 %.lo1
store i384 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias %0, i256 %.val, i256 %.val1, i256 21888242871839275222246405745257275088696311157297823662689037894645226208583)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias %0, i320 %.val, i320 %.val1, i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x6(ptr noalias %0, i384 %.val, i384 %.val1, i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787)
ret void
}
attributes #<!-- -->0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #<!-- -->1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #<!-- -->2 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } Details i256; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
} Generated on x86: _modadd_noo.u64x4: # @<!-- -->_modadd_noo.u64x4
add rsi, qword ptr [rsp + 8]
adc rdx, qword ptr [rsp + 16]
adc rcx, qword ptr [rsp + 24]
adc r8, qword ptr [rsp + 32]
mov rax, rsi
sub rax, qword ptr [rsp + 40]
mov r9, rdx
sbb r9, qword ptr [rsp + 48]
mov r10, rcx
sbb r10, qword ptr [rsp + 56]
mov r11, r8
sbb r11, qword ptr [rsp + 64]
cmovb r10, rcx
cmovb r11, r8
cmovb rax, rsi
cmovb r9, rdx
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 8], r9
mov qword ptr [rdi], rax
ret Notice how sub+3xsbb flows into cmovb. On ARM64
subs and 3xsbs flow into conditional select Details i320; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
} The structure is the same as i256 so you would expect the same optimized assembly, however on x86-64 _modadd_noo.u64x5: # @<!-- -->_modadd_noo.u64x5
push r15
push r14
push r13
push r12
push rbx
add rsi, qword ptr [rsp + 48]
adc rdx, qword ptr [rsp + 56]
adc rcx, qword ptr [rsp + 64]
adc r8, qword ptr [rsp + 72]
adc r9, qword ptr [rsp + 80]
xor r10d, r10d
mov rax, rsi
sub rax, qword ptr [rsp + 96]
mov r11, rdx
sbb r11, qword ptr [rsp + 104]
mov rbx, rcx
sbb rbx, qword ptr [rsp + 112]
mov r14, r8
sbb r14, qword ptr [rsp + 120]
mov r15, r9
sbb r15, qword ptr [rsp + 128]
mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13
cmovne r15, r9
cmovne rax, rsi
cmovne r11, rdx
cmovne rbx, rcx
cmovne r14, r8
mov qword ptr [rdi + 24], r14
mov qword ptr [rdi + 16], rbx
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi], rax
mov qword ptr [rdi + 32], r15
pop rbx
pop r12
pop r13
pop r14
pop r15
ret You have you have sub+4 useful sbb but then this whole sequence before cmovne is extremely wasteful mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13 And on ARM64 it's the same _modadd_noo.u64x5: // @<!-- -->_modadd_noo.u64x5
ldp x8, x9, [sp]
ldr x13, [sp, #<!-- -->32]
ldp x10, x11, [sp, #<!-- -->16]
ldr x17, [sp, #<!-- -->80]
ldp x12, x14, [sp, #<!-- -->48]
adds x8, x2, x8
ldp x15, x16, [sp, #<!-- -->64]
adcs x9, x3, x9
adcs x10, x4, x10
adcs x11, x5, x11
adc x13, x6, x13
subs x12, x8, x12
sbcs x14, x9, x14
sbcs x15, x10, x15
sbcs x16, x11, x16
sbcs x17, x13, x17
ngcs x18, xzr
ngcs x1, xzr
ngc x2, xzr
orr x18, x1, x18
orr x18, x18, x2
cmp x18, #<!-- -->0
csel x9, x9, x14, ne
csel x8, x8, x12, ne
csel x11, x11, x16, ne
csel x10, x10, x15, ne
stp x8, x9, [x0]
csel x9, x13, x17, ne
stp x10, x11, [x0, #<!-- -->16]
str x9, [x0, #<!-- -->32]
ret This is totally unnecessary
|
@llvm/issue-subscribers-backend-aarch64 Author: Mamy Ratsimbazafy (mratsim)
Yet another experiment to properly generate efficient modular addition in pure LLVM IR in at least x86 and arm without inline assembly (follow-up of #102062, #102868). This time using the intrinsics llvm.uadd.with.overflow.iXXX.
The code is optimal on both x86 and ARM for i256 but has extra sbb, sbb, or sequence for i320 or i384. The LLVM IR has a similar structure so I assume it's at the MIR level that some threshold is reached. Ignoring the moves, just from an algorithm point of view, on 320-bit modular addition that needs 5 add/adc + 5 sub/sbb + 5 cmov, 5 extra instructions so 33% extra are generated. Test case: https://alive2.llvm.org/ce/z/fV77tb ; ModuleID = 'x86_poc'
source_filename = "x86_poc"
; target triple = "arm64"
target triple = "x86_64"
@<!-- -->bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@<!-- -->bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@<!-- -->bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256, i256) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256, i256) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @<!-- -->llvm.uadd.with.overflow.i384(i384, i384) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @<!-- -->llvm.usub.with.overflow.i384(i384, i384) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320, i320) #<!-- -->0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320, i320) #<!-- -->0
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x6(ptr noalias nocapture writeonly %0, i384 %.0.val, i384 %.0.val1, i384 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i384, i1 } @<!-- -->llvm.uadd.with.overflow.i384(i384 %.0.val, i384 %.0.val1)
%.lo = extractvalue { i384, i1 } %2, 0
%3 = call { i384, i1 } @<!-- -->llvm.usub.with.overflow.i384(i384 %.lo, i384 %.0.val3)
%.lo1 = extractvalue { i384, i1 } %3, 0
%.borrow = extractvalue { i384, i1 } %3, 1
%4 = select i1 %.borrow, i384 %.lo, i384 %.lo1
store i384 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias %0, i256 %.val, i256 %.val1, i256 21888242871839275222246405745257275088696311157297823662689037894645226208583)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias %0, i320 %.val, i320 %.val1, i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @<!-- -->bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #<!-- -->1 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
call fastcc void @<!-- -->_modadd_noo.u64x6(ptr noalias %0, i384 %.val, i384 %.val1, i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787)
ret void
}
attributes #<!-- -->0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #<!-- -->1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #<!-- -->2 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } Details i256; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i256, i1 } @<!-- -->llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @<!-- -->llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
} Generated on x86: _modadd_noo.u64x4: # @<!-- -->_modadd_noo.u64x4
add rsi, qword ptr [rsp + 8]
adc rdx, qword ptr [rsp + 16]
adc rcx, qword ptr [rsp + 24]
adc r8, qword ptr [rsp + 32]
mov rax, rsi
sub rax, qword ptr [rsp + 40]
mov r9, rdx
sbb r9, qword ptr [rsp + 48]
mov r10, rcx
sbb r10, qword ptr [rsp + 56]
mov r11, r8
sbb r11, qword ptr [rsp + 64]
cmovb r10, rcx
cmovb r11, r8
cmovb rax, rsi
cmovb r9, rdx
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 8], r9
mov qword ptr [rdi], rax
ret Notice how sub+3xsbb flows into cmovb. On ARM64
subs and 3xsbs flow into conditional select Details i320; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #<!-- -->2 section "ctt.fields" {
%2 = call { i320, i1 } @<!-- -->llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @<!-- -->llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
} The structure is the same as i256 so you would expect the same optimized assembly, however on x86-64 _modadd_noo.u64x5: # @<!-- -->_modadd_noo.u64x5
push r15
push r14
push r13
push r12
push rbx
add rsi, qword ptr [rsp + 48]
adc rdx, qword ptr [rsp + 56]
adc rcx, qword ptr [rsp + 64]
adc r8, qword ptr [rsp + 72]
adc r9, qword ptr [rsp + 80]
xor r10d, r10d
mov rax, rsi
sub rax, qword ptr [rsp + 96]
mov r11, rdx
sbb r11, qword ptr [rsp + 104]
mov rbx, rcx
sbb rbx, qword ptr [rsp + 112]
mov r14, r8
sbb r14, qword ptr [rsp + 120]
mov r15, r9
sbb r15, qword ptr [rsp + 128]
mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13
cmovne r15, r9
cmovne rax, rsi
cmovne r11, rdx
cmovne rbx, rcx
cmovne r14, r8
mov qword ptr [rdi + 24], r14
mov qword ptr [rdi + 16], rbx
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi], rax
mov qword ptr [rdi + 32], r15
pop rbx
pop r12
pop r13
pop r14
pop r15
ret You have you have sub+4 useful sbb but then this whole sequence before cmovne is extremely wasteful mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13 And on ARM64 it's the same _modadd_noo.u64x5: // @<!-- -->_modadd_noo.u64x5
ldp x8, x9, [sp]
ldr x13, [sp, #<!-- -->32]
ldp x10, x11, [sp, #<!-- -->16]
ldr x17, [sp, #<!-- -->80]
ldp x12, x14, [sp, #<!-- -->48]
adds x8, x2, x8
ldp x15, x16, [sp, #<!-- -->64]
adcs x9, x3, x9
adcs x10, x4, x10
adcs x11, x5, x11
adc x13, x6, x13
subs x12, x8, x12
sbcs x14, x9, x14
sbcs x15, x10, x15
sbcs x16, x11, x16
sbcs x17, x13, x17
ngcs x18, xzr
ngcs x1, xzr
ngc x2, xzr
orr x18, x1, x18
orr x18, x18, x2
cmp x18, #<!-- -->0
csel x9, x9, x14, ne
csel x8, x8, x12, ne
csel x11, x11, x16, ne
csel x10, x10, x15, ne
stp x8, x9, [x0]
csel x9, x13, x17, ne
stp x10, x11, [x0, #<!-- -->16]
str x9, [x0, #<!-- -->32]
ret This is totally unnecessary
|
Yet another experiment to properly generate efficient modular addition in pure LLVM IR in at least x86 and arm without inline assembly (follow-up of #102062, #102868). This time using the intrinsics llvm.uadd.with.overflow.iXXX.
The code is optimal on both x86 and ARM for i256 but has extra sbb, sbb, or sequence for i320 or i384. The LLVM IR has a similar structure so I assume it's at the MIR level that some threshold is reached.
Ignoring the moves, just from an algorithm point of view, on 320-bit modular addition that needs 5 add/adc + 5 sub/sbb + 5 cmov, 5 extra instructions so 33% extra are generated.
Test case: https://alive2.llvm.org/ce/z/fV77tb
Details i256
Generated on x86:
Notice how sub+3xsbb flows into cmovb.
On ARM64
subs and 3xsbs flow into conditional select
Details i320
The structure is the same as i256 so you would expect the same optimized assembly, however
on x86-64
You have you have sub+4 useful sbb but then this whole sequence before cmovne is extremely wasteful
And on ARM64 it's the same
This is totally unnecessary
The text was updated successfully, but these errors were encountered: