mirror of https://github.com/lukechilds/node.git
8 changed files with 3585 additions and 0 deletions
@ -0,0 +1,295 @@ |
|||
.text |
|||
|
|||
|
|||
.type _mul_1x1,@function |
|||
.align 16 |
|||
_mul_1x1: |
|||
subq $128+8,%rsp |
|||
movq $-1,%r9 |
|||
leaq (%rax,%rax,1),%rsi |
|||
shrq $3,%r9 |
|||
leaq (,%rax,4),%rdi |
|||
andq %rax,%r9 |
|||
leaq (,%rax,8),%r12 |
|||
sarq $63,%rax |
|||
leaq (%r9,%r9,1),%r10 |
|||
sarq $63,%rsi |
|||
leaq (,%r9,4),%r11 |
|||
andq %rbp,%rax |
|||
sarq $63,%rdi |
|||
movq %rax,%rdx |
|||
shlq $63,%rax |
|||
andq %rbp,%rsi |
|||
shrq $1,%rdx |
|||
movq %rsi,%rcx |
|||
shlq $62,%rsi |
|||
andq %rbp,%rdi |
|||
shrq $2,%rcx |
|||
xorq %rsi,%rax |
|||
movq %rdi,%rbx |
|||
shlq $61,%rdi |
|||
xorq %rcx,%rdx |
|||
shrq $3,%rbx |
|||
xorq %rdi,%rax |
|||
xorq %rbx,%rdx |
|||
|
|||
movq %r9,%r13 |
|||
movq $0,0(%rsp) |
|||
xorq %r10,%r13 |
|||
movq %r9,8(%rsp) |
|||
movq %r11,%r14 |
|||
movq %r10,16(%rsp) |
|||
xorq %r12,%r14 |
|||
movq %r13,24(%rsp) |
|||
|
|||
xorq %r11,%r9 |
|||
movq %r11,32(%rsp) |
|||
xorq %r11,%r10 |
|||
movq %r9,40(%rsp) |
|||
xorq %r11,%r13 |
|||
movq %r10,48(%rsp) |
|||
xorq %r14,%r9 |
|||
movq %r13,56(%rsp) |
|||
xorq %r14,%r10 |
|||
|
|||
movq %r12,64(%rsp) |
|||
xorq %r14,%r13 |
|||
movq %r9,72(%rsp) |
|||
xorq %r11,%r9 |
|||
movq %r10,80(%rsp) |
|||
xorq %r11,%r10 |
|||
movq %r13,88(%rsp) |
|||
|
|||
xorq %r11,%r13 |
|||
movq %r14,96(%rsp) |
|||
movq %r8,%rsi |
|||
movq %r9,104(%rsp) |
|||
andq %rbp,%rsi |
|||
movq %r10,112(%rsp) |
|||
shrq $4,%rbp |
|||
movq %r13,120(%rsp) |
|||
movq %r8,%rdi |
|||
andq %rbp,%rdi |
|||
shrq $4,%rbp |
|||
|
|||
movq (%rsp,%rsi,8),%xmm0 |
|||
movq %r8,%rsi |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $4,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $60,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $1,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $12,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $52,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $2,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $20,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $44,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $3,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $28,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $36,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $4,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $36,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $28,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $5,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $44,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $20,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $6,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $52,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $12,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $7,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %rcx,%rbx |
|||
shlq $60,%rcx |
|||
.byte 102,72,15,126,198 |
|||
shrq $4,%rbx |
|||
xorq %rcx,%rax |
|||
psrldq $8,%xmm0 |
|||
xorq %rbx,%rdx |
|||
.byte 102,72,15,126,199 |
|||
xorq %rsi,%rax |
|||
xorq %rdi,%rdx |
|||
|
|||
addq $128+8,%rsp |
|||
.byte 0xf3,0xc3 |
|||
.Lend_mul_1x1: |
|||
.size _mul_1x1,.-_mul_1x1 |
|||
|
|||
.globl bn_GF2m_mul_2x2 |
|||
.type bn_GF2m_mul_2x2,@function |
|||
.align 16 |
|||
bn_GF2m_mul_2x2: |
|||
movq OPENSSL_ia32cap_P(%rip),%rax |
|||
btq $33,%rax |
|||
jnc .Lvanilla_mul_2x2 |
|||
|
|||
.byte 102,72,15,110,198 |
|||
.byte 102,72,15,110,201 |
|||
.byte 102,72,15,110,210 |
|||
.byte 102,73,15,110,216 |
|||
movdqa %xmm0,%xmm4 |
|||
movdqa %xmm1,%xmm5 |
|||
.byte 102,15,58,68,193,0 |
|||
pxor %xmm2,%xmm4 |
|||
pxor %xmm3,%xmm5 |
|||
.byte 102,15,58,68,211,0 |
|||
.byte 102,15,58,68,229,0 |
|||
xorps %xmm0,%xmm4 |
|||
xorps %xmm2,%xmm4 |
|||
movdqa %xmm4,%xmm5 |
|||
pslldq $8,%xmm4 |
|||
psrldq $8,%xmm5 |
|||
pxor %xmm4,%xmm2 |
|||
pxor %xmm5,%xmm0 |
|||
movdqu %xmm2,0(%rdi) |
|||
movdqu %xmm0,16(%rdi) |
|||
.byte 0xf3,0xc3 |
|||
|
|||
.align 16 |
|||
.Lvanilla_mul_2x2: |
|||
leaq -136(%rsp),%rsp |
|||
movq %r14,80(%rsp) |
|||
movq %r13,88(%rsp) |
|||
movq %r12,96(%rsp) |
|||
movq %rbp,104(%rsp) |
|||
movq %rbx,112(%rsp) |
|||
.Lbody_mul_2x2: |
|||
movq %rdi,32(%rsp) |
|||
movq %rsi,40(%rsp) |
|||
movq %rdx,48(%rsp) |
|||
movq %rcx,56(%rsp) |
|||
movq %r8,64(%rsp) |
|||
|
|||
movq $15,%r8 |
|||
movq %rsi,%rax |
|||
movq %rcx,%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq %rax,16(%rsp) |
|||
movq %rdx,24(%rsp) |
|||
|
|||
movq 48(%rsp),%rax |
|||
movq 64(%rsp),%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq %rax,0(%rsp) |
|||
movq %rdx,8(%rsp) |
|||
|
|||
movq 40(%rsp),%rax |
|||
movq 56(%rsp),%rbp |
|||
xorq 48(%rsp),%rax |
|||
xorq 64(%rsp),%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq 0(%rsp),%rbx |
|||
movq 8(%rsp),%rcx |
|||
movq 16(%rsp),%rdi |
|||
movq 24(%rsp),%rsi |
|||
movq 32(%rsp),%rbp |
|||
|
|||
xorq %rdx,%rax |
|||
xorq %rcx,%rdx |
|||
xorq %rbx,%rax |
|||
movq %rbx,0(%rbp) |
|||
xorq %rdi,%rdx |
|||
movq %rsi,24(%rbp) |
|||
xorq %rsi,%rax |
|||
xorq %rsi,%rdx |
|||
xorq %rdx,%rax |
|||
movq %rdx,16(%rbp) |
|||
movq %rax,8(%rbp) |
|||
|
|||
movq 80(%rsp),%r14 |
|||
movq 88(%rsp),%r13 |
|||
movq 96(%rsp),%r12 |
|||
movq 104(%rsp),%rbp |
|||
movq 112(%rsp),%rbx |
|||
leaq 136(%rsp),%rsp |
|||
.byte 0xf3,0xc3 |
|||
.Lend_mul_2x2: |
|||
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 |
|||
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
|||
.align 16 |
@ -0,0 +1,785 @@ |
|||
.text |
|||
|
|||
|
|||
.globl bn_mul_mont_gather5 |
|||
.type bn_mul_mont_gather5,@function |
|||
.align 64 |
|||
bn_mul_mont_gather5: |
|||
testl $3,%r9d |
|||
jnz .Lmul_enter |
|||
cmpl $8,%r9d |
|||
jb .Lmul_enter |
|||
jmp .Lmul4x_enter |
|||
|
|||
.align 16 |
|||
.Lmul_enter: |
|||
movl %r9d,%r9d |
|||
movl 8(%rsp),%r10d |
|||
pushq %rbx |
|||
pushq %rbp |
|||
pushq %r12 |
|||
pushq %r13 |
|||
pushq %r14 |
|||
pushq %r15 |
|||
movq %rsp,%rax |
|||
leaq 2(%r9),%r11 |
|||
negq %r11 |
|||
leaq (%rsp,%r11,8),%rsp |
|||
andq $-1024,%rsp |
|||
|
|||
movq %rax,8(%rsp,%r9,8) |
|||
.Lmul_body: |
|||
movq %rdx,%r12 |
|||
movq %r10,%r11 |
|||
shrq $3,%r10 |
|||
andq $7,%r11 |
|||
notq %r10 |
|||
leaq .Lmagic_masks(%rip),%rax |
|||
andq $3,%r10 |
|||
leaq 96(%r12,%r11,8),%r12 |
|||
movq 0(%rax,%r10,8),%xmm4 |
|||
movq 8(%rax,%r10,8),%xmm5 |
|||
movq 16(%rax,%r10,8),%xmm6 |
|||
movq 24(%rax,%r10,8),%xmm7 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
movq (%r8),%r8 |
|||
movq (%rsi),%rax |
|||
|
|||
xorq %r14,%r14 |
|||
xorq %r15,%r15 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
movq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r13 |
|||
|
|||
leaq 1(%r15),%r15 |
|||
jmp .L1st_enter |
|||
|
|||
.align 16 |
|||
.L1st: |
|||
addq %rax,%r13 |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r13 |
|||
movq %r10,%r11 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.L1st_enter: |
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
leaq 1(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
cmpq %r9,%r15 |
|||
jne .L1st |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
addq %rax,%r13 |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
movq %r10,%r11 |
|||
|
|||
xorq %rdx,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r9,8) |
|||
movq %rdx,(%rsp,%r9,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
jmp .Louter |
|||
.align 16 |
|||
.Louter: |
|||
xorq %r15,%r15 |
|||
movq %r8,%rbp |
|||
movq (%rsp),%r10 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
adcq $0,%rdx |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq 8(%rsp),%r10 |
|||
movq %rdx,%r13 |
|||
|
|||
leaq 1(%r15),%r15 |
|||
jmp .Linner_enter |
|||
|
|||
.align 16 |
|||
.Linner: |
|||
addq %rax,%r13 |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
movq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.Linner_enter: |
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r10 |
|||
movq %rdx,%r11 |
|||
adcq $0,%r11 |
|||
leaq 1(%r15),%r15 |
|||
|
|||
mulq %rbp |
|||
cmpq %r9,%r15 |
|||
jne .Linner |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
addq %rax,%r13 |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
movq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
xorq %rdx,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r9,8) |
|||
movq %rdx,(%rsp,%r9,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
cmpq %r9,%r14 |
|||
jl .Louter |
|||
|
|||
xorq %r14,%r14 |
|||
movq (%rsp),%rax |
|||
leaq (%rsp),%rsi |
|||
movq %r9,%r15 |
|||
jmp .Lsub |
|||
.align 16 |
|||
.Lsub: sbbq (%rcx,%r14,8),%rax |
|||
movq %rax,(%rdi,%r14,8) |
|||
movq 8(%rsi,%r14,8),%rax |
|||
leaq 1(%r14),%r14 |
|||
decq %r15 |
|||
jnz .Lsub |
|||
|
|||
sbbq $0,%rax |
|||
xorq %r14,%r14 |
|||
andq %rax,%rsi |
|||
notq %rax |
|||
movq %rdi,%rcx |
|||
andq %rax,%rcx |
|||
movq %r9,%r15 |
|||
orq %rcx,%rsi |
|||
.align 16 |
|||
.Lcopy: |
|||
movq (%rsi,%r14,8),%rax |
|||
movq %r14,(%rsp,%r14,8) |
|||
movq %rax,(%rdi,%r14,8) |
|||
leaq 1(%r14),%r14 |
|||
subq $1,%r15 |
|||
jnz .Lcopy |
|||
|
|||
movq 8(%rsp,%r9,8),%rsi |
|||
movq $1,%rax |
|||
movq (%rsi),%r15 |
|||
movq 8(%rsi),%r14 |
|||
movq 16(%rsi),%r13 |
|||
movq 24(%rsi),%r12 |
|||
movq 32(%rsi),%rbp |
|||
movq 40(%rsi),%rbx |
|||
leaq 48(%rsi),%rsp |
|||
.Lmul_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 |
|||
.type bn_mul4x_mont_gather5,@function |
|||
.align 16 |
|||
bn_mul4x_mont_gather5: |
|||
.Lmul4x_enter: |
|||
movl %r9d,%r9d |
|||
movl 8(%rsp),%r10d |
|||
pushq %rbx |
|||
pushq %rbp |
|||
pushq %r12 |
|||
pushq %r13 |
|||
pushq %r14 |
|||
pushq %r15 |
|||
movq %rsp,%rax |
|||
leaq 4(%r9),%r11 |
|||
negq %r11 |
|||
leaq (%rsp,%r11,8),%rsp |
|||
andq $-1024,%rsp |
|||
|
|||
movq %rax,8(%rsp,%r9,8) |
|||
.Lmul4x_body: |
|||
movq %rdi,16(%rsp,%r9,8) |
|||
movq %rdx,%r12 |
|||
movq %r10,%r11 |
|||
shrq $3,%r10 |
|||
andq $7,%r11 |
|||
notq %r10 |
|||
leaq .Lmagic_masks(%rip),%rax |
|||
andq $3,%r10 |
|||
leaq 96(%r12,%r11,8),%r12 |
|||
movq 0(%rax,%r10,8),%xmm4 |
|||
movq 8(%rax,%r10,8),%xmm5 |
|||
movq 16(%rax,%r10,8),%xmm6 |
|||
movq 24(%rax,%r10,8),%xmm7 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
movq (%r8),%r8 |
|||
movq (%rsi),%rax |
|||
|
|||
xorq %r14,%r14 |
|||
xorq %r15,%r15 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
movq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq 16(%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
leaq 4(%r15),%r15 |
|||
adcq $0,%rdx |
|||
movq %rdi,(%rsp) |
|||
movq %rdx,%r13 |
|||
jmp .L1st4x |
|||
.align 16 |
|||
.L1st4x: |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq 8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
leaq 4(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq -16(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
cmpq %r9,%r15 |
|||
jl .L1st4x |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
xorq %rdi,%rdi |
|||
addq %r10,%r13 |
|||
adcq $0,%rdi |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdi,(%rsp,%r15,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
.align 4 |
|||
.Louter4x: |
|||
xorq %r15,%r15 |
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq (%rsp),%r10 |
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
adcq $0,%rdx |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx),%rax |
|||
adcq $0,%rdx |
|||
addq 8(%rsp),%r11 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq 16(%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
leaq 4(%r15),%r15 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r13 |
|||
jmp .Linner4x |
|||
.align 16 |
|||
.Linner4x: |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -16(%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq 8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq 8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
leaq 4(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq -16(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-40(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
cmpq %r9,%r15 |
|||
jl .Linner4x |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -16(%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
leaq 1(%r14),%r14 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
|
|||
xorq %rdi,%rdi |
|||
addq %r10,%r13 |
|||
adcq $0,%rdi |
|||
addq (%rsp,%r9,8),%r13 |
|||
adcq $0,%rdi |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdi,(%rsp,%r15,8) |
|||
|
|||
cmpq %r9,%r14 |
|||
jl .Louter4x |
|||
movq 16(%rsp,%r9,8),%rdi |
|||
movq 0(%rsp),%rax |
|||
pxor %xmm0,%xmm0 |
|||
movq 8(%rsp),%rdx |
|||
shrq $2,%r9 |
|||
leaq (%rsp),%rsi |
|||
xorq %r14,%r14 |
|||
|
|||
subq 0(%rcx),%rax |
|||
movq 16(%rsi),%rbx |
|||
movq 24(%rsi),%rbp |
|||
sbbq 8(%rcx),%rdx |
|||
leaq -1(%r9),%r15 |
|||
jmp .Lsub4x |
|||
.align 16 |
|||
.Lsub4x: |
|||
movq %rax,0(%rdi,%r14,8) |
|||
movq %rdx,8(%rdi,%r14,8) |
|||
sbbq 16(%rcx,%r14,8),%rbx |
|||
movq 32(%rsi,%r14,8),%rax |
|||
movq 40(%rsi,%r14,8),%rdx |
|||
sbbq 24(%rcx,%r14,8),%rbp |
|||
movq %rbx,16(%rdi,%r14,8) |
|||
movq %rbp,24(%rdi,%r14,8) |
|||
sbbq 32(%rcx,%r14,8),%rax |
|||
movq 48(%rsi,%r14,8),%rbx |
|||
movq 56(%rsi,%r14,8),%rbp |
|||
sbbq 40(%rcx,%r14,8),%rdx |
|||
leaq 4(%r14),%r14 |
|||
decq %r15 |
|||
jnz .Lsub4x |
|||
|
|||
movq %rax,0(%rdi,%r14,8) |
|||
movq 32(%rsi,%r14,8),%rax |
|||
sbbq 16(%rcx,%r14,8),%rbx |
|||
movq %rdx,8(%rdi,%r14,8) |
|||
sbbq 24(%rcx,%r14,8),%rbp |
|||
movq %rbx,16(%rdi,%r14,8) |
|||
|
|||
sbbq $0,%rax |
|||
movq %rbp,24(%rdi,%r14,8) |
|||
xorq %r14,%r14 |
|||
andq %rax,%rsi |
|||
notq %rax |
|||
movq %rdi,%rcx |
|||
andq %rax,%rcx |
|||
leaq -1(%r9),%r15 |
|||
orq %rcx,%rsi |
|||
|
|||
movdqu (%rsi),%xmm1 |
|||
movdqa %xmm0,(%rsp) |
|||
movdqu %xmm1,(%rdi) |
|||
jmp .Lcopy4x |
|||
.align 16 |
|||
.Lcopy4x: |
|||
movdqu 16(%rsi,%r14,1),%xmm2 |
|||
movdqu 32(%rsi,%r14,1),%xmm1 |
|||
movdqa %xmm0,16(%rsp,%r14,1) |
|||
movdqu %xmm2,16(%rdi,%r14,1) |
|||
movdqa %xmm0,32(%rsp,%r14,1) |
|||
movdqu %xmm1,32(%rdi,%r14,1) |
|||
leaq 32(%r14),%r14 |
|||
decq %r15 |
|||
jnz .Lcopy4x |
|||
|
|||
shlq $2,%r9 |
|||
movdqu 16(%rsi,%r14,1),%xmm2 |
|||
movdqa %xmm0,16(%rsp,%r14,1) |
|||
movdqu %xmm2,16(%rdi,%r14,1) |
|||
movq 8(%rsp,%r9,8),%rsi |
|||
movq $1,%rax |
|||
movq (%rsi),%r15 |
|||
movq 8(%rsi),%r14 |
|||
movq 16(%rsi),%r13 |
|||
movq 24(%rsi),%r12 |
|||
movq 32(%rsi),%rbp |
|||
movq 40(%rsi),%rbx |
|||
leaq 48(%rsi),%rsp |
|||
.Lmul4x_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 |
|||
.globl bn_scatter5 |
|||
.type bn_scatter5,@function |
|||
.align 16 |
|||
bn_scatter5: |
|||
cmpq $0,%rsi |
|||
jz .Lscatter_epilogue |
|||
leaq (%rdx,%rcx,8),%rdx |
|||
.Lscatter: |
|||
movq (%rdi),%rax |
|||
leaq 8(%rdi),%rdi |
|||
movq %rax,(%rdx) |
|||
leaq 256(%rdx),%rdx |
|||
subq $1,%rsi |
|||
jnz .Lscatter |
|||
.Lscatter_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
.size bn_scatter5,.-bn_scatter5 |
|||
|
|||
.globl bn_gather5 |
|||
.type bn_gather5,@function |
|||
.align 16 |
|||
bn_gather5: |
|||
movq %rcx,%r11 |
|||
shrq $3,%rcx |
|||
andq $7,%r11 |
|||
notq %rcx |
|||
leaq .Lmagic_masks(%rip),%rax |
|||
andq $3,%rcx |
|||
leaq 96(%rdx,%r11,8),%rdx |
|||
movq 0(%rax,%rcx,8),%xmm4 |
|||
movq 8(%rax,%rcx,8),%xmm5 |
|||
movq 16(%rax,%rcx,8),%xmm6 |
|||
movq 24(%rax,%rcx,8),%xmm7 |
|||
jmp .Lgather |
|||
.align 16 |
|||
.Lgather: |
|||
movq -96(%rdx),%xmm0 |
|||
movq -32(%rdx),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%rdx),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%rdx),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%rdx),%rdx |
|||
por %xmm3,%xmm0 |
|||
|
|||
movq %xmm0,(%rdi) |
|||
leaq 8(%rdi),%rdi |
|||
subq $1,%rsi |
|||
jnz .Lgather |
|||
.byte 0xf3,0xc3 |
|||
.LSEH_end_bn_gather5: |
|||
.size bn_gather5,.-bn_gather5 |
|||
.align 64 |
|||
.Lmagic_masks: |
|||
.long 0,0, 0,0, 0,0, -1,-1 |
|||
.long 0,0, 0,0, 0,0, 0,0 |
|||
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
@ -0,0 +1,295 @@ |
|||
.text |
|||
|
|||
|
|||
|
|||
.p2align 4 |
|||
_mul_1x1: |
|||
subq $128+8,%rsp |
|||
movq $-1,%r9 |
|||
leaq (%rax,%rax,1),%rsi |
|||
shrq $3,%r9 |
|||
leaq (,%rax,4),%rdi |
|||
andq %rax,%r9 |
|||
leaq (,%rax,8),%r12 |
|||
sarq $63,%rax |
|||
leaq (%r9,%r9,1),%r10 |
|||
sarq $63,%rsi |
|||
leaq (,%r9,4),%r11 |
|||
andq %rbp,%rax |
|||
sarq $63,%rdi |
|||
movq %rax,%rdx |
|||
shlq $63,%rax |
|||
andq %rbp,%rsi |
|||
shrq $1,%rdx |
|||
movq %rsi,%rcx |
|||
shlq $62,%rsi |
|||
andq %rbp,%rdi |
|||
shrq $2,%rcx |
|||
xorq %rsi,%rax |
|||
movq %rdi,%rbx |
|||
shlq $61,%rdi |
|||
xorq %rcx,%rdx |
|||
shrq $3,%rbx |
|||
xorq %rdi,%rax |
|||
xorq %rbx,%rdx |
|||
|
|||
movq %r9,%r13 |
|||
movq $0,0(%rsp) |
|||
xorq %r10,%r13 |
|||
movq %r9,8(%rsp) |
|||
movq %r11,%r14 |
|||
movq %r10,16(%rsp) |
|||
xorq %r12,%r14 |
|||
movq %r13,24(%rsp) |
|||
|
|||
xorq %r11,%r9 |
|||
movq %r11,32(%rsp) |
|||
xorq %r11,%r10 |
|||
movq %r9,40(%rsp) |
|||
xorq %r11,%r13 |
|||
movq %r10,48(%rsp) |
|||
xorq %r14,%r9 |
|||
movq %r13,56(%rsp) |
|||
xorq %r14,%r10 |
|||
|
|||
movq %r12,64(%rsp) |
|||
xorq %r14,%r13 |
|||
movq %r9,72(%rsp) |
|||
xorq %r11,%r9 |
|||
movq %r10,80(%rsp) |
|||
xorq %r11,%r10 |
|||
movq %r13,88(%rsp) |
|||
|
|||
xorq %r11,%r13 |
|||
movq %r14,96(%rsp) |
|||
movq %r8,%rsi |
|||
movq %r9,104(%rsp) |
|||
andq %rbp,%rsi |
|||
movq %r10,112(%rsp) |
|||
shrq $4,%rbp |
|||
movq %r13,120(%rsp) |
|||
movq %r8,%rdi |
|||
andq %rbp,%rdi |
|||
shrq $4,%rbp |
|||
|
|||
movq (%rsp,%rsi,8),%xmm0 |
|||
movq %r8,%rsi |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $4,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $60,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $1,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $12,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $52,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $2,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $20,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $44,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $3,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $28,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $36,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $4,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $36,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $28,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $5,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $44,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $20,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $6,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %r8,%rdi |
|||
movq %rcx,%rbx |
|||
shlq $52,%rcx |
|||
andq %rbp,%rdi |
|||
movq (%rsp,%rsi,8),%xmm1 |
|||
shrq $12,%rbx |
|||
xorq %rcx,%rax |
|||
pslldq $7,%xmm1 |
|||
movq %r8,%rsi |
|||
shrq $4,%rbp |
|||
xorq %rbx,%rdx |
|||
andq %rbp,%rsi |
|||
shrq $4,%rbp |
|||
pxor %xmm1,%xmm0 |
|||
movq (%rsp,%rdi,8),%rcx |
|||
movq %rcx,%rbx |
|||
shlq $60,%rcx |
|||
.byte 102,72,15,126,198 |
|||
shrq $4,%rbx |
|||
xorq %rcx,%rax |
|||
psrldq $8,%xmm0 |
|||
xorq %rbx,%rdx |
|||
.byte 102,72,15,126,199 |
|||
xorq %rsi,%rax |
|||
xorq %rdi,%rdx |
|||
|
|||
addq $128+8,%rsp |
|||
.byte 0xf3,0xc3 |
|||
L$end_mul_1x1: |
|||
|
|||
|
|||
.globl _bn_GF2m_mul_2x2 |
|||
|
|||
.p2align 4 |
|||
_bn_GF2m_mul_2x2: |
|||
movq _OPENSSL_ia32cap_P(%rip),%rax |
|||
btq $33,%rax |
|||
jnc L$vanilla_mul_2x2 |
|||
|
|||
.byte 102,72,15,110,198 |
|||
.byte 102,72,15,110,201 |
|||
.byte 102,72,15,110,210 |
|||
.byte 102,73,15,110,216 |
|||
movdqa %xmm0,%xmm4 |
|||
movdqa %xmm1,%xmm5 |
|||
.byte 102,15,58,68,193,0 |
|||
pxor %xmm2,%xmm4 |
|||
pxor %xmm3,%xmm5 |
|||
.byte 102,15,58,68,211,0 |
|||
.byte 102,15,58,68,229,0 |
|||
xorps %xmm0,%xmm4 |
|||
xorps %xmm2,%xmm4 |
|||
movdqa %xmm4,%xmm5 |
|||
pslldq $8,%xmm4 |
|||
psrldq $8,%xmm5 |
|||
pxor %xmm4,%xmm2 |
|||
pxor %xmm5,%xmm0 |
|||
movdqu %xmm2,0(%rdi) |
|||
movdqu %xmm0,16(%rdi) |
|||
.byte 0xf3,0xc3 |
|||
|
|||
.p2align 4 |
|||
L$vanilla_mul_2x2: |
|||
leaq -136(%rsp),%rsp |
|||
movq %r14,80(%rsp) |
|||
movq %r13,88(%rsp) |
|||
movq %r12,96(%rsp) |
|||
movq %rbp,104(%rsp) |
|||
movq %rbx,112(%rsp) |
|||
L$body_mul_2x2: |
|||
movq %rdi,32(%rsp) |
|||
movq %rsi,40(%rsp) |
|||
movq %rdx,48(%rsp) |
|||
movq %rcx,56(%rsp) |
|||
movq %r8,64(%rsp) |
|||
|
|||
movq $15,%r8 |
|||
movq %rsi,%rax |
|||
movq %rcx,%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq %rax,16(%rsp) |
|||
movq %rdx,24(%rsp) |
|||
|
|||
movq 48(%rsp),%rax |
|||
movq 64(%rsp),%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq %rax,0(%rsp) |
|||
movq %rdx,8(%rsp) |
|||
|
|||
movq 40(%rsp),%rax |
|||
movq 56(%rsp),%rbp |
|||
xorq 48(%rsp),%rax |
|||
xorq 64(%rsp),%rbp |
|||
call _mul_1x1 |
|||
|
|||
movq 0(%rsp),%rbx |
|||
movq 8(%rsp),%rcx |
|||
movq 16(%rsp),%rdi |
|||
movq 24(%rsp),%rsi |
|||
movq 32(%rsp),%rbp |
|||
|
|||
xorq %rdx,%rax |
|||
xorq %rcx,%rdx |
|||
xorq %rbx,%rax |
|||
movq %rbx,0(%rbp) |
|||
xorq %rdi,%rdx |
|||
movq %rsi,24(%rbp) |
|||
xorq %rsi,%rax |
|||
xorq %rsi,%rdx |
|||
xorq %rdx,%rax |
|||
movq %rdx,16(%rbp) |
|||
movq %rax,8(%rbp) |
|||
|
|||
movq 80(%rsp),%r14 |
|||
movq 88(%rsp),%r13 |
|||
movq 96(%rsp),%r12 |
|||
movq 104(%rsp),%rbp |
|||
movq 112(%rsp),%rbx |
|||
leaq 136(%rsp),%rsp |
|||
.byte 0xf3,0xc3 |
|||
L$end_mul_2x2: |
|||
|
|||
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
|||
.p2align 4 |
@ -0,0 +1,785 @@ |
|||
.text |
|||
|
|||
|
|||
.globl _bn_mul_mont_gather5 |
|||
|
|||
.p2align 6 |
|||
_bn_mul_mont_gather5: |
|||
testl $3,%r9d |
|||
jnz L$mul_enter |
|||
cmpl $8,%r9d |
|||
jb L$mul_enter |
|||
jmp L$mul4x_enter |
|||
|
|||
.p2align 4 |
|||
L$mul_enter: |
|||
movl %r9d,%r9d |
|||
movl 8(%rsp),%r10d |
|||
pushq %rbx |
|||
pushq %rbp |
|||
pushq %r12 |
|||
pushq %r13 |
|||
pushq %r14 |
|||
pushq %r15 |
|||
movq %rsp,%rax |
|||
leaq 2(%r9),%r11 |
|||
negq %r11 |
|||
leaq (%rsp,%r11,8),%rsp |
|||
andq $-1024,%rsp |
|||
|
|||
movq %rax,8(%rsp,%r9,8) |
|||
L$mul_body: |
|||
movq %rdx,%r12 |
|||
movq %r10,%r11 |
|||
shrq $3,%r10 |
|||
andq $7,%r11 |
|||
notq %r10 |
|||
leaq L$magic_masks(%rip),%rax |
|||
andq $3,%r10 |
|||
leaq 96(%r12,%r11,8),%r12 |
|||
movq 0(%rax,%r10,8),%xmm4 |
|||
movq 8(%rax,%r10,8),%xmm5 |
|||
movq 16(%rax,%r10,8),%xmm6 |
|||
movq 24(%rax,%r10,8),%xmm7 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
movq (%r8),%r8 |
|||
movq (%rsi),%rax |
|||
|
|||
xorq %r14,%r14 |
|||
xorq %r15,%r15 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
movq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r13 |
|||
|
|||
leaq 1(%r15),%r15 |
|||
jmp L$1st_enter |
|||
|
|||
.p2align 4 |
|||
L$1st: |
|||
addq %rax,%r13 |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r13 |
|||
movq %r10,%r11 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
L$1st_enter: |
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
leaq 1(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
cmpq %r9,%r15 |
|||
jne L$1st |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
addq %rax,%r13 |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
movq %r10,%r11 |
|||
|
|||
xorq %rdx,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r9,8) |
|||
movq %rdx,(%rsp,%r9,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
jmp L$outer |
|||
.p2align 4 |
|||
L$outer: |
|||
xorq %r15,%r15 |
|||
movq %r8,%rbp |
|||
movq (%rsp),%r10 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
adcq $0,%rdx |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq 8(%rsp),%r10 |
|||
movq %rdx,%r13 |
|||
|
|||
leaq 1(%r15),%r15 |
|||
jmp L$inner_enter |
|||
|
|||
.p2align 4 |
|||
L$inner: |
|||
addq %rax,%r13 |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
movq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
L$inner_enter: |
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%r10 |
|||
movq %rdx,%r11 |
|||
adcq $0,%r11 |
|||
leaq 1(%r15),%r15 |
|||
|
|||
mulq %rbp |
|||
cmpq %r9,%r15 |
|||
jne L$inner |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
addq %rax,%r13 |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
movq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %r13,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
xorq %rdx,%rdx |
|||
addq %r11,%r13 |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r9,8) |
|||
movq %rdx,(%rsp,%r9,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
cmpq %r9,%r14 |
|||
jl L$outer |
|||
|
|||
xorq %r14,%r14 |
|||
movq (%rsp),%rax |
|||
leaq (%rsp),%rsi |
|||
movq %r9,%r15 |
|||
jmp L$sub |
|||
.p2align 4 |
|||
L$sub: sbbq (%rcx,%r14,8),%rax |
|||
movq %rax,(%rdi,%r14,8) |
|||
movq 8(%rsi,%r14,8),%rax |
|||
leaq 1(%r14),%r14 |
|||
decq %r15 |
|||
jnz L$sub |
|||
|
|||
sbbq $0,%rax |
|||
xorq %r14,%r14 |
|||
andq %rax,%rsi |
|||
notq %rax |
|||
movq %rdi,%rcx |
|||
andq %rax,%rcx |
|||
movq %r9,%r15 |
|||
orq %rcx,%rsi |
|||
.p2align 4 |
|||
L$copy: |
|||
movq (%rsi,%r14,8),%rax |
|||
movq %r14,(%rsp,%r14,8) |
|||
movq %rax,(%rdi,%r14,8) |
|||
leaq 1(%r14),%r14 |
|||
subq $1,%r15 |
|||
jnz L$copy |
|||
|
|||
movq 8(%rsp,%r9,8),%rsi |
|||
movq $1,%rax |
|||
movq (%rsi),%r15 |
|||
movq 8(%rsi),%r14 |
|||
movq 16(%rsi),%r13 |
|||
movq 24(%rsi),%r12 |
|||
movq 32(%rsi),%rbp |
|||
movq 40(%rsi),%rbx |
|||
leaq 48(%rsi),%rsp |
|||
L$mul_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
|
|||
|
|||
.p2align 4 |
|||
bn_mul4x_mont_gather5: |
|||
L$mul4x_enter: |
|||
movl %r9d,%r9d |
|||
movl 8(%rsp),%r10d |
|||
pushq %rbx |
|||
pushq %rbp |
|||
pushq %r12 |
|||
pushq %r13 |
|||
pushq %r14 |
|||
pushq %r15 |
|||
movq %rsp,%rax |
|||
leaq 4(%r9),%r11 |
|||
negq %r11 |
|||
leaq (%rsp,%r11,8),%rsp |
|||
andq $-1024,%rsp |
|||
|
|||
movq %rax,8(%rsp,%r9,8) |
|||
L$mul4x_body: |
|||
movq %rdi,16(%rsp,%r9,8) |
|||
movq %rdx,%r12 |
|||
movq %r10,%r11 |
|||
shrq $3,%r10 |
|||
andq $7,%r11 |
|||
notq %r10 |
|||
leaq L$magic_masks(%rip),%rax |
|||
andq $3,%r10 |
|||
leaq 96(%r12,%r11,8),%r12 |
|||
movq 0(%rax,%r10,8),%xmm4 |
|||
movq 8(%rax,%r10,8),%xmm5 |
|||
movq 16(%rax,%r10,8),%xmm6 |
|||
movq 24(%rax,%r10,8),%xmm7 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
movq (%r8),%r8 |
|||
movq (%rsi),%rax |
|||
|
|||
xorq %r14,%r14 |
|||
xorq %r15,%r15 |
|||
|
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
movq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq 16(%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
leaq 4(%r15),%r15 |
|||
adcq $0,%rdx |
|||
movq %rdi,(%rsp) |
|||
movq %rdx,%r13 |
|||
jmp L$1st4x |
|||
.p2align 4 |
|||
L$1st4x: |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq 8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
leaq 4(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq -16(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
cmpq %r9,%r15 |
|||
jl L$1st4x |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
|
|||
xorq %rdi,%rdi |
|||
addq %r10,%r13 |
|||
adcq $0,%rdi |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdi,(%rsp,%r15,8) |
|||
|
|||
leaq 1(%r14),%r14 |
|||
.p2align 2 |
|||
L$outer4x: |
|||
xorq %r15,%r15 |
|||
movq -96(%r12),%xmm0 |
|||
movq -32(%r12),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%r12),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
|
|||
movq (%rsp),%r10 |
|||
movq %r8,%rbp |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx),%rax |
|||
adcq $0,%rdx |
|||
|
|||
movq 96(%r12),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
|
|||
imulq %r10,%rbp |
|||
movq %rdx,%r11 |
|||
|
|||
por %xmm2,%xmm0 |
|||
leaq 256(%r12),%r12 |
|||
por %xmm3,%xmm0 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r10 |
|||
movq 8(%rsi),%rax |
|||
adcq $0,%rdx |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx),%rax |
|||
adcq $0,%rdx |
|||
addq 8(%rsp),%r11 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq 16(%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
leaq 4(%r15),%r15 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r13 |
|||
jmp L$inner4x |
|||
.p2align 4 |
|||
L$inner4x: |
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -16(%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq (%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq (%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq 8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq 8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq 8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
leaq 4(%r15),%r15 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq -16(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-40(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
cmpq %r9,%r15 |
|||
jl L$inner4x |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r10 |
|||
movq -16(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -16(%rsp,%r15,8),%r10 |
|||
adcq $0,%rdx |
|||
movq %rdx,%r11 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%r13 |
|||
movq -8(%rsi,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq %r10,%r13 |
|||
adcq $0,%rdx |
|||
movq %rdi,-32(%rsp,%r15,8) |
|||
movq %rdx,%rdi |
|||
|
|||
mulq %rbx |
|||
addq %rax,%r11 |
|||
movq -8(%rcx,%r15,8),%rax |
|||
adcq $0,%rdx |
|||
addq -8(%rsp,%r15,8),%r11 |
|||
adcq $0,%rdx |
|||
leaq 1(%r14),%r14 |
|||
movq %rdx,%r10 |
|||
|
|||
mulq %rbp |
|||
addq %rax,%rdi |
|||
movq (%rsi),%rax |
|||
adcq $0,%rdx |
|||
addq %r11,%rdi |
|||
adcq $0,%rdx |
|||
movq %r13,-24(%rsp,%r15,8) |
|||
movq %rdx,%r13 |
|||
|
|||
.byte 102,72,15,126,195 |
|||
movq %rdi,-16(%rsp,%r15,8) |
|||
|
|||
xorq %rdi,%rdi |
|||
addq %r10,%r13 |
|||
adcq $0,%rdi |
|||
addq (%rsp,%r9,8),%r13 |
|||
adcq $0,%rdi |
|||
movq %r13,-8(%rsp,%r15,8) |
|||
movq %rdi,(%rsp,%r15,8) |
|||
|
|||
cmpq %r9,%r14 |
|||
jl L$outer4x |
|||
movq 16(%rsp,%r9,8),%rdi |
|||
movq 0(%rsp),%rax |
|||
pxor %xmm0,%xmm0 |
|||
movq 8(%rsp),%rdx |
|||
shrq $2,%r9 |
|||
leaq (%rsp),%rsi |
|||
xorq %r14,%r14 |
|||
|
|||
subq 0(%rcx),%rax |
|||
movq 16(%rsi),%rbx |
|||
movq 24(%rsi),%rbp |
|||
sbbq 8(%rcx),%rdx |
|||
leaq -1(%r9),%r15 |
|||
jmp L$sub4x |
|||
.p2align 4 |
|||
L$sub4x: |
|||
movq %rax,0(%rdi,%r14,8) |
|||
movq %rdx,8(%rdi,%r14,8) |
|||
sbbq 16(%rcx,%r14,8),%rbx |
|||
movq 32(%rsi,%r14,8),%rax |
|||
movq 40(%rsi,%r14,8),%rdx |
|||
sbbq 24(%rcx,%r14,8),%rbp |
|||
movq %rbx,16(%rdi,%r14,8) |
|||
movq %rbp,24(%rdi,%r14,8) |
|||
sbbq 32(%rcx,%r14,8),%rax |
|||
movq 48(%rsi,%r14,8),%rbx |
|||
movq 56(%rsi,%r14,8),%rbp |
|||
sbbq 40(%rcx,%r14,8),%rdx |
|||
leaq 4(%r14),%r14 |
|||
decq %r15 |
|||
jnz L$sub4x |
|||
|
|||
movq %rax,0(%rdi,%r14,8) |
|||
movq 32(%rsi,%r14,8),%rax |
|||
sbbq 16(%rcx,%r14,8),%rbx |
|||
movq %rdx,8(%rdi,%r14,8) |
|||
sbbq 24(%rcx,%r14,8),%rbp |
|||
movq %rbx,16(%rdi,%r14,8) |
|||
|
|||
sbbq $0,%rax |
|||
movq %rbp,24(%rdi,%r14,8) |
|||
xorq %r14,%r14 |
|||
andq %rax,%rsi |
|||
notq %rax |
|||
movq %rdi,%rcx |
|||
andq %rax,%rcx |
|||
leaq -1(%r9),%r15 |
|||
orq %rcx,%rsi |
|||
|
|||
movdqu (%rsi),%xmm1 |
|||
movdqa %xmm0,(%rsp) |
|||
movdqu %xmm1,(%rdi) |
|||
jmp L$copy4x |
|||
.p2align 4 |
|||
L$copy4x: |
|||
movdqu 16(%rsi,%r14,1),%xmm2 |
|||
movdqu 32(%rsi,%r14,1),%xmm1 |
|||
movdqa %xmm0,16(%rsp,%r14,1) |
|||
movdqu %xmm2,16(%rdi,%r14,1) |
|||
movdqa %xmm0,32(%rsp,%r14,1) |
|||
movdqu %xmm1,32(%rdi,%r14,1) |
|||
leaq 32(%r14),%r14 |
|||
decq %r15 |
|||
jnz L$copy4x |
|||
|
|||
shlq $2,%r9 |
|||
movdqu 16(%rsi,%r14,1),%xmm2 |
|||
movdqa %xmm0,16(%rsp,%r14,1) |
|||
movdqu %xmm2,16(%rdi,%r14,1) |
|||
movq 8(%rsp,%r9,8),%rsi |
|||
movq $1,%rax |
|||
movq (%rsi),%r15 |
|||
movq 8(%rsi),%r14 |
|||
movq 16(%rsi),%r13 |
|||
movq 24(%rsi),%r12 |
|||
movq 32(%rsi),%rbp |
|||
movq 40(%rsi),%rbx |
|||
leaq 48(%rsi),%rsp |
|||
L$mul4x_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
|
|||
.globl _bn_scatter5 |
|||
|
|||
.p2align 4 |
|||
_bn_scatter5: |
|||
cmpq $0,%rsi |
|||
jz L$scatter_epilogue |
|||
leaq (%rdx,%rcx,8),%rdx |
|||
L$scatter: |
|||
movq (%rdi),%rax |
|||
leaq 8(%rdi),%rdi |
|||
movq %rax,(%rdx) |
|||
leaq 256(%rdx),%rdx |
|||
subq $1,%rsi |
|||
jnz L$scatter |
|||
L$scatter_epilogue: |
|||
.byte 0xf3,0xc3 |
|||
|
|||
|
|||
.globl _bn_gather5 |
|||
|
|||
.p2align 4 |
|||
_bn_gather5: |
|||
movq %rcx,%r11 |
|||
shrq $3,%rcx |
|||
andq $7,%r11 |
|||
notq %rcx |
|||
leaq L$magic_masks(%rip),%rax |
|||
andq $3,%rcx |
|||
leaq 96(%rdx,%r11,8),%rdx |
|||
movq 0(%rax,%rcx,8),%xmm4 |
|||
movq 8(%rax,%rcx,8),%xmm5 |
|||
movq 16(%rax,%rcx,8),%xmm6 |
|||
movq 24(%rax,%rcx,8),%xmm7 |
|||
jmp L$gather |
|||
.p2align 4 |
|||
L$gather: |
|||
movq -96(%rdx),%xmm0 |
|||
movq -32(%rdx),%xmm1 |
|||
pand %xmm4,%xmm0 |
|||
movq 32(%rdx),%xmm2 |
|||
pand %xmm5,%xmm1 |
|||
movq 96(%rdx),%xmm3 |
|||
pand %xmm6,%xmm2 |
|||
por %xmm1,%xmm0 |
|||
pand %xmm7,%xmm3 |
|||
por %xmm2,%xmm0 |
|||
leaq 256(%rdx),%rdx |
|||
por %xmm3,%xmm0 |
|||
|
|||
movq %xmm0,(%rdi) |
|||
leaq 8(%rdi),%rdi |
|||
subq $1,%rsi |
|||
jnz L$gather |
|||
.byte 0xf3,0xc3 |
|||
L$SEH_end_bn_gather5: |
|||
|
|||
.p2align 6 |
|||
L$magic_masks: |
|||
.long 0,0, 0,0, 0,0, -1,-1 |
|||
.long 0,0, 0,0, 0,0, 0,0 |
|||
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
@ -0,0 +1,404 @@ |
|||
OPTION DOTNAME |
|||
.text$ SEGMENT ALIGN(64) 'CODE' |
|||
|
|||
|
|||
ALIGN 16 |
|||
_mul_1x1 PROC PRIVATE |
|||
sub rsp,128+8 |
|||
mov r9,-1 |
|||
lea rsi,QWORD PTR[rax*1+rax] |
|||
shr r9,3 |
|||
lea rdi,QWORD PTR[rax*4] |
|||
and r9,rax |
|||
lea r12,QWORD PTR[rax*8] |
|||
sar rax,63 |
|||
lea r10,QWORD PTR[r9*1+r9] |
|||
sar rsi,63 |
|||
lea r11,QWORD PTR[r9*4] |
|||
and rax,rbp |
|||
sar rdi,63 |
|||
mov rdx,rax |
|||
shl rax,63 |
|||
and rsi,rbp |
|||
shr rdx,1 |
|||
mov rcx,rsi |
|||
shl rsi,62 |
|||
and rdi,rbp |
|||
shr rcx,2 |
|||
xor rax,rsi |
|||
mov rbx,rdi |
|||
shl rdi,61 |
|||
xor rdx,rcx |
|||
shr rbx,3 |
|||
xor rax,rdi |
|||
xor rdx,rbx |
|||
|
|||
mov r13,r9 |
|||
mov QWORD PTR[rsp],0 |
|||
xor r13,r10 |
|||
mov QWORD PTR[8+rsp],r9 |
|||
mov r14,r11 |
|||
mov QWORD PTR[16+rsp],r10 |
|||
xor r14,r12 |
|||
mov QWORD PTR[24+rsp],r13 |
|||
|
|||
xor r9,r11 |
|||
mov QWORD PTR[32+rsp],r11 |
|||
xor r10,r11 |
|||
mov QWORD PTR[40+rsp],r9 |
|||
xor r13,r11 |
|||
mov QWORD PTR[48+rsp],r10 |
|||
xor r9,r14 |
|||
mov QWORD PTR[56+rsp],r13 |
|||
xor r10,r14 |
|||
|
|||
mov QWORD PTR[64+rsp],r12 |
|||
xor r13,r14 |
|||
mov QWORD PTR[72+rsp],r9 |
|||
xor r9,r11 |
|||
mov QWORD PTR[80+rsp],r10 |
|||
xor r10,r11 |
|||
mov QWORD PTR[88+rsp],r13 |
|||
|
|||
xor r13,r11 |
|||
mov QWORD PTR[96+rsp],r14 |
|||
mov rsi,r8 |
|||
mov QWORD PTR[104+rsp],r9 |
|||
and rsi,rbp |
|||
mov QWORD PTR[112+rsp],r10 |
|||
shr rbp,4 |
|||
mov QWORD PTR[120+rsp],r13 |
|||
mov rdi,r8 |
|||
and rdi,rbp |
|||
shr rbp,4 |
|||
|
|||
movq xmm0,QWORD PTR[rsi*8+rsp] |
|||
mov rsi,r8 |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,4 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,60 |
|||
xor rax,rcx |
|||
pslldq xmm1,1 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,12 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,52 |
|||
xor rax,rcx |
|||
pslldq xmm1,2 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,20 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,44 |
|||
xor rax,rcx |
|||
pslldq xmm1,3 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,28 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,36 |
|||
xor rax,rcx |
|||
pslldq xmm1,4 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,36 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,28 |
|||
xor rax,rcx |
|||
pslldq xmm1,5 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,44 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,20 |
|||
xor rax,rcx |
|||
pslldq xmm1,6 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rdi,r8 |
|||
mov rbx,rcx |
|||
shl rcx,52 |
|||
and rdi,rbp |
|||
movq xmm1,QWORD PTR[rsi*8+rsp] |
|||
shr rbx,12 |
|||
xor rax,rcx |
|||
pslldq xmm1,7 |
|||
mov rsi,r8 |
|||
shr rbp,4 |
|||
xor rdx,rbx |
|||
and rsi,rbp |
|||
shr rbp,4 |
|||
pxor xmm0,xmm1 |
|||
mov rcx,QWORD PTR[rdi*8+rsp] |
|||
mov rbx,rcx |
|||
shl rcx,60 |
|||
DB 102,72,15,126,198 |
|||
shr rbx,4 |
|||
xor rax,rcx |
|||
psrldq xmm0,8 |
|||
xor rdx,rbx |
|||
DB 102,72,15,126,199 |
|||
xor rax,rsi |
|||
xor rdx,rdi |
|||
|
|||
add rsp,128+8 |
|||
DB 0F3h,0C3h ;repret |
|||
$L$end_mul_1x1:: |
|||
_mul_1x1 ENDP |
|||
EXTERN OPENSSL_ia32cap_P:NEAR |
|||
PUBLIC bn_GF2m_mul_2x2 |
|||
|
|||
ALIGN 16 |
|||
bn_GF2m_mul_2x2 PROC PUBLIC |
|||
mov rax,QWORD PTR[OPENSSL_ia32cap_P] |
|||
bt rax,33 |
|||
jnc $L$vanilla_mul_2x2 |
|||
|
|||
DB 102,72,15,110,194 |
|||
DB 102,73,15,110,201 |
|||
DB 102,73,15,110,208 |
|||
movq xmm3,QWORD PTR[40+rsp] |
|||
movdqa xmm4,xmm0 |
|||
movdqa xmm5,xmm1 |
|||
DB 102,15,58,68,193,0 |
|||
pxor xmm4,xmm2 |
|||
pxor xmm5,xmm3 |
|||
DB 102,15,58,68,211,0 |
|||
DB 102,15,58,68,229,0 |
|||
xorps xmm4,xmm0 |
|||
xorps xmm4,xmm2 |
|||
movdqa xmm5,xmm4 |
|||
pslldq xmm4,8 |
|||
psrldq xmm5,8 |
|||
pxor xmm2,xmm4 |
|||
pxor xmm0,xmm5 |
|||
movdqu XMMWORD PTR[rcx],xmm2 |
|||
movdqu XMMWORD PTR[16+rcx],xmm0 |
|||
DB 0F3h,0C3h ;repret |
|||
|
|||
ALIGN 16 |
|||
$L$vanilla_mul_2x2:: |
|||
lea rsp,QWORD PTR[((-136))+rsp] |
|||
mov r10,QWORD PTR[176+rsp] |
|||
mov QWORD PTR[120+rsp],rdi |
|||
mov QWORD PTR[128+rsp],rsi |
|||
mov QWORD PTR[80+rsp],r14 |
|||
mov QWORD PTR[88+rsp],r13 |
|||
mov QWORD PTR[96+rsp],r12 |
|||
mov QWORD PTR[104+rsp],rbp |
|||
mov QWORD PTR[112+rsp],rbx |
|||
$L$body_mul_2x2:: |
|||
mov QWORD PTR[32+rsp],rcx |
|||
mov QWORD PTR[40+rsp],rdx |
|||
mov QWORD PTR[48+rsp],r8 |
|||
mov QWORD PTR[56+rsp],r9 |
|||
mov QWORD PTR[64+rsp],r10 |
|||
|
|||
mov r8,0fh |
|||
mov rax,rdx |
|||
mov rbp,r9 |
|||
call _mul_1x1 |
|||
|
|||
mov QWORD PTR[16+rsp],rax |
|||
mov QWORD PTR[24+rsp],rdx |
|||
|
|||
mov rax,QWORD PTR[48+rsp] |
|||
mov rbp,QWORD PTR[64+rsp] |
|||
call _mul_1x1 |
|||
|
|||
mov QWORD PTR[rsp],rax |
|||
mov QWORD PTR[8+rsp],rdx |
|||
|
|||
mov rax,QWORD PTR[40+rsp] |
|||
mov rbp,QWORD PTR[56+rsp] |
|||
xor rax,QWORD PTR[48+rsp] |
|||
xor rbp,QWORD PTR[64+rsp] |
|||
call _mul_1x1 |
|||
|
|||
mov rbx,QWORD PTR[rsp] |
|||
mov rcx,QWORD PTR[8+rsp] |
|||
mov rdi,QWORD PTR[16+rsp] |
|||
mov rsi,QWORD PTR[24+rsp] |
|||
mov rbp,QWORD PTR[32+rsp] |
|||
|
|||
xor rax,rdx |
|||
xor rdx,rcx |
|||
xor rax,rbx |
|||
mov QWORD PTR[rbp],rbx |
|||
xor rdx,rdi |
|||
mov QWORD PTR[24+rbp],rsi |
|||
xor rax,rsi |
|||
xor rdx,rsi |
|||
xor rax,rdx |
|||
mov QWORD PTR[16+rbp],rdx |
|||
mov QWORD PTR[8+rbp],rax |
|||
|
|||
mov r14,QWORD PTR[80+rsp] |
|||
mov r13,QWORD PTR[88+rsp] |
|||
mov r12,QWORD PTR[96+rsp] |
|||
mov rbp,QWORD PTR[104+rsp] |
|||
mov rbx,QWORD PTR[112+rsp] |
|||
mov rdi,QWORD PTR[120+rsp] |
|||
mov rsi,QWORD PTR[128+rsp] |
|||
lea rsp,QWORD PTR[136+rsp] |
|||
DB 0F3h,0C3h ;repret |
|||
$L$end_mul_2x2:: |
|||
bn_GF2m_mul_2x2 ENDP |
|||
DB 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105 |
|||
DB 99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54 |
|||
DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 |
|||
DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 |
|||
DB 111,114,103,62,0 |
|||
ALIGN 16 |
|||
EXTERN __imp_RtlVirtualUnwind:NEAR |
|||
|
|||
|
|||
ALIGN 16 |
|||
se_handler PROC PRIVATE |
|||
push rsi |
|||
push rdi |
|||
push rbx |
|||
push rbp |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
pushfq |
|||
sub rsp,64 |
|||
|
|||
mov rax,QWORD PTR[152+r8] |
|||
mov rbx,QWORD PTR[248+r8] |
|||
|
|||
lea r10,QWORD PTR[$L$body_mul_2x2] |
|||
cmp rbx,r10 |
|||
jb $L$in_prologue |
|||
|
|||
mov r14,QWORD PTR[80+rax] |
|||
mov r13,QWORD PTR[88+rax] |
|||
mov r12,QWORD PTR[96+rax] |
|||
mov rbp,QWORD PTR[104+rax] |
|||
mov rbx,QWORD PTR[112+rax] |
|||
mov rdi,QWORD PTR[120+rax] |
|||
mov rsi,QWORD PTR[128+rax] |
|||
|
|||
mov QWORD PTR[144+r8],rbx |
|||
mov QWORD PTR[160+r8],rbp |
|||
mov QWORD PTR[168+r8],rsi |
|||
mov QWORD PTR[176+r8],rdi |
|||
mov QWORD PTR[216+r8],r12 |
|||
mov QWORD PTR[224+r8],r13 |
|||
mov QWORD PTR[232+r8],r14 |
|||
|
|||
$L$in_prologue:: |
|||
lea rax,QWORD PTR[136+rax] |
|||
mov QWORD PTR[152+r8],rax |
|||
|
|||
mov rdi,QWORD PTR[40+r9] |
|||
mov rsi,r8 |
|||
mov ecx,154 |
|||
DD 0a548f3fch |
|||
|
|||
|
|||
mov rsi,r9 |
|||
xor rcx,rcx |
|||
mov rdx,QWORD PTR[8+rsi] |
|||
mov r8,QWORD PTR[rsi] |
|||
mov r9,QWORD PTR[16+rsi] |
|||
mov r10,QWORD PTR[40+rsi] |
|||
lea r11,QWORD PTR[56+rsi] |
|||
lea r12,QWORD PTR[24+rsi] |
|||
mov QWORD PTR[32+rsp],r10 |
|||
mov QWORD PTR[40+rsp],r11 |
|||
mov QWORD PTR[48+rsp],r12 |
|||
mov QWORD PTR[56+rsp],rcx |
|||
call QWORD PTR[__imp_RtlVirtualUnwind] |
|||
|
|||
mov eax,1 |
|||
add rsp,64 |
|||
popfq |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rbp |
|||
pop rbx |
|||
pop rdi |
|||
pop rsi |
|||
DB 0F3h,0C3h ;repret |
|||
se_handler ENDP |
|||
|
|||
.text$ ENDS |
|||
.pdata SEGMENT READONLY ALIGN(4) |
|||
ALIGN 4 |
|||
DD imagerel _mul_1x1 |
|||
DD imagerel $L$end_mul_1x1 |
|||
DD imagerel $L$SEH_info_1x1 |
|||
|
|||
DD imagerel $L$vanilla_mul_2x2 |
|||
DD imagerel $L$end_mul_2x2 |
|||
DD imagerel $L$SEH_info_2x2 |
|||
.pdata ENDS |
|||
.xdata SEGMENT READONLY ALIGN(8) |
|||
ALIGN 8 |
|||
$L$SEH_info_1x1:: |
|||
DB 001h,007h,002h,000h |
|||
DB 007h,001h,011h,000h |
|||
|
|||
$L$SEH_info_2x2:: |
|||
DB 9,0,0,0 |
|||
DD imagerel se_handler |
|||
|
|||
.xdata ENDS |
|||
END |
@ -0,0 +1,990 @@ |
|||
OPTION DOTNAME |
|||
.text$ SEGMENT ALIGN(64) 'CODE' |
|||
|
|||
PUBLIC bn_mul_mont_gather5 |
|||
|
|||
ALIGN 64 |
|||
bn_mul_mont_gather5 PROC PUBLIC |
|||
mov QWORD PTR[8+rsp],rdi ;WIN64 prologue |
|||
mov QWORD PTR[16+rsp],rsi |
|||
mov rax,rsp |
|||
$L$SEH_begin_bn_mul_mont_gather5:: |
|||
mov rdi,rcx |
|||
mov rsi,rdx |
|||
mov rdx,r8 |
|||
mov rcx,r9 |
|||
mov r8,QWORD PTR[40+rsp] |
|||
mov r9,QWORD PTR[48+rsp] |
|||
|
|||
|
|||
test r9d,3 |
|||
jnz $L$mul_enter |
|||
cmp r9d,8 |
|||
jb $L$mul_enter |
|||
jmp $L$mul4x_enter |
|||
|
|||
ALIGN 16 |
|||
$L$mul_enter:: |
|||
mov r9d,r9d |
|||
mov r10d,DWORD PTR[56+rsp] |
|||
push rbx |
|||
push rbp |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
lea rsp,QWORD PTR[((-40))+rsp] |
|||
movaps XMMWORD PTR[rsp],xmm6 |
|||
movaps XMMWORD PTR[16+rsp],xmm7 |
|||
$L$mul_alloca:: |
|||
mov rax,rsp |
|||
lea r11,QWORD PTR[2+r9] |
|||
neg r11 |
|||
lea rsp,QWORD PTR[r11*8+rsp] |
|||
and rsp,-1024 |
|||
|
|||
mov QWORD PTR[8+r9*8+rsp],rax |
|||
$L$mul_body:: |
|||
mov r12,rdx |
|||
mov r11,r10 |
|||
shr r10,3 |
|||
and r11,7 |
|||
not r10 |
|||
lea rax,QWORD PTR[$L$magic_masks] |
|||
and r10,3 |
|||
lea r12,QWORD PTR[96+r11*8+r12] |
|||
movq xmm4,QWORD PTR[r10*8+rax] |
|||
movq xmm5,QWORD PTR[8+r10*8+rax] |
|||
movq xmm6,QWORD PTR[16+r10*8+rax] |
|||
movq xmm7,QWORD PTR[24+r10*8+rax] |
|||
|
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
DB 102,72,15,126,195 |
|||
|
|||
mov r8,QWORD PTR[r8] |
|||
mov rax,QWORD PTR[rsi] |
|||
|
|||
xor r14,r14 |
|||
xor r15,r15 |
|||
|
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
|
|||
mov rbp,r8 |
|||
mul rbx |
|||
mov r10,rax |
|||
mov rax,QWORD PTR[rcx] |
|||
|
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
|
|||
imul rbp,r10 |
|||
mov r11,rdx |
|||
|
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
mul rbp |
|||
add r10,rax |
|||
mov rax,QWORD PTR[8+rsi] |
|||
adc rdx,0 |
|||
mov r13,rdx |
|||
|
|||
lea r15,QWORD PTR[1+r15] |
|||
jmp $L$1st_enter |
|||
|
|||
ALIGN 16 |
|||
$L$1st:: |
|||
add r13,rax |
|||
mov rax,QWORD PTR[r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r11 |
|||
mov r11,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
|
|||
$L$1st_enter:: |
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[r15*8+rcx] |
|||
adc rdx,0 |
|||
lea r15,QWORD PTR[1+r15] |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
cmp r15,r9 |
|||
jne $L$1st |
|||
|
|||
DB 102,72,15,126,195 |
|||
|
|||
add r13,rax |
|||
mov rax,QWORD PTR[rsi] |
|||
adc rdx,0 |
|||
add r13,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
mov r11,r10 |
|||
|
|||
xor rdx,rdx |
|||
add r13,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-8))+r9*8+rsp],r13 |
|||
mov QWORD PTR[r9*8+rsp],rdx |
|||
|
|||
lea r14,QWORD PTR[1+r14] |
|||
jmp $L$outer |
|||
ALIGN 16 |
|||
$L$outer:: |
|||
xor r15,r15 |
|||
mov rbp,r8 |
|||
mov r10,QWORD PTR[rsp] |
|||
|
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
|
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[rcx] |
|||
adc rdx,0 |
|||
|
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
|
|||
imul rbp,r10 |
|||
mov r11,rdx |
|||
|
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
mul rbp |
|||
add r10,rax |
|||
mov rax,QWORD PTR[8+rsi] |
|||
adc rdx,0 |
|||
mov r10,QWORD PTR[8+rsp] |
|||
mov r13,rdx |
|||
|
|||
lea r15,QWORD PTR[1+r15] |
|||
jmp $L$inner_enter |
|||
|
|||
ALIGN 16 |
|||
$L$inner:: |
|||
add r13,rax |
|||
mov rax,QWORD PTR[r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
mov r10,QWORD PTR[r15*8+rsp] |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
|
|||
$L$inner_enter:: |
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[r15*8+rcx] |
|||
adc rdx,0 |
|||
add r10,r11 |
|||
mov r11,rdx |
|||
adc r11,0 |
|||
lea r15,QWORD PTR[1+r15] |
|||
|
|||
mul rbp |
|||
cmp r15,r9 |
|||
jne $L$inner |
|||
|
|||
DB 102,72,15,126,195 |
|||
|
|||
add r13,rax |
|||
mov rax,QWORD PTR[rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
mov r10,QWORD PTR[r15*8+rsp] |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
|
|||
xor rdx,rdx |
|||
add r13,r11 |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-8))+r9*8+rsp],r13 |
|||
mov QWORD PTR[r9*8+rsp],rdx |
|||
|
|||
lea r14,QWORD PTR[1+r14] |
|||
cmp r14,r9 |
|||
jl $L$outer |
|||
|
|||
xor r14,r14 |
|||
mov rax,QWORD PTR[rsp] |
|||
lea rsi,QWORD PTR[rsp] |
|||
mov r15,r9 |
|||
jmp $L$sub |
|||
ALIGN 16 |
|||
$L$sub:: sbb rax,QWORD PTR[r14*8+rcx] |
|||
mov QWORD PTR[r14*8+rdi],rax |
|||
mov rax,QWORD PTR[8+r14*8+rsi] |
|||
lea r14,QWORD PTR[1+r14] |
|||
dec r15 |
|||
jnz $L$sub |
|||
|
|||
sbb rax,0 |
|||
xor r14,r14 |
|||
and rsi,rax |
|||
not rax |
|||
mov rcx,rdi |
|||
and rcx,rax |
|||
mov r15,r9 |
|||
or rsi,rcx |
|||
ALIGN 16 |
|||
$L$copy:: |
|||
mov rax,QWORD PTR[r14*8+rsi] |
|||
mov QWORD PTR[r14*8+rsp],r14 |
|||
mov QWORD PTR[r14*8+rdi],rax |
|||
lea r14,QWORD PTR[1+r14] |
|||
sub r15,1 |
|||
jnz $L$copy |
|||
|
|||
mov rsi,QWORD PTR[8+r9*8+rsp] |
|||
mov rax,1 |
|||
movaps xmm6,XMMWORD PTR[rsi] |
|||
movaps xmm7,XMMWORD PTR[16+rsi] |
|||
lea rsi,QWORD PTR[40+rsi] |
|||
mov r15,QWORD PTR[rsi] |
|||
mov r14,QWORD PTR[8+rsi] |
|||
mov r13,QWORD PTR[16+rsi] |
|||
mov r12,QWORD PTR[24+rsi] |
|||
mov rbp,QWORD PTR[32+rsi] |
|||
mov rbx,QWORD PTR[40+rsi] |
|||
lea rsp,QWORD PTR[48+rsi] |
|||
$L$mul_epilogue:: |
|||
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue |
|||
mov rsi,QWORD PTR[16+rsp] |
|||
DB 0F3h,0C3h ;repret |
|||
$L$SEH_end_bn_mul_mont_gather5:: |
|||
bn_mul_mont_gather5 ENDP |
|||
|
|||
ALIGN 16 |
|||
bn_mul4x_mont_gather5 PROC PRIVATE |
|||
mov QWORD PTR[8+rsp],rdi ;WIN64 prologue |
|||
mov QWORD PTR[16+rsp],rsi |
|||
mov rax,rsp |
|||
$L$SEH_begin_bn_mul4x_mont_gather5:: |
|||
mov rdi,rcx |
|||
mov rsi,rdx |
|||
mov rdx,r8 |
|||
mov rcx,r9 |
|||
mov r8,QWORD PTR[40+rsp] |
|||
mov r9,QWORD PTR[48+rsp] |
|||
|
|||
|
|||
$L$mul4x_enter:: |
|||
mov r9d,r9d |
|||
mov r10d,DWORD PTR[56+rsp] |
|||
push rbx |
|||
push rbp |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
lea rsp,QWORD PTR[((-40))+rsp] |
|||
movaps XMMWORD PTR[rsp],xmm6 |
|||
movaps XMMWORD PTR[16+rsp],xmm7 |
|||
$L$mul4x_alloca:: |
|||
mov rax,rsp |
|||
lea r11,QWORD PTR[4+r9] |
|||
neg r11 |
|||
lea rsp,QWORD PTR[r11*8+rsp] |
|||
and rsp,-1024 |
|||
|
|||
mov QWORD PTR[8+r9*8+rsp],rax |
|||
$L$mul4x_body:: |
|||
mov QWORD PTR[16+r9*8+rsp],rdi |
|||
mov r12,rdx |
|||
mov r11,r10 |
|||
shr r10,3 |
|||
and r11,7 |
|||
not r10 |
|||
lea rax,QWORD PTR[$L$magic_masks] |
|||
and r10,3 |
|||
lea r12,QWORD PTR[96+r11*8+r12] |
|||
movq xmm4,QWORD PTR[r10*8+rax] |
|||
movq xmm5,QWORD PTR[8+r10*8+rax] |
|||
movq xmm6,QWORD PTR[16+r10*8+rax] |
|||
movq xmm7,QWORD PTR[24+r10*8+rax] |
|||
|
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
DB 102,72,15,126,195 |
|||
mov r8,QWORD PTR[r8] |
|||
mov rax,QWORD PTR[rsi] |
|||
|
|||
xor r14,r14 |
|||
xor r15,r15 |
|||
|
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
|
|||
mov rbp,r8 |
|||
mul rbx |
|||
mov r10,rax |
|||
mov rax,QWORD PTR[rcx] |
|||
|
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
|
|||
imul rbp,r10 |
|||
mov r11,rdx |
|||
|
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
mul rbp |
|||
add r10,rax |
|||
mov rax,QWORD PTR[8+rsi] |
|||
adc rdx,0 |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[8+rcx] |
|||
adc rdx,0 |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[16+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
lea r15,QWORD PTR[4+r15] |
|||
adc rdx,0 |
|||
mov QWORD PTR[rsp],rdi |
|||
mov r13,rdx |
|||
jmp $L$1st4x |
|||
ALIGN 16 |
|||
$L$1st4x:: |
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rcx] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-24))+r15*8+rsp],r13 |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rcx] |
|||
adc rdx,0 |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[r15*8+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],rdi |
|||
mov r13,rdx |
|||
|
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[r15*8+rcx] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[8+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-8))+r15*8+rsp],r13 |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[8+r15*8+rcx] |
|||
adc rdx,0 |
|||
lea r15,QWORD PTR[4+r15] |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-32))+r15*8+rsp],rdi |
|||
mov r13,rdx |
|||
cmp r15,r9 |
|||
jl $L$1st4x |
|||
|
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rcx] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-24))+r15*8+rsp],r13 |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rcx] |
|||
adc rdx,0 |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],rdi |
|||
mov r13,rdx |
|||
|
|||
DB 102,72,15,126,195 |
|||
|
|||
xor rdi,rdi |
|||
add r13,r10 |
|||
adc rdi,0 |
|||
mov QWORD PTR[((-8))+r15*8+rsp],r13 |
|||
mov QWORD PTR[r15*8+rsp],rdi |
|||
|
|||
lea r14,QWORD PTR[1+r14] |
|||
ALIGN 4 |
|||
$L$outer4x:: |
|||
xor r15,r15 |
|||
movq xmm0,QWORD PTR[((-96))+r12] |
|||
movq xmm1,QWORD PTR[((-32))+r12] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r12] |
|||
pand xmm1,xmm5 |
|||
|
|||
mov r10,QWORD PTR[rsp] |
|||
mov rbp,r8 |
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[rcx] |
|||
adc rdx,0 |
|||
|
|||
movq xmm3,QWORD PTR[96+r12] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
|
|||
imul rbp,r10 |
|||
mov r11,rdx |
|||
|
|||
por xmm0,xmm2 |
|||
lea r12,QWORD PTR[256+r12] |
|||
por xmm0,xmm3 |
|||
|
|||
mul rbp |
|||
add r10,rax |
|||
mov rax,QWORD PTR[8+rsi] |
|||
adc rdx,0 |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[8+rcx] |
|||
adc rdx,0 |
|||
add r11,QWORD PTR[8+rsp] |
|||
adc rdx,0 |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[16+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
lea r15,QWORD PTR[4+r15] |
|||
adc rdx,0 |
|||
mov r13,rdx |
|||
jmp $L$inner4x |
|||
ALIGN 16 |
|||
$L$inner4x:: |
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rcx] |
|||
adc rdx,0 |
|||
add r10,QWORD PTR[((-16))+r15*8+rsp] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-32))+r15*8+rsp],rdi |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rcx] |
|||
adc rdx,0 |
|||
add r11,QWORD PTR[((-8))+r15*8+rsp] |
|||
adc rdx,0 |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[r15*8+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-24))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
|
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[r15*8+rcx] |
|||
adc rdx,0 |
|||
add r10,QWORD PTR[r15*8+rsp] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[8+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],rdi |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[8+r15*8+rcx] |
|||
adc rdx,0 |
|||
add r11,QWORD PTR[8+r15*8+rsp] |
|||
adc rdx,0 |
|||
lea r15,QWORD PTR[4+r15] |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-40))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
cmp r15,r9 |
|||
jl $L$inner4x |
|||
|
|||
mul rbx |
|||
add r10,rax |
|||
mov rax,QWORD PTR[((-16))+r15*8+rcx] |
|||
adc rdx,0 |
|||
add r10,QWORD PTR[((-16))+r15*8+rsp] |
|||
adc rdx,0 |
|||
mov r11,rdx |
|||
|
|||
mul rbp |
|||
add r13,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rsi] |
|||
adc rdx,0 |
|||
add r13,r10 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-32))+r15*8+rsp],rdi |
|||
mov rdi,rdx |
|||
|
|||
mul rbx |
|||
add r11,rax |
|||
mov rax,QWORD PTR[((-8))+r15*8+rcx] |
|||
adc rdx,0 |
|||
add r11,QWORD PTR[((-8))+r15*8+rsp] |
|||
adc rdx,0 |
|||
lea r14,QWORD PTR[1+r14] |
|||
mov r10,rdx |
|||
|
|||
mul rbp |
|||
add rdi,rax |
|||
mov rax,QWORD PTR[rsi] |
|||
adc rdx,0 |
|||
add rdi,r11 |
|||
adc rdx,0 |
|||
mov QWORD PTR[((-24))+r15*8+rsp],r13 |
|||
mov r13,rdx |
|||
|
|||
DB 102,72,15,126,195 |
|||
mov QWORD PTR[((-16))+r15*8+rsp],rdi |
|||
|
|||
xor rdi,rdi |
|||
add r13,r10 |
|||
adc rdi,0 |
|||
add r13,QWORD PTR[r9*8+rsp] |
|||
adc rdi,0 |
|||
mov QWORD PTR[((-8))+r15*8+rsp],r13 |
|||
mov QWORD PTR[r15*8+rsp],rdi |
|||
|
|||
cmp r14,r9 |
|||
jl $L$outer4x |
|||
mov rdi,QWORD PTR[16+r9*8+rsp] |
|||
mov rax,QWORD PTR[rsp] |
|||
pxor xmm0,xmm0 |
|||
mov rdx,QWORD PTR[8+rsp] |
|||
shr r9,2 |
|||
lea rsi,QWORD PTR[rsp] |
|||
xor r14,r14 |
|||
|
|||
sub rax,QWORD PTR[rcx] |
|||
mov rbx,QWORD PTR[16+rsi] |
|||
mov rbp,QWORD PTR[24+rsi] |
|||
sbb rdx,QWORD PTR[8+rcx] |
|||
lea r15,QWORD PTR[((-1))+r9] |
|||
jmp $L$sub4x |
|||
ALIGN 16 |
|||
$L$sub4x:: |
|||
mov QWORD PTR[r14*8+rdi],rax |
|||
mov QWORD PTR[8+r14*8+rdi],rdx |
|||
sbb rbx,QWORD PTR[16+r14*8+rcx] |
|||
mov rax,QWORD PTR[32+r14*8+rsi] |
|||
mov rdx,QWORD PTR[40+r14*8+rsi] |
|||
sbb rbp,QWORD PTR[24+r14*8+rcx] |
|||
mov QWORD PTR[16+r14*8+rdi],rbx |
|||
mov QWORD PTR[24+r14*8+rdi],rbp |
|||
sbb rax,QWORD PTR[32+r14*8+rcx] |
|||
mov rbx,QWORD PTR[48+r14*8+rsi] |
|||
mov rbp,QWORD PTR[56+r14*8+rsi] |
|||
sbb rdx,QWORD PTR[40+r14*8+rcx] |
|||
lea r14,QWORD PTR[4+r14] |
|||
dec r15 |
|||
jnz $L$sub4x |
|||
|
|||
mov QWORD PTR[r14*8+rdi],rax |
|||
mov rax,QWORD PTR[32+r14*8+rsi] |
|||
sbb rbx,QWORD PTR[16+r14*8+rcx] |
|||
mov QWORD PTR[8+r14*8+rdi],rdx |
|||
sbb rbp,QWORD PTR[24+r14*8+rcx] |
|||
mov QWORD PTR[16+r14*8+rdi],rbx |
|||
|
|||
sbb rax,0 |
|||
mov QWORD PTR[24+r14*8+rdi],rbp |
|||
xor r14,r14 |
|||
and rsi,rax |
|||
not rax |
|||
mov rcx,rdi |
|||
and rcx,rax |
|||
lea r15,QWORD PTR[((-1))+r9] |
|||
or rsi,rcx |
|||
|
|||
movdqu xmm1,XMMWORD PTR[rsi] |
|||
movdqa XMMWORD PTR[rsp],xmm0 |
|||
movdqu XMMWORD PTR[rdi],xmm1 |
|||
jmp $L$copy4x |
|||
ALIGN 16 |
|||
$L$copy4x:: |
|||
movdqu xmm2,XMMWORD PTR[16+r14*1+rsi] |
|||
movdqu xmm1,XMMWORD PTR[32+r14*1+rsi] |
|||
movdqa XMMWORD PTR[16+r14*1+rsp],xmm0 |
|||
movdqu XMMWORD PTR[16+r14*1+rdi],xmm2 |
|||
movdqa XMMWORD PTR[32+r14*1+rsp],xmm0 |
|||
movdqu XMMWORD PTR[32+r14*1+rdi],xmm1 |
|||
lea r14,QWORD PTR[32+r14] |
|||
dec r15 |
|||
jnz $L$copy4x |
|||
|
|||
shl r9,2 |
|||
movdqu xmm2,XMMWORD PTR[16+r14*1+rsi] |
|||
movdqa XMMWORD PTR[16+r14*1+rsp],xmm0 |
|||
movdqu XMMWORD PTR[16+r14*1+rdi],xmm2 |
|||
mov rsi,QWORD PTR[8+r9*8+rsp] |
|||
mov rax,1 |
|||
movaps xmm6,XMMWORD PTR[rsi] |
|||
movaps xmm7,XMMWORD PTR[16+rsi] |
|||
lea rsi,QWORD PTR[40+rsi] |
|||
mov r15,QWORD PTR[rsi] |
|||
mov r14,QWORD PTR[8+rsi] |
|||
mov r13,QWORD PTR[16+rsi] |
|||
mov r12,QWORD PTR[24+rsi] |
|||
mov rbp,QWORD PTR[32+rsi] |
|||
mov rbx,QWORD PTR[40+rsi] |
|||
lea rsp,QWORD PTR[48+rsi] |
|||
$L$mul4x_epilogue:: |
|||
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue |
|||
mov rsi,QWORD PTR[16+rsp] |
|||
DB 0F3h,0C3h ;repret |
|||
$L$SEH_end_bn_mul4x_mont_gather5:: |
|||
bn_mul4x_mont_gather5 ENDP |
|||
PUBLIC bn_scatter5 |
|||
|
|||
ALIGN 16 |
|||
bn_scatter5 PROC PUBLIC |
|||
cmp rdx,0 |
|||
jz $L$scatter_epilogue |
|||
lea r8,QWORD PTR[r9*8+r8] |
|||
$L$scatter:: |
|||
mov rax,QWORD PTR[rcx] |
|||
lea rcx,QWORD PTR[8+rcx] |
|||
mov QWORD PTR[r8],rax |
|||
lea r8,QWORD PTR[256+r8] |
|||
sub rdx,1 |
|||
jnz $L$scatter |
|||
$L$scatter_epilogue:: |
|||
DB 0F3h,0C3h ;repret |
|||
bn_scatter5 ENDP |
|||
|
|||
PUBLIC bn_gather5 |
|||
|
|||
ALIGN 16 |
|||
bn_gather5 PROC PUBLIC |
|||
$L$SEH_begin_bn_gather5:: |
|||
|
|||
DB 048h,083h,0ech,028h |
|||
|
|||
DB 00fh,029h,034h,024h |
|||
|
|||
DB 00fh,029h,07ch,024h,010h |
|||
|
|||
mov r11,r9 |
|||
shr r9,3 |
|||
and r11,7 |
|||
not r9 |
|||
lea rax,QWORD PTR[$L$magic_masks] |
|||
and r9,3 |
|||
lea r8,QWORD PTR[96+r11*8+r8] |
|||
movq xmm4,QWORD PTR[r9*8+rax] |
|||
movq xmm5,QWORD PTR[8+r9*8+rax] |
|||
movq xmm6,QWORD PTR[16+r9*8+rax] |
|||
movq xmm7,QWORD PTR[24+r9*8+rax] |
|||
jmp $L$gather |
|||
ALIGN 16 |
|||
$L$gather:: |
|||
movq xmm0,QWORD PTR[((-96))+r8] |
|||
movq xmm1,QWORD PTR[((-32))+r8] |
|||
pand xmm0,xmm4 |
|||
movq xmm2,QWORD PTR[32+r8] |
|||
pand xmm1,xmm5 |
|||
movq xmm3,QWORD PTR[96+r8] |
|||
pand xmm2,xmm6 |
|||
por xmm0,xmm1 |
|||
pand xmm3,xmm7 |
|||
por xmm0,xmm2 |
|||
lea r8,QWORD PTR[256+r8] |
|||
por xmm0,xmm3 |
|||
|
|||
movq QWORD PTR[rcx],xmm0 |
|||
lea rcx,QWORD PTR[8+rcx] |
|||
sub rdx,1 |
|||
jnz $L$gather |
|||
movaps XMMWORD PTR[rsp],xmm6 |
|||
movaps XMMWORD PTR[16+rsp],xmm7 |
|||
lea rsp,QWORD PTR[40+rsp] |
|||
DB 0F3h,0C3h ;repret |
|||
$L$SEH_end_bn_gather5:: |
|||
bn_gather5 ENDP |
|||
ALIGN 64 |
|||
$L$magic_masks:: |
|||
DD 0,0,0,0,0,0,-1,-1 |
|||
DD 0,0,0,0,0,0,0,0 |
|||
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 |
|||
DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 |
|||
DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 |
|||
DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 |
|||
DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 |
|||
DB 112,101,110,115,115,108,46,111,114,103,62,0 |
|||
EXTERN __imp_RtlVirtualUnwind:NEAR |
|||
|
|||
ALIGN 16 |
|||
mul_handler PROC PRIVATE |
|||
push rsi |
|||
push rdi |
|||
push rbx |
|||
push rbp |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
pushfq |
|||
sub rsp,64 |
|||
|
|||
mov rax,QWORD PTR[120+r8] |
|||
mov rbx,QWORD PTR[248+r8] |
|||
|
|||
mov rsi,QWORD PTR[8+r9] |
|||
mov r11,QWORD PTR[56+r9] |
|||
|
|||
mov r10d,DWORD PTR[r11] |
|||
lea r10,QWORD PTR[r10*1+rsi] |
|||
cmp rbx,r10 |
|||
jb $L$common_seh_tail |
|||
|
|||
lea rax,QWORD PTR[88+rax] |
|||
|
|||
mov r10d,DWORD PTR[4+r11] |
|||
lea r10,QWORD PTR[r10*1+rsi] |
|||
cmp rbx,r10 |
|||
jb $L$common_seh_tail |
|||
|
|||
mov rax,QWORD PTR[152+r8] |
|||
|
|||
mov r10d,DWORD PTR[8+r11] |
|||
lea r10,QWORD PTR[r10*1+rsi] |
|||
cmp rbx,r10 |
|||
jae $L$common_seh_tail |
|||
|
|||
mov r10,QWORD PTR[192+r8] |
|||
mov rax,QWORD PTR[8+r10*8+rax] |
|||
|
|||
movaps xmm0,XMMWORD PTR[rax] |
|||
movaps xmm1,XMMWORD PTR[16+rax] |
|||
lea rax,QWORD PTR[88+rax] |
|||
|
|||
mov rbx,QWORD PTR[((-8))+rax] |
|||
mov rbp,QWORD PTR[((-16))+rax] |
|||
mov r12,QWORD PTR[((-24))+rax] |
|||
mov r13,QWORD PTR[((-32))+rax] |
|||
mov r14,QWORD PTR[((-40))+rax] |
|||
mov r15,QWORD PTR[((-48))+rax] |
|||
mov QWORD PTR[144+r8],rbx |
|||
mov QWORD PTR[160+r8],rbp |
|||
mov QWORD PTR[216+r8],r12 |
|||
mov QWORD PTR[224+r8],r13 |
|||
mov QWORD PTR[232+r8],r14 |
|||
mov QWORD PTR[240+r8],r15 |
|||
movups XMMWORD PTR[512+r8],xmm0 |
|||
movups XMMWORD PTR[528+r8],xmm1 |
|||
|
|||
$L$common_seh_tail:: |
|||
mov rdi,QWORD PTR[8+rax] |
|||
mov rsi,QWORD PTR[16+rax] |
|||
mov QWORD PTR[152+r8],rax |
|||
mov QWORD PTR[168+r8],rsi |
|||
mov QWORD PTR[176+r8],rdi |
|||
|
|||
mov rdi,QWORD PTR[40+r9] |
|||
mov rsi,r8 |
|||
mov ecx,154 |
|||
DD 0a548f3fch |
|||
|
|||
|
|||
mov rsi,r9 |
|||
xor rcx,rcx |
|||
mov rdx,QWORD PTR[8+rsi] |
|||
mov r8,QWORD PTR[rsi] |
|||
mov r9,QWORD PTR[16+rsi] |
|||
mov r10,QWORD PTR[40+rsi] |
|||
lea r11,QWORD PTR[56+rsi] |
|||
lea r12,QWORD PTR[24+rsi] |
|||
mov QWORD PTR[32+rsp],r10 |
|||
mov QWORD PTR[40+rsp],r11 |
|||
mov QWORD PTR[48+rsp],r12 |
|||
mov QWORD PTR[56+rsp],rcx |
|||
call QWORD PTR[__imp_RtlVirtualUnwind] |
|||
|
|||
mov eax,1 |
|||
add rsp,64 |
|||
popfq |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rbp |
|||
pop rbx |
|||
pop rdi |
|||
pop rsi |
|||
DB 0F3h,0C3h ;repret |
|||
mul_handler ENDP |
|||
|
|||
.text$ ENDS |
|||
.pdata SEGMENT READONLY ALIGN(4) |
|||
ALIGN 4 |
|||
DD imagerel $L$SEH_begin_bn_mul_mont_gather5 |
|||
DD imagerel $L$SEH_end_bn_mul_mont_gather5 |
|||
DD imagerel $L$SEH_info_bn_mul_mont_gather5 |
|||
|
|||
DD imagerel $L$SEH_begin_bn_mul4x_mont_gather5 |
|||
DD imagerel $L$SEH_end_bn_mul4x_mont_gather5 |
|||
DD imagerel $L$SEH_info_bn_mul4x_mont_gather5 |
|||
|
|||
DD imagerel $L$SEH_begin_bn_gather5 |
|||
DD imagerel $L$SEH_end_bn_gather5 |
|||
DD imagerel $L$SEH_info_bn_gather5 |
|||
|
|||
.pdata ENDS |
|||
.xdata SEGMENT READONLY ALIGN(8) |
|||
ALIGN 8 |
|||
$L$SEH_info_bn_mul_mont_gather5:: |
|||
DB 9,0,0,0 |
|||
DD imagerel mul_handler |
|||
DD imagerel $L$mul_alloca,imagerel $L$mul_body,imagerel $L$mul_epilogue |
|||
|
|||
ALIGN 8 |
|||
$L$SEH_info_bn_mul4x_mont_gather5:: |
|||
DB 9,0,0,0 |
|||
DD imagerel mul_handler |
|||
DD imagerel $L$mul4x_alloca,imagerel $L$mul4x_body,imagerel $L$mul4x_epilogue |
|||
|
|||
ALIGN 8 |
|||
$L$SEH_info_bn_gather5:: |
|||
DB 001h,00dh,005h,000h |
|||
DB 00dh,078h,001h,000h |
|||
|
|||
DB 008h,068h,000h,000h |
|||
|
|||
DB 004h,042h,000h,000h |
|||
|
|||
ALIGN 8 |
|||
|
|||
.xdata ENDS |
|||
END |
Loading…
Reference in new issue