diff --git a/deps/openssl/asm/Makefile b/deps/openssl/asm/Makefile index bbe9d1ed00..eb6c64672a 100644 --- a/deps/openssl/asm/Makefile +++ b/deps/openssl/asm/Makefile @@ -26,6 +26,8 @@ OUTPUTS = \ x64-elf-gas/aes/aesni-sha1-x86_64.s \ x64-elf-gas/bn/modexp512-x86_64.s \ x64-elf-gas/bn/x86_64-mont.s \ + x64-elf-gas/bn/x86_64-mont5.s \ + x64-elf-gas/bn/x86_64-gf2m.s \ x64-elf-gas/camellia/cmll-x86_64.s \ x64-elf-gas/md5/md5-x86_64.s \ x64-elf-gas/rc4/rc4-x86_64.s \ @@ -58,6 +60,8 @@ OUTPUTS = \ x64-macosx-gas/aes/aesni-sha1-x86_64.s \ x64-macosx-gas/bn/modexp512-x86_64.s \ x64-macosx-gas/bn/x86_64-mont.s \ + x64-macosx-gas/bn/x86_64-mont5.s \ + x64-macosx-gas/bn/x86_64-gf2m.s \ x64-macosx-gas/camellia/cmll-x86_64.s \ x64-macosx-gas/md5/md5-x86_64.s \ x64-macosx-gas/rc4/rc4-x86_64.s \ @@ -90,6 +94,8 @@ OUTPUTS = \ x64-win32-masm/aes/aesni-sha1-x86_64.asm \ x64-win32-masm/bn/modexp512-x86_64.asm \ x64-win32-masm/bn/x86_64-mont.asm \ + x64-win32-masm/bn/x86_64-mont5.asm \ + x64-win32-masm/bn/x86_64-gf2m.asm \ x64-win32-masm/camellia/cmll-x86_64.asm \ x64-win32-masm/md5/md5-x86_64.asm \ x64-win32-masm/rc4/rc4-x86_64.asm \ @@ -129,6 +135,8 @@ x64-elf-gas/aes/aesni-x86_64.s: ../openssl/crypto/aes/asm/aesni-x86_64.pl x64-elf-gas/aes/aesni-sha1-x86_64.s: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl x64-elf-gas/bn/modexp512-x86_64.s: ../openssl/crypto/bn/asm/modexp512-x86_64.pl x64-elf-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl +x64-elf-gas/bn/x86_64-mont5.s: ../openssl/crypto/bn/asm/x86_64-mont5.pl +x64-elf-gas/bn/x86_64-gf2m.s: ../openssl/crypto/bn/asm/x86_64-gf2m.pl x64-elf-gas/camellia/cmll-x86_64.s: ../openssl/crypto/camellia/asm/cmll-x86_64.pl x64-elf-gas/md5/md5-x86_64.s: ../openssl/crypto/md5/asm/md5-x86_64.pl x64-elf-gas/rc4/rc4-x86_64.s: ../openssl/crypto/rc4/asm/rc4-x86_64.pl @@ -143,6 +151,8 @@ x64-macosx-gas/aes/aesni-x86_64.s: ../openssl/crypto/aes/asm/aesni-x86_64.pl x64-macosx-gas/aes/aesni-sha1-x86_64.s: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl x64-macosx-gas/bn/modexp512-x86_64.s: ../openssl/crypto/bn/asm/modexp512-x86_64.pl x64-macosx-gas/bn/x86_64-mont.s: ../openssl/crypto/bn/asm/x86_64-mont.pl +x64-macosx-gas/bn/x86_64-mont5.s: ../openssl/crypto/bn/asm/x86_64-mont5.pl +x64-macosx-gas/bn/x86_64-gf2m.s: ../openssl/crypto/bn/asm/x86_64-gf2m.pl x64-macosx-gas/camellia/cmll-x86_64.s: ../openssl/crypto/camellia/asm/cmll-x86_64.pl x64-macosx-gas/md5/md5-x86_64.s: ../openssl/crypto/md5/asm/md5-x86_64.pl x64-macosx-gas/rc4/rc4-x86_64.s: ../openssl/crypto/rc4/asm/rc4-x86_64.pl @@ -157,6 +167,8 @@ x64-win32-masm/aes/aesni-x86_64.asm: ../openssl/crypto/aes/asm/aesni-x86_64.pl x64-win32-masm/aes/aesni-sha1-x86_64.asm: ../openssl/crypto/aes/asm/aesni-sha1-x86_64.pl x64-win32-masm/bn/modexp512-x86_64.asm: ../openssl/crypto/bn/asm/modexp512-x86_64.pl x64-win32-masm/bn/x86_64-mont.asm: ../openssl/crypto/bn/asm/x86_64-mont.pl +x64-win32-masm/bn/x86_64-mont5.asm: ../openssl/crypto/bn/asm/x86_64-mont5.pl +x64-win32-masm/bn/x86_64-gf2m.asm: ../openssl/crypto/bn/asm/x86_64-gf2m.pl x64-win32-masm/camellia/cmll-x86_64.asm: ../openssl/crypto/camellia/asm/cmll-x86_64.pl x64-win32-masm/md5/md5-x86_64.asm: ../openssl/crypto/md5/asm/md5-x86_64.pl x64-win32-masm/rc4/rc4-x86_64.asm: ../openssl/crypto/rc4/asm/rc4-x86_64.pl diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s new file mode 100644 index 0000000000..4f84013f2b --- /dev/null +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s @@ -0,0 +1,295 @@ +.text + + +.type _mul_1x1,@function +.align 16 +_mul_1x1: + subq $128+8,%rsp + movq $-1,%r9 + leaq (%rax,%rax,1),%rsi + shrq $3,%r9 + leaq (,%rax,4),%rdi + andq %rax,%r9 + leaq (,%rax,8),%r12 + sarq $63,%rax + leaq (%r9,%r9,1),%r10 + sarq $63,%rsi + leaq (,%r9,4),%r11 + andq %rbp,%rax + sarq $63,%rdi + movq %rax,%rdx + shlq $63,%rax + andq %rbp,%rsi + shrq $1,%rdx + movq %rsi,%rcx + shlq $62,%rsi + andq %rbp,%rdi + shrq $2,%rcx + xorq %rsi,%rax + movq %rdi,%rbx + shlq $61,%rdi + xorq %rcx,%rdx + shrq $3,%rbx + xorq %rdi,%rax + xorq %rbx,%rdx + + movq %r9,%r13 + movq $0,0(%rsp) + xorq %r10,%r13 + movq %r9,8(%rsp) + movq %r11,%r14 + movq %r10,16(%rsp) + xorq %r12,%r14 + movq %r13,24(%rsp) + + xorq %r11,%r9 + movq %r11,32(%rsp) + xorq %r11,%r10 + movq %r9,40(%rsp) + xorq %r11,%r13 + movq %r10,48(%rsp) + xorq %r14,%r9 + movq %r13,56(%rsp) + xorq %r14,%r10 + + movq %r12,64(%rsp) + xorq %r14,%r13 + movq %r9,72(%rsp) + xorq %r11,%r9 + movq %r10,80(%rsp) + xorq %r11,%r10 + movq %r13,88(%rsp) + + xorq %r11,%r13 + movq %r14,96(%rsp) + movq %r8,%rsi + movq %r9,104(%rsp) + andq %rbp,%rsi + movq %r10,112(%rsp) + shrq $4,%rbp + movq %r13,120(%rsp) + movq %r8,%rdi + andq %rbp,%rdi + shrq $4,%rbp + + movq (%rsp,%rsi,8),%xmm0 + movq %r8,%rsi + andq %rbp,%rsi + shrq $4,%rbp + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $4,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $60,%rbx + xorq %rcx,%rax + pslldq $1,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $12,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $52,%rbx + xorq %rcx,%rax + pslldq $2,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $20,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $44,%rbx + xorq %rcx,%rax + pslldq $3,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $28,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $36,%rbx + xorq %rcx,%rax + pslldq $4,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $36,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $28,%rbx + xorq %rcx,%rax + pslldq $5,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $44,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $20,%rbx + xorq %rcx,%rax + pslldq $6,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $52,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $12,%rbx + xorq %rcx,%rax + pslldq $7,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %rcx,%rbx + shlq $60,%rcx +.byte 102,72,15,126,198 + shrq $4,%rbx + xorq %rcx,%rax + psrldq $8,%xmm0 + xorq %rbx,%rdx +.byte 102,72,15,126,199 + xorq %rsi,%rax + xorq %rdi,%rdx + + addq $128+8,%rsp + .byte 0xf3,0xc3 +.Lend_mul_1x1: +.size _mul_1x1,.-_mul_1x1 + +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,@function +.align 16 +bn_GF2m_mul_2x2: + movq OPENSSL_ia32cap_P(%rip),%rax + btq $33,%rax + jnc .Lvanilla_mul_2x2 + +.byte 102,72,15,110,198 +.byte 102,72,15,110,201 +.byte 102,72,15,110,210 +.byte 102,73,15,110,216 + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 +.byte 102,15,58,68,193,0 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 +.byte 102,15,58,68,211,0 +.byte 102,15,58,68,229,0 + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 + movdqa %xmm4,%xmm5 + pslldq $8,%xmm4 + psrldq $8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0(%rdi) + movdqu %xmm0,16(%rdi) + .byte 0xf3,0xc3 + +.align 16 +.Lvanilla_mul_2x2: + leaq -136(%rsp),%rsp + movq %r14,80(%rsp) + movq %r13,88(%rsp) + movq %r12,96(%rsp) + movq %rbp,104(%rsp) + movq %rbx,112(%rsp) +.Lbody_mul_2x2: + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + movq %rdx,48(%rsp) + movq %rcx,56(%rsp) + movq %r8,64(%rsp) + + movq $15,%r8 + movq %rsi,%rax + movq %rcx,%rbp + call _mul_1x1 + + movq %rax,16(%rsp) + movq %rdx,24(%rsp) + + movq 48(%rsp),%rax + movq 64(%rsp),%rbp + call _mul_1x1 + + movq %rax,0(%rsp) + movq %rdx,8(%rsp) + + movq 40(%rsp),%rax + movq 56(%rsp),%rbp + xorq 48(%rsp),%rax + xorq 64(%rsp),%rbp + call _mul_1x1 + + movq 0(%rsp),%rbx + movq 8(%rsp),%rcx + movq 16(%rsp),%rdi + movq 24(%rsp),%rsi + movq 32(%rsp),%rbp + + xorq %rdx,%rax + xorq %rcx,%rdx + xorq %rbx,%rax + movq %rbx,0(%rbp) + xorq %rdi,%rdx + movq %rsi,24(%rbp) + xorq %rsi,%rax + xorq %rsi,%rdx + xorq %rdx,%rax + movq %rdx,16(%rbp) + movq %rax,8(%rbp) + + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbp + movq 112(%rsp),%rbx + leaq 136(%rsp),%rsp + .byte 0xf3,0xc3 +.Lend_mul_2x2: +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s new file mode 100644 index 0000000000..875911c0fd --- /dev/null +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s @@ -0,0 +1,785 @@ +.text + + +.globl bn_mul_mont_gather5 +.type bn_mul_mont_gather5,@function +.align 64 +bn_mul_mont_gather5: + testl $3,%r9d + jnz .Lmul_enter + cmpl $8,%r9d + jb .Lmul_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 2(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +.Lmul_body: + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jl .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.align 16 +.Lcopy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.type bn_mul4x_mont_gather5,@function +.align 16 +bn_mul4x_mont_gather5: +.Lmul4x_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 4(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + xorq %r15,%r15 + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-40(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + movq %rdi,-16(%rsp,%r15,8) + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jl .Louter4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + pxor %xmm0,%xmm0 + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp .Lsub4x +.align 16 +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -1(%r9),%r15 + orq %rcx,%rsi + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) + leaq 32(%r14),%r14 + decq %r15 + jnz .Lcopy4x + + shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +.globl bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: + cmpq $0,%rsi + jz .Lscatter_epilogue + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subq $1,%rsi + jnz .Lscatter +.Lscatter_epilogue: + .byte 0xf3,0xc3 +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,@function +.align 16 +bn_gather5: + movq %rcx,%r11 + shrq $3,%rcx + andq $7,%r11 + notq %rcx + leaq .Lmagic_masks(%rip),%rax + andq $3,%rcx + leaq 96(%rdx,%r11,8),%rdx + movq 0(%rax,%rcx,8),%xmm4 + movq 8(%rax,%rcx,8),%xmm5 + movq 16(%rax,%rcx,8),%xmm6 + movq 24(%rax,%rcx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq -96(%rdx),%xmm0 + movq -32(%rdx),%xmm1 + pand %xmm4,%xmm0 + movq 32(%rdx),%xmm2 + pand %xmm5,%xmm1 + movq 96(%rdx),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%rdx),%rdx + por %xmm3,%xmm0 + + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subq $1,%rsi + jnz .Lgather + .byte 0xf3,0xc3 +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 +.align 64 +.Lmagic_masks: +.long 0,0, 0,0, 0,0, -1,-1 +.long 0,0, 0,0, 0,0, 0,0 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s new file mode 100644 index 0000000000..59268018c4 --- /dev/null +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s @@ -0,0 +1,295 @@ +.text + + + +.p2align 4 +_mul_1x1: + subq $128+8,%rsp + movq $-1,%r9 + leaq (%rax,%rax,1),%rsi + shrq $3,%r9 + leaq (,%rax,4),%rdi + andq %rax,%r9 + leaq (,%rax,8),%r12 + sarq $63,%rax + leaq (%r9,%r9,1),%r10 + sarq $63,%rsi + leaq (,%r9,4),%r11 + andq %rbp,%rax + sarq $63,%rdi + movq %rax,%rdx + shlq $63,%rax + andq %rbp,%rsi + shrq $1,%rdx + movq %rsi,%rcx + shlq $62,%rsi + andq %rbp,%rdi + shrq $2,%rcx + xorq %rsi,%rax + movq %rdi,%rbx + shlq $61,%rdi + xorq %rcx,%rdx + shrq $3,%rbx + xorq %rdi,%rax + xorq %rbx,%rdx + + movq %r9,%r13 + movq $0,0(%rsp) + xorq %r10,%r13 + movq %r9,8(%rsp) + movq %r11,%r14 + movq %r10,16(%rsp) + xorq %r12,%r14 + movq %r13,24(%rsp) + + xorq %r11,%r9 + movq %r11,32(%rsp) + xorq %r11,%r10 + movq %r9,40(%rsp) + xorq %r11,%r13 + movq %r10,48(%rsp) + xorq %r14,%r9 + movq %r13,56(%rsp) + xorq %r14,%r10 + + movq %r12,64(%rsp) + xorq %r14,%r13 + movq %r9,72(%rsp) + xorq %r11,%r9 + movq %r10,80(%rsp) + xorq %r11,%r10 + movq %r13,88(%rsp) + + xorq %r11,%r13 + movq %r14,96(%rsp) + movq %r8,%rsi + movq %r9,104(%rsp) + andq %rbp,%rsi + movq %r10,112(%rsp) + shrq $4,%rbp + movq %r13,120(%rsp) + movq %r8,%rdi + andq %rbp,%rdi + shrq $4,%rbp + + movq (%rsp,%rsi,8),%xmm0 + movq %r8,%rsi + andq %rbp,%rsi + shrq $4,%rbp + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $4,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $60,%rbx + xorq %rcx,%rax + pslldq $1,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $12,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $52,%rbx + xorq %rcx,%rax + pslldq $2,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $20,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $44,%rbx + xorq %rcx,%rax + pslldq $3,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $28,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $36,%rbx + xorq %rcx,%rax + pslldq $4,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $36,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $28,%rbx + xorq %rcx,%rax + pslldq $5,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $44,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $20,%rbx + xorq %rcx,%rax + pslldq $6,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %r8,%rdi + movq %rcx,%rbx + shlq $52,%rcx + andq %rbp,%rdi + movq (%rsp,%rsi,8),%xmm1 + shrq $12,%rbx + xorq %rcx,%rax + pslldq $7,%xmm1 + movq %r8,%rsi + shrq $4,%rbp + xorq %rbx,%rdx + andq %rbp,%rsi + shrq $4,%rbp + pxor %xmm1,%xmm0 + movq (%rsp,%rdi,8),%rcx + movq %rcx,%rbx + shlq $60,%rcx +.byte 102,72,15,126,198 + shrq $4,%rbx + xorq %rcx,%rax + psrldq $8,%xmm0 + xorq %rbx,%rdx +.byte 102,72,15,126,199 + xorq %rsi,%rax + xorq %rdi,%rdx + + addq $128+8,%rsp + .byte 0xf3,0xc3 +L$end_mul_1x1: + + +.globl _bn_GF2m_mul_2x2 + +.p2align 4 +_bn_GF2m_mul_2x2: + movq _OPENSSL_ia32cap_P(%rip),%rax + btq $33,%rax + jnc L$vanilla_mul_2x2 + +.byte 102,72,15,110,198 +.byte 102,72,15,110,201 +.byte 102,72,15,110,210 +.byte 102,73,15,110,216 + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 +.byte 102,15,58,68,193,0 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 +.byte 102,15,58,68,211,0 +.byte 102,15,58,68,229,0 + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 + movdqa %xmm4,%xmm5 + pslldq $8,%xmm4 + psrldq $8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0(%rdi) + movdqu %xmm0,16(%rdi) + .byte 0xf3,0xc3 + +.p2align 4 +L$vanilla_mul_2x2: + leaq -136(%rsp),%rsp + movq %r14,80(%rsp) + movq %r13,88(%rsp) + movq %r12,96(%rsp) + movq %rbp,104(%rsp) + movq %rbx,112(%rsp) +L$body_mul_2x2: + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + movq %rdx,48(%rsp) + movq %rcx,56(%rsp) + movq %r8,64(%rsp) + + movq $15,%r8 + movq %rsi,%rax + movq %rcx,%rbp + call _mul_1x1 + + movq %rax,16(%rsp) + movq %rdx,24(%rsp) + + movq 48(%rsp),%rax + movq 64(%rsp),%rbp + call _mul_1x1 + + movq %rax,0(%rsp) + movq %rdx,8(%rsp) + + movq 40(%rsp),%rax + movq 56(%rsp),%rbp + xorq 48(%rsp),%rax + xorq 64(%rsp),%rbp + call _mul_1x1 + + movq 0(%rsp),%rbx + movq 8(%rsp),%rcx + movq 16(%rsp),%rdi + movq 24(%rsp),%rsi + movq 32(%rsp),%rbp + + xorq %rdx,%rax + xorq %rcx,%rdx + xorq %rbx,%rax + movq %rbx,0(%rbp) + xorq %rdi,%rdx + movq %rsi,24(%rbp) + xorq %rsi,%rax + xorq %rsi,%rdx + xorq %rdx,%rax + movq %rdx,16(%rbp) + movq %rax,8(%rbp) + + movq 80(%rsp),%r14 + movq 88(%rsp),%r13 + movq 96(%rsp),%r12 + movq 104(%rsp),%rbp + movq 112(%rsp),%rbx + leaq 136(%rsp),%rsp + .byte 0xf3,0xc3 +L$end_mul_2x2: + +.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 4 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s new file mode 100644 index 0000000000..f3bfb046d3 --- /dev/null +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s @@ -0,0 +1,785 @@ +.text + + +.globl _bn_mul_mont_gather5 + +.p2align 6 +_bn_mul_mont_gather5: + testl $3,%r9d + jnz L$mul_enter + cmpl $8,%r9d + jb L$mul_enter + jmp L$mul4x_enter + +.p2align 4 +L$mul_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 2(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +L$mul_body: + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq L$magic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jl L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp L$sub +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.p2align 4 +L$copy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$mul_epilogue: + .byte 0xf3,0xc3 + + +.p2align 4 +bn_mul4x_mont_gather5: +L$mul4x_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 4(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +L$mul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq L$magic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$1st4x +.p2align 4 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.p2align 2 +L$outer4x: + xorq %r15,%r15 + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdx,%r13 + jmp L$inner4x +.p2align 4 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-40(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + movq %rdi,-16(%rsp,%r15,8) + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jl L$outer4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + pxor %xmm0,%xmm0 + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp L$sub4x +.p2align 4 +L$sub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz L$sub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -1(%r9),%r15 + orq %rcx,%rsi + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) + jmp L$copy4x +.p2align 4 +L$copy4x: + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) + leaq 32(%r14),%r14 + decq %r15 + jnz L$copy4x + + shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$mul4x_epilogue: + .byte 0xf3,0xc3 + +.globl _bn_scatter5 + +.p2align 4 +_bn_scatter5: + cmpq $0,%rsi + jz L$scatter_epilogue + leaq (%rdx,%rcx,8),%rdx +L$scatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subq $1,%rsi + jnz L$scatter +L$scatter_epilogue: + .byte 0xf3,0xc3 + + +.globl _bn_gather5 + +.p2align 4 +_bn_gather5: + movq %rcx,%r11 + shrq $3,%rcx + andq $7,%r11 + notq %rcx + leaq L$magic_masks(%rip),%rax + andq $3,%rcx + leaq 96(%rdx,%r11,8),%rdx + movq 0(%rax,%rcx,8),%xmm4 + movq 8(%rax,%rcx,8),%xmm5 + movq 16(%rax,%rcx,8),%xmm6 + movq 24(%rax,%rcx,8),%xmm7 + jmp L$gather +.p2align 4 +L$gather: + movq -96(%rdx),%xmm0 + movq -32(%rdx),%xmm1 + pand %xmm4,%xmm0 + movq 32(%rdx),%xmm2 + pand %xmm5,%xmm1 + movq 96(%rdx),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%rdx),%rdx + por %xmm3,%xmm0 + + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subq $1,%rsi + jnz L$gather + .byte 0xf3,0xc3 +L$SEH_end_bn_gather5: + +.p2align 6 +L$magic_masks: +.long 0,0, 0,0, 0,0, -1,-1 +.long 0,0, 0,0, 0,0, 0,0 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-gf2m.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-gf2m.asm new file mode 100644 index 0000000000..a58049d3c8 --- /dev/null +++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-gf2m.asm @@ -0,0 +1,404 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(64) 'CODE' + + +ALIGN 16 +_mul_1x1 PROC PRIVATE + sub rsp,128+8 + mov r9,-1 + lea rsi,QWORD PTR[rax*1+rax] + shr r9,3 + lea rdi,QWORD PTR[rax*4] + and r9,rax + lea r12,QWORD PTR[rax*8] + sar rax,63 + lea r10,QWORD PTR[r9*1+r9] + sar rsi,63 + lea r11,QWORD PTR[r9*4] + and rax,rbp + sar rdi,63 + mov rdx,rax + shl rax,63 + and rsi,rbp + shr rdx,1 + mov rcx,rsi + shl rsi,62 + and rdi,rbp + shr rcx,2 + xor rax,rsi + mov rbx,rdi + shl rdi,61 + xor rdx,rcx + shr rbx,3 + xor rax,rdi + xor rdx,rbx + + mov r13,r9 + mov QWORD PTR[rsp],0 + xor r13,r10 + mov QWORD PTR[8+rsp],r9 + mov r14,r11 + mov QWORD PTR[16+rsp],r10 + xor r14,r12 + mov QWORD PTR[24+rsp],r13 + + xor r9,r11 + mov QWORD PTR[32+rsp],r11 + xor r10,r11 + mov QWORD PTR[40+rsp],r9 + xor r13,r11 + mov QWORD PTR[48+rsp],r10 + xor r9,r14 + mov QWORD PTR[56+rsp],r13 + xor r10,r14 + + mov QWORD PTR[64+rsp],r12 + xor r13,r14 + mov QWORD PTR[72+rsp],r9 + xor r9,r11 + mov QWORD PTR[80+rsp],r10 + xor r10,r11 + mov QWORD PTR[88+rsp],r13 + + xor r13,r11 + mov QWORD PTR[96+rsp],r14 + mov rsi,r8 + mov QWORD PTR[104+rsp],r9 + and rsi,rbp + mov QWORD PTR[112+rsp],r10 + shr rbp,4 + mov QWORD PTR[120+rsp],r13 + mov rdi,r8 + and rdi,rbp + shr rbp,4 + + movq xmm0,QWORD PTR[rsi*8+rsp] + mov rsi,r8 + and rsi,rbp + shr rbp,4 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,4 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,60 + xor rax,rcx + pslldq xmm1,1 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,12 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,52 + xor rax,rcx + pslldq xmm1,2 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,20 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,44 + xor rax,rcx + pslldq xmm1,3 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,28 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,36 + xor rax,rcx + pslldq xmm1,4 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,36 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,28 + xor rax,rcx + pslldq xmm1,5 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,44 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,20 + xor rax,rcx + pslldq xmm1,6 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rdi,r8 + mov rbx,rcx + shl rcx,52 + and rdi,rbp + movq xmm1,QWORD PTR[rsi*8+rsp] + shr rbx,12 + xor rax,rcx + pslldq xmm1,7 + mov rsi,r8 + shr rbp,4 + xor rdx,rbx + and rsi,rbp + shr rbp,4 + pxor xmm0,xmm1 + mov rcx,QWORD PTR[rdi*8+rsp] + mov rbx,rcx + shl rcx,60 +DB 102,72,15,126,198 + shr rbx,4 + xor rax,rcx + psrldq xmm0,8 + xor rdx,rbx +DB 102,72,15,126,199 + xor rax,rsi + xor rdx,rdi + + add rsp,128+8 + DB 0F3h,0C3h ;repret +$L$end_mul_1x1:: +_mul_1x1 ENDP +EXTERN OPENSSL_ia32cap_P:NEAR +PUBLIC bn_GF2m_mul_2x2 + +ALIGN 16 +bn_GF2m_mul_2x2 PROC PUBLIC + mov rax,QWORD PTR[OPENSSL_ia32cap_P] + bt rax,33 + jnc $L$vanilla_mul_2x2 + +DB 102,72,15,110,194 +DB 102,73,15,110,201 +DB 102,73,15,110,208 + movq xmm3,QWORD PTR[40+rsp] + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 +DB 102,15,58,68,193,0 + pxor xmm4,xmm2 + pxor xmm5,xmm3 +DB 102,15,58,68,211,0 +DB 102,15,58,68,229,0 + xorps xmm4,xmm0 + xorps xmm4,xmm2 + movdqa xmm5,xmm4 + pslldq xmm4,8 + psrldq xmm5,8 + pxor xmm2,xmm4 + pxor xmm0,xmm5 + movdqu XMMWORD PTR[rcx],xmm2 + movdqu XMMWORD PTR[16+rcx],xmm0 + DB 0F3h,0C3h ;repret + +ALIGN 16 +$L$vanilla_mul_2x2:: + lea rsp,QWORD PTR[((-136))+rsp] + mov r10,QWORD PTR[176+rsp] + mov QWORD PTR[120+rsp],rdi + mov QWORD PTR[128+rsp],rsi + mov QWORD PTR[80+rsp],r14 + mov QWORD PTR[88+rsp],r13 + mov QWORD PTR[96+rsp],r12 + mov QWORD PTR[104+rsp],rbp + mov QWORD PTR[112+rsp],rbx +$L$body_mul_2x2:: + mov QWORD PTR[32+rsp],rcx + mov QWORD PTR[40+rsp],rdx + mov QWORD PTR[48+rsp],r8 + mov QWORD PTR[56+rsp],r9 + mov QWORD PTR[64+rsp],r10 + + mov r8,0fh + mov rax,rdx + mov rbp,r9 + call _mul_1x1 + + mov QWORD PTR[16+rsp],rax + mov QWORD PTR[24+rsp],rdx + + mov rax,QWORD PTR[48+rsp] + mov rbp,QWORD PTR[64+rsp] + call _mul_1x1 + + mov QWORD PTR[rsp],rax + mov QWORD PTR[8+rsp],rdx + + mov rax,QWORD PTR[40+rsp] + mov rbp,QWORD PTR[56+rsp] + xor rax,QWORD PTR[48+rsp] + xor rbp,QWORD PTR[64+rsp] + call _mul_1x1 + + mov rbx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + mov rdi,QWORD PTR[16+rsp] + mov rsi,QWORD PTR[24+rsp] + mov rbp,QWORD PTR[32+rsp] + + xor rax,rdx + xor rdx,rcx + xor rax,rbx + mov QWORD PTR[rbp],rbx + xor rdx,rdi + mov QWORD PTR[24+rbp],rsi + xor rax,rsi + xor rdx,rsi + xor rax,rdx + mov QWORD PTR[16+rbp],rdx + mov QWORD PTR[8+rbp],rax + + mov r14,QWORD PTR[80+rsp] + mov r13,QWORD PTR[88+rsp] + mov r12,QWORD PTR[96+rsp] + mov rbp,QWORD PTR[104+rsp] + mov rbx,QWORD PTR[112+rsp] + mov rdi,QWORD PTR[120+rsp] + mov rsi,QWORD PTR[128+rsp] + lea rsp,QWORD PTR[136+rsp] + DB 0F3h,0C3h ;repret +$L$end_mul_2x2:: +bn_GF2m_mul_2x2 ENDP +DB 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105 +DB 99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +DB 111,114,103,62,0 +ALIGN 16 +EXTERN __imp_RtlVirtualUnwind:NEAR + + +ALIGN 16 +se_handler PROC PRIVATE + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD PTR[152+r8] + mov rbx,QWORD PTR[248+r8] + + lea r10,QWORD PTR[$L$body_mul_2x2] + cmp rbx,r10 + jb $L$in_prologue + + mov r14,QWORD PTR[80+rax] + mov r13,QWORD PTR[88+rax] + mov r12,QWORD PTR[96+rax] + mov rbp,QWORD PTR[104+rax] + mov rbx,QWORD PTR[112+rax] + mov rdi,QWORD PTR[120+rax] + mov rsi,QWORD PTR[128+rax] + + mov QWORD PTR[144+r8],rbx + mov QWORD PTR[160+r8],rbp + mov QWORD PTR[168+r8],rsi + mov QWORD PTR[176+r8],rdi + mov QWORD PTR[216+r8],r12 + mov QWORD PTR[224+r8],r13 + mov QWORD PTR[232+r8],r14 + +$L$in_prologue:: + lea rax,QWORD PTR[136+rax] + mov QWORD PTR[152+r8],rax + + mov rdi,QWORD PTR[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0a548f3fch + + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD PTR[8+rsi] + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[16+rsi] + mov r10,QWORD PTR[40+rsi] + lea r11,QWORD PTR[56+rsi] + lea r12,QWORD PTR[24+rsi] + mov QWORD PTR[32+rsp],r10 + mov QWORD PTR[40+rsp],r11 + mov QWORD PTR[48+rsp],r12 + mov QWORD PTR[56+rsp],rcx + call QWORD PTR[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret +se_handler ENDP + +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel _mul_1x1 + DD imagerel $L$end_mul_1x1 + DD imagerel $L$SEH_info_1x1 + + DD imagerel $L$vanilla_mul_2x2 + DD imagerel $L$end_mul_2x2 + DD imagerel $L$SEH_info_2x2 +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_1x1:: +DB 001h,007h,002h,000h +DB 007h,001h,011h,000h + +$L$SEH_info_2x2:: +DB 9,0,0,0 + DD imagerel se_handler + +.xdata ENDS +END diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm new file mode 100644 index 0000000000..e43204eba6 --- /dev/null +++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm @@ -0,0 +1,990 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(64) 'CODE' + +PUBLIC bn_mul_mont_gather5 + +ALIGN 64 +bn_mul_mont_gather5 PROC PUBLIC + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul_mont_gather5:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + test r9d,3 + jnz $L$mul_enter + cmp r9d,8 + jb $L$mul_enter + jmp $L$mul4x_enter + +ALIGN 16 +$L$mul_enter:: + mov r9d,r9d + mov r10d,DWORD PTR[56+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,QWORD PTR[((-40))+rsp] + movaps XMMWORD PTR[rsp],xmm6 + movaps XMMWORD PTR[16+rsp],xmm7 +$L$mul_alloca:: + mov rax,rsp + lea r11,QWORD PTR[2+r9] + neg r11 + lea rsp,QWORD PTR[r11*8+rsp] + and rsp,-1024 + + mov QWORD PTR[8+r9*8+rsp],rax +$L$mul_body:: + mov r12,rdx + mov r11,r10 + shr r10,3 + and r11,7 + not r10 + lea rax,QWORD PTR[$L$magic_masks] + and r10,3 + lea r12,QWORD PTR[96+r11*8+r12] + movq xmm4,QWORD PTR[r10*8+rax] + movq xmm5,QWORD PTR[8+r10*8+rax] + movq xmm6,QWORD PTR[16+r10*8+rax] + movq xmm7,QWORD PTR[24+r10*8+rax] + + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + +DB 102,72,15,126,195 + + mov r8,QWORD PTR[r8] + mov rax,QWORD PTR[rsi] + + xor r14,r14 + xor r15,r15 + + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD PTR[rcx] + + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + + imul rbp,r10 + mov r11,rdx + + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + + mul rbp + add r10,rax + mov rax,QWORD PTR[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,QWORD PTR[1+r15] + jmp $L$1st_enter + +ALIGN 16 +$L$1st:: + add r13,rax + mov rax,QWORD PTR[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter:: + mul rbx + add r11,rax + mov rax,QWORD PTR[r15*8+rcx] + adc rdx,0 + lea r15,QWORD PTR[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne $L$1st + +DB 102,72,15,126,195 + + add r13,rax + mov rax,QWORD PTR[rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD PTR[((-8))+r9*8+rsp],r13 + mov QWORD PTR[r9*8+rsp],rdx + + lea r14,QWORD PTR[1+r14] + jmp $L$outer +ALIGN 16 +$L$outer:: + xor r15,r15 + mov rbp,r8 + mov r10,QWORD PTR[rsp] + + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + + mul rbx + add r10,rax + mov rax,QWORD PTR[rcx] + adc rdx,0 + + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + + imul rbp,r10 + mov r11,rdx + + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + + mul rbp + add r10,rax + mov rax,QWORD PTR[8+rsi] + adc rdx,0 + mov r10,QWORD PTR[8+rsp] + mov r13,rdx + + lea r15,QWORD PTR[1+r15] + jmp $L$inner_enter + +ALIGN 16 +$L$inner:: + add r13,rax + mov rax,QWORD PTR[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD PTR[r15*8+rsp] + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter:: + mul rbx + add r11,rax + mov rax,QWORD PTR[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,QWORD PTR[1+r15] + + mul rbp + cmp r15,r9 + jne $L$inner + +DB 102,72,15,126,195 + + add r13,rax + mov rax,QWORD PTR[rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD PTR[r15*8+rsp] + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-8))+r9*8+rsp],r13 + mov QWORD PTR[r9*8+rsp],rdx + + lea r14,QWORD PTR[1+r14] + cmp r14,r9 + jl $L$outer + + xor r14,r14 + mov rax,QWORD PTR[rsp] + lea rsi,QWORD PTR[rsp] + mov r15,r9 + jmp $L$sub +ALIGN 16 +$L$sub:: sbb rax,QWORD PTR[r14*8+rcx] + mov QWORD PTR[r14*8+rdi],rax + mov rax,QWORD PTR[8+r14*8+rsi] + lea r14,QWORD PTR[1+r14] + dec r15 + jnz $L$sub + + sbb rax,0 + xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax + mov r15,r9 + or rsi,rcx +ALIGN 16 +$L$copy:: + mov rax,QWORD PTR[r14*8+rsi] + mov QWORD PTR[r14*8+rsp],r14 + mov QWORD PTR[r14*8+rdi],rax + lea r14,QWORD PTR[1+r14] + sub r15,1 + jnz $L$copy + + mov rsi,QWORD PTR[8+r9*8+rsp] + mov rax,1 + movaps xmm6,XMMWORD PTR[rsi] + movaps xmm7,XMMWORD PTR[16+rsi] + lea rsi,QWORD PTR[40+rsi] + mov r15,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r13,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rbx,QWORD PTR[40+rsi] + lea rsp,QWORD PTR[48+rsi] +$L$mul_epilogue:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_bn_mul_mont_gather5:: +bn_mul_mont_gather5 ENDP + +ALIGN 16 +bn_mul4x_mont_gather5 PROC PRIVATE + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont_gather5:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + +$L$mul4x_enter:: + mov r9d,r9d + mov r10d,DWORD PTR[56+rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + lea rsp,QWORD PTR[((-40))+rsp] + movaps XMMWORD PTR[rsp],xmm6 + movaps XMMWORD PTR[16+rsp],xmm7 +$L$mul4x_alloca:: + mov rax,rsp + lea r11,QWORD PTR[4+r9] + neg r11 + lea rsp,QWORD PTR[r11*8+rsp] + and rsp,-1024 + + mov QWORD PTR[8+r9*8+rsp],rax +$L$mul4x_body:: + mov QWORD PTR[16+r9*8+rsp],rdi + mov r12,rdx + mov r11,r10 + shr r10,3 + and r11,7 + not r10 + lea rax,QWORD PTR[$L$magic_masks] + and r10,3 + lea r12,QWORD PTR[96+r11*8+r12] + movq xmm4,QWORD PTR[r10*8+rax] + movq xmm5,QWORD PTR[8+r10*8+rax] + movq xmm6,QWORD PTR[16+r10*8+rax] + movq xmm7,QWORD PTR[24+r10*8+rax] + + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + +DB 102,72,15,126,195 + mov r8,QWORD PTR[r8] + mov rax,QWORD PTR[rsi] + + xor r14,r14 + xor r15,r15 + + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD PTR[rcx] + + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + + imul rbp,r10 + mov r11,rdx + + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + + mul rbp + add r10,rax + mov rax,QWORD PTR[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,QWORD PTR[4+r15] + adc rdx,0 + mov QWORD PTR[rsp],rdi + mov r13,rdx + jmp $L$1st4x +ALIGN 16 +$L$1st4x:: + mul rbx + add r10,rax + mov rax,QWORD PTR[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD PTR[r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[8+r15*8+rcx] + adc rdx,0 + lea r15,QWORD PTR[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jl $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD PTR[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],rdi + mov r13,rdx + +DB 102,72,15,126,195 + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD PTR[((-8))+r15*8+rsp],r13 + mov QWORD PTR[r15*8+rsp],rdi + + lea r14,QWORD PTR[1+r14] +ALIGN 4 +$L$outer4x:: + xor r15,r15 + movq xmm0,QWORD PTR[((-96))+r12] + movq xmm1,QWORD PTR[((-32))+r12] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r12] + pand xmm1,xmm5 + + mov r10,QWORD PTR[rsp] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD PTR[rcx] + adc rdx,0 + + movq xmm3,QWORD PTR[96+r12] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + + imul rbp,r10 + mov r11,rdx + + por xmm0,xmm2 + lea r12,QWORD PTR[256+r12] + por xmm0,xmm3 + + mul rbp + add r10,rax + mov rax,QWORD PTR[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[8+rcx] + adc rdx,0 + add r11,QWORD PTR[8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,QWORD PTR[4+r15] + adc rdx,0 + mov r13,rdx + jmp $L$inner4x +ALIGN 16 +$L$inner4x:: + mul rbx + add r10,rax + mov rax,QWORD PTR[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD PTR[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-32))+r15*8+rsp],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD PTR[((-8))+r15*8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-24))+r15*8+rsp],r13 + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD PTR[r15*8+rcx] + adc rdx,0 + add r10,QWORD PTR[r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-16))+r15*8+rsp],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[8+r15*8+rcx] + adc rdx,0 + add r11,QWORD PTR[8+r15*8+rsp] + adc rdx,0 + lea r15,QWORD PTR[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-40))+r15*8+rsp],r13 + mov r13,rdx + cmp r15,r9 + jl $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD PTR[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD PTR[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD PTR[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD PTR[((-32))+r15*8+rsp],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD PTR[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD PTR[((-8))+r15*8+rsp] + adc rdx,0 + lea r14,QWORD PTR[1+r14] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD PTR[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD PTR[((-24))+r15*8+rsp],r13 + mov r13,rdx + +DB 102,72,15,126,195 + mov QWORD PTR[((-16))+r15*8+rsp],rdi + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD PTR[r9*8+rsp] + adc rdi,0 + mov QWORD PTR[((-8))+r15*8+rsp],r13 + mov QWORD PTR[r15*8+rsp],rdi + + cmp r14,r9 + jl $L$outer4x + mov rdi,QWORD PTR[16+r9*8+rsp] + mov rax,QWORD PTR[rsp] + pxor xmm0,xmm0 + mov rdx,QWORD PTR[8+rsp] + shr r9,2 + lea rsi,QWORD PTR[rsp] + xor r14,r14 + + sub rax,QWORD PTR[rcx] + mov rbx,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[24+rsi] + sbb rdx,QWORD PTR[8+rcx] + lea r15,QWORD PTR[((-1))+r9] + jmp $L$sub4x +ALIGN 16 +$L$sub4x:: + mov QWORD PTR[r14*8+rdi],rax + mov QWORD PTR[8+r14*8+rdi],rdx + sbb rbx,QWORD PTR[16+r14*8+rcx] + mov rax,QWORD PTR[32+r14*8+rsi] + mov rdx,QWORD PTR[40+r14*8+rsi] + sbb rbp,QWORD PTR[24+r14*8+rcx] + mov QWORD PTR[16+r14*8+rdi],rbx + mov QWORD PTR[24+r14*8+rdi],rbp + sbb rax,QWORD PTR[32+r14*8+rcx] + mov rbx,QWORD PTR[48+r14*8+rsi] + mov rbp,QWORD PTR[56+r14*8+rsi] + sbb rdx,QWORD PTR[40+r14*8+rcx] + lea r14,QWORD PTR[4+r14] + dec r15 + jnz $L$sub4x + + mov QWORD PTR[r14*8+rdi],rax + mov rax,QWORD PTR[32+r14*8+rsi] + sbb rbx,QWORD PTR[16+r14*8+rcx] + mov QWORD PTR[8+r14*8+rdi],rdx + sbb rbp,QWORD PTR[24+r14*8+rcx] + mov QWORD PTR[16+r14*8+rdi],rbx + + sbb rax,0 + mov QWORD PTR[24+r14*8+rdi],rbp + xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax + lea r15,QWORD PTR[((-1))+r9] + or rsi,rcx + + movdqu xmm1,XMMWORD PTR[rsi] + movdqa XMMWORD PTR[rsp],xmm0 + movdqu XMMWORD PTR[rdi],xmm1 + jmp $L$copy4x +ALIGN 16 +$L$copy4x:: + movdqu xmm2,XMMWORD PTR[16+r14*1+rsi] + movdqu xmm1,XMMWORD PTR[32+r14*1+rsi] + movdqa XMMWORD PTR[16+r14*1+rsp],xmm0 + movdqu XMMWORD PTR[16+r14*1+rdi],xmm2 + movdqa XMMWORD PTR[32+r14*1+rsp],xmm0 + movdqu XMMWORD PTR[32+r14*1+rdi],xmm1 + lea r14,QWORD PTR[32+r14] + dec r15 + jnz $L$copy4x + + shl r9,2 + movdqu xmm2,XMMWORD PTR[16+r14*1+rsi] + movdqa XMMWORD PTR[16+r14*1+rsp],xmm0 + movdqu XMMWORD PTR[16+r14*1+rdi],xmm2 + mov rsi,QWORD PTR[8+r9*8+rsp] + mov rax,1 + movaps xmm6,XMMWORD PTR[rsi] + movaps xmm7,XMMWORD PTR[16+rsi] + lea rsi,QWORD PTR[40+rsi] + mov r15,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r13,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rbx,QWORD PTR[40+rsi] + lea rsp,QWORD PTR[48+rsi] +$L$mul4x_epilogue:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_bn_mul4x_mont_gather5:: +bn_mul4x_mont_gather5 ENDP +PUBLIC bn_scatter5 + +ALIGN 16 +bn_scatter5 PROC PUBLIC + cmp rdx,0 + jz $L$scatter_epilogue + lea r8,QWORD PTR[r9*8+r8] +$L$scatter:: + mov rax,QWORD PTR[rcx] + lea rcx,QWORD PTR[8+rcx] + mov QWORD PTR[r8],rax + lea r8,QWORD PTR[256+r8] + sub rdx,1 + jnz $L$scatter +$L$scatter_epilogue:: + DB 0F3h,0C3h ;repret +bn_scatter5 ENDP + +PUBLIC bn_gather5 + +ALIGN 16 +bn_gather5 PROC PUBLIC +$L$SEH_begin_bn_gather5:: + +DB 048h,083h,0ech,028h + +DB 00fh,029h,034h,024h + +DB 00fh,029h,07ch,024h,010h + + mov r11,r9 + shr r9,3 + and r11,7 + not r9 + lea rax,QWORD PTR[$L$magic_masks] + and r9,3 + lea r8,QWORD PTR[96+r11*8+r8] + movq xmm4,QWORD PTR[r9*8+rax] + movq xmm5,QWORD PTR[8+r9*8+rax] + movq xmm6,QWORD PTR[16+r9*8+rax] + movq xmm7,QWORD PTR[24+r9*8+rax] + jmp $L$gather +ALIGN 16 +$L$gather:: + movq xmm0,QWORD PTR[((-96))+r8] + movq xmm1,QWORD PTR[((-32))+r8] + pand xmm0,xmm4 + movq xmm2,QWORD PTR[32+r8] + pand xmm1,xmm5 + movq xmm3,QWORD PTR[96+r8] + pand xmm2,xmm6 + por xmm0,xmm1 + pand xmm3,xmm7 + por xmm0,xmm2 + lea r8,QWORD PTR[256+r8] + por xmm0,xmm3 + + movq QWORD PTR[rcx],xmm0 + lea rcx,QWORD PTR[8+rcx] + sub rdx,1 + jnz $L$gather + movaps XMMWORD PTR[rsp],xmm6 + movaps XMMWORD PTR[16+rsp],xmm7 + lea rsp,QWORD PTR[40+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_bn_gather5:: +bn_gather5 ENDP +ALIGN 64 +$L$magic_masks:: + DD 0,0,0,0,0,0,-1,-1 + DD 0,0,0,0,0,0,0,0 +DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 +DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 +DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 +DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 +DB 112,101,110,115,115,108,46,111,114,103,62,0 +EXTERN __imp_RtlVirtualUnwind:NEAR + +ALIGN 16 +mul_handler PROC PRIVATE + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD PTR[120+r8] + mov rbx,QWORD PTR[248+r8] + + mov rsi,QWORD PTR[8+r9] + mov r11,QWORD PTR[56+r9] + + mov r10d,DWORD PTR[r11] + lea r10,QWORD PTR[r10*1+rsi] + cmp rbx,r10 + jb $L$common_seh_tail + + lea rax,QWORD PTR[88+rax] + + mov r10d,DWORD PTR[4+r11] + lea r10,QWORD PTR[r10*1+rsi] + cmp rbx,r10 + jb $L$common_seh_tail + + mov rax,QWORD PTR[152+r8] + + mov r10d,DWORD PTR[8+r11] + lea r10,QWORD PTR[r10*1+rsi] + cmp rbx,r10 + jae $L$common_seh_tail + + mov r10,QWORD PTR[192+r8] + mov rax,QWORD PTR[8+r10*8+rax] + + movaps xmm0,XMMWORD PTR[rax] + movaps xmm1,XMMWORD PTR[16+rax] + lea rax,QWORD PTR[88+rax] + + mov rbx,QWORD PTR[((-8))+rax] + mov rbp,QWORD PTR[((-16))+rax] + mov r12,QWORD PTR[((-24))+rax] + mov r13,QWORD PTR[((-32))+rax] + mov r14,QWORD PTR[((-40))+rax] + mov r15,QWORD PTR[((-48))+rax] + mov QWORD PTR[144+r8],rbx + mov QWORD PTR[160+r8],rbp + mov QWORD PTR[216+r8],r12 + mov QWORD PTR[224+r8],r13 + mov QWORD PTR[232+r8],r14 + mov QWORD PTR[240+r8],r15 + movups XMMWORD PTR[512+r8],xmm0 + movups XMMWORD PTR[528+r8],xmm1 + +$L$common_seh_tail:: + mov rdi,QWORD PTR[8+rax] + mov rsi,QWORD PTR[16+rax] + mov QWORD PTR[152+r8],rax + mov QWORD PTR[168+r8],rsi + mov QWORD PTR[176+r8],rdi + + mov rdi,QWORD PTR[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0a548f3fch + + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD PTR[8+rsi] + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[16+rsi] + mov r10,QWORD PTR[40+rsi] + lea r11,QWORD PTR[56+rsi] + lea r12,QWORD PTR[24+rsi] + mov QWORD PTR[32+rsp],r10 + mov QWORD PTR[40+rsp],r11 + mov QWORD PTR[48+rsp],r12 + mov QWORD PTR[56+rsp],rcx + call QWORD PTR[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret +mul_handler ENDP + +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_bn_mul_mont_gather5 + DD imagerel $L$SEH_end_bn_mul_mont_gather5 + DD imagerel $L$SEH_info_bn_mul_mont_gather5 + + DD imagerel $L$SEH_begin_bn_mul4x_mont_gather5 + DD imagerel $L$SEH_end_bn_mul4x_mont_gather5 + DD imagerel $L$SEH_info_bn_mul4x_mont_gather5 + + DD imagerel $L$SEH_begin_bn_gather5 + DD imagerel $L$SEH_end_bn_gather5 + DD imagerel $L$SEH_info_bn_gather5 + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_bn_mul_mont_gather5:: +DB 9,0,0,0 + DD imagerel mul_handler + DD imagerel $L$mul_alloca,imagerel $L$mul_body,imagerel $L$mul_epilogue + +ALIGN 8 +$L$SEH_info_bn_mul4x_mont_gather5:: +DB 9,0,0,0 + DD imagerel mul_handler + DD imagerel $L$mul4x_alloca,imagerel $L$mul4x_body,imagerel $L$mul4x_epilogue + +ALIGN 8 +$L$SEH_info_bn_gather5:: +DB 001h,00dh,005h,000h +DB 00dh,078h,001h,000h + +DB 008h,068h,000h,000h + +DB 004h,042h,000h,000h + +ALIGN 8 + +.xdata ENDS +END diff --git a/deps/openssl/openssl.gyp b/deps/openssl/openssl.gyp index 5e2df4ba98..3becfca2aa 100644 --- a/deps/openssl/openssl.gyp +++ b/deps/openssl/openssl.gyp @@ -696,6 +696,7 @@ 'LIB_BN_ASM', 'MD5_ASM', 'OPENSSL_BN_ASM', + 'OPENSSL_BN_ASM_MONT', 'OPENSSL_CPUID_OBJ', 'RIP_ASM', 'RMD160_ASM', @@ -730,12 +731,18 @@ ] }], ['OS!="win" and OS!="mac" and target_arch=="x64"', { + 'defines': [ + 'OPENSSL_BN_ASM_MONT5', + 'OPENSSL_BN_ASM_GF2m', + ], 'sources': [ 'asm/x64-elf-gas/aes/aes-x86_64.s', 'asm/x64-elf-gas/aes/aesni-x86_64.s', 'asm/x64-elf-gas/aes/aesni-sha1-x86_64.s', 'asm/x64-elf-gas/bn/modexp512-x86_64.s', 'asm/x64-elf-gas/bn/x86_64-mont.s', + 'asm/x64-elf-gas/bn/x86_64-mont5.s', + 'asm/x64-elf-gas/bn/x86_64-gf2m.s', 'asm/x64-elf-gas/camellia/cmll-x86_64.s', 'asm/x64-elf-gas/md5/md5-x86_64.s', 'asm/x64-elf-gas/rc4/rc4-x86_64.s', @@ -779,12 +786,18 @@ ] }], ['OS=="mac" and target_arch=="x64"', { + 'defines': [ + 'OPENSSL_BN_ASM_MONT5', + 'OPENSSL_BN_ASM_GF2m', + ], 'sources': [ 'asm/x64-macosx-gas/aes/aes-x86_64.s', 'asm/x64-macosx-gas/aes/aesni-x86_64.s', 'asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s', 'asm/x64-macosx-gas/bn/modexp512-x86_64.s', 'asm/x64-macosx-gas/bn/x86_64-mont.s', + 'asm/x64-macosx-gas/bn/x86_64-mont5.s', + 'asm/x64-macosx-gas/bn/x86_64-gf2m.s', 'asm/x64-macosx-gas/camellia/cmll-x86_64.s', 'asm/x64-macosx-gas/md5/md5-x86_64.s', 'asm/x64-macosx-gas/rc4/rc4-x86_64.s', @@ -847,12 +860,18 @@ ] }], ['OS=="win" and target_arch=="x64"', { + 'defines': [ + 'OPENSSL_BN_ASM_MONT5', + 'OPENSSL_BN_ASM_GF2m', + ], 'sources': [ 'asm/x64-win32-masm/aes/aes-x86_64.asm', 'asm/x64-win32-masm/aes/aesni-x86_64.asm', 'asm/x64-win32-masm/aes/aesni-sha1-x86_64.asm', 'asm/x64-win32-masm/bn/modexp512-x86_64.asm', 'asm/x64-win32-masm/bn/x86_64-mont.asm', + 'asm/x64-win32-masm/bn/x86_64-mont5.asm', + 'asm/x64-win32-masm/bn/x86_64-gf2m.asm', 'asm/x64-win32-masm/camellia/cmll-x86_64.asm', 'asm/x64-win32-masm/md5/md5-x86_64.asm', 'asm/x64-win32-masm/rc4/rc4-x86_64.asm',