Browse Source

deps: update asm files for openssl-1.0.2b

asm files are generated as
  - In `deps/openssl/asm/`, make with CC=gcc and ASM=nasm
  - In `deps/openssl/asm_obsolute/`, make with no envs for compilers

Fixes: https://github.com/nodejs/io.js/issues/1921
PR-URL: https://github.com/nodejs/io.js/pull/1950
Reviewed-By: Fedor Indutny <fedor@indutny.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
v2.3.1-release
Shigeki Ohtsu 10 years ago
parent
commit
94804969b7
  1. 206
      deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S
  2. 2
      deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S
  3. 202
      deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S
  4. 212
      deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S
  5. 206
      deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S
  6. 200
      deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S
  7. 502
      deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s
  8. 13
      deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s
  9. 502
      deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s
  10. 13
      deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s
  11. 522
      deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm
  12. 13
      deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm
  13. 790
      deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s
  14. 794
      deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s
  15. 793
      deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm
  16. 206
      deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S
  17. 2
      deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S
  18. 202
      deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S
  19. 212
      deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S
  20. 206
      deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S
  21. 200
      deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S
  22. 502
      deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s
  23. 13
      deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s
  24. 502
      deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s
  25. 13
      deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s
  26. 71
      deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm
  27. 522
      deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm
  28. 13
      deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm
  29. 790
      deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s
  30. 794
      deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s
  31. 793
      deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm

206
deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S

@ -230,17 +230,17 @@ aes_v8_encrypt:
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q1},[r2]!
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
@ -259,17 +259,17 @@ aes_v8_decrypt:
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q1},[r2]!
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
veor q5,q8,q7
beq .Lcbc_enc128
vld1.32 {q2-q3},[r7]
add r7,r3,#16
add r6,r3,#16*4
add r12,r3,#16*5
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r14,r3,#16*6
add r3,r3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r6,r6,#2
vst1.8 {q6},[r1]!
.Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
bgt .Loop_cbc_enc
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r6]
cmp r5,#4
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r12]
beq .Lcbc_enc192
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r14]
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r3]
nop
.Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
mov r6,r5
veor q6,q0,q7
vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
veor q4,q6,q7
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q4,q6,q7
subs r2,r2,#0x30
veor q5,q2,q7
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
veor q9,q3,q7
subs r2,r2,#0x30
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vorr q6,q11,q11
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
vorr q6,q11,q11
mov r7,r3
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
mov r7,r3
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
vld1.8 {q2},[r0]!
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
vld1.8 {q11},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vorr q0,q2,q2
vst1.8 {q4},[r1]!
vorr q1,q3,q3
vorr q0,q2,q2
vst1.8 {q5},[r1]!
vorr q1,q3,q3
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs .Loop3x_cbc_dec
@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q9},[r7]!
bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
mov r7,r3
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
vld1.8 {q2},[r0]!
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q2},[r0]!
vorr q0,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]!
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vorr q1,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.8 {q11},[r0]!
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vld1.8 {q11},[r0]!
mov r7,r3
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7
add r10,r8,#2
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7
rev r9,r9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10
rev r12,r8
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
mov r6,r5
veor q2,q2,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vst1.8 {q2},[r1]!
veor q3,q3,q5
mov r6,r5
vst1.8 {q3},[r1]!
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q2},[r1]!
vst1.8 {q3},[r1]!
vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32
@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q9},[r7]!
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
vld1.8 {q3},[r0]
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
veor q2,q2,q7
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15

2
deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S

@ -495,7 +495,7 @@ gcm_ghash_neon:
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
veor d1,d1,d20 @
veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase

202
deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S

@ -7,109 +7,223 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
vld1.64 {q9},[r1] @ load H
vmov.i8 q8,#0xe1
vld1.64 {q9},[r1] @ load input H
vmov.i8 q11,#0xe1
vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8
vshl.i64 q8,q8,#57
vshr.u64 q10,q8,#63
vext.8 q8,q10,q8,#8 @ t0=0xc2....01
vshr.u64 q10,q11,#63
vdup.32 q9,d18[1]
vshr.u64 q11,q3,#63
vext.8 q8,q10,q11,#8 @ t0=0xc2....01
vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
vand q11,q11,q8
vand q10,q10,q8
vshl.i64 q3,q3,#1
vext.8 q11,q11,q11,#8
vext.8 q10,q10,q10,#8
vand q8,q8,q9
vorr q3,q3,q11 @ H<<<=1
veor q3,q3,q8 @ twisted H
vst1.64 {q3},[r0]
vorr q3,q3,q10 @ H<<<=1
veor q12,q3,q8 @ twisted H
vst1.64 {q12},[r0]! @ store Htable[0]
@ calculate H^2
vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
veor q8,q8,q12
.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q14,q0,q10
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
veor q9,q9,q14
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
vst1.64 {q13-q14},[r0] @ store Htable[1..2]
bx lr
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
vld1.64 {q12},[r1] @ load twisted H
vld1.64 {q12-q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vext.8 q13,q12,q12,#8
mov r3,#0
vext.8 q3,q9,q9,#8
mov r12,#0
veor q13,q13,q12 @ Karatsuba pre-processing
mov r2,r0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
bx lr
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi
subs r3,r3,#16
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ alorithm specification
subs r3,r3,#32 @ see if r3 is 32 or larger
mov r12,#16 @ r12 is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ r12 is zeroed just in time
@ to preclude oversteping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1
mov r12,#16
vld1.64 {q12},[r1] @ load twisted H
moveq r12,#0
vext.8 q0,q0,q0,#8
vshl.u64 q11,q11,#57
vld1.64 {q9},[r2],r12 @ load [rotated] inp
vext.8 q13,q12,q12,#8
vld1.64 {q14},[r1]
moveq r12,#0 @ is it time to zero r12?
vext.8 q0,q0,q0,#8 @ rotate Xi
vld1.64 {q8},[r2]! @ load [rotated] I[0]
vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 q8,q8
vrev64.8 q0,q0
#endif
vext.8 q3,q8,q8,#8 @ rotate I[0]
blo .Lodd_tail_v8 @ r3 was less than 32
vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
veor q13,q13,q12 @ Karatsuba pre-processing
vext.8 q3,q9,q9,#8
b .Loop_v8
vext.8 q7,q9,q9,#8
veor q3,q3,q0 @ I[i]^=Xi
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q9,q9,q7 @ Karatsuba pre-processing
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
b .Loop_mod2x_v8
.align 4
.Loop_v8:
.Loop_mod2x_v8:
vext.8 q10,q3,q3,#8
subs r3,r3,#32 @ is there more data?
.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
movlo r12,#0 @ is it time to zero r12?
.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
veor q10,q10,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
veor q0,q0,q4 @ accumulate
.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
veor q2,q2,q6
moveq r12,#0 @ is it time to zero r12?
veor q1,q1,q5
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 q8,q8
#endif
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
vext.8 q7,q9,q9,#8
vext.8 q3,q8,q8,#8
veor q0,q1,q10
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q3,q3,q2 @ accumulate q3 early
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q3,q3,q10
veor q9,q9,q7 @ Karatsuba pre-processing
veor q3,q3,q0
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor q2,q2,q10
vext.8 q3,q8,q8,#8 @ re-construct q3
adds r3,r3,#32 @ re-construct r3
veor q0,q0,q2 @ re-construct q0
beq .Ldone_v8 @ is r3 zero?
.Lodd_tail_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
veor q9,q9,q10 @ q9 is rotated inp^Xi
veor q9,q8,q10 @ q9 is rotated inp^Xi
.Lgmult_v8:
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
subs r3,r3,#16
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
moveq r12,#0
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] inp
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
veor q0,q1,q10
vext.8 q3,q9,q9,#8
vext.8 q10,q0,q0,#8 @ 2nd phase
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
bhs .Loop_v8
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
vldmia sp!,{d8-d15} @ 32-bit ABI says so
bx lr
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"

212
deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S

@ -1,7 +1,59 @@
#include "arm_arch.h"
@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Permission to use under GPL terms is granted.
@ ====================================================================
@ SHA256 block procedure for ARMv4. May 2007.
@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
@ byte [on single-issue Xscale PXA250 core].
@ July 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
@ Cortex A8 core and ~20 cycles per processed byte.
@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 16%
@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
@ September 2013.
@
@ Add NEON implementation. On Cortex A8 it was measured to process one
@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
@ code (meaning that latter performs sub-optimally, nothing was done
@ about it).
@ May 2014.
@
@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
#ifndef __KERNEL__
# include "arm_arch.h"
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
.text
#if __ARM_ARCH__<7
.code 32
#else
.syntax unified
# ifdef __thumb2__
.thumb
# else
.code 32
# endif
#endif
.type K256,%object
.align 5
@ -24,7 +76,7 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
#endif
@ -33,9 +85,12 @@ K256:
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
add r2,r1,r2,lsl#6 @ len to point at the end of inp
#if __ARM_MAX_ARCH__>=7
#else
adr r3,sha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
@ -43,6 +98,7 @@ sha256_block_data_order:
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp
stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r14,r3,#256+32 @ K256
@ -1736,6 +1792,9 @@ sha256_block_data_order:
eor r12,r12,r6 @ Maj(a,b,c)
add r4,r4,r0,ror#2 @ h+=Sigma0(a)
@ add r4,r4,r12 @ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq r3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
@ -1777,16 +1836,19 @@ sha256_block_data_order:
.arch armv7-a
.fpu neon
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub r11,sp,#16*4+16
adr r14,K256
bic r11,r11,#15 @ align for 128-bit stores
mov r12,sp
sub sp,sp,#16*4+16 @ alloca
sub r14,r3,#256+32 @ K256
bic sp,sp,#15 @ align for 128-bit stores
mov sp,r11 @ alloca
add r2,r1,r2,lsl#6 @ len to point at the end of inp
vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]!
@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
ldr r0,[sp,#72]
sub r14,r14,#256 @ rewind r14
teq r1,r0
it eq
subeq r1,r1,#64 @ avoid SEGV
vld1.8 {q0},[r1]! @ load next input block
vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]!
it ne
strne r1,[sp,#68]
mov r1,sp
add r11,r11,r2
@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
str r7,[r2],#4
stmia r2,{r8-r11}
ittte ne
movne r1,sp
ldrne r2,[sp,#0]
eorne r12,r12,r12
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne r3,r5,r6
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# ifdef __thumb2__
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {q0,q1},[r0]
sub r3,r3,#sha256_block_data_order-K256
# ifdef __thumb2__
adr r3,.LARMv8
sub r3,r3,#.LARMv8-K256
# else
adrl r3,K256
# endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {q8-q9},[r1]!
@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
teq r1,r2
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vld1.32 {q13},[r3]
vadd.i32 q12,q12,q10
sub r3,r3,#256-16 @ rewind
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vadd.i32 q13,q13,q11
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vadd.i32 q0,q0,q14
vadd.i32 q1,q1,q15
it ne
bne .Loop_v8
vst1.32 {q0,q1},[r0]
@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
#endif
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif

206
deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S

@ -227,17 +227,17 @@ aes_v8_encrypt:
.Loop_enc:
aese v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@ -256,17 +256,17 @@ aes_v8_decrypt:
.Loop_dec:
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
ld1 {v2.4s-v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
add x14,x3,#16*6
add x3,x3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
subs w6,w6,#2
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
b.gt .Loop_cbc_enc
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x6]
cmp w5,#4
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x12]
b.eq .Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x3]
nop
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
mov w6,w5
eor v6.16b,v0.16b,v7.16b
st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
eor v4.16b,v6.16b,v7.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
eor v17.16b,v3.16b,v7.16b
subs x2,x2,#0x30
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v20.16b
aesd v1.16b,v20.16b
aesd v18.16b,v20.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
mov x7,x3
aesd v0.16b,v21.16b
aesd v1.16b,v21.16b
aesd v18.16b,v21.16b
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesd v1.16b,v22.16b
aesd v18.16b,v22.16b
ld1 {v19.16b},[x0],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
orr v0.16b,v2.16b,v2.16b
st1 {v4.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
aesd v18.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
aesd v18.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesd v18.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aese v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aese v1.16b,v17.16b
aese v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aese v18.16b,v16.16b
mov x7,x3
aesmc v4.16b,v0.16b
ld1 {v2.16b},[x0],#16
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
aesmc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aese v5.16b,v17.16b
aese v18.16b,v17.16b
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
ld1 {v19.16b},[x0],#16
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aese v17.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aese v17.16b,v21.16b
aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
aesmc v4.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
aesmc v5.16b,v5.16b
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
aese v17.16b,v22.16b
aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
aese v5.16b,v23.16b
aese v17.16b,v23.16b
mov w6,w5
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v2.16b},[x1],#16
st1 {v3.16b},[x1],#16
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aese v1.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aese v1.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aese v1.16b,v20.16b
ld1 {v3.16b},[x0]
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aese v1.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
aese v0.16b,v22.16b
aese v1.16b,v22.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b

200
deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S

@ -6,103 +6,215 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
ld1 {v17.2d},[x1] //load H
movi v16.16b,#0xe1
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
shl v16.2d,v16.2d,#57
ushr v18.2d,v16.2d,#63
ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ushr v19.2d,v3.2d,#63
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v19.16b,v19.16b,v16.16b
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v19.16b,v19.16b,v19.16b,#8
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v19.16b //H<<<=1
eor v3.16b,v3.16b,v16.16b //twisted H
st1 {v3.2d},[x0]
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull v0.1q,v20.1d,v20.1d
eor v16.16b,v16.16b,v20.16b
pmull2 v2.1q,v20.2d,v20.2d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v22.16b,v0.16b,v18.16b
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d},[x1] //load twisted H
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v21.16b,v20.16b,v20.16b,#8
mov x3,#0
ext v3.16b,v17.16b,v17.16b,#8
mov x12,#0
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
mov x2,x0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
pmull v0.1q,v20.1d,v3.1d //H.loˇXi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hiˇXi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)ˇ(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
subs x3,x3,#16
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//alorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude oversteping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
mov x12,#16
ld1 {v20.2d},[x1] //load twisted H
csel x12,xzr,x12,eq
ext v0.16b,v0.16b,v0.16b,#8
shl v19.2d,v19.2d,#57
ld1 {v17.2d},[x2],x12 //load [rotated] inp
ext v21.16b,v20.16b,v20.16b,#8
ld1 {v22.2d},[x1]
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
ext v3.16b,v17.16b,v17.16b,#8
b .Loop_v8
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //HˇIi+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b .Loop_mod2x_v8
.align 4
.Loop_v8:
.Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.loˇXi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hiˇXi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)ˇ(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //HˇIi+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
.Lgmult_v8:
pmull v0.1q,v20.1d,v3.1d //H.loˇXi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hiˇXi.hi
subs x3,x3,#16
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)ˇ(Xi.lo+Xi.hi)
csel x12,xzr,x12,eq
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] inp
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
eor v0.16b,v1.16b,v18.16b
ext v3.16b,v17.16b,v17.16b,#8
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
b.hs .Loop_v8
.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif

502
deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s

@ -17,7 +17,10 @@ aesni_encrypt:
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
@ -38,7 +41,10 @@ aesni_decrypt:
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
.type _aesni_encrypt2,@function
@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lenc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Ldec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,220,217
pxor %xmm0,%xmm8
.byte 102,15,56,220,217
pxor %xmm0,%xmm9
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,222,217
pxor %xmm0,%xmm8
.byte 102,15,56,222,217
pxor %xmm0,%xmm9
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@ -587,6 +577,7 @@ aesni_ecb_encrypt:
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@ -700,15 +691,23 @@ aesni_ecb_encrypt:
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
@ -731,14 +730,23 @@ aesni_ecb_encrypt:
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
@ -754,49 +762,73 @@ aesni_ecb_encrypt:
jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
.Lecb_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
leaq 16(%r11),%r11
jnz .Loop_enc1_6
.byte 102,15,56,221,217
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
cmpq $1,%rdx
jne .Lctr32_bulk
movups (%r8),%xmm2
movups (%rdi),%xmm3
movl 240(%rcx),%edx
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_7:
.byte 102,15,56,220,209
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
xorps %xmm2,%xmm2
jmp .Lctr32_epilogue
.align 16
.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
cmpq $1,%rdx
je .Lctr32_one_shortcut
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups (%r8),%xmm2
movups (%rdi),%xmm10
movl 240(%rcx),%eax
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_7:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_done:
xorps %xmm0,%xmm0
xorl %r11d,%r11d
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
shrl $4,%eax
.Lxts_enc_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
pxor %xmm7,%xmm7
call _aesni_encrypt6
@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
shrl $4,%eax
.Lxts_dec_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%r8)
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_decrypt:
cmpq $16,%rdx
jne .Lcbc_decrypt_bulk
movdqu (%rdi),%xmm2
movdqu (%r8),%xmm3
movdqa %xmm2,%xmm4
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_16:
.byte 102,15,56,222,209
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
.align 16
@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
movdqa %xmm7,%xmm2
addq $80,%rdx
jle .Lcbc_dec_tail_collected
jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
subq $16,%rdx
jmp .Lcbc_dec_tail_collected
@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_16:
.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_clear_tail_collected:
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz .Lenc_key_ret
movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
andl OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%esi
cmpl $268435456,%r10d
je .L10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@ -3007,10 +3211,80 @@ __aesni_set_encrypt_key:
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L10rounds_alt:
movdqa .Lkey_rotate(%rip),%xmm5
movl $8,%r10d
movdqa .Lkey_rcon1(%rip),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,(%rdx)
jmp .Loop_key128
.align 16
.Loop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
leaq 16(%rax),%rax
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,-16(%rax)
movdqa %xmm0,%xmm2
decl %r10d
jnz .Loop_key128
movdqa .Lkey_rcon1b(%rip),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
.byte 102,15,56,0,197
.byte 102,15,56,221,196
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,16(%rax)
movl %esi,96(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
cmpl $268435456,%r10d
je .L12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@ -3033,11 +3307,55 @@ __aesni_set_encrypt_key:
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.L12rounds_alt:
movdqa .Lkey_rotate192(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
movl $8,%r10d
movdqu %xmm0,(%rdx)
jmp .Loop_key192
.align 16
.Loop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
.byte 102,15,56,221,212
pslld $1,%xmm4
leaq 24(%rax),%rax
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pshufd $255,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pxor %xmm2,%xmm0
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%rax)
decl %r10d
jnz .Loop_key192
movl %esi,32(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
cmpl $268435456,%r10d
je .L14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@ -3071,10 +3389,70 @@ __aesni_set_encrypt_key:
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.L14rounds_alt:
movdqa .Lkey_rotate(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
movl $7,%r10d
movdqu %xmm0,0(%rdx)
movdqa %xmm2,%xmm1
movdqu %xmm2,16(%rdx)
jmp .Loop_key256
.align 16
.Loop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pslld $1,%xmm4
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
decl %r10d
jz .Ldone_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm3,%xmm1
pxor %xmm1,%xmm2
movdqu %xmm2,16(%rax)
leaq 32(%rax),%rax
movdqa %xmm2,%xmm1
jmp .Loop_key256
.Ldone_key256:
movl %esi,16(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Lkey_rotate:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
.Lkey_rotate192:
.long 0x04070605,0x04070605,0x04070605,0x04070605
.Lkey_rcon1:
.long 1,1,1,1
.Lkey_rcon1b:
.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64

13
deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s

@ -2884,11 +2884,16 @@ sqrx8x_reduction:
.type bn_get_bits5,@function
.align 16
bn_get_bits5:
movq %rdi,%r10
leaq 0(%rdi),%r10
leaq 1(%rdi),%r11
movl %esi,%ecx
shrl $3,%esi
movzwl (%r10,%rsi,1),%eax
andl $7,%ecx
shrl $4,%esi
andl $15,%ecx
leal -8(%rcx),%eax
cmpl $11,%ecx
cmovaq %r11,%r10
cmoval %eax,%ecx
movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3

502
deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s

@ -17,7 +17,10 @@ L$oop_enc1_1:
leaq 16(%rdx),%rdx
jnz L$oop_enc1_1
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@ -38,7 +41,10 @@ L$oop_dec1_2:
leaq 16(%rdx),%rdx
jnz L$oop_dec1_2
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
L$enc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
L$dec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,220,217
pxor %xmm0,%xmm8
.byte 102,15,56,220,217
pxor %xmm0,%xmm9
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,222,217
pxor %xmm0,%xmm8
.byte 102,15,56,222,217
pxor %xmm0,%xmm9
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@ -587,6 +577,7 @@ L$ecb_enc_tail:
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz L$ecb_ret
@ -731,14 +730,23 @@ L$ecb_dec_tail:
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
@ -754,49 +762,73 @@ L$oop_dec1_4:
jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
L$ecb_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
@ -944,21 +982,56 @@ L$oop_enc1_6:
leaq 16(%r11),%r11
jnz L$oop_enc1_6
.byte 102,15,56,221,217
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
cmpq $1,%rdx
jne L$ctr32_bulk
movups (%r8),%xmm2
movups (%rdi),%xmm3
movl 240(%rcx),%edx
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_7:
.byte 102,15,56,220,209
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_7
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
xorps %xmm2,%xmm2
jmp L$ctr32_epilogue
.p2align 4
L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
cmpq $1,%rdx
je L$ctr32_one_shortcut
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
leaq -128(%rcx),%rcx
L$ctr32_tail:
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@ -1456,30 +1532,33 @@ L$ctr32_loop3:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_one_shortcut:
movups (%r8),%xmm2
movups (%rdi),%xmm10
movl 240(%rcx),%eax
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_7:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_7
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_done:
xorps %xmm0,%xmm0
xorl %r11d,%r11d
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
shrl $4,%eax
L$xts_enc_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@ -1778,6 +1858,7 @@ L$xts_enc_short:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
pxor %xmm7,%xmm7
call _aesni_encrypt6
@ -1920,6 +2001,29 @@ L$oop_enc1_10:
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
shrl $4,%eax
L$xts_dec_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@ -2398,6 +2503,29 @@ L$oop_dec1_14:
movups %xmm2,(%rsi)
L$xts_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
@ -2446,7 +2574,11 @@ L$oop_enc1_15:
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%r8)
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
.p2align 4
L$cbc_decrypt:
cmpq $16,%rdx
jne L$cbc_decrypt_bulk
movdqu (%rdi),%xmm2
movdqu (%r8),%xmm3
movdqa %xmm2,%xmm4
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_16:
.byte 102,15,56,222,209
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_16
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$cbc_ret
.p2align 4
L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@ -2702,7 +2863,7 @@ L$cbc_dec_done:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
jle L$cbc_dec_tail_collected
jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
movdqa %xmm7,%xmm2
addq $80,%rdx
jle L$cbc_dec_tail_collected
jle L$cbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
subq $16,%rdx
jmp L$cbc_dec_tail_collected
@ -2847,12 +3025,12 @@ L$cbc_dec_one:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_16:
L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_16
jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@ -2866,6 +3044,7 @@ L$cbc_dec_two:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2878,7 +3057,9 @@ L$cbc_dec_three:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2891,29 +3072,45 @@ L$cbc_dec_four:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_clear_tail_collected:
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
@ -2951,7 +3148,9 @@ L$dec_key_inverse:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz L$enc_key_ret
movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
andl _OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
L$10rounds:
movl $9,%esi
cmpl $268435456,%r10d
je L$10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
@ -3007,10 +3211,80 @@ L$10rounds:
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$10rounds_alt:
movdqa L$key_rotate(%rip),%xmm5
movl $8,%r10d
movdqa L$key_rcon1(%rip),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,(%rdx)
jmp L$oop_key128
.p2align 4
L$oop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
leaq 16(%rax),%rax
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,-16(%rax)
movdqa %xmm0,%xmm2
decl %r10d
jnz L$oop_key128
movdqa L$key_rcon1b(%rip),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
.byte 102,15,56,0,197
.byte 102,15,56,221,196
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,16(%rax)
movl %esi,96(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
cmpl $268435456,%r10d
je L$12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
@ -3033,11 +3307,55 @@ L$12rounds:
xorq %rax,%rax
jmp L$enc_key_ret
.p2align 4
L$12rounds_alt:
movdqa L$key_rotate192(%rip),%xmm5
movdqa L$key_rcon1(%rip),%xmm4
movl $8,%r10d
movdqu %xmm0,(%rdx)
jmp L$oop_key192
.p2align 4
L$oop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
.byte 102,15,56,221,212
pslld $1,%xmm4
leaq 24(%rax),%rax
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pshufd $255,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pxor %xmm2,%xmm0
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%rax)
decl %r10d
jnz L$oop_key192
movl %esi,32(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
cmpl $268435456,%r10d
je L$14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@ -3071,10 +3389,70 @@ L$14rounds:
xorq %rax,%rax
jmp L$enc_key_ret
.p2align 4
L$14rounds_alt:
movdqa L$key_rotate(%rip),%xmm5
movdqa L$key_rcon1(%rip),%xmm4
movl $7,%r10d
movdqu %xmm0,0(%rdx)
movdqa %xmm2,%xmm1
movdqu %xmm2,16(%rdx)
jmp L$oop_key256
.p2align 4
L$oop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pslld $1,%xmm4
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
decl %r10d
jz L$done_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm3,%xmm1
pxor %xmm1,%xmm2
movdqu %xmm2,16(%rax)
leaq 32(%rax),%rax
movdqa %xmm2,%xmm1
jmp L$oop_key256
L$done_key256:
movl %esi,16(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
@ -3160,6 +3538,14 @@ L$xts_magic:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
L$key_rotate:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
L$key_rotate192:
.long 0x04070605,0x04070605,0x04070605,0x04070605
L$key_rcon1:
.long 1,1,1,1
L$key_rcon1b:
.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6

13
deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s

@ -2884,11 +2884,16 @@ L$sqrx4x_sub:
.p2align 4
_bn_get_bits5:
movq %rdi,%r10
leaq 0(%rdi),%r10
leaq 1(%rdi),%r11
movl %esi,%ecx
shrl $3,%esi
movzwl (%r10,%rsi,1),%eax
andl $7,%ecx
shrl $4,%esi
andl $15,%ecx
leal -8(%rcx),%eax
cmpl $11,%ecx
cmovaq %r11,%r10
cmoval %eax,%ecx
movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3

522
deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm

@ -18,7 +18,10 @@ DB 102,15,56,220,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_enc1_1
DB 102,15,56,221,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_encrypt ENDP
@ -39,7 +42,10 @@ DB 102,15,56,222,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_dec1_2
DB 102,15,56,223,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_decrypt ENDP
@ -265,21 +271,18 @@ DB 102,15,56,220,217
pxor xmm6,xmm0
DB 102,15,56,220,225
pxor xmm7,xmm0
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop6_enter
ALIGN 16
$L$enc_loop6::
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
$L$enc_loop6_enter::
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
$L$enc_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,220,208
@ -322,21 +325,18 @@ DB 102,15,56,222,217
pxor xmm6,xmm0
DB 102,15,56,222,225
pxor xmm7,xmm0
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop6_enter
ALIGN 16
$L$dec_loop6::
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
$L$dec_loop6_enter::
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
$L$dec_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,222,208
@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,220,209
add rax,16
pxor xmm7,xmm0
DB 102,15,56,220,217
pxor xmm8,xmm0
DB 102,15,56,220,217
pxor xmm9,xmm0
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
DB 102,68,15,56,220,193
DB 102,68,15,56,220,201
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop8_enter
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
jmp $L$enc_loop8_inner
ALIGN 16
$L$enc_loop8::
DB 102,15,56,220,209
DB 102,15,56,220,217
$L$enc_loop8_inner::
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,222,209
add rax,16
pxor xmm7,xmm0
DB 102,15,56,222,217
pxor xmm8,xmm0
DB 102,15,56,222,217
pxor xmm9,xmm0
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
DB 102,68,15,56,222,193
DB 102,68,15,56,222,201
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop8_enter
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
jmp $L$dec_loop8_inner
ALIGN 16
$L$dec_loop8::
DB 102,15,56,222,209
DB 102,15,56,222,217
$L$dec_loop8_inner::
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
@ -605,6 +595,7 @@ $L$ecb_enc_tail::
movups xmm7,XMMWORD PTR[80+rdi]
je $L$ecb_enc_six
movdqu xmm8,XMMWORD PTR[96+rdi]
xorps xmm9,xmm9
call _aesni_encrypt8
movups XMMWORD PTR[rsi],xmm2
movups XMMWORD PTR[16+rsi],xmm3
@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
jnc $L$ecb_dec_loop8
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
mov rcx,r11
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
mov eax,r10d
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
pxor xmm8,xmm8
movups XMMWORD PTR[112+rsi],xmm9
pxor xmm9,xmm9
lea rsi,QWORD PTR[128+rsi]
add rdx,080h
jz $L$ecb_ret
@ -749,14 +748,23 @@ $L$ecb_dec_tail::
je $L$ecb_dec_six
movups xmm8,XMMWORD PTR[96+rdi]
movups xmm0,XMMWORD PTR[rcx]
xorps xmm9,xmm9
call _aesni_decrypt8
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
pxor xmm8,xmm8
pxor xmm9,xmm9
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_one::
@ -772,53 +780,81 @@ DB 102,15,56,222,209
jnz $L$oop_dec1_4
DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_two::
call _aesni_decrypt2
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_three::
call _aesni_decrypt3
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_four::
call _aesni_decrypt4
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_five::
xorps xmm7,xmm7
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
pxor xmm7,xmm7
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_six::
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
$L$ecb_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ecb_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -898,11 +934,21 @@ DB 102,15,56,0,215
lea rsi,QWORD PTR[16+rsi]
jnz $L$ccm64_enc_outer
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
pxor xmm3,xmm3
pxor xmm8,xmm8
pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -1016,11 +1062,21 @@ DB 102,15,56,220,217
lea r11,QWORD PTR[16+r11]
jnz $L$oop_enc1_6
DB 102,15,56,221,217
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
pxor xmm3,xmm3
pxor xmm8,xmm8
pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_dec_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
mov r8,QWORD PTR[40+rsp]
cmp rdx,1
jne $L$ctr32_bulk
movups xmm2,XMMWORD PTR[r8]
movups xmm3,XMMWORD PTR[rdi]
mov edx,DWORD PTR[240+rcx]
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_7::
DB 102,15,56,220,209
dec edx
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_enc1_7
DB 102,15,56,221,209
pxor xmm0,xmm0
pxor xmm1,xmm1
xorps xmm2,xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[rsi],xmm2
xorps xmm2,xmm2
jmp $L$ctr32_epilogue
ALIGN 16
$L$ctr32_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,288
@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
$L$ctr32_body::
lea rbp,QWORD PTR[((-8))+rax]
cmp rdx,1
je $L$ctr32_one_shortcut
movdqu xmm2,XMMWORD PTR[r8]
movdqu xmm0,XMMWORD PTR[rcx]
@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202
lea rcx,QWORD PTR[((-128))+rcx]
$L$ctr32_tail::
lea rcx,QWORD PTR[16+rcx]
cmp rdx,4
jb $L$ctr32_loop3
je $L$ctr32_loop4
shl eax,4
movdqa xmm8,XMMWORD PTR[96+rsp]
pxor xmm9,xmm9
@ -1559,40 +1647,43 @@ DB 102,15,56,221,225
movups xmm12,XMMWORD PTR[32+rdi]
xorps xmm4,xmm12
movups XMMWORD PTR[32+rsi],xmm4
jmp $L$ctr32_done
ALIGN 16
$L$ctr32_one_shortcut::
movups xmm2,XMMWORD PTR[r8]
movups xmm10,XMMWORD PTR[rdi]
mov eax,DWORD PTR[240+rcx]
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_7::
DB 102,15,56,220,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_enc1_7
DB 102,15,56,221,209
xorps xmm2,xmm10
movups XMMWORD PTR[rsi],xmm2
jmp $L$ctr32_done
ALIGN 16
$L$ctr32_done::
xorps xmm0,xmm0
xor r11d,r11d
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
movaps XMMWORD PTR[112+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$ctr32_epilogue::
@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80
shr eax,4
$L$xts_enc_short::
mov r10d,eax
pxor xmm10,xmm0
add rdx,16*6
@ -1917,6 +2009,7 @@ $L$xts_enc_short::
pxor xmm4,xmm12
pxor xmm5,xmm13
pxor xmm6,xmm14
pxor xmm7,xmm7
call _aesni_encrypt6
@ -2059,16 +2152,39 @@ DB 102,15,56,221,209
movups XMMWORD PTR[(-16)+rsi],xmm2
$L$xts_enc_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_enc_epilogue::
@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80
shr eax,4
$L$xts_dec_short::
mov r10d,eax
pxor xmm10,xmm0
pxor xmm11,xmm0
@ -2573,16 +2690,39 @@ DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
$L$xts_dec_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_dec_epilogue::
@ -2646,7 +2786,11 @@ DB 102,15,56,221,209
jnc $L$cbc_enc_loop
add rdx,16
jnz $L$cbc_enc_tail
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[r8],xmm2
pxor xmm2,xmm2
pxor xmm3,xmm3
jmp $L$cbc_ret
$L$cbc_enc_tail::
@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
ALIGN 16
$L$cbc_decrypt::
cmp rdx,16
jne $L$cbc_decrypt_bulk
movdqu xmm2,XMMWORD PTR[rdi]
movdqu xmm3,XMMWORD PTR[r8]
movdqa xmm4,xmm2
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_16::
DB 102,15,56,222,209
dec r10d
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_dec1_16
DB 102,15,56,223,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movdqu XMMWORD PTR[r8],xmm4
xorps xmm2,xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$cbc_ret
ALIGN 16
$L$cbc_decrypt_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,176
@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202
movaps xmm2,xmm9
lea rcx,QWORD PTR[((-112))+rcx]
add rdx,070h
jle $L$cbc_dec_tail_collected
jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm9
lea rsi,QWORD PTR[16+rsi]
cmp rdx,050h
@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
lea rsi,QWORD PTR[80+rsi]
movdqa xmm2,xmm7
pxor xmm7,xmm7
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
pxor xmm8,xmm9
movdqu XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
lea rsi,QWORD PTR[96+rsi]
movdqa xmm2,xmm8
pxor xmm8,xmm8
pxor xmm9,xmm9
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
movdqa xmm2,xmm7
add rdx,050h
jle $L$cbc_dec_tail_collected
jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm7
lea rsi,QWORD PTR[16+rsi]
@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
lea rsi,QWORD PTR[64+rsi]
movdqa xmm2,xmm6
pxor xmm6,xmm6
pxor xmm7,xmm7
sub rdx,010h
jmp $L$cbc_dec_tail_collected
@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_16::
$L$oop_dec1_17::
DB 102,15,56,222,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_dec1_16
jnz $L$oop_dec1_17
DB 102,15,56,223,209
xorps xmm2,xmm10
movaps xmm10,xmm11
@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
pxor xmm3,xmm11
movdqu XMMWORD PTR[rsi],xmm2
movdqa xmm2,xmm3
pxor xmm3,xmm3
lea rsi,QWORD PTR[16+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movdqa xmm2,xmm4
pxor xmm4,xmm4
lea rsi,QWORD PTR[32+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movdqa xmm2,xmm5
pxor xmm5,xmm5
lea rsi,QWORD PTR[48+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
$L$cbc_dec_clear_tail_collected::
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
$L$cbc_dec_tail_collected::
movups XMMWORD PTR[r8],xmm10
and rdx,15
jnz $L$cbc_dec_tail_partial
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$cbc_dec_ret
ALIGN 16
$L$cbc_dec_tail_partial::
movaps XMMWORD PTR[rsp],xmm2
pxor xmm2,xmm2
mov rcx,16
mov rdi,rsi
sub rcx,rdx
lea rsi,QWORD PTR[rsp]
DD 09066A4F3h
movdqa XMMWORD PTR[rsp],xmm2
$L$cbc_dec_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm7,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm8,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
movaps xmm9,XMMWORD PTR[64+rsp]
movaps XMMWORD PTR[64+rsp],xmm0
movaps xmm10,XMMWORD PTR[80+rsp]
movaps XMMWORD PTR[80+rsp],xmm0
movaps xmm11,XMMWORD PTR[96+rsp]
movaps XMMWORD PTR[96+rsp],xmm0
movaps xmm12,XMMWORD PTR[112+rsp]
movaps XMMWORD PTR[112+rsp],xmm0
movaps xmm13,XMMWORD PTR[128+rsp]
movaps XMMWORD PTR[128+rsp],xmm0
movaps xmm14,XMMWORD PTR[144+rsp]
movaps XMMWORD PTR[144+rsp],xmm0
movaps xmm15,XMMWORD PTR[160+rsp]
movaps XMMWORD PTR[160+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$cbc_ret::
@ -3175,7 +3390,9 @@ DB 102,15,56,219,201
movups xmm0,XMMWORD PTR[r8]
DB 102,15,56,219,192
pxor xmm1,xmm1
movups XMMWORD PTR[rcx],xmm0
pxor xmm0,xmm0
$L$dec_key_ret::
add rsp,8
DB 0F3h,0C3h ;repret
@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h
test r8,r8
jz $L$enc_key_ret
mov r10d,268437504
movups xmm0,XMMWORD PTR[rcx]
xorps xmm4,xmm4
and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
lea rax,QWORD PTR[16+r8]
cmp edx,256
je $L$14rounds
@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h
$L$10rounds::
mov edx,9
cmp r10d,268435456
je $L$10rounds_alt
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,200,1
call $L$key_expansion_128_cold
@ -3231,10 +3453,80 @@ DB 102,15,58,223,200,54
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$10rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate]
mov r10d,8
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
movdqa xmm2,xmm0
movdqu XMMWORD PTR[r8],xmm0
jmp $L$oop_key128
ALIGN 16
$L$oop_key128::
DB 102,15,56,0,197
DB 102,15,56,221,196
pslld xmm4,1
lea rax,QWORD PTR[16+rax]
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[(-16)+rax],xmm0
movdqa xmm2,xmm0
dec r10d
jnz $L$oop_key128
movdqa xmm4,XMMWORD PTR[$L$key_rcon1b]
DB 102,15,56,0,197
DB 102,15,56,221,196
pslld xmm4,1
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[rax],xmm0
movdqa xmm2,xmm0
DB 102,15,56,0,197
DB 102,15,56,221,196
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[16+rax],xmm0
mov DWORD PTR[96+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$12rounds::
movq xmm2,QWORD PTR[16+rcx]
mov edx,11
cmp r10d,268435456
je $L$12rounds_alt
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,202,1
call $L$key_expansion_192a_cold
@ -3257,11 +3549,55 @@ DB 102,15,58,223,202,128
xor rax,rax
jmp $L$enc_key_ret
ALIGN 16
$L$12rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate192]
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
mov r10d,8
movdqu XMMWORD PTR[r8],xmm0
jmp $L$oop_key192
ALIGN 16
$L$oop_key192::
movq QWORD PTR[rax],xmm2
movdqa xmm1,xmm2
DB 102,15,56,0,213
DB 102,15,56,221,212
pslld xmm4,1
lea rax,QWORD PTR[24+rax]
movdqa xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm0,xmm3
pshufd xmm3,xmm0,0ffh
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pxor xmm0,xmm2
pxor xmm2,xmm3
movdqu XMMWORD PTR[(-16)+rax],xmm0
dec r10d
jnz $L$oop_key192
mov DWORD PTR[32+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$14rounds::
movups xmm2,XMMWORD PTR[16+rcx]
mov edx,13
lea rax,QWORD PTR[16+rax]
cmp r10d,268435456
je $L$14rounds_alt
movups XMMWORD PTR[r8],xmm0
movups XMMWORD PTR[16+r8],xmm2
DB 102,15,58,223,202,1
@ -3295,10 +3631,70 @@ DB 102,15,58,223,202,64
xor rax,rax
jmp $L$enc_key_ret
ALIGN 16
$L$14rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate]
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
mov r10d,7
movdqu XMMWORD PTR[r8],xmm0
movdqa xmm1,xmm2
movdqu XMMWORD PTR[16+r8],xmm2
jmp $L$oop_key256
ALIGN 16
$L$oop_key256::
DB 102,15,56,0,213
DB 102,15,56,221,212
movdqa xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm0,xmm3
pslld xmm4,1
pxor xmm0,xmm2
movdqu XMMWORD PTR[rax],xmm0
dec r10d
jz $L$done_key256
pshufd xmm2,xmm0,0ffh
pxor xmm3,xmm3
DB 102,15,56,221,211
movdqa xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm1,xmm3
pxor xmm2,xmm1
movdqu XMMWORD PTR[16+rax],xmm2
lea rax,QWORD PTR[32+rax]
movdqa xmm1,xmm2
jmp $L$oop_key256
$L$done_key256::
mov DWORD PTR[16+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$bad_keybits::
mov rax,-2
$L$enc_key_ret::
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
add rsp,8
DB 0F3h,0C3h ;repret
$L$SEH_end_set_encrypt_key::
@ -3384,6 +3780,14 @@ $L$xts_magic::
DD 087h,0,1,0
$L$increment1::
DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
$L$key_rotate::
DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
$L$key_rotate192::
DD 004070605h,004070605h,004070605h,004070605h
$L$key_rcon1::
DD 1,1,1,1
$L$key_rcon1b::
DD 01bh,01bh,01bh,01bh
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE
mov rax,QWORD PTR[152+r8]
mov rbx,QWORD PTR[248+r8]
lea r10,QWORD PTR[$L$cbc_decrypt]
lea r10,QWORD PTR[$L$cbc_decrypt_bulk]
cmp rbx,r10
jb $L$common_seh_tail

13
deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm

@ -3001,11 +3001,16 @@ PUBLIC bn_get_bits5
ALIGN 16
bn_get_bits5 PROC PUBLIC
mov r10,rcx
lea r10,QWORD PTR[rcx]
lea r11,QWORD PTR[1+rcx]
mov ecx,edx
shr edx,3
movzx eax,WORD PTR[rdx*1+r10]
and ecx,7
shr edx,4
and ecx,15
lea eax,DWORD PTR[((-8))+rcx]
cmp ecx,11
cmova r10,r11
cmova ecx,eax
movzx eax,WORD PTR[rdx*2+r10]
shr eax,cl
and eax,31
DB 0F3h,0C3h ;repret

790
deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s

File diff suppressed because it is too large

794
deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s

File diff suppressed because it is too large

793
deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm

File diff suppressed because it is too large

206
deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S

@ -230,17 +230,17 @@ aes_v8_encrypt:
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q1},[r2]!
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
@ -259,17 +259,17 @@ aes_v8_decrypt:
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q1},[r2]!
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
@ -313,16 +313,42 @@ aes_v8_cbc_encrypt:
veor q5,q8,q7
beq .Lcbc_enc128
vld1.32 {q2-q3},[r7]
add r7,r3,#16
add r6,r3,#16*4
add r12,r3,#16*5
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r14,r3,#16*6
add r3,r3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r6,r6,#2
vst1.8 {q6},[r1]!
.Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
bgt .Loop_cbc_enc
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r6]
cmp r5,#4
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r12]
beq .Lcbc_enc192
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r14]
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r3]
nop
.Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
@ -331,7 +357,6 @@ aes_v8_cbc_encrypt:
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
@ -340,16 +365,14 @@ aes_v8_cbc_encrypt:
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
mov r6,r5
veor q6,q0,q7
vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
@ -407,79 +430,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
veor q4,q6,q7
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q4,q6,q7
subs r2,r2,#0x30
veor q5,q2,q7
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
veor q9,q3,q7
subs r2,r2,#0x30
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vorr q6,q11,q11
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
vorr q6,q11,q11
mov r7,r3
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
mov r7,r3
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
vld1.8 {q2},[r0]!
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
vld1.8 {q11},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vorr q0,q2,q2
vst1.8 {q4},[r1]!
vorr q1,q3,q3
vorr q0,q2,q2
vst1.8 {q5},[r1]!
vorr q1,q3,q3
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs .Loop3x_cbc_dec
@ -490,39 +512,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q9},[r7]!
bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
mov r7,r3
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
vld1.8 {q2},[r0]!
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q2},[r0]!
vorr q0,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]!
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vorr q1,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.8 {q11},[r0]!
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vld1.8 {q11},[r0]!
mov r7,r3
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7
add r10,r8,#2
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7
rev r9,r9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10
rev r12,r8
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks:
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
mov r6,r5
veor q2,q2,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vst1.8 {q2},[r1]!
veor q3,q3,q5
mov r6,r5
vst1.8 {q3},[r1]!
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q2},[r1]!
vst1.8 {q3},[r1]!
vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32
@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q9},[r7]!
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
vld1.8 {q3},[r0]
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
veor q2,q2,q7
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15

2
deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S

@ -495,7 +495,7 @@ gcm_ghash_neon:
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
veor d1,d1,d20 @
veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase

202
deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S

@ -7,109 +7,223 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
vld1.64 {q9},[r1] @ load H
vmov.i8 q8,#0xe1
vld1.64 {q9},[r1] @ load input H
vmov.i8 q11,#0xe1
vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8
vshl.i64 q8,q8,#57
vshr.u64 q10,q8,#63
vext.8 q8,q10,q8,#8 @ t0=0xc2....01
vshr.u64 q10,q11,#63
vdup.32 q9,d18[1]
vshr.u64 q11,q3,#63
vext.8 q8,q10,q11,#8 @ t0=0xc2....01
vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
vand q11,q11,q8
vand q10,q10,q8
vshl.i64 q3,q3,#1
vext.8 q11,q11,q11,#8
vext.8 q10,q10,q10,#8
vand q8,q8,q9
vorr q3,q3,q11 @ H<<<=1
veor q3,q3,q8 @ twisted H
vst1.64 {q3},[r0]
vorr q3,q3,q10 @ H<<<=1
veor q12,q3,q8 @ twisted H
vst1.64 {q12},[r0]! @ store Htable[0]
@ calculate H^2
vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
veor q8,q8,q12
.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q14,q0,q10
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
veor q9,q9,q14
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
vst1.64 {q13-q14},[r0] @ store Htable[1..2]
bx lr
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
vld1.64 {q12},[r1] @ load twisted H
vld1.64 {q12-q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vext.8 q13,q12,q12,#8
mov r3,#0
vext.8 q3,q9,q9,#8
mov r12,#0
veor q13,q13,q12 @ Karatsuba pre-processing
mov r2,r0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
bx lr
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi
subs r3,r3,#16
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ alorithm specification
subs r3,r3,#32 @ see if r3 is 32 or larger
mov r12,#16 @ r12 is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ r12 is zeroed just in time
@ to preclude oversteping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1
mov r12,#16
vld1.64 {q12},[r1] @ load twisted H
moveq r12,#0
vext.8 q0,q0,q0,#8
vshl.u64 q11,q11,#57
vld1.64 {q9},[r2],r12 @ load [rotated] inp
vext.8 q13,q12,q12,#8
vld1.64 {q14},[r1]
moveq r12,#0 @ is it time to zero r12?
vext.8 q0,q0,q0,#8 @ rotate Xi
vld1.64 {q8},[r2]! @ load [rotated] I[0]
vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 q8,q8
vrev64.8 q0,q0
#endif
vext.8 q3,q8,q8,#8 @ rotate I[0]
blo .Lodd_tail_v8 @ r3 was less than 32
vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
veor q13,q13,q12 @ Karatsuba pre-processing
vext.8 q3,q9,q9,#8
b .Loop_v8
vext.8 q7,q9,q9,#8
veor q3,q3,q0 @ I[i]^=Xi
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q9,q9,q7 @ Karatsuba pre-processing
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
b .Loop_mod2x_v8
.align 4
.Loop_v8:
.Loop_mod2x_v8:
vext.8 q10,q3,q3,#8
subs r3,r3,#32 @ is there more data?
.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
movlo r12,#0 @ is it time to zero r12?
.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
veor q10,q10,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
veor q0,q0,q4 @ accumulate
.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
veor q2,q2,q6
moveq r12,#0 @ is it time to zero r12?
veor q1,q1,q5
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 q8,q8
#endif
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
vext.8 q7,q9,q9,#8
vext.8 q3,q8,q8,#8
veor q0,q1,q10
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q3,q3,q2 @ accumulate q3 early
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q3,q3,q10
veor q9,q9,q7 @ Karatsuba pre-processing
veor q3,q3,q0
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor q2,q2,q10
vext.8 q3,q8,q8,#8 @ re-construct q3
adds r3,r3,#32 @ re-construct r3
veor q0,q0,q2 @ re-construct q0
beq .Ldone_v8 @ is r3 zero?
.Lodd_tail_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
veor q9,q9,q10 @ q9 is rotated inp^Xi
veor q9,q8,q10 @ q9 is rotated inp^Xi
.Lgmult_v8:
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
subs r3,r3,#16
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
moveq r12,#0
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] inp
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
veor q0,q1,q10
vext.8 q3,q9,q9,#8
vext.8 q10,q0,q0,#8 @ 2nd phase
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
bhs .Loop_v8
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
vldmia sp!,{d8-d15} @ 32-bit ABI says so
bx lr
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"

212
deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S

@ -1,7 +1,59 @@
#include "arm_arch.h"
@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Permission to use under GPL terms is granted.
@ ====================================================================
@ SHA256 block procedure for ARMv4. May 2007.
@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
@ byte [on single-issue Xscale PXA250 core].
@ July 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
@ Cortex A8 core and ~20 cycles per processed byte.
@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 16%
@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
@ September 2013.
@
@ Add NEON implementation. On Cortex A8 it was measured to process one
@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
@ code (meaning that latter performs sub-optimally, nothing was done
@ about it).
@ May 2014.
@
@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
#ifndef __KERNEL__
# include "arm_arch.h"
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
.text
#if __ARM_ARCH__<7
.code 32
#else
.syntax unified
# ifdef __thumb2__
.thumb
# else
.code 32
# endif
#endif
.type K256,%object
.align 5
@ -24,7 +76,7 @@ K256:
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-sha256_block_data_order
#endif
@ -33,9 +85,12 @@ K256:
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
add r2,r1,r2,lsl#6 @ len to point at the end of inp
#if __ARM_MAX_ARCH__>=7
#else
adr r3,sha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#ARMV8_SHA256
@ -43,6 +98,7 @@ sha256_block_data_order:
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp
stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r14,r3,#256+32 @ K256
@ -1736,6 +1792,9 @@ sha256_block_data_order:
eor r12,r12,r6 @ Maj(a,b,c)
add r4,r4,r0,ror#2 @ h+=Sigma0(a)
@ add r4,r4,r12 @ h+=Maj(a,b,c)
#if __ARM_ARCH__>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq r3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
@ -1777,16 +1836,19 @@ sha256_block_data_order:
.arch armv7-a
.fpu neon
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 4
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub r11,sp,#16*4+16
adr r14,K256
bic r11,r11,#15 @ align for 128-bit stores
mov r12,sp
sub sp,sp,#16*4+16 @ alloca
sub r14,r3,#256+32 @ K256
bic sp,sp,#15 @ align for 128-bit stores
mov sp,r11 @ alloca
add r2,r1,r2,lsl#6 @ len to point at the end of inp
vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]!
@ -2224,11 +2286,13 @@ sha256_block_data_order_neon:
ldr r0,[sp,#72]
sub r14,r14,#256 @ rewind r14
teq r1,r0
it eq
subeq r1,r1,#64 @ avoid SEGV
vld1.8 {q0},[r1]! @ load next input block
vld1.8 {q1},[r1]!
vld1.8 {q2},[r1]!
vld1.8 {q3},[r1]!
it ne
strne r1,[sp,#68]
mov r1,sp
add r11,r11,r2
@ -2542,23 +2606,38 @@ sha256_block_data_order_neon:
str r7,[r2],#4
stmia r2,{r8-r11}
ittte ne
movne r1,sp
ldrne r2,[sp,#0]
eorne r12,r12,r12
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne r3,r5,r6
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# ifdef __thumb2__
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {q0,q1},[r0]
sub r3,r3,#sha256_block_data_order-K256
# ifdef __thumb2__
adr r3,.LARMv8
sub r3,r3,#.LARMv8-K256
# else
adrl r3,K256
# endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp
.Loop_v8:
vld1.8 {q8-q9},[r1]!
@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8:
teq r1,r2
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
.byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9
INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
.byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10
INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q10
.byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11
INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
.byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q11
.byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8
INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
.byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
vld1.32 {q13},[r3]!
vadd.i32 q12,q12,q8
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vld1.32 {q12},[r3]!
vadd.i32 q13,q13,q9
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vld1.32 {q13},[r3]
vadd.i32 q12,q12,q10
sub r3,r3,#256-16 @ rewind
vmov q2,q0
.byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12
.byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12
INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
vadd.i32 q13,q13,q11
vmov q2,q0
.byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13
.byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13
INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
vadd.i32 q0,q0,q14
vadd.i32 q1,q1,q15
it ne
bne .Loop_v8
vst1.32 {q0,q1},[r0]
@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8:
#endif
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif

206
deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S

@ -227,17 +227,17 @@ aes_v8_encrypt:
.Loop_enc:
aese v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesmc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@ -256,17 +256,17 @@ aes_v8_decrypt:
.Loop_dec:
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2],#16
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
ld1 {v1.4s},[x2],#16
aesimc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
ld1 {v0.4s},[x2]
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
@ -308,16 +308,42 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
ld1 {v2.4s-v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
add x14,x3,#16*6
add x3,x3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
subs w6,w6,#2
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
b.gt .Loop_cbc_enc
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x6]
cmp w5,#4
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x12]
b.eq .Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x3]
nop
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
@ -326,7 +352,6 @@ aes_v8_cbc_encrypt:
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
add x7,x3,#16
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
@ -335,16 +360,14 @@ aes_v8_cbc_encrypt:
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
mov w6,w5
eor v6.16b,v0.16b,v7.16b
st1 {v6.16b},[x1],#16
b.hs .Loop_cbc_enc
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
@ -402,79 +425,78 @@ aes_v8_cbc_encrypt:
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
eor v4.16b,v6.16b,v7.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
eor v17.16b,v3.16b,v7.16b
subs x2,x2,#0x30
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v20.16b
aesd v1.16b,v20.16b
aesd v18.16b,v20.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
mov x7,x3
aesd v0.16b,v21.16b
aesd v1.16b,v21.16b
aesd v18.16b,v21.16b
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesd v1.16b,v22.16b
aesd v18.16b,v22.16b
ld1 {v19.16b},[x0],#16
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
orr v0.16b,v2.16b,v2.16b
st1 {v4.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
@ -485,39 +507,39 @@ aes_v8_cbc_encrypt:
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
aesd v18.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
aesd v18.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
aesd v18.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
aesd v18.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesd v18.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks:
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aese v18.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aese v1.16b,v17.16b
aese v18.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aese v18.16b,v16.16b
mov x7,x3
aesmc v4.16b,v0.16b
ld1 {v2.16b},[x0],#16
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
aesmc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
orr v0.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aese v5.16b,v17.16b
aese v18.16b,v17.16b
orr v1.16b,v6.16b,v6.16b
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
ld1 {v19.16b},[x0],#16
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
orr v18.16b,v6.16b,v6.16b
add w9,w8,#1
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aese v17.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aese v17.16b,v21.16b
aesmc v5.16b,v5.16b
eor v19.16b,v19.16b,v7.16b
rev w9,w9
aesmc v4.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
aesmc v5.16b,v5.16b
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
mov v0.s[3], w9
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
aese v5.16b,v22.16b
aese v17.16b,v22.16b
aesmc v5.16b,v5.16b
mov v1.s[3], w10
rev w12,w8
aesmc v4.16b,v4.16b
aesmc v5.16b,v5.16b
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
mov v18.s[3], w12
subs x2,x2,#3
@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks:
aese v5.16b,v23.16b
aese v17.16b,v23.16b
mov w6,w5
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v2.16b},[x1],#16
st1 {v3.16b},[x1],#16
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks:
.Lctr32_tail:
aese v0.16b,v16.16b
aese v1.16b,v16.16b
ld1 {v16.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aese v1.16b,v17.16b
ld1 {v17.4s},[x7],#16
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aese v1.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aese v1.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aese v1.16b,v20.16b
ld1 {v3.16b},[x0]
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aese v1.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
aese v0.16b,v22.16b
aese v1.16b,v22.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b

200
deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S

@ -6,103 +6,215 @@
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
ld1 {v17.2d},[x1] //load H
movi v16.16b,#0xe1
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
shl v16.2d,v16.2d,#57
ushr v18.2d,v16.2d,#63
ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ushr v19.2d,v3.2d,#63
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v19.16b,v19.16b,v16.16b
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v19.16b,v19.16b,v19.16b,#8
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v19.16b //H<<<=1
eor v3.16b,v3.16b,v16.16b //twisted H
st1 {v3.2d},[x0]
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull v0.1q,v20.1d,v20.1d
eor v16.16b,v16.16b,v20.16b
pmull2 v2.1q,v20.2d,v20.2d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v22.16b,v0.16b,v18.16b
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d-v22.2d},[x0] //store Htable[1..2]
ret
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d},[x1] //load twisted H
ld1 {v20.2d-v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ext v21.16b,v20.16b,v20.16b,#8
mov x3,#0
ext v3.16b,v17.16b,v17.16b,#8
mov x12,#0
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
mov x2,x0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
pmull v0.1q,v20.1d,v3.1d //H.loˇXi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hiˇXi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)ˇ(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
ld1 {v0.2d},[x0] //load [rotated] Xi
subs x3,x3,#16
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//alorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude oversteping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2
movi v19.16b,#0xe1
mov x12,#16
ld1 {v20.2d},[x1] //load twisted H
csel x12,xzr,x12,eq
ext v0.16b,v0.16b,v0.16b,#8
shl v19.2d,v19.2d,#57
ld1 {v17.2d},[x2],x12 //load [rotated] inp
ext v21.16b,v20.16b,v20.16b,#8
ld1 {v22.2d},[x1]
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing
ext v3.16b,v17.16b,v17.16b,#8
b .Loop_v8
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //HˇIi+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b .Loop_mod2x_v8
.align 4
.Loop_v8:
.Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.loˇXi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hiˇXi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)ˇ(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __ARMEB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //HˇIi+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
.Lgmult_v8:
pmull v0.1q,v20.1d,v3.1d //H.loˇXi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hiˇXi.hi
subs x3,x3,#16
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)ˇ(Xi.lo+Xi.hi)
csel x12,xzr,x12,eq
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] inp
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
#ifndef __ARMEB__
rev64 v17.16b,v17.16b
#endif
eor v0.16b,v1.16b,v18.16b
ext v3.16b,v17.16b,v17.16b,#8
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
b.hs .Loop_v8
.Ldone_v8:
#ifndef __ARMEB__
rev64 v0.16b,v0.16b
#endif

502
deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s

@ -17,7 +17,10 @@ aesni_encrypt:
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
@ -38,7 +41,10 @@ aesni_decrypt:
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
.type _aesni_encrypt2,@function
@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.Lenc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lenc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.Ldec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Ldec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,220,217
pxor %xmm0,%xmm8
.byte 102,15,56,220,217
pxor %xmm0,%xmm9
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups -16(%rcx,%rax,1),%xmm0
jmp .Lenc_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Lenc_loop8_inner
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.Lenc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,222,217
pxor %xmm0,%xmm8
.byte 102,15,56,222,217
pxor %xmm0,%xmm9
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups -16(%rcx,%rax,1),%xmm0
jmp .Ldec_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp .Ldec_loop8_inner
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.Ldec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@ -587,6 +577,7 @@ aesni_ecb_encrypt:
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@ -700,15 +691,23 @@ aesni_ecb_encrypt:
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
@ -731,14 +730,23 @@ aesni_ecb_encrypt:
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
@ -754,49 +762,73 @@ aesni_ecb_encrypt:
jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
.Lecb_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks:
leaq 16(%rsi),%rsi
jnz .Lccm64_enc_outer
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks:
leaq 16(%r11),%r11
jnz .Loop_enc1_6
.byte 102,15,56,221,217
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
cmpq $1,%rdx
jne .Lctr32_bulk
movups (%r8),%xmm2
movups (%rdi),%xmm3
movl 240(%rcx),%edx
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_7:
.byte 102,15,56,220,209
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
xorps %xmm2,%xmm2
jmp .Lctr32_epilogue
.align 16
.Lctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
cmpq $1,%rdx
je .Lctr32_one_shortcut
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks:
leaq -128(%rcx),%rcx
.Lctr32_tail:
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb .Lctr32_loop3
je .Lctr32_loop4
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups (%r8),%xmm2
movups (%rdi),%xmm10
movl 240(%rcx),%eax
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_7:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_done:
xorps %xmm0,%xmm0
xorl %r11d,%r11d
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lctr32_epilogue:
@ -1750,6 +1829,7 @@ aesni_xts_encrypt:
shrl $4,%eax
.Lxts_enc_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@ -1778,6 +1858,7 @@ aesni_xts_encrypt:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
pxor %xmm7,%xmm7
call _aesni_encrypt6
@ -1920,6 +2001,29 @@ aesni_xts_encrypt:
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_enc_epilogue:
@ -2196,6 +2300,7 @@ aesni_xts_decrypt:
shrl $4,%eax
.Lxts_dec_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@ -2398,6 +2503,29 @@ aesni_xts_decrypt:
movups %xmm2,(%rsi)
.Lxts_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
.Lxts_dec_epilogue:
@ -2446,7 +2574,11 @@ aesni_cbc_encrypt:
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%r8)
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
jmp .Lcbc_ret
.Lcbc_enc_tail:
@ -2466,6 +2598,35 @@ aesni_cbc_encrypt:
.align 16
.Lcbc_decrypt:
cmpq $16,%rdx
jne .Lcbc_decrypt_bulk
movdqu (%rdi),%xmm2
movdqu (%r8),%xmm3
movdqa %xmm2,%xmm4
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_16:
.byte 102,15,56,222,209
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@ -2702,7 +2863,7 @@ aesni_cbc_encrypt:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
jle .Lcbc_dec_tail_collected
jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@ -2721,14 +2882,19 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
pxor %xmm7,%xmm7
jmp .Lcbc_dec_tail_collected
.align 16
@ -2743,16 +2909,23 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp .Lcbc_dec_tail_collected
.align 16
@ -2796,7 +2969,7 @@ aesni_cbc_encrypt:
movdqa %xmm7,%xmm2
addq $80,%rdx
jle .Lcbc_dec_tail_collected
jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@ -2831,12 +3004,17 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
subq $16,%rdx
jmp .Lcbc_dec_tail_collected
@ -2847,12 +3025,12 @@ aesni_cbc_encrypt:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_16:
.Loop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
jnz .Loop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@ -2866,6 +3044,7 @@ aesni_cbc_encrypt:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@ -2878,7 +3057,9 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
@ -2891,29 +3072,45 @@ aesni_cbc_encrypt:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_clear_tail_collected:
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
.Lcbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,(%rsp)
pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
movdqa %xmm2,(%rsp)
.Lcbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
.Lcbc_ret:
@ -2951,7 +3148,9 @@ aesni_set_decrypt_key:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
pxor %xmm0,%xmm0
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz .Lenc_key_ret
movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
andl OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
.L10rounds:
movl $9,%esi
cmpl $268435456,%r10d
je .L10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
@ -3007,10 +3211,80 @@ __aesni_set_encrypt_key:
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L10rounds_alt:
movdqa .Lkey_rotate(%rip),%xmm5
movl $8,%r10d
movdqa .Lkey_rcon1(%rip),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,(%rdx)
jmp .Loop_key128
.align 16
.Loop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
leaq 16(%rax),%rax
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,-16(%rax)
movdqa %xmm0,%xmm2
decl %r10d
jnz .Loop_key128
movdqa .Lkey_rcon1b(%rip),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
.byte 102,15,56,0,197
.byte 102,15,56,221,196
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,16(%rax)
movl %esi,96(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
cmpl $268435456,%r10d
je .L12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
@ -3033,11 +3307,55 @@ __aesni_set_encrypt_key:
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.L12rounds_alt:
movdqa .Lkey_rotate192(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
movl $8,%r10d
movdqu %xmm0,(%rdx)
jmp .Loop_key192
.align 16
.Loop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
.byte 102,15,56,221,212
pslld $1,%xmm4
leaq 24(%rax),%rax
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pshufd $255,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pxor %xmm2,%xmm0
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%rax)
decl %r10d
jnz .Loop_key192
movl %esi,32(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
cmpl $268435456,%r10d
je .L14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@ -3071,10 +3389,70 @@ __aesni_set_encrypt_key:
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.L14rounds_alt:
movdqa .Lkey_rotate(%rip),%xmm5
movdqa .Lkey_rcon1(%rip),%xmm4
movl $7,%r10d
movdqu %xmm0,0(%rdx)
movdqa %xmm2,%xmm1
movdqu %xmm2,16(%rdx)
jmp .Loop_key256
.align 16
.Loop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pslld $1,%xmm4
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
decl %r10d
jz .Ldone_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm3,%xmm1
pxor %xmm1,%xmm2
movdqu %xmm2,16(%rax)
leaq 32(%rax),%rax
movdqa %xmm2,%xmm1
jmp .Loop_key256
.Ldone_key256:
movl %esi,16(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Lkey_rotate:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
.Lkey_rotate192:
.long 0x04070605,0x04070605,0x04070605,0x04070605
.Lkey_rcon1:
.long 1,1,1,1
.Lkey_rcon1b:
.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64

13
deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s

@ -1755,11 +1755,16 @@ bn_from_mont8x:
.type bn_get_bits5,@function
.align 16
bn_get_bits5:
movq %rdi,%r10
leaq 0(%rdi),%r10
leaq 1(%rdi),%r11
movl %esi,%ecx
shrl $3,%esi
movzwl (%r10,%rsi,1),%eax
andl $7,%ecx
shrl $4,%esi
andl $15,%ecx
leal -8(%rcx),%eax
cmpl $11,%ecx
cmovaq %r11,%r10
cmoval %eax,%ecx
movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3

502
deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s

@ -17,7 +17,10 @@ L$oop_enc1_1:
leaq 16(%rdx),%rdx
jnz L$oop_enc1_1
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@ -38,7 +41,10 @@ L$oop_dec1_2:
leaq 16(%rdx),%rdx
jnz L$oop_dec1_2
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
@ -264,21 +270,18 @@ _aesni_encrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop6_enter
.p2align 4
L$enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
L$enc_loop6_enter:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
L$enc_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,220,208
@ -321,21 +324,18 @@ _aesni_decrypt6:
pxor %xmm0,%xmm6
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop6_enter
.p2align 4
L$dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
L$dec_loop6_enter:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
L$dec_loop6_enter:
movups (%rcx,%rax,1),%xmm1
addq $32,%rax
.byte 102,15,56,222,208
@ -375,23 +375,18 @@ _aesni_encrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,220,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,220,217
pxor %xmm0,%xmm8
.byte 102,15,56,220,217
pxor %xmm0,%xmm9
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups -16(%rcx,%rax,1),%xmm0
jmp L$enc_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$enc_loop8_inner
.p2align 4
L$enc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
L$enc_loop8_inner:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@ -444,23 +439,18 @@ _aesni_decrypt8:
leaq 32(%rcx,%rax,1),%rcx
negq %rax
.byte 102,15,56,222,209
addq $16,%rax
pxor %xmm0,%xmm7
.byte 102,15,56,222,217
pxor %xmm0,%xmm8
.byte 102,15,56,222,217
pxor %xmm0,%xmm9
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups -16(%rcx,%rax,1),%xmm0
jmp L$dec_loop8_enter
movups (%rcx,%rax,1),%xmm0
addq $16,%rax
jmp L$dec_loop8_inner
.p2align 4
L$dec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
L$dec_loop8_inner:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@ -587,6 +577,7 @@ L$ecb_enc_tail:
movups 80(%rdi),%xmm7
je L$ecb_enc_six
movdqu 96(%rdi),%xmm8
xorps %xmm9,%xmm9
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter:
jnc L$ecb_dec_loop8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movq %r11,%rcx
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movl %r10d,%eax
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
addq $128,%rdx
jz L$ecb_ret
@ -731,14 +730,23 @@ L$ecb_dec_tail:
je L$ecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
xorps %xmm9,%xmm9
call _aesni_decrypt8
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
movups %xmm8,96(%rsi)
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp L$ecb_ret
.p2align 4
L$ecb_dec_one:
@ -754,49 +762,73 @@ L$oop_dec1_4:
jnz L$oop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$ecb_ret
.p2align 4
L$ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
jmp L$ecb_ret
.p2align 4
L$ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
jmp L$ecb_ret
.p2align 4
L$ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
jmp L$ecb_ret
.p2align 4
L$ecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
jmp L$ecb_ret
.p2align 4
L$ecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
movups %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movups %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movups %xmm5,48(%rsi)
pxor %xmm5,%xmm5
movups %xmm6,64(%rsi)
pxor %xmm6,%xmm6
movups %xmm7,80(%rsi)
pxor %xmm7,%xmm7
L$ecb_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
.byte 0xf3,0xc3
.globl _aesni_ccm64_encrypt_blocks
@ -853,7 +885,13 @@ L$ccm64_enc2_loop:
leaq 16(%rsi),%rsi
jnz L$ccm64_enc_outer
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ccm64_decrypt_blocks
@ -944,21 +982,56 @@ L$oop_enc1_6:
leaq 16(%r11),%r11
jnz L$oop_enc1_6
.byte 102,15,56,221,217
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
movups %xmm3,(%r9)
pxor %xmm3,%xmm3
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
.globl _aesni_ctr32_encrypt_blocks
.p2align 4
_aesni_ctr32_encrypt_blocks:
cmpq $1,%rdx
jne L$ctr32_bulk
movups (%r8),%xmm2
movups (%rdi),%xmm3
movl 240(%rcx),%edx
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_7:
.byte 102,15,56,220,209
decl %edx
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_7
.byte 102,15,56,221,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
xorps %xmm2,%xmm2
jmp L$ctr32_epilogue
.p2align 4
L$ctr32_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
leaq -8(%rax),%rbp
cmpq $1,%rdx
je L$ctr32_one_shortcut
movdqu (%r8),%xmm2
movdqu (%rcx),%xmm0
@ -1349,11 +1422,14 @@ L$ctr32_enc_done:
leaq -128(%rcx),%rcx
L$ctr32_tail:
leaq 16(%rcx),%rcx
cmpq $4,%rdx
jb L$ctr32_loop3
je L$ctr32_loop4
shll $4,%eax
movdqa 96(%rsp),%xmm8
pxor %xmm9,%xmm9
@ -1456,30 +1532,33 @@ L$ctr32_loop3:
movups 32(%rdi),%xmm12
xorps %xmm12,%xmm4
movups %xmm4,32(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_one_shortcut:
movups (%r8),%xmm2
movups (%rdi),%xmm10
movl 240(%rcx),%eax
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_enc1_7:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_enc1_7
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
jmp L$ctr32_done
.p2align 4
L$ctr32_done:
xorps %xmm0,%xmm0
xorl %r11d,%r11d
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$ctr32_epilogue:
@ -1750,6 +1829,7 @@ L$xts_enc_loop6:
shrl $4,%eax
L$xts_enc_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
addq $96,%rdx
@ -1778,6 +1858,7 @@ L$xts_enc_short:
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
pxor %xmm7,%xmm7
call _aesni_encrypt6
@ -1920,6 +2001,29 @@ L$oop_enc1_10:
movups %xmm2,-16(%rsi)
L$xts_enc_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_enc_epilogue:
@ -2196,6 +2300,7 @@ L$xts_dec_loop6:
shrl $4,%eax
L$xts_dec_short:
movl %eax,%r10d
pxor %xmm0,%xmm10
pxor %xmm0,%xmm11
@ -2398,6 +2503,29 @@ L$oop_dec1_14:
movups %xmm2,(%rsi)
L$xts_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
movaps %xmm0,0(%rsp)
pxor %xmm8,%xmm8
movaps %xmm0,16(%rsp)
pxor %xmm9,%xmm9
movaps %xmm0,32(%rsp)
pxor %xmm10,%xmm10
movaps %xmm0,48(%rsp)
pxor %xmm11,%xmm11
movaps %xmm0,64(%rsp)
pxor %xmm12,%xmm12
movaps %xmm0,80(%rsp)
pxor %xmm13,%xmm13
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
leaq (%rbp),%rsp
popq %rbp
L$xts_dec_epilogue:
@ -2446,7 +2574,11 @@ L$oop_enc1_15:
jnc L$cbc_enc_loop
addq $16,%rdx
jnz L$cbc_enc_tail
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movups %xmm2,(%r8)
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
jmp L$cbc_ret
L$cbc_enc_tail:
@ -2466,6 +2598,35 @@ L$cbc_enc_tail:
.p2align 4
L$cbc_decrypt:
cmpq $16,%rdx
jne L$cbc_decrypt_bulk
movdqu (%rdi),%xmm2
movdqu (%r8),%xmm3
movdqa %xmm2,%xmm4
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_16:
.byte 102,15,56,222,209
decl %r10d
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_16
.byte 102,15,56,223,209
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
movdqu %xmm4,(%r8)
xorps %xmm3,%xmm2
pxor %xmm3,%xmm3
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$cbc_ret
.p2align 4
L$cbc_decrypt_bulk:
leaq (%rsp),%rax
pushq %rbp
subq $16,%rsp
@ -2702,7 +2863,7 @@ L$cbc_dec_done:
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
addq $112,%rdx
jle L$cbc_dec_tail_collected
jle L$cbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
cmpq $80,%rdx
@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
leaq 80(%rsi),%rsi
movdqa %xmm7,%xmm2
pxor %xmm7,%xmm7
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2743,16 +2909,23 @@ L$cbc_dec_seven:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
pxor %xmm15,%xmm7
movdqu %xmm6,64(%rsi)
pxor %xmm6,%xmm6
pxor %xmm9,%xmm8
movdqu %xmm7,80(%rsi)
pxor %xmm7,%xmm7
leaq 96(%rsi),%rsi
movdqa %xmm8,%xmm2
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter:
movdqa %xmm7,%xmm2
addq $80,%rdx
jle L$cbc_dec_tail_collected
jle L$cbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
@ -2831,12 +3004,17 @@ L$cbc_dec_tail:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
pxor %xmm14,%xmm6
movdqu %xmm5,48(%rsi)
pxor %xmm5,%xmm5
leaq 64(%rsi),%rsi
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
subq $16,%rdx
jmp L$cbc_dec_tail_collected
@ -2847,12 +3025,12 @@ L$cbc_dec_one:
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
L$oop_dec1_16:
L$oop_dec1_17:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz L$oop_dec1_16
jnz L$oop_dec1_17
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movaps %xmm11,%xmm10
@ -2866,6 +3044,7 @@ L$cbc_dec_two:
pxor %xmm11,%xmm3
movdqu %xmm2,(%rsi)
movdqa %xmm3,%xmm2
pxor %xmm3,%xmm3
leaq 16(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2878,7 +3057,9 @@ L$cbc_dec_three:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
movdqa %xmm4,%xmm2
pxor %xmm4,%xmm4
leaq 32(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
@ -2891,29 +3072,45 @@ L$cbc_dec_four:
movdqu %xmm2,(%rsi)
pxor %xmm12,%xmm4
movdqu %xmm3,16(%rsi)
pxor %xmm3,%xmm3
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm4,%xmm4
movdqa %xmm5,%xmm2
pxor %xmm5,%xmm5
leaq 48(%rsi),%rsi
jmp L$cbc_dec_tail_collected
.p2align 4
L$cbc_dec_clear_tail_collected:
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
L$cbc_dec_tail_collected:
movups %xmm10,(%r8)
andq $15,%rdx
jnz L$cbc_dec_tail_partial
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
jmp L$cbc_dec_ret
.p2align 4
L$cbc_dec_tail_partial:
movaps %xmm2,(%rsp)
pxor %xmm2,%xmm2
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq (%rsp),%rsi
.long 0x9066A4F3
movdqa %xmm2,(%rsp)
L$cbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
leaq (%rbp),%rsp
popq %rbp
L$cbc_ret:
@ -2951,7 +3148,9 @@ L$dec_key_inverse:
movups (%rdx),%xmm0
.byte 102,15,56,219,192
pxor %xmm1,%xmm1
movups %xmm0,(%rdi)
pxor %xmm0,%xmm0
L$dec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key:
testq %rdx,%rdx
jz L$enc_key_ret
movl $268437504,%r10d
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
andl _OPENSSL_ia32cap_P+4(%rip),%r10d
leaq 16(%rdx),%rax
cmpl $256,%esi
je L$14rounds
@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key:
L$10rounds:
movl $9,%esi
cmpl $268435456,%r10d
je L$10rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call L$key_expansion_128_cold
@ -3007,10 +3211,80 @@ L$10rounds:
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$10rounds_alt:
movdqa L$key_rotate(%rip),%xmm5
movl $8,%r10d
movdqa L$key_rcon1(%rip),%xmm4
movdqa %xmm0,%xmm2
movdqu %xmm0,(%rdx)
jmp L$oop_key128
.p2align 4
L$oop_key128:
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
leaq 16(%rax),%rax
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,-16(%rax)
movdqa %xmm0,%xmm2
decl %r10d
jnz L$oop_key128
movdqa L$key_rcon1b(%rip),%xmm4
.byte 102,15,56,0,197
.byte 102,15,56,221,196
pslld $1,%xmm4
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
movdqa %xmm0,%xmm2
.byte 102,15,56,0,197
.byte 102,15,56,221,196
movdqa %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm2,%xmm3
pslldq $4,%xmm2
pxor %xmm3,%xmm2
pxor %xmm2,%xmm0
movdqu %xmm0,16(%rax)
movl %esi,96(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
cmpl $268435456,%r10d
je L$12rounds_alt
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call L$key_expansion_192a_cold
@ -3033,11 +3307,55 @@ L$12rounds:
xorq %rax,%rax
jmp L$enc_key_ret
.p2align 4
L$12rounds_alt:
movdqa L$key_rotate192(%rip),%xmm5
movdqa L$key_rcon1(%rip),%xmm4
movl $8,%r10d
movdqu %xmm0,(%rdx)
jmp L$oop_key192
.p2align 4
L$oop_key192:
movq %xmm2,0(%rax)
movdqa %xmm2,%xmm1
.byte 102,15,56,0,213
.byte 102,15,56,221,212
pslld $1,%xmm4
leaq 24(%rax),%rax
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pshufd $255,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pxor %xmm2,%xmm0
pxor %xmm3,%xmm2
movdqu %xmm0,-16(%rax)
decl %r10d
jnz L$oop_key192
movl %esi,32(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
cmpl $268435456,%r10d
je L$14rounds_alt
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
@ -3071,10 +3389,70 @@ L$14rounds:
xorq %rax,%rax
jmp L$enc_key_ret
.p2align 4
L$14rounds_alt:
movdqa L$key_rotate(%rip),%xmm5
movdqa L$key_rcon1(%rip),%xmm4
movl $7,%r10d
movdqu %xmm0,0(%rdx)
movdqa %xmm2,%xmm1
movdqu %xmm2,16(%rdx)
jmp L$oop_key256
.p2align 4
L$oop_key256:
.byte 102,15,56,0,213
.byte 102,15,56,221,212
movdqa %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm0,%xmm3
pslldq $4,%xmm0
pxor %xmm3,%xmm0
pslld $1,%xmm4
pxor %xmm2,%xmm0
movdqu %xmm0,(%rax)
decl %r10d
jz L$done_key256
pshufd $255,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
movdqa %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm3,%xmm1
pxor %xmm1,%xmm2
movdqu %xmm2,16(%rax)
leaq 32(%rax),%rax
movdqa %xmm2,%xmm1
jmp L$oop_key256
L$done_key256:
movl %esi,16(%rax)
xorl %eax,%eax
jmp L$enc_key_ret
.p2align 4
L$bad_keybits:
movq $-2,%rax
L$enc_key_ret:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
addq $8,%rsp
.byte 0xf3,0xc3
L$SEH_end_set_encrypt_key:
@ -3160,6 +3538,14 @@ L$xts_magic:
.long 0x87,0,1,0
L$increment1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
L$key_rotate:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
L$key_rotate192:
.long 0x04070605,0x04070605,0x04070605,0x04070605
L$key_rcon1:
.long 1,1,1,1
L$key_rcon1b:
.long 0x1b,0x1b,0x1b,0x1b
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6

13
deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s

@ -1755,11 +1755,16 @@ L$from_epilogue:
.p2align 4
_bn_get_bits5:
movq %rdi,%r10
leaq 0(%rdi),%r10
leaq 1(%rdi),%r11
movl %esi,%ecx
shrl $3,%esi
movzwl (%r10,%rsi,1),%eax
andl $7,%ecx
shrl $4,%esi
andl $15,%ecx
leal -8(%rcx),%eax
cmpl $11,%ecx
cmovaq %r11,%r10
cmoval %eax,%ecx
movzwl (%r10,%rsi,2),%eax
shrl %cl,%eax
andl $31,%eax
.byte 0xf3,0xc3

71
deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm

@ -60,77 +60,6 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98
DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108
DB 46,111,114,103,62,0
ALIGN 64
mov rsi,rax
mov rax,QWORD PTR[((64+56))+rax]
lea rax,QWORD PTR[48+rax]
mov rbx,QWORD PTR[((-8))+rax]
mov rbp,QWORD PTR[((-16))+rax]
mov r12,QWORD PTR[((-24))+rax]
mov r13,QWORD PTR[((-32))+rax]
mov r14,QWORD PTR[((-40))+rax]
mov r15,QWORD PTR[((-48))+rax]
mov QWORD PTR[144+r8],rbx
mov QWORD PTR[160+r8],rbp
mov QWORD PTR[216+r8],r12
mov QWORD PTR[224+r8],r13
mov QWORD PTR[232+r8],r14
mov QWORD PTR[240+r8],r15
lea rsi,QWORD PTR[((64+64))+rsi]
lea rdi,QWORD PTR[512+r8]
mov ecx,20
DD 0a548f3fch
$L$in_prologue::
mov rdi,QWORD PTR[8+rax]
mov rsi,QWORD PTR[16+rax]
mov QWORD PTR[152+r8],rax
mov QWORD PTR[168+r8],rsi
mov QWORD PTR[176+r8],rdi
mov rdi,QWORD PTR[40+r9]
mov rsi,r8
mov ecx,154
DD 0a548f3fch
mov rsi,r9
xor rcx,rcx
mov rdx,QWORD PTR[8+rsi]
mov r8,QWORD PTR[rsi]
mov r9,QWORD PTR[16+rsi]
mov r10,QWORD PTR[40+rsi]
lea r11,QWORD PTR[56+rsi]
lea r12,QWORD PTR[24+rsi]
mov QWORD PTR[32+rsp],r10
mov QWORD PTR[40+rsp],r11
mov QWORD PTR[48+rsp],r12
mov QWORD PTR[56+rsp],rcx
call QWORD PTR[__imp_RtlVirtualUnwind]
mov eax,1
add rsp,64
popfq
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rbx
pop rdi
pop rsi
DB 0F3h,0C3h ;repret
.text$ ENDS
.pdata SEGMENT READONLY ALIGN(4)
DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_xop
DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_xop
DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_xop
DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_avx
DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_avx
DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_avx
.pdata ENDS
END

522
deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm

@ -18,7 +18,10 @@ DB 102,15,56,220,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_enc1_1
DB 102,15,56,221,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_encrypt ENDP
@ -39,7 +42,10 @@ DB 102,15,56,222,209
lea r8,QWORD PTR[16+r8]
jnz $L$oop_dec1_2
DB 102,15,56,223,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[rdx],xmm2
pxor xmm2,xmm2
DB 0F3h,0C3h ;repret
aesni_decrypt ENDP
@ -265,21 +271,18 @@ DB 102,15,56,220,217
pxor xmm6,xmm0
DB 102,15,56,220,225
pxor xmm7,xmm0
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop6_enter
ALIGN 16
$L$enc_loop6::
DB 102,15,56,220,209
DB 102,15,56,220,217
DB 102,15,56,220,225
$L$enc_loop6_enter::
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
$L$enc_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,220,208
@ -322,21 +325,18 @@ DB 102,15,56,222,217
pxor xmm6,xmm0
DB 102,15,56,222,225
pxor xmm7,xmm0
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop6_enter
ALIGN 16
$L$dec_loop6::
DB 102,15,56,222,209
DB 102,15,56,222,217
DB 102,15,56,222,225
$L$dec_loop6_enter::
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
$L$dec_loop6_enter::
movups xmm1,XMMWORD PTR[rax*1+rcx]
add rax,32
DB 102,15,56,222,208
@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,220,209
add rax,16
pxor xmm7,xmm0
DB 102,15,56,220,217
pxor xmm8,xmm0
DB 102,15,56,220,217
pxor xmm9,xmm0
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
DB 102,15,56,220,249
DB 102,68,15,56,220,193
DB 102,68,15,56,220,201
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$enc_loop8_enter
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
jmp $L$enc_loop8_inner
ALIGN 16
$L$enc_loop8::
DB 102,15,56,220,209
DB 102,15,56,220,217
$L$enc_loop8_inner::
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE
lea rcx,QWORD PTR[32+rax*1+rcx]
neg rax
DB 102,15,56,222,209
add rax,16
pxor xmm7,xmm0
DB 102,15,56,222,217
pxor xmm8,xmm0
DB 102,15,56,222,217
pxor xmm9,xmm0
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
DB 102,68,15,56,222,193
DB 102,68,15,56,222,201
movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx]
jmp $L$dec_loop8_enter
movups xmm0,XMMWORD PTR[rax*1+rcx]
add rax,16
jmp $L$dec_loop8_inner
ALIGN 16
$L$dec_loop8::
DB 102,15,56,222,209
DB 102,15,56,222,217
$L$dec_loop8_inner::
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
@ -605,6 +595,7 @@ $L$ecb_enc_tail::
movups xmm7,XMMWORD PTR[80+rdi]
je $L$ecb_enc_six
movdqu xmm8,XMMWORD PTR[96+rdi]
xorps xmm9,xmm9
call _aesni_encrypt8
movups XMMWORD PTR[rsi],xmm2
movups XMMWORD PTR[16+rsi],xmm3
@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter::
jnc $L$ecb_dec_loop8
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
mov rcx,r11
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
mov eax,r10d
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
pxor xmm8,xmm8
movups XMMWORD PTR[112+rsi],xmm9
pxor xmm9,xmm9
lea rsi,QWORD PTR[128+rsi]
add rdx,080h
jz $L$ecb_ret
@ -749,14 +748,23 @@ $L$ecb_dec_tail::
je $L$ecb_dec_six
movups xmm8,XMMWORD PTR[96+rdi]
movups xmm0,XMMWORD PTR[rcx]
xorps xmm9,xmm9
call _aesni_decrypt8
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
movups XMMWORD PTR[96+rsi],xmm8
pxor xmm8,xmm8
pxor xmm9,xmm9
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_one::
@ -772,53 +780,81 @@ DB 102,15,56,222,209
jnz $L$oop_dec1_4
DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_two::
call _aesni_decrypt2
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_three::
call _aesni_decrypt3
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_four::
call _aesni_decrypt4
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_five::
xorps xmm7,xmm7
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
pxor xmm7,xmm7
jmp $L$ecb_ret
ALIGN 16
$L$ecb_dec_six::
call _aesni_decrypt6
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
movups XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movups XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
movups XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
movups XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
$L$ecb_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ecb_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -898,11 +934,21 @@ DB 102,15,56,0,215
lea rsi,QWORD PTR[16+rsi]
jnz $L$ccm64_enc_outer
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
pxor xmm3,xmm3
pxor xmm8,xmm8
pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_enc_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -1016,11 +1062,21 @@ DB 102,15,56,220,217
lea r11,QWORD PTR[16+r11]
jnz $L$oop_enc1_6
DB 102,15,56,221,217
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
movups XMMWORD PTR[r9],xmm3
pxor xmm3,xmm3
pxor xmm8,xmm8
pxor xmm6,xmm6
movaps xmm6,XMMWORD PTR[rsp]
movaps XMMWORD PTR[rsp],xmm0
movaps xmm7,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm8,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm9,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
lea rsp,QWORD PTR[88+rsp]
$L$ccm64_dec_ret::
mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
mov r8,QWORD PTR[40+rsp]
cmp rdx,1
jne $L$ctr32_bulk
movups xmm2,XMMWORD PTR[r8]
movups xmm3,XMMWORD PTR[rdi]
mov edx,DWORD PTR[240+rcx]
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_7::
DB 102,15,56,220,209
dec edx
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_enc1_7
DB 102,15,56,221,209
pxor xmm0,xmm0
pxor xmm1,xmm1
xorps xmm2,xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[rsi],xmm2
xorps xmm2,xmm2
jmp $L$ctr32_epilogue
ALIGN 16
$L$ctr32_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,288
@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks::
$L$ctr32_body::
lea rbp,QWORD PTR[((-8))+rax]
cmp rdx,1
je $L$ctr32_one_shortcut
movdqu xmm2,XMMWORD PTR[r8]
movdqu xmm0,XMMWORD PTR[rcx]
@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202
lea rcx,QWORD PTR[((-128))+rcx]
$L$ctr32_tail::
lea rcx,QWORD PTR[16+rcx]
cmp rdx,4
jb $L$ctr32_loop3
je $L$ctr32_loop4
shl eax,4
movdqa xmm8,XMMWORD PTR[96+rsp]
pxor xmm9,xmm9
@ -1559,40 +1647,43 @@ DB 102,15,56,221,225
movups xmm12,XMMWORD PTR[32+rdi]
xorps xmm4,xmm12
movups XMMWORD PTR[32+rsi],xmm4
jmp $L$ctr32_done
ALIGN 16
$L$ctr32_one_shortcut::
movups xmm2,XMMWORD PTR[r8]
movups xmm10,XMMWORD PTR[rdi]
mov eax,DWORD PTR[240+rcx]
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_enc1_7::
DB 102,15,56,220,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_enc1_7
DB 102,15,56,221,209
xorps xmm2,xmm10
movups XMMWORD PTR[rsi],xmm2
jmp $L$ctr32_done
ALIGN 16
$L$ctr32_done::
xorps xmm0,xmm0
xor r11d,r11d
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
movaps XMMWORD PTR[112+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$ctr32_epilogue::
@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80
shr eax,4
$L$xts_enc_short::
mov r10d,eax
pxor xmm10,xmm0
add rdx,16*6
@ -1917,6 +2009,7 @@ $L$xts_enc_short::
pxor xmm4,xmm12
pxor xmm5,xmm13
pxor xmm6,xmm14
pxor xmm7,xmm7
call _aesni_encrypt6
@ -2059,16 +2152,39 @@ DB 102,15,56,221,209
movups XMMWORD PTR[(-16)+rsi],xmm2
$L$xts_enc_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_enc_epilogue::
@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80
shr eax,4
$L$xts_dec_short::
mov r10d,eax
pxor xmm10,xmm0
pxor xmm11,xmm0
@ -2573,16 +2690,39 @@ DB 102,15,56,223,209
movups XMMWORD PTR[rsi],xmm2
$L$xts_dec_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
movaps xmm6,XMMWORD PTR[((-160))+rbp]
movaps XMMWORD PTR[(-160)+rbp],xmm0
movaps xmm7,XMMWORD PTR[((-144))+rbp]
movaps XMMWORD PTR[(-144)+rbp],xmm0
movaps xmm8,XMMWORD PTR[((-128))+rbp]
movaps XMMWORD PTR[(-128)+rbp],xmm0
movaps xmm9,XMMWORD PTR[((-112))+rbp]
movaps XMMWORD PTR[(-112)+rbp],xmm0
movaps xmm10,XMMWORD PTR[((-96))+rbp]
movaps XMMWORD PTR[(-96)+rbp],xmm0
movaps xmm11,XMMWORD PTR[((-80))+rbp]
movaps XMMWORD PTR[(-80)+rbp],xmm0
movaps xmm12,XMMWORD PTR[((-64))+rbp]
movaps XMMWORD PTR[(-64)+rbp],xmm0
movaps xmm13,XMMWORD PTR[((-48))+rbp]
movaps XMMWORD PTR[(-48)+rbp],xmm0
movaps xmm14,XMMWORD PTR[((-32))+rbp]
movaps XMMWORD PTR[(-32)+rbp],xmm0
movaps xmm15,XMMWORD PTR[((-16))+rbp]
movaps XMMWORD PTR[(-16)+rbp],xmm0
movaps XMMWORD PTR[rsp],xmm0
movaps XMMWORD PTR[16+rsp],xmm0
movaps XMMWORD PTR[32+rsp],xmm0
movaps XMMWORD PTR[48+rsp],xmm0
movaps XMMWORD PTR[64+rsp],xmm0
movaps XMMWORD PTR[80+rsp],xmm0
movaps XMMWORD PTR[96+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$xts_dec_epilogue::
@ -2646,7 +2786,11 @@ DB 102,15,56,221,209
jnc $L$cbc_enc_loop
add rdx,16
jnz $L$cbc_enc_tail
pxor xmm0,xmm0
pxor xmm1,xmm1
movups XMMWORD PTR[r8],xmm2
pxor xmm2,xmm2
pxor xmm3,xmm3
jmp $L$cbc_ret
$L$cbc_enc_tail::
@ -2666,6 +2810,35 @@ $L$cbc_enc_tail::
ALIGN 16
$L$cbc_decrypt::
cmp rdx,16
jne $L$cbc_decrypt_bulk
movdqu xmm2,XMMWORD PTR[rdi]
movdqu xmm3,XMMWORD PTR[r8]
movdqa xmm4,xmm2
movups xmm0,XMMWORD PTR[rcx]
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_16::
DB 102,15,56,222,209
dec r10d
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_dec1_16
DB 102,15,56,223,209
pxor xmm0,xmm0
pxor xmm1,xmm1
movdqu XMMWORD PTR[r8],xmm4
xorps xmm2,xmm3
pxor xmm3,xmm3
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$cbc_ret
ALIGN 16
$L$cbc_decrypt_bulk::
lea rax,QWORD PTR[rsp]
push rbp
sub rsp,176
@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202
movaps xmm2,xmm9
lea rcx,QWORD PTR[((-112))+rcx]
add rdx,070h
jle $L$cbc_dec_tail_collected
jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm9
lea rsi,QWORD PTR[16+rsi]
cmp rdx,050h
@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
lea rsi,QWORD PTR[80+rsi]
movdqa xmm2,xmm7
pxor xmm7,xmm7
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -2954,16 +3132,23 @@ $L$cbc_dec_seven::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
pxor xmm7,xmm15
movdqu XMMWORD PTR[64+rsi],xmm6
pxor xmm6,xmm6
pxor xmm8,xmm9
movdqu XMMWORD PTR[80+rsi],xmm7
pxor xmm7,xmm7
lea rsi,QWORD PTR[96+rsi]
movdqa xmm2,xmm8
pxor xmm8,xmm8
pxor xmm9,xmm9
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter::
movdqa xmm2,xmm7
add rdx,050h
jle $L$cbc_dec_tail_collected
jle $L$cbc_dec_clear_tail_collected
movups XMMWORD PTR[rsi],xmm7
lea rsi,QWORD PTR[16+rsi]
@ -3042,12 +3227,17 @@ $L$cbc_dec_tail::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
pxor xmm6,xmm14
movdqu XMMWORD PTR[48+rsi],xmm5
pxor xmm5,xmm5
lea rsi,QWORD PTR[64+rsi]
movdqa xmm2,xmm6
pxor xmm6,xmm6
pxor xmm7,xmm7
sub rdx,010h
jmp $L$cbc_dec_tail_collected
@ -3058,12 +3248,12 @@ $L$cbc_dec_one::
movups xmm1,XMMWORD PTR[16+rcx]
lea rcx,QWORD PTR[32+rcx]
xorps xmm2,xmm0
$L$oop_dec1_16::
$L$oop_dec1_17::
DB 102,15,56,222,209
dec eax
movups xmm1,XMMWORD PTR[rcx]
lea rcx,QWORD PTR[16+rcx]
jnz $L$oop_dec1_16
jnz $L$oop_dec1_17
DB 102,15,56,223,209
xorps xmm2,xmm10
movaps xmm10,xmm11
@ -3077,6 +3267,7 @@ $L$cbc_dec_two::
pxor xmm3,xmm11
movdqu XMMWORD PTR[rsi],xmm2
movdqa xmm2,xmm3
pxor xmm3,xmm3
lea rsi,QWORD PTR[16+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3089,7 +3280,9 @@ $L$cbc_dec_three::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
movdqa xmm2,xmm4
pxor xmm4,xmm4
lea rsi,QWORD PTR[32+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
@ -3102,39 +3295,61 @@ $L$cbc_dec_four::
movdqu XMMWORD PTR[rsi],xmm2
pxor xmm4,xmm12
movdqu XMMWORD PTR[16+rsi],xmm3
pxor xmm3,xmm3
pxor xmm5,xmm13
movdqu XMMWORD PTR[32+rsi],xmm4
pxor xmm4,xmm4
movdqa xmm2,xmm5
pxor xmm5,xmm5
lea rsi,QWORD PTR[48+rsi]
jmp $L$cbc_dec_tail_collected
ALIGN 16
$L$cbc_dec_clear_tail_collected::
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
$L$cbc_dec_tail_collected::
movups XMMWORD PTR[r8],xmm10
and rdx,15
jnz $L$cbc_dec_tail_partial
movups XMMWORD PTR[rsi],xmm2
pxor xmm2,xmm2
jmp $L$cbc_dec_ret
ALIGN 16
$L$cbc_dec_tail_partial::
movaps XMMWORD PTR[rsp],xmm2
pxor xmm2,xmm2
mov rcx,16
mov rdi,rsi
sub rcx,rdx
lea rsi,QWORD PTR[rsp]
DD 09066A4F3h
movdqa XMMWORD PTR[rsp],xmm2
$L$cbc_dec_ret::
xorps xmm0,xmm0
pxor xmm1,xmm1
movaps xmm6,XMMWORD PTR[16+rsp]
movaps XMMWORD PTR[16+rsp],xmm0
movaps xmm7,XMMWORD PTR[32+rsp]
movaps XMMWORD PTR[32+rsp],xmm0
movaps xmm8,XMMWORD PTR[48+rsp]
movaps XMMWORD PTR[48+rsp],xmm0
movaps xmm9,XMMWORD PTR[64+rsp]
movaps XMMWORD PTR[64+rsp],xmm0
movaps xmm10,XMMWORD PTR[80+rsp]
movaps XMMWORD PTR[80+rsp],xmm0
movaps xmm11,XMMWORD PTR[96+rsp]
movaps XMMWORD PTR[96+rsp],xmm0
movaps xmm12,XMMWORD PTR[112+rsp]
movaps XMMWORD PTR[112+rsp],xmm0
movaps xmm13,XMMWORD PTR[128+rsp]
movaps XMMWORD PTR[128+rsp],xmm0
movaps xmm14,XMMWORD PTR[144+rsp]
movaps XMMWORD PTR[144+rsp],xmm0
movaps xmm15,XMMWORD PTR[160+rsp]
movaps XMMWORD PTR[160+rsp],xmm0
lea rsp,QWORD PTR[rbp]
pop rbp
$L$cbc_ret::
@ -3175,7 +3390,9 @@ DB 102,15,56,219,201
movups xmm0,XMMWORD PTR[r8]
DB 102,15,56,219,192
pxor xmm1,xmm1
movups XMMWORD PTR[rcx],xmm0
pxor xmm0,xmm0
$L$dec_key_ret::
add rsp,8
DB 0F3h,0C3h ;repret
@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h
test r8,r8
jz $L$enc_key_ret
mov r10d,268437504
movups xmm0,XMMWORD PTR[rcx]
xorps xmm4,xmm4
and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))]
lea rax,QWORD PTR[16+r8]
cmp edx,256
je $L$14rounds
@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h
$L$10rounds::
mov edx,9
cmp r10d,268435456
je $L$10rounds_alt
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,200,1
call $L$key_expansion_128_cold
@ -3231,10 +3453,80 @@ DB 102,15,58,223,200,54
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$10rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate]
mov r10d,8
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
movdqa xmm2,xmm0
movdqu XMMWORD PTR[r8],xmm0
jmp $L$oop_key128
ALIGN 16
$L$oop_key128::
DB 102,15,56,0,197
DB 102,15,56,221,196
pslld xmm4,1
lea rax,QWORD PTR[16+rax]
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[(-16)+rax],xmm0
movdqa xmm2,xmm0
dec r10d
jnz $L$oop_key128
movdqa xmm4,XMMWORD PTR[$L$key_rcon1b]
DB 102,15,56,0,197
DB 102,15,56,221,196
pslld xmm4,1
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[rax],xmm0
movdqa xmm2,xmm0
DB 102,15,56,0,197
DB 102,15,56,221,196
movdqa xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm3,xmm2
pslldq xmm2,4
pxor xmm2,xmm3
pxor xmm0,xmm2
movdqu XMMWORD PTR[16+rax],xmm0
mov DWORD PTR[96+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$12rounds::
movq xmm2,QWORD PTR[16+rcx]
mov edx,11
cmp r10d,268435456
je $L$12rounds_alt
movups XMMWORD PTR[r8],xmm0
DB 102,15,58,223,202,1
call $L$key_expansion_192a_cold
@ -3257,11 +3549,55 @@ DB 102,15,58,223,202,128
xor rax,rax
jmp $L$enc_key_ret
ALIGN 16
$L$12rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate192]
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
mov r10d,8
movdqu XMMWORD PTR[r8],xmm0
jmp $L$oop_key192
ALIGN 16
$L$oop_key192::
movq QWORD PTR[rax],xmm2
movdqa xmm1,xmm2
DB 102,15,56,0,213
DB 102,15,56,221,212
pslld xmm4,1
lea rax,QWORD PTR[24+rax]
movdqa xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm0,xmm3
pshufd xmm3,xmm0,0ffh
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pxor xmm0,xmm2
pxor xmm2,xmm3
movdqu XMMWORD PTR[(-16)+rax],xmm0
dec r10d
jnz $L$oop_key192
mov DWORD PTR[32+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$14rounds::
movups xmm2,XMMWORD PTR[16+rcx]
mov edx,13
lea rax,QWORD PTR[16+rax]
cmp r10d,268435456
je $L$14rounds_alt
movups XMMWORD PTR[r8],xmm0
movups XMMWORD PTR[16+r8],xmm2
DB 102,15,58,223,202,1
@ -3295,10 +3631,70 @@ DB 102,15,58,223,202,64
xor rax,rax
jmp $L$enc_key_ret
ALIGN 16
$L$14rounds_alt::
movdqa xmm5,XMMWORD PTR[$L$key_rotate]
movdqa xmm4,XMMWORD PTR[$L$key_rcon1]
mov r10d,7
movdqu XMMWORD PTR[r8],xmm0
movdqa xmm1,xmm2
movdqu XMMWORD PTR[16+r8],xmm2
jmp $L$oop_key256
ALIGN 16
$L$oop_key256::
DB 102,15,56,0,213
DB 102,15,56,221,212
movdqa xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm3,xmm0
pslldq xmm0,4
pxor xmm0,xmm3
pslld xmm4,1
pxor xmm0,xmm2
movdqu XMMWORD PTR[rax],xmm0
dec r10d
jz $L$done_key256
pshufd xmm2,xmm0,0ffh
pxor xmm3,xmm3
DB 102,15,56,221,211
movdqa xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm3,xmm1
pslldq xmm1,4
pxor xmm1,xmm3
pxor xmm2,xmm1
movdqu XMMWORD PTR[16+rax],xmm2
lea rax,QWORD PTR[32+rax]
movdqa xmm1,xmm2
jmp $L$oop_key256
$L$done_key256::
mov DWORD PTR[16+rax],edx
xor eax,eax
jmp $L$enc_key_ret
ALIGN 16
$L$bad_keybits::
mov rax,-2
$L$enc_key_ret::
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
add rsp,8
DB 0F3h,0C3h ;repret
$L$SEH_end_set_encrypt_key::
@ -3384,6 +3780,14 @@ $L$xts_magic::
DD 087h,0,1,0
$L$increment1::
DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
$L$key_rotate::
DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh
$L$key_rotate192::
DD 004070605h,004070605h,004070605h,004070605h
$L$key_rcon1::
DD 1,1,1,1
$L$key_rcon1b::
DD 01bh,01bh,01bh,01bh
DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE
mov rax,QWORD PTR[152+r8]
mov rbx,QWORD PTR[248+r8]
lea r10,QWORD PTR[$L$cbc_decrypt]
lea r10,QWORD PTR[$L$cbc_decrypt_bulk]
cmp rbx,r10
jb $L$common_seh_tail

13
deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm

@ -1832,11 +1832,16 @@ PUBLIC bn_get_bits5
ALIGN 16
bn_get_bits5 PROC PUBLIC
mov r10,rcx
lea r10,QWORD PTR[rcx]
lea r11,QWORD PTR[1+rcx]
mov ecx,edx
shr edx,3
movzx eax,WORD PTR[rdx*1+r10]
and ecx,7
shr edx,4
and ecx,15
lea eax,DWORD PTR[((-8))+rcx]
cmp ecx,11
cmova r10,r11
cmova ecx,eax
movzx eax,WORD PTR[rdx*2+r10]
shr eax,cl
and eax,31
DB 0F3h,0C3h ;repret

790
deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s

File diff suppressed because it is too large

794
deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s

File diff suppressed because it is too large

793
deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm

File diff suppressed because it is too large
Loading…
Cancel
Save