diff --git a/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S b/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S index 732ba3d9c8..fd979d078f 100644 --- a/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S +++ b/deps/openssl/asm/arm-void-gas/aes/aesv8-armx.S @@ -230,17 +230,17 @@ aes_v8_encrypt: .Loop_enc: .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2]! .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2]! subs r3,r3,#2 .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - vld1.32 {q1},[r2]! .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q1},[r2]! bgt .Loop_enc .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2] .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2] .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 veor q2,q2,q0 @@ -259,17 +259,17 @@ aes_v8_decrypt: .Loop_dec: .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2]! .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2]! subs r3,r3,#2 .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - vld1.32 {q1},[r2]! .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q1},[r2]! bgt .Loop_dec .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2] .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2] .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 veor q2,q2,q0 @@ -313,16 +313,42 @@ aes_v8_cbc_encrypt: veor q5,q8,q7 beq .Lcbc_enc128 + vld1.32 {q2-q3},[r7] + add r7,r3,#16 + add r6,r3,#16*4 + add r12,r3,#16*5 + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r14,r3,#16*6 + add r3,r3,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r6,r6,#2 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc: .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - bgt .Loop_cbc_enc + .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r6] + cmp r5,#4 + .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r12] + beq .Lcbc_enc192 + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r14] + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r3] + nop +.Lcbc_enc192: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 subs r2,r2,#16 @@ -331,7 +357,6 @@ aes_v8_cbc_encrypt: moveq r8,#0 .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r7,r3,#16 .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 vld1.8 {q8},[r0],r8 @@ -340,16 +365,14 @@ aes_v8_cbc_encrypt: veor q8,q8,q5 .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vld1.32 {q9},[r7] @ re-pre-load rndkey[1] .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - - mov r6,r5 veor q6,q0,q7 - vst1.8 {q6},[r1]! bhs .Loop_cbc_enc + vst1.8 {q6},[r1]! b .Lcbc_done .align 5 @@ -407,79 +430,78 @@ aes_v8_cbc_encrypt: .Loop3x_cbc_dec: .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - vld1.32 {q8},[r7]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - vld1.32 {q9},[r7]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! bgt .Loop3x_cbc_dec .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - veor q4,q6,q7 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q4,q6,q7 + subs r2,r2,#0x30 veor q5,q2,q7 + movlo r6,r2 @ r6, r6, is zero at this point .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - veor q9,q3,q7 - subs r2,r2,#0x30 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vorr q6,q11,q11 - movlo r6,r2 @ r6, r6, is zero at this point - .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 - .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 + veor q9,q3,q7 add r0,r0,r6 @ r0 is adjusted in such way that @ at exit from the loop q1-q10 @ are loaded with last "words" + vorr q6,q11,q11 + mov r7,r3 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - mov r7,r3 - .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 - .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 vld1.8 {q2},[r0]! + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 vld1.8 {q3},[r0]! .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 - .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 - vld1.8 {q11},[r0]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vld1.8 {q11},[r0]! .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 .byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] add r6,r5,#2 veor q4,q4,q0 veor q5,q5,q1 veor q10,q10,q9 vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vorr q0,q2,q2 vst1.8 {q4},[r1]! - vorr q1,q3,q3 + vorr q0,q2,q2 vst1.8 {q5},[r1]! + vorr q1,q3,q3 vst1.8 {q10},[r1]! vorr q10,q11,q11 bhs .Loop3x_cbc_dec @@ -490,39 +512,39 @@ aes_v8_cbc_encrypt: .Lcbc_dec_tail: .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - vld1.32 {q8},[r7]! .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - vld1.32 {q9},[r7]! .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! bgt .Lcbc_dec_tail .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 cmn r2,#0x20 .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 veor q5,q6,q7 .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 veor q9,q3,q7 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 @@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks: .align 4 .Loop3x_ctr32: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q9},[r7]! bgt .Loop3x_ctr32 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 - mov r7,r3 .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 - vld1.8 {q2},[r0]! + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.8 {q2},[r0]! vorr q0,q6,q6 - .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 + .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 + .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 vld1.8 {q3},[r0]! - .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 - .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 vorr q1,q6,q6 + .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vld1.8 {q11},[r0]! + .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.8 {q11},[r0]! + mov r7,r3 + .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 vorr q10,q6,q6 add r9,r8,#1 .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 - .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 veor q2,q2,q7 add r10,r8,#2 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 veor q3,q3,q7 add r8,r8,#3 .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 - .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 veor q11,q11,q7 rev r9,r9 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 vmov.32 d1[1], r9 rev r10,r10 .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 - .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 vmov.32 d3[1], r10 rev r12,r8 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 vmov.32 d21[1], r12 subs r2,r2,#3 @@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks: .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 .byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 - mov r6,r5 veor q2,q2,q4 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vst1.8 {q2},[r1]! veor q3,q3,q5 + mov r6,r5 + vst1.8 {q3},[r1]! veor q11,q11,q9 vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q2},[r1]! - vst1.8 {q3},[r1]! vst1.8 {q11},[r1]! bhs .Loop3x_ctr32 @@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_tail: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q9},[r7]! bgt .Lctr32_tail .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 vld1.8 {q2},[r0],r12 .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 - vld1.8 {q3},[r0] .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0] .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 veor q2,q2,q7 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 veor q3,q3,q7 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 diff --git a/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S b/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S index d321235f79..c54f514997 100644 --- a/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S +++ b/deps/openssl/asm/arm-void-gas/modes/ghash-armv4.S @@ -495,7 +495,7 @@ gcm_ghash_neon: veor q10,q10,q9 @ vshl.i64 q9,q0,#63 veor q10, q10, q9 @ - veor d1,d1,d20 @ + veor d1,d1,d20 @ veor d4,d4,d21 vshr.u64 q10,q0,#1 @ 2nd phase diff --git a/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S b/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S index 570d9175c4..269574945f 100644 --- a/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S +++ b/deps/openssl/asm/arm-void-gas/modes/ghashv8-armx.S @@ -7,109 +7,223 @@ .type gcm_init_v8,%function .align 4 gcm_init_v8: - vld1.64 {q9},[r1] @ load H - vmov.i8 q8,#0xe1 + vld1.64 {q9},[r1] @ load input H + vmov.i8 q11,#0xe1 + vshl.i64 q11,q11,#57 @ 0xc2.0 vext.8 q3,q9,q9,#8 - vshl.i64 q8,q8,#57 - vshr.u64 q10,q8,#63 - vext.8 q8,q10,q8,#8 @ t0=0xc2....01 + vshr.u64 q10,q11,#63 vdup.32 q9,d18[1] - vshr.u64 q11,q3,#63 + vext.8 q8,q10,q11,#8 @ t0=0xc2....01 + vshr.u64 q10,q3,#63 vshr.s32 q9,q9,#31 @ broadcast carry bit - vand q11,q11,q8 + vand q10,q10,q8 vshl.i64 q3,q3,#1 - vext.8 q11,q11,q11,#8 + vext.8 q10,q10,q10,#8 vand q8,q8,q9 - vorr q3,q3,q11 @ H<<<=1 - veor q3,q3,q8 @ twisted H - vst1.64 {q3},[r0] + vorr q3,q3,q10 @ H<<<=1 + veor q12,q3,q8 @ twisted H + vst1.64 {q12},[r0]! @ store Htable[0] + + @ calculate H^2 + vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing + .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + veor q8,q8,q12 + .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 + .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q14,q0,q10 + + vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + veor q9,q9,q14 + vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed + vst1.64 {q13-q14},[r0] @ store Htable[1..2] bx lr .size gcm_init_v8,.-gcm_init_v8 - .global gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: vld1.64 {q9},[r0] @ load Xi vmov.i8 q11,#0xe1 - vld1.64 {q12},[r1] @ load twisted H + vld1.64 {q12-q13},[r1] @ load twisted H, ... vshl.u64 q11,q11,#57 #ifndef __ARMEB__ vrev64.8 q9,q9 #endif - vext.8 q13,q12,q12,#8 - mov r3,#0 vext.8 q3,q9,q9,#8 - mov r12,#0 - veor q13,q13,q12 @ Karatsuba pre-processing - mov r2,r0 - b .Lgmult_v8 -.size gcm_gmult_v8,.-gcm_gmult_v8 + .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing + .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr +.size gcm_gmult_v8,.-gcm_gmult_v8 .global gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: + vstmdb sp!,{d8-d15} @ 32-bit ABI says so vld1.64 {q0},[r0] @ load [rotated] Xi - subs r3,r3,#16 + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ alorithm specification + subs r3,r3,#32 @ see if r3 is 32 or larger + mov r12,#16 @ r12 is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ r12 is zeroed just in time + @ to preclude oversteping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2 vmov.i8 q11,#0xe1 - mov r12,#16 - vld1.64 {q12},[r1] @ load twisted H - moveq r12,#0 - vext.8 q0,q0,q0,#8 - vshl.u64 q11,q11,#57 - vld1.64 {q9},[r2],r12 @ load [rotated] inp - vext.8 q13,q12,q12,#8 + vld1.64 {q14},[r1] + moveq r12,#0 @ is it time to zero r12? + vext.8 q0,q0,q0,#8 @ rotate Xi + vld1.64 {q8},[r2]! @ load [rotated] I[0] + vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant #ifndef __ARMEB__ + vrev64.8 q8,q8 vrev64.8 q0,q0 +#endif + vext.8 q3,q8,q8,#8 @ rotate I[0] + blo .Lodd_tail_v8 @ r3 was less than 32 + vld1.64 {q9},[r2],r12 @ load [rotated] I[1] +#ifndef __ARMEB__ vrev64.8 q9,q9 #endif - veor q13,q13,q12 @ Karatsuba pre-processing - vext.8 q3,q9,q9,#8 - b .Loop_v8 + vext.8 q7,q9,q9,#8 + veor q3,q3,q0 @ I[i]^=Xi + .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q9,q9,q7 @ Karatsuba pre-processing + .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + b .Loop_mod2x_v8 .align 4 -.Loop_v8: +.Loop_mod2x_v8: + vext.8 q10,q3,q3,#8 + subs r3,r3,#32 @ is there more data? + .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + movlo r12,#0 @ is it time to zero r12? + + .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + veor q10,q10,q3 @ Karatsuba pre-processing + .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + veor q0,q0,q4 @ accumulate + .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] + + veor q2,q2,q6 + moveq r12,#0 @ is it time to zero r12? + veor q1,q1,q5 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 q8,q8 +#endif + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + vext.8 q7,q9,q9,#8 + vext.8 q3,q8,q8,#8 + veor q0,q1,q10 + .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q3,q3,q2 @ accumulate q3 early + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q3,q3,q10 + veor q9,q9,q7 @ Karatsuba pre-processing + veor q3,q3,q0 + .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + bhs .Loop_mod2x_v8 @ there was at least 32 more bytes + + veor q2,q2,q10 + vext.8 q3,q8,q8,#8 @ re-construct q3 + adds r3,r3,#32 @ re-construct r3 + veor q0,q0,q2 @ re-construct q0 + beq .Ldone_v8 @ is r3 zero? +.Lodd_tail_v8: vext.8 q10,q0,q0,#8 veor q3,q3,q0 @ inp^=Xi - veor q9,q9,q10 @ q9 is rotated inp^Xi + veor q9,q8,q10 @ q9 is rotated inp^Xi -.Lgmult_v8: .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi - subs r3,r3,#16 .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - moveq r12,#0 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 - vld1.64 {q9},[r2],r12 @ load [rotated] inp veor q1,q1,q10 - .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif veor q0,q1,q10 - vext.8 q3,q9,q9,#8 - vext.8 q10,q0,q0,#8 @ 2nd phase + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 - bhs .Loop_v8 +.Ldone_v8: #ifndef __ARMEB__ vrev64.8 q0,q0 #endif vext.8 q0,q0,q0,#8 vst1.64 {q0},[r0] @ write out Xi + vldmia sp!,{d8-d15} @ 32-bit ABI says so bx lr .size gcm_ghash_v8,.-gcm_ghash_v8 .asciz "GHASH for ARMv8, CRYPTOGAMS by " diff --git a/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S b/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S index bf1ce4f997..683f1cc0c8 100644 --- a/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S +++ b/deps/openssl/asm/arm-void-gas/sha/sha256-armv4.S @@ -1,7 +1,59 @@ -#include "arm_arch.h" + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif .text +#if __ARM_ARCH__<7 .code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif .type K256,%object .align 5 @@ -24,7 +76,7 @@ K256: .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha256_block_data_order #endif @@ -33,9 +85,12 @@ K256: .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order - add r2,r1,r2,lsl#6 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,sha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#ARMV8_SHA256 @@ -43,6 +98,7 @@ sha256_block_data_order: tst r12,#ARMV7_NEON bne .LNEON #endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp stmdb sp!,{r0,r1,r2,r4-r11,lr} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} sub r14,r3,#256+32 @ K256 @@ -1736,6 +1792,9 @@ sha256_block_data_order: eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif ldreq r3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx @@ -1777,16 +1836,19 @@ sha256_block_data_order: .arch armv7-a .fpu neon +.global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 4 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4-r12,lr} + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores mov r12,sp - sub sp,sp,#16*4+16 @ alloca - sub r14,r3,#256+32 @ K256 - bic sp,sp,#15 @ align for 128-bit stores + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp vld1.8 {q0},[r1]! vld1.8 {q1},[r1]! @@ -2224,11 +2286,13 @@ sha256_block_data_order_neon: ldr r0,[sp,#72] sub r14,r14,#256 @ rewind r14 teq r1,r0 + it eq subeq r1,r1,#64 @ avoid SEGV vld1.8 {q0},[r1]! @ load next input block vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! + it ne strne r1,[sp,#68] mov r1,sp add r11,r11,r2 @@ -2542,23 +2606,38 @@ sha256_block_data_order_neon: str r7,[r2],#4 stmia r2,{r8-r11} + ittte ne movne r1,sp ldrne r2,[sp,#0] eorne r12,r12,r12 ldreq sp,[sp,#76] @ restore original sp + itt ne eorne r3,r5,r6 bne .L_00_48 ldmia sp!,{r4-r12,pc} .size sha256_block_data_order_neon,.-sha256_block_data_order_neon #endif -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: .LARMv8: vld1.32 {q0,q1},[r0] - sub r3,r3,#sha256_block_data_order-K256 +# ifdef __thumb2__ + adr r3,.LARMv8 + sub r3,r3,#.LARMv8-K256 +# else + adrl r3,K256 +# endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp .Loop_v8: vld1.8 {q8-q9},[r1]! @@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8: teq r1,r2 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vld1.32 {q13},[r3] vadd.i32 q12,q12,q10 sub r3,r3,#256-16 @ rewind vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vadd.i32 q13,q13,q11 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vadd.i32 q0,q0,q14 vadd.i32 q1,q1,q15 + it ne bne .Loop_v8 vst1.32 {q0,q1},[r0] @@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8: #endif .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 #endif diff --git a/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S b/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S index 0a4b1ac4c4..f5dd6cbb86 100644 --- a/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S +++ b/deps/openssl/asm/arm64-linux64-gas/aes/aesv8-armx.S @@ -227,17 +227,17 @@ aes_v8_encrypt: .Loop_enc: aese v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aese v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_enc aese v2.16b,v0.16b - ld1 {v0.4s},[x2] aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] aese v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -256,17 +256,17 @@ aes_v8_decrypt: .Loop_dec: aesd v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aesd v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_dec aesd v2.16b,v0.16b - ld1 {v0.4s},[x2] aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] aesd v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -308,16 +308,42 @@ aes_v8_cbc_encrypt: eor v5.16b,v16.16b,v7.16b b.eq .Lcbc_enc128 + ld1 {v2.4s-v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: aese v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b - subs w6,w6,#2 + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc: aese v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b - b.gt .Loop_cbc_enc + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq .Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop +.Lcbc_enc192: aese v0.16b,v16.16b aesmc v0.16b,v0.16b subs x2,x2,#16 @@ -326,7 +352,6 @@ aes_v8_cbc_encrypt: csel x8,xzr,x8,eq aese v0.16b,v18.16b aesmc v0.16b,v0.16b - add x7,x3,#16 aese v0.16b,v19.16b aesmc v0.16b,v0.16b ld1 {v16.16b},[x0],x8 @@ -335,16 +360,14 @@ aes_v8_cbc_encrypt: eor v16.16b,v16.16b,v5.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b - - mov w6,w5 eor v6.16b,v0.16b,v7.16b - st1 {v6.16b},[x1],#16 b.hs .Loop_cbc_enc + st1 {v6.16b},[x1],#16 b .Lcbc_done .align 5 @@ -402,79 +425,78 @@ aes_v8_cbc_encrypt: .Loop3x_cbc_dec: aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Loop3x_cbc_dec aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - eor v4.16b,v6.16b,v7.16b aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - eor v17.16b,v3.16b,v7.16b - subs x2,x2,#0x30 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point - aesd v0.16b,v20.16b - aesd v1.16b,v20.16b - aesd v18.16b,v20.16b + eor v17.16b,v3.16b,v7.16b add x0,x0,x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v18.16b // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - mov x7,x3 - aesd v0.16b,v21.16b - aesd v1.16b,v21.16b - aesd v18.16b,v21.16b ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 aesd v0.16b,v22.16b - aesd v1.16b,v22.16b - aesd v18.16b,v22.16b - ld1 {v19.16b},[x0],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v19.16b},[x0],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b - + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v18.16b,v18.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - orr v0.16b,v2.16b,v2.16b st1 {v4.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b + orr v0.16b,v2.16b,v2.16b st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b st1 {v18.16b},[x1],#16 orr v18.16b,v19.16b,v19.16b b.hs .Loop3x_cbc_dec @@ -485,39 +507,39 @@ aes_v8_cbc_encrypt: .Lcbc_dec_tail: aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Lcbc_dec_tail aesd v1.16b,v16.16b - aesd v18.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b aesd v1.16b,v17.16b - aesd v18.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b aesd v1.16b,v20.16b - aesd v18.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b cmn x2,#0x20 aesd v1.16b,v21.16b - aesd v18.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b eor v5.16b,v6.16b,v7.16b aesd v1.16b,v22.16b - aesd v18.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b eor v17.16b,v3.16b,v7.16b aesd v1.16b,v23.16b @@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks: .align 4 .Loop3x_ctr32: aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b + aese v18.16b,v16.16b aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b - aese v1.16b,v17.16b - aese v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b + aese v18.16b,v17.16b aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Loop3x_ctr32 aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - mov x7,x3 aesmc v4.16b,v0.16b - ld1 {v2.16b},[x0],#16 + aese v1.16b,v16.16b aesmc v5.16b,v1.16b - aesmc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 orr v0.16b,v6.16b,v6.16b - aese v4.16b,v17.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 - aese v5.16b,v17.16b - aese v18.16b,v17.16b orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b aesmc v4.16b,v4.16b - ld1 {v19.16b},[x0],#16 + aese v5.16b,v17.16b aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b aesmc v17.16b,v18.16b orr v18.16b,v6.16b,v6.16b add w9,w8,#1 aese v4.16b,v20.16b + aesmc v4.16b,v4.16b aese v5.16b,v20.16b - aese v17.16b,v20.16b + aesmc v5.16b,v5.16b eor v2.16b,v2.16b,v7.16b add w10,w8,#2 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b + aese v17.16b,v20.16b aesmc v17.16b,v17.16b eor v3.16b,v3.16b,v7.16b add w8,w8,#3 aese v4.16b,v21.16b + aesmc v4.16b,v4.16b aese v5.16b,v21.16b - aese v17.16b,v21.16b + aesmc v5.16b,v5.16b eor v19.16b,v19.16b,v7.16b rev w9,w9 - aesmc v4.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - aesmc v5.16b,v5.16b + aese v17.16b,v21.16b aesmc v17.16b,v17.16b mov v0.s[3], w9 rev w10,w10 aese v4.16b,v22.16b + aesmc v4.16b,v4.16b aese v5.16b,v22.16b - aese v17.16b,v22.16b + aesmc v5.16b,v5.16b mov v1.s[3], w10 rev w12,w8 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b + aese v17.16b,v22.16b aesmc v17.16b,v17.16b mov v18.s[3], w12 subs x2,x2,#3 @@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks: aese v5.16b,v23.16b aese v17.16b,v23.16b - mov w6,w5 eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 eor v19.16b,v19.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v2.16b},[x1],#16 - st1 {v3.16b},[x1],#16 st1 {v19.16b},[x1],#16 b.hs .Loop3x_ctr32 @@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_tail: aese v0.16b,v16.16b - aese v1.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b - aese v1.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 b.gt .Lctr32_tail aese v0.16b,v16.16b - aese v1.16b,v16.16b aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v0.16b,v17.16b - aese v1.16b,v17.16b aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v2.16b},[x0],x12 aese v0.16b,v20.16b - aese v1.16b,v20.16b - ld1 {v3.16b},[x0] aesmc v0.16b,v0.16b + aese v1.16b,v20.16b aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] aese v0.16b,v21.16b - aese v1.16b,v21.16b aesmc v0.16b,v0.16b + aese v1.16b,v21.16b aesmc v1.16b,v1.16b - aese v0.16b,v22.16b - aese v1.16b,v22.16b eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b aesmc v0.16b,v0.16b + aese v1.16b,v22.16b aesmc v1.16b,v1.16b eor v3.16b,v3.16b,v7.16b aese v0.16b,v23.16b diff --git a/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S b/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S index 1bfb26340a..479007dc54 100644 --- a/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S +++ b/deps/openssl/asm/arm64-linux64-gas/modes/ghashv8-armx.S @@ -6,103 +6,215 @@ .type gcm_init_v8,%function .align 4 gcm_init_v8: - ld1 {v17.2d},[x1] //load H - movi v16.16b,#0xe1 + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 - shl v16.2d,v16.2d,#57 - ushr v18.2d,v16.2d,#63 - ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01 + ushr v18.2d,v19.2d,#63 dup v17.4s,v17.s[1] - ushr v19.2d,v3.2d,#63 + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit - and v19.16b,v19.16b,v16.16b + and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 - ext v19.16b,v19.16b,v19.16b,#8 + ext v18.16b,v18.16b,v18.16b,#8 and v16.16b,v16.16b,v17.16b - orr v3.16b,v3.16b,v19.16b //H<<<=1 - eor v3.16b,v3.16b,v16.16b //twisted H - st1 {v3.2d},[x0] + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d-v22.2d},[x0] //store Htable[1..2] ret .size gcm_init_v8,.-gcm_init_v8 - .global gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 - ld1 {v20.2d},[x1] //load twisted H + ld1 {v20.2d-v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - ext v21.16b,v20.16b,v20.16b,#8 - mov x3,#0 ext v3.16b,v17.16b,v17.16b,#8 - mov x12,#0 - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - mov x2,x0 - b .Lgmult_v8 -.size gcm_gmult_v8,.-gcm_gmult_v8 + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 .global gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: ld1 {v0.2d},[x0] //load [rotated] Xi - subs x3,x3,#16 + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //alorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude oversteping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2 movi v19.16b,#0xe1 - mov x12,#16 - ld1 {v20.2d},[x1] //load twisted H - csel x12,xzr,x12,eq - ext v0.16b,v0.16b,v0.16b,#8 - shl v19.2d,v19.2d,#57 - ld1 {v17.2d},[x2],x12 //load [rotated] inp - ext v21.16b,v20.16b,v20.16b,#8 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant #ifndef __ARMEB__ + rev64 v16.16b,v16.16b rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo .Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - ext v3.16b,v17.16b,v17.16b,#8 - b .Loop_v8 + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b .Loop_mod2x_v8 .align 4 -.Loop_v8: +.Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs .Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq .Ldone_v8 //is x3 zero? +.Lodd_tail_v8: ext v18.16b,v0.16b,v0.16b,#8 eor v3.16b,v3.16b,v0.16b //inp^=Xi - eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi -.Lgmult_v8: pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - subs x3,x3,#16 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - csel x12,xzr,x12,eq ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] inp eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] -#ifndef __ARMEB__ - rev64 v17.16b,v17.16b -#endif eor v0.16b,v1.16b,v18.16b - ext v3.16b,v17.16b,v17.16b,#8 - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v0.16b,v0.16b,v18.16b - b.hs .Loop_v8 +.Ldone_v8: #ifndef __ARMEB__ rev64 v0.16b,v0.16b #endif diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s index 84708afbbb..6573fe4be3 100644 --- a/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-x86_64.s @@ -17,7 +17,10 @@ aesni_encrypt: leaq 16(%rdx),%rdx jnz .Loop_enc1_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_encrypt,.-aesni_encrypt @@ -38,7 +41,10 @@ aesni_decrypt: leaq 16(%rdx),%rdx jnz .Loop_dec1_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_decrypt, .-aesni_decrypt .type _aesni_encrypt2,@function @@ -264,21 +270,18 @@ _aesni_encrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop6_enter .align 16 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +.Lenc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lenc_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 @@ -321,21 +324,18 @@ _aesni_decrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop6_enter .align 16 .Ldec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +.Ldec_loop6_enter: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Ldec_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,222,208 @@ -375,23 +375,18 @@ _aesni_encrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 pxor %xmm0,%xmm9 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp .Lenc_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner .align 16 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 +.Lenc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -444,23 +439,18 @@ _aesni_decrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,222,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,222,217 pxor %xmm0,%xmm8 +.byte 102,15,56,222,217 pxor %xmm0,%xmm9 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp .Ldec_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop8_inner .align 16 .Ldec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 +.Ldec_loop8_inner: .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -587,6 +577,7 @@ aesni_ecb_encrypt: movups 80(%rdi),%xmm7 je .Lecb_enc_six movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 call _aesni_encrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -700,15 +691,23 @@ aesni_ecb_encrypt: jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movq %r11,%rcx movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movl %r10d,%eax movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi addq $128,%rdx jz .Lecb_ret @@ -731,14 +730,23 @@ aesni_ecb_encrypt: je .Lecb_dec_six movups 96(%rdi),%xmm8 movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 call _aesni_decrypt8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp .Lecb_ret .align 16 .Lecb_dec_one: @@ -754,49 +762,73 @@ aesni_ecb_encrypt: jnz .Loop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lecb_ret .align 16 .Lecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 jmp .Lecb_ret .align 16 .Lecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 jmp .Lecb_ret .align 16 .Lecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 jmp .Lecb_ret .align 16 .Lecb_dec_five: xorps %xmm7,%xmm7 call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 jmp .Lecb_ret .align 16 .Lecb_dec_six: call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 .Lecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 .byte 0xf3,0xc3 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt .globl aesni_ccm64_encrypt_blocks @@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks: leaq 16(%rsi),%rsi jnz .Lccm64_enc_outer + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks .globl aesni_ccm64_decrypt_blocks @@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks: leaq 16(%r11),%r11 jnz .Loop_enc1_6 .byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx + jne .Lctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: leaq (%rsp),%rax pushq %rbp subq $128,%rsp andq $-16,%rsp leaq -8(%rax),%rbp - cmpq $1,%rdx - je .Lctr32_one_shortcut + + movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 @@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks: leaq -128(%rcx),%rcx .Lctr32_tail: + + leaq 16(%rcx),%rcx cmpq $4,%rdx jb .Lctr32_loop3 je .Lctr32_loop4 + shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 @@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks: movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) - jmp .Lctr32_done -.align 16 -.Lctr32_one_shortcut: - movups (%r8),%xmm2 - movups (%rdi),%xmm10 - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_7: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - jmp .Lctr32_done - -.align 16 .Lctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lctr32_epilogue: @@ -1750,6 +1829,7 @@ aesni_xts_encrypt: shrl $4,%eax .Lxts_enc_short: + movl %eax,%r10d pxor %xmm0,%xmm10 addq $96,%rdx @@ -1778,6 +1858,7 @@ aesni_xts_encrypt: pxor %xmm12,%xmm4 pxor %xmm13,%xmm5 pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 call _aesni_encrypt6 @@ -1920,6 +2001,29 @@ aesni_xts_encrypt: movups %xmm2,-16(%rsi) .Lxts_enc_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lxts_enc_epilogue: @@ -2196,6 +2300,7 @@ aesni_xts_decrypt: shrl $4,%eax .Lxts_dec_short: + movl %eax,%r10d pxor %xmm0,%xmm10 pxor %xmm0,%xmm11 @@ -2398,6 +2503,29 @@ aesni_xts_decrypt: movups %xmm2,(%rsi) .Lxts_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lxts_dec_epilogue: @@ -2446,7 +2574,11 @@ aesni_cbc_encrypt: jnc .Lcbc_enc_loop addq $16,%rdx jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 jmp .Lcbc_ret .Lcbc_enc_tail: @@ -2466,6 +2598,35 @@ aesni_cbc_encrypt: .align 16 .Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: leaq (%rsp),%rax pushq %rbp subq $16,%rsp @@ -2702,7 +2863,7 @@ aesni_cbc_encrypt: movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx addq $112,%rdx - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi cmpq $80,%rdx @@ -2721,14 +2882,19 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 leaq 80(%rsi),%rsi movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 jmp .Lcbc_dec_tail_collected .align 16 @@ -2743,16 +2909,23 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 pxor %xmm9,%xmm8 movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 leaq 96(%rsi),%rsi movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp .Lcbc_dec_tail_collected .align 16 @@ -2796,7 +2969,7 @@ aesni_cbc_encrypt: movdqa %xmm7,%xmm2 addq $80,%rdx - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi @@ -2831,12 +3004,17 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 leaq 64(%rsi),%rsi movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 subq $16,%rdx jmp .Lcbc_dec_tail_collected @@ -2847,12 +3025,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_17: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_17 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -2866,6 +3044,7 @@ aesni_cbc_encrypt: pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 leaq 16(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 @@ -2878,7 +3057,9 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 leaq 32(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 @@ -2891,29 +3072,45 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 leaq 48(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 +.Lcbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 .Lcbc_dec_tail_collected: movups %xmm10,(%r8) andq $15,%rdx jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx leaq (%rsp),%rsi .long 0x9066A4F3 + movdqa %xmm2,(%rsp) .Lcbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 leaq (%rbp),%rsp popq %rbp .Lcbc_ret: @@ -2951,7 +3148,9 @@ aesni_set_decrypt_key: movups (%rdx),%xmm0 .byte 102,15,56,219,192 + pxor %xmm1,%xmm1 movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz .Lenc_key_ret + movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 + andl OPENSSL_ia32cap_P+4(%rip),%r10d leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds @@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key: .L10rounds: movl $9,%esi + cmpl $268435456,%r10d + je .L10rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call .Lkey_expansion_128_cold @@ -3007,10 +3211,80 @@ __aesni_set_encrypt_key: xorl %eax,%eax jmp .Lenc_key_ret +.align 16 +.L10rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .L12rounds: movq 16(%rdi),%xmm2 movl $11,%esi + cmpl $268435456,%r10d + je .L12rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_192a_cold @@ -3033,11 +3307,55 @@ __aesni_set_encrypt_key: xorq %rax,%rax jmp .Lenc_key_ret +.align 16 +.L12rounds_alt: + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz .Loop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .L14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax + cmpl $268435456,%r10d + je .L14rounds_alt + movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 @@ -3071,10 +3389,70 @@ __aesni_set_encrypt_key: xorq %rax,%rax jmp .Lenc_key_ret +.align 16 +.L14rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .Lbad_keybits: movq $-2,%rax .Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 addq $8,%rsp .byte 0xf3,0xc3 .LSEH_end_set_encrypt_key: @@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key: .long 0x87,0,1,0 .Lincrement1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s index 84dd72075d..db3fe399ab 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s @@ -2884,11 +2884,16 @@ sqrx8x_reduction: .type bn_get_bits5,@function .align 16 bn_get_bits5: - movq %rdi,%r10 + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 movl %esi,%ecx - shrl $3,%esi - movzwl (%r10,%rsi,1),%eax - andl $7,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax shrl %cl,%eax andl $31,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s index 57509ae719..41ad80eebd 100644 --- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-x86_64.s @@ -17,7 +17,10 @@ L$oop_enc1_1: leaq 16(%rdx),%rdx jnz L$oop_enc1_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 @@ -38,7 +41,10 @@ L$oop_dec1_2: leaq 16(%rdx),%rdx jnz L$oop_dec1_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 @@ -264,21 +270,18 @@ _aesni_encrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%rcx,%rax,1),%xmm0 jmp L$enc_loop6_enter .p2align 4 L$enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +L$enc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$enc_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 @@ -321,21 +324,18 @@ _aesni_decrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%rcx,%rax,1),%xmm0 jmp L$dec_loop6_enter .p2align 4 L$dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +L$dec_loop6_enter: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -L$dec_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,222,208 @@ -375,23 +375,18 @@ _aesni_encrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 pxor %xmm0,%xmm9 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp L$enc_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop8_inner .p2align 4 L$enc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 +L$enc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -444,23 +439,18 @@ _aesni_decrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,222,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,222,217 pxor %xmm0,%xmm8 +.byte 102,15,56,222,217 pxor %xmm0,%xmm9 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp L$dec_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$dec_loop8_inner .p2align 4 L$dec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 +L$dec_loop8_inner: .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -587,6 +577,7 @@ L$ecb_enc_tail: movups 80(%rdi),%xmm7 je L$ecb_enc_six movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 call _aesni_encrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter: jnc L$ecb_dec_loop8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movq %r11,%rcx movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movl %r10d,%eax movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi addq $128,%rdx jz L$ecb_ret @@ -731,14 +730,23 @@ L$ecb_dec_tail: je L$ecb_dec_six movups 96(%rdi),%xmm8 movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 call _aesni_decrypt8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp L$ecb_ret .p2align 4 L$ecb_dec_one: @@ -754,49 +762,73 @@ L$oop_dec1_4: jnz L$oop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp L$ecb_ret .p2align 4 L$ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 jmp L$ecb_ret .p2align 4 L$ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 jmp L$ecb_ret .p2align 4 L$ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 jmp L$ecb_ret .p2align 4 L$ecb_dec_five: xorps %xmm7,%xmm7 call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 jmp L$ecb_ret .p2align 4 L$ecb_dec_six: call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 L$ecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 .byte 0xf3,0xc3 .globl _aesni_ccm64_encrypt_blocks @@ -853,7 +885,13 @@ L$ccm64_enc2_loop: leaq 16(%rsi),%rsi jnz L$ccm64_enc_outer + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .globl _aesni_ccm64_decrypt_blocks @@ -944,21 +982,56 @@ L$oop_enc1_6: leaq 16(%r11),%r11 jnz L$oop_enc1_6 .byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .globl _aesni_ctr32_encrypt_blocks .p2align 4 _aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx + jne L$ctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp L$ctr32_epilogue + +.p2align 4 +L$ctr32_bulk: leaq (%rsp),%rax pushq %rbp subq $128,%rsp andq $-16,%rsp leaq -8(%rax),%rbp - cmpq $1,%rdx - je L$ctr32_one_shortcut + + movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 @@ -1349,11 +1422,14 @@ L$ctr32_enc_done: leaq -128(%rcx),%rcx L$ctr32_tail: + + leaq 16(%rcx),%rcx cmpq $4,%rdx jb L$ctr32_loop3 je L$ctr32_loop4 + shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 @@ -1456,30 +1532,33 @@ L$ctr32_loop3: movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) - jmp L$ctr32_done -.p2align 4 -L$ctr32_one_shortcut: - movups (%r8),%xmm2 - movups (%rdi),%xmm10 - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_7: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - jmp L$ctr32_done - -.p2align 4 L$ctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$ctr32_epilogue: @@ -1750,6 +1829,7 @@ L$xts_enc_loop6: shrl $4,%eax L$xts_enc_short: + movl %eax,%r10d pxor %xmm0,%xmm10 addq $96,%rdx @@ -1778,6 +1858,7 @@ L$xts_enc_short: pxor %xmm12,%xmm4 pxor %xmm13,%xmm5 pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 call _aesni_encrypt6 @@ -1920,6 +2001,29 @@ L$oop_enc1_10: movups %xmm2,-16(%rsi) L$xts_enc_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$xts_enc_epilogue: @@ -2196,6 +2300,7 @@ L$xts_dec_loop6: shrl $4,%eax L$xts_dec_short: + movl %eax,%r10d pxor %xmm0,%xmm10 pxor %xmm0,%xmm11 @@ -2398,6 +2503,29 @@ L$oop_dec1_14: movups %xmm2,(%rsi) L$xts_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$xts_dec_epilogue: @@ -2446,7 +2574,11 @@ L$oop_enc1_15: jnc L$cbc_enc_loop addq $16,%rdx jnz L$cbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 jmp L$cbc_ret L$cbc_enc_tail: @@ -2466,6 +2598,35 @@ L$cbc_enc_tail: .p2align 4 L$cbc_decrypt: + cmpq $16,%rdx + jne L$cbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$cbc_ret +.p2align 4 +L$cbc_decrypt_bulk: leaq (%rsp),%rax pushq %rbp subq $16,%rsp @@ -2702,7 +2863,7 @@ L$cbc_dec_done: movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx addq $112,%rdx - jle L$cbc_dec_tail_collected + jle L$cbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi cmpq $80,%rdx @@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 leaq 80(%rsi),%rsi movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 jmp L$cbc_dec_tail_collected .p2align 4 @@ -2743,16 +2909,23 @@ L$cbc_dec_seven: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 pxor %xmm9,%xmm8 movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 leaq 96(%rsi),%rsi movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp L$cbc_dec_tail_collected .p2align 4 @@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter: movdqa %xmm7,%xmm2 addq $80,%rdx - jle L$cbc_dec_tail_collected + jle L$cbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi @@ -2831,12 +3004,17 @@ L$cbc_dec_tail: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 leaq 64(%rsi),%rsi movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 subq $16,%rdx jmp L$cbc_dec_tail_collected @@ -2847,12 +3025,12 @@ L$cbc_dec_one: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_16: +L$oop_dec1_17: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_17 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -2866,6 +3044,7 @@ L$cbc_dec_two: pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 leaq 16(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 @@ -2878,7 +3057,9 @@ L$cbc_dec_three: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 leaq 32(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 @@ -2891,29 +3072,45 @@ L$cbc_dec_four: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 leaq 48(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 +L$cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 L$cbc_dec_tail_collected: movups %xmm10,(%r8) andq $15,%rdx jnz L$cbc_dec_tail_partial movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp L$cbc_dec_ret .p2align 4 L$cbc_dec_tail_partial: movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx leaq (%rsp),%rsi .long 0x9066A4F3 + movdqa %xmm2,(%rsp) L$cbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 leaq (%rbp),%rsp popq %rbp L$cbc_ret: @@ -2951,7 +3148,9 @@ L$dec_key_inverse: movups (%rdx),%xmm0 .byte 102,15,56,219,192 + pxor %xmm1,%xmm1 movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz L$enc_key_ret + movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 + andl _OPENSSL_ia32cap_P+4(%rip),%r10d leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds @@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key: L$10rounds: movl $9,%esi + cmpl $268435456,%r10d + je L$10rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call L$key_expansion_128_cold @@ -3007,10 +3211,80 @@ L$10rounds: xorl %eax,%eax jmp L$enc_key_ret +.p2align 4 +L$10rounds_alt: + movdqa L$key_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa L$key_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp L$oop_key128 + +.p2align 4 +L$oop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz L$oop_key128 + + movdqa L$key_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$12rounds: movq 16(%rdi),%xmm2 movl $11,%esi + cmpl $268435456,%r10d + je L$12rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call L$key_expansion_192a_cold @@ -3033,11 +3307,55 @@ L$12rounds: xorq %rax,%rax jmp L$enc_key_ret +.p2align 4 +L$12rounds_alt: + movdqa L$key_rotate192(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp L$oop_key192 + +.p2align 4 +L$oop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz L$oop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax + cmpl $268435456,%r10d + je L$14rounds_alt + movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 @@ -3071,10 +3389,70 @@ L$14rounds: xorq %rax,%rax jmp L$enc_key_ret +.p2align 4 +L$14rounds_alt: + movdqa L$key_rotate(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp L$oop_key256 + +.p2align 4 +L$oop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz L$done_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp L$oop_key256 + +L$done_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$bad_keybits: movq $-2,%rax L$enc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 addq $8,%rsp .byte 0xf3,0xc3 L$SEH_end_set_encrypt_key: @@ -3160,6 +3538,14 @@ L$xts_magic: .long 0x87,0,1,0 L$increment1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$key_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +L$key_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +L$key_rcon1: +.long 1,1,1,1 +L$key_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s index 65cf9993d8..5470fb0336 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s @@ -2884,11 +2884,16 @@ L$sqrx4x_sub: .p2align 4 _bn_get_bits5: - movq %rdi,%r10 + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 movl %esi,%ecx - shrl $3,%esi - movzwl (%r10,%rsi,1),%eax - andl $7,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax shrl %cl,%eax andl $31,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm b/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm index 53d8afc950..5e848125d6 100644 --- a/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm +++ b/deps/openssl/asm/x64-win32-masm/aes/aesni-x86_64.asm @@ -18,7 +18,10 @@ DB 102,15,56,220,209 lea r8,QWORD PTR[16+r8] jnz $L$oop_enc1_1 DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[rdx],xmm2 + pxor xmm2,xmm2 DB 0F3h,0C3h ;repret aesni_encrypt ENDP @@ -39,7 +42,10 @@ DB 102,15,56,222,209 lea r8,QWORD PTR[16+r8] jnz $L$oop_dec1_2 DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[rdx],xmm2 + pxor xmm2,xmm2 DB 0F3h,0C3h ;repret aesni_decrypt ENDP @@ -265,21 +271,18 @@ DB 102,15,56,220,217 pxor xmm6,xmm0 DB 102,15,56,220,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR[rax*1+rcx] add rax,16 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] jmp $L$enc_loop6_enter ALIGN 16 $L$enc_loop6:: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 +$L$enc_loop6_enter:: DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 -$L$enc_loop6_enter:: movups xmm1,XMMWORD PTR[rax*1+rcx] add rax,32 DB 102,15,56,220,208 @@ -322,21 +325,18 @@ DB 102,15,56,222,217 pxor xmm6,xmm0 DB 102,15,56,222,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR[rax*1+rcx] add rax,16 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] jmp $L$dec_loop6_enter ALIGN 16 $L$dec_loop6:: DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 +$L$dec_loop6_enter:: DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 -$L$dec_loop6_enter:: movups xmm1,XMMWORD PTR[rax*1+rcx] add rax,32 DB 102,15,56,222,208 @@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE lea rcx,QWORD PTR[32+rax*1+rcx] neg rax DB 102,15,56,220,209 - add rax,16 pxor xmm7,xmm0 -DB 102,15,56,220,217 pxor xmm8,xmm0 +DB 102,15,56,220,217 pxor xmm9,xmm0 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 -DB 102,68,15,56,220,193 -DB 102,68,15,56,220,201 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] - jmp $L$enc_loop8_enter + movups xmm0,XMMWORD PTR[rax*1+rcx] + add rax,16 + jmp $L$enc_loop8_inner ALIGN 16 $L$enc_loop8:: DB 102,15,56,220,209 DB 102,15,56,220,217 +$L$enc_loop8_inner:: DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 @@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE lea rcx,QWORD PTR[32+rax*1+rcx] neg rax DB 102,15,56,222,209 - add rax,16 pxor xmm7,xmm0 -DB 102,15,56,222,217 pxor xmm8,xmm0 +DB 102,15,56,222,217 pxor xmm9,xmm0 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 -DB 102,68,15,56,222,193 -DB 102,68,15,56,222,201 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] - jmp $L$dec_loop8_enter + movups xmm0,XMMWORD PTR[rax*1+rcx] + add rax,16 + jmp $L$dec_loop8_inner ALIGN 16 $L$dec_loop8:: DB 102,15,56,222,209 DB 102,15,56,222,217 +$L$dec_loop8_inner:: DB 102,15,56,222,225 DB 102,15,56,222,233 DB 102,15,56,222,241 @@ -605,6 +595,7 @@ $L$ecb_enc_tail:: movups xmm7,XMMWORD PTR[80+rdi] je $L$ecb_enc_six movdqu xmm8,XMMWORD PTR[96+rdi] + xorps xmm9,xmm9 call _aesni_encrypt8 movups XMMWORD PTR[rsi],xmm2 movups XMMWORD PTR[16+rsi],xmm3 @@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter:: jnc $L$ecb_dec_loop8 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 mov rcx,r11 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 mov eax,r10d movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 movups XMMWORD PTR[96+rsi],xmm8 + pxor xmm8,xmm8 movups XMMWORD PTR[112+rsi],xmm9 + pxor xmm9,xmm9 lea rsi,QWORD PTR[128+rsi] add rdx,080h jz $L$ecb_ret @@ -749,14 +748,23 @@ $L$ecb_dec_tail:: je $L$ecb_dec_six movups xmm8,XMMWORD PTR[96+rdi] movups xmm0,XMMWORD PTR[rcx] + xorps xmm9,xmm9 call _aesni_decrypt8 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 movups XMMWORD PTR[96+rsi],xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_one:: @@ -772,53 +780,81 @@ DB 102,15,56,222,209 jnz $L$oop_dec1_4 DB 102,15,56,223,209 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_two:: call _aesni_decrypt2 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_three:: call _aesni_decrypt3 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_four:: call _aesni_decrypt4 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_five:: xorps xmm7,xmm7 call _aesni_decrypt6 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_six:: call _aesni_decrypt6 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 $L$ecb_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ecb_enc_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -898,11 +934,21 @@ DB 102,15,56,0,215 lea rsi,QWORD PTR[16+rsi] jnz $L$ccm64_enc_outer + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 movups XMMWORD PTR[r9],xmm3 + pxor xmm3,xmm3 + pxor xmm8,xmm8 + pxor xmm6,xmm6 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ccm64_enc_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -1016,11 +1062,21 @@ DB 102,15,56,220,217 lea r11,QWORD PTR[16+r11] jnz $L$oop_enc1_6 DB 102,15,56,221,217 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 movups XMMWORD PTR[r9],xmm3 + pxor xmm3,xmm3 + pxor xmm8,xmm8 + pxor xmm6,xmm6 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ccm64_dec_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks:: mov r8,QWORD PTR[40+rsp] + cmp rdx,1 + jne $L$ctr32_bulk + + + + movups xmm2,XMMWORD PTR[r8] + movups xmm3,XMMWORD PTR[rdi] + mov edx,DWORD PTR[240+rcx] + movups xmm0,XMMWORD PTR[rcx] + movups xmm1,XMMWORD PTR[16+rcx] + lea rcx,QWORD PTR[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_7:: +DB 102,15,56,220,209 + dec edx + movups xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + jnz $L$oop_enc1_7 +DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD PTR[rsi],xmm2 + xorps xmm2,xmm2 + jmp $L$ctr32_epilogue + +ALIGN 16 +$L$ctr32_bulk:: lea rax,QWORD PTR[rsp] push rbp sub rsp,288 @@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks:: $L$ctr32_body:: lea rbp,QWORD PTR[((-8))+rax] - cmp rdx,1 - je $L$ctr32_one_shortcut + + movdqu xmm2,XMMWORD PTR[r8] movdqu xmm0,XMMWORD PTR[rcx] @@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202 lea rcx,QWORD PTR[((-128))+rcx] $L$ctr32_tail:: + + lea rcx,QWORD PTR[16+rcx] cmp rdx,4 jb $L$ctr32_loop3 je $L$ctr32_loop4 + shl eax,4 movdqa xmm8,XMMWORD PTR[96+rsp] pxor xmm9,xmm9 @@ -1559,40 +1647,43 @@ DB 102,15,56,221,225 movups xmm12,XMMWORD PTR[32+rdi] xorps xmm4,xmm12 movups XMMWORD PTR[32+rsi],xmm4 - jmp $L$ctr32_done -ALIGN 16 -$L$ctr32_one_shortcut:: - movups xmm2,XMMWORD PTR[r8] - movups xmm10,XMMWORD PTR[rdi] - mov eax,DWORD PTR[240+rcx] - movups xmm0,XMMWORD PTR[rcx] - movups xmm1,XMMWORD PTR[16+rcx] - lea rcx,QWORD PTR[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_7:: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD PTR[rcx] - lea rcx,QWORD PTR[16+rcx] - jnz $L$oop_enc1_7 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movups XMMWORD PTR[rsi],xmm2 - jmp $L$ctr32_done - -ALIGN 16 $L$ctr32_done:: + xorps xmm0,xmm0 + xor r11d,r11d + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 + movaps XMMWORD PTR[112+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$ctr32_epilogue:: @@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80 shr eax,4 $L$xts_enc_short:: + mov r10d,eax pxor xmm10,xmm0 add rdx,16*6 @@ -1917,6 +2009,7 @@ $L$xts_enc_short:: pxor xmm4,xmm12 pxor xmm5,xmm13 pxor xmm6,xmm14 + pxor xmm7,xmm7 call _aesni_encrypt6 @@ -2059,16 +2152,39 @@ DB 102,15,56,221,209 movups XMMWORD PTR[(-16)+rsi],xmm2 $L$xts_enc_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$xts_enc_epilogue:: @@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80 shr eax,4 $L$xts_dec_short:: + mov r10d,eax pxor xmm10,xmm0 pxor xmm11,xmm0 @@ -2573,16 +2690,39 @@ DB 102,15,56,223,209 movups XMMWORD PTR[rsi],xmm2 $L$xts_dec_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$xts_dec_epilogue:: @@ -2646,7 +2786,11 @@ DB 102,15,56,221,209 jnc $L$cbc_enc_loop add rdx,16 jnz $L$cbc_enc_tail + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[r8],xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 jmp $L$cbc_ret $L$cbc_enc_tail:: @@ -2666,6 +2810,35 @@ $L$cbc_enc_tail:: ALIGN 16 $L$cbc_decrypt:: + cmp rdx,16 + jne $L$cbc_decrypt_bulk + + + + movdqu xmm2,XMMWORD PTR[rdi] + movdqu xmm3,XMMWORD PTR[r8] + movdqa xmm4,xmm2 + movups xmm0,XMMWORD PTR[rcx] + movups xmm1,XMMWORD PTR[16+rcx] + lea rcx,QWORD PTR[32+rcx] + xorps xmm2,xmm0 +$L$oop_dec1_16:: +DB 102,15,56,222,209 + dec r10d + movups xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + jnz $L$oop_dec1_16 +DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqu XMMWORD PTR[r8],xmm4 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 + jmp $L$cbc_ret +ALIGN 16 +$L$cbc_decrypt_bulk:: lea rax,QWORD PTR[rsp] push rbp sub rsp,176 @@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202 movaps xmm2,xmm9 lea rcx,QWORD PTR[((-112))+rcx] add rdx,070h - jle $L$cbc_dec_tail_collected + jle $L$cbc_dec_clear_tail_collected movups XMMWORD PTR[rsi],xmm9 lea rsi,QWORD PTR[16+rsi] cmp rdx,050h @@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 pxor xmm7,xmm15 movdqu XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 lea rsi,QWORD PTR[80+rsi] movdqa xmm2,xmm7 + pxor xmm7,xmm7 jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -2954,16 +3132,23 @@ $L$cbc_dec_seven:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 pxor xmm7,xmm15 movdqu XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 pxor xmm8,xmm9 movdqu XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 lea rsi,QWORD PTR[96+rsi] movdqa xmm2,xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter:: movdqa xmm2,xmm7 add rdx,050h - jle $L$cbc_dec_tail_collected + jle $L$cbc_dec_clear_tail_collected movups XMMWORD PTR[rsi],xmm7 lea rsi,QWORD PTR[16+rsi] @@ -3042,12 +3227,17 @@ $L$cbc_dec_tail:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 lea rsi,QWORD PTR[64+rsi] movdqa xmm2,xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 sub rdx,010h jmp $L$cbc_dec_tail_collected @@ -3058,12 +3248,12 @@ $L$cbc_dec_one:: movups xmm1,XMMWORD PTR[16+rcx] lea rcx,QWORD PTR[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_16:: +$L$oop_dec1_17:: DB 102,15,56,222,209 dec eax movups xmm1,XMMWORD PTR[rcx] lea rcx,QWORD PTR[16+rcx] - jnz $L$oop_dec1_16 + jnz $L$oop_dec1_17 DB 102,15,56,223,209 xorps xmm2,xmm10 movaps xmm10,xmm11 @@ -3077,6 +3267,7 @@ $L$cbc_dec_two:: pxor xmm3,xmm11 movdqu XMMWORD PTR[rsi],xmm2 movdqa xmm2,xmm3 + pxor xmm3,xmm3 lea rsi,QWORD PTR[16+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3089,7 +3280,9 @@ $L$cbc_dec_three:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movdqa xmm2,xmm4 + pxor xmm4,xmm4 lea rsi,QWORD PTR[32+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3102,39 +3295,61 @@ $L$cbc_dec_four:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movdqa xmm2,xmm5 + pxor xmm5,xmm5 lea rsi,QWORD PTR[48+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 +$L$cbc_dec_clear_tail_collected:: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 $L$cbc_dec_tail_collected:: movups XMMWORD PTR[r8],xmm10 and rdx,15 jnz $L$cbc_dec_tail_partial movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 jmp $L$cbc_dec_ret ALIGN 16 $L$cbc_dec_tail_partial:: movaps XMMWORD PTR[rsp],xmm2 + pxor xmm2,xmm2 mov rcx,16 mov rdi,rsi sub rcx,rdx lea rsi,QWORD PTR[rsp] DD 09066A4F3h + movdqa XMMWORD PTR[rsp],xmm2 $L$cbc_dec_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 movaps xmm6,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm7,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm8,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 movaps xmm9,XMMWORD PTR[64+rsp] + movaps XMMWORD PTR[64+rsp],xmm0 movaps xmm10,XMMWORD PTR[80+rsp] + movaps XMMWORD PTR[80+rsp],xmm0 movaps xmm11,XMMWORD PTR[96+rsp] + movaps XMMWORD PTR[96+rsp],xmm0 movaps xmm12,XMMWORD PTR[112+rsp] + movaps XMMWORD PTR[112+rsp],xmm0 movaps xmm13,XMMWORD PTR[128+rsp] + movaps XMMWORD PTR[128+rsp],xmm0 movaps xmm14,XMMWORD PTR[144+rsp] + movaps XMMWORD PTR[144+rsp],xmm0 movaps xmm15,XMMWORD PTR[160+rsp] + movaps XMMWORD PTR[160+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$cbc_ret:: @@ -3175,7 +3390,9 @@ DB 102,15,56,219,201 movups xmm0,XMMWORD PTR[r8] DB 102,15,56,219,192 + pxor xmm1,xmm1 movups XMMWORD PTR[rcx],xmm0 + pxor xmm0,xmm0 $L$dec_key_ret:: add rsp,8 DB 0F3h,0C3h ;repret @@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h test r8,r8 jz $L$enc_key_ret + mov r10d,268437504 movups xmm0,XMMWORD PTR[rcx] xorps xmm4,xmm4 + and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))] lea rax,QWORD PTR[16+r8] cmp edx,256 je $L$14rounds @@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h $L$10rounds:: mov edx,9 + cmp r10d,268435456 + je $L$10rounds_alt + movups XMMWORD PTR[r8],xmm0 DB 102,15,58,223,200,1 call $L$key_expansion_128_cold @@ -3231,10 +3453,80 @@ DB 102,15,58,223,200,54 xor eax,eax jmp $L$enc_key_ret +ALIGN 16 +$L$10rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate] + mov r10d,8 + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + movdqa xmm2,xmm0 + movdqu XMMWORD PTR[r8],xmm0 + jmp $L$oop_key128 + +ALIGN 16 +$L$oop_key128:: +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + lea rax,QWORD PTR[16+rax] + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[(-16)+rax],xmm0 + movdqa xmm2,xmm0 + + dec r10d + jnz $L$oop_key128 + + movdqa xmm4,XMMWORD PTR[$L$key_rcon1b] + +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[rax],xmm0 + + movdqa xmm2,xmm0 +DB 102,15,56,0,197 +DB 102,15,56,221,196 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[16+rax],xmm0 + + mov DWORD PTR[96+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$12rounds:: movq xmm2,QWORD PTR[16+rcx] mov edx,11 + cmp r10d,268435456 + je $L$12rounds_alt + movups XMMWORD PTR[r8],xmm0 DB 102,15,58,223,202,1 call $L$key_expansion_192a_cold @@ -3257,11 +3549,55 @@ DB 102,15,58,223,202,128 xor rax,rax jmp $L$enc_key_ret +ALIGN 16 +$L$12rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate192] + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + mov r10d,8 + movdqu XMMWORD PTR[r8],xmm0 + jmp $L$oop_key192 + +ALIGN 16 +$L$oop_key192:: + movq QWORD PTR[rax],xmm2 + movdqa xmm1,xmm2 +DB 102,15,56,0,213 +DB 102,15,56,221,212 + pslld xmm4,1 + lea rax,QWORD PTR[24+rax] + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + + pshufd xmm3,xmm0,0ffh + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu XMMWORD PTR[(-16)+rax],xmm0 + + dec r10d + jnz $L$oop_key192 + + mov DWORD PTR[32+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$14rounds:: movups xmm2,XMMWORD PTR[16+rcx] mov edx,13 lea rax,QWORD PTR[16+rax] + cmp r10d,268435456 + je $L$14rounds_alt + movups XMMWORD PTR[r8],xmm0 movups XMMWORD PTR[16+r8],xmm2 DB 102,15,58,223,202,1 @@ -3295,10 +3631,70 @@ DB 102,15,58,223,202,64 xor rax,rax jmp $L$enc_key_ret +ALIGN 16 +$L$14rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate] + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + mov r10d,7 + movdqu XMMWORD PTR[r8],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD PTR[16+r8],xmm2 + jmp $L$oop_key256 + +ALIGN 16 +$L$oop_key256:: +DB 102,15,56,0,213 +DB 102,15,56,221,212 + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[rax],xmm0 + + dec r10d + jz $L$done_key256 + + pshufd xmm2,xmm0,0ffh + pxor xmm3,xmm3 +DB 102,15,56,221,211 + + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + + pxor xmm2,xmm1 + movdqu XMMWORD PTR[16+rax],xmm2 + lea rax,QWORD PTR[32+rax] + movdqa xmm1,xmm2 + + jmp $L$oop_key256 + +$L$done_key256:: + mov DWORD PTR[16+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$bad_keybits:: mov rax,-2 $L$enc_key_ret:: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 add rsp,8 DB 0F3h,0C3h ;repret $L$SEH_end_set_encrypt_key:: @@ -3384,6 +3780,14 @@ $L$xts_magic:: DD 087h,0,1,0 $L$increment1:: DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$key_rotate:: + DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh +$L$key_rotate192:: + DD 004070605h,004070605h,004070605h,004070605h +$L$key_rcon1:: + DD 1,1,1,1 +$L$key_rcon1b:: + DD 01bh,01bh,01bh,01bh DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 @@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE mov rax,QWORD PTR[152+r8] mov rbx,QWORD PTR[248+r8] - lea r10,QWORD PTR[$L$cbc_decrypt] + lea r10,QWORD PTR[$L$cbc_decrypt_bulk] cmp rbx,r10 jb $L$common_seh_tail diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm index 64a1b42cfe..9fdd91d016 100644 --- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm +++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm @@ -3001,11 +3001,16 @@ PUBLIC bn_get_bits5 ALIGN 16 bn_get_bits5 PROC PUBLIC - mov r10,rcx + lea r10,QWORD PTR[rcx] + lea r11,QWORD PTR[1+rcx] mov ecx,edx - shr edx,3 - movzx eax,WORD PTR[rdx*1+r10] - and ecx,7 + shr edx,4 + and ecx,15 + lea eax,DWORD PTR[((-8))+rcx] + cmp ecx,11 + cmova r10,r11 + cmova ecx,eax + movzx eax,WORD PTR[rdx*2+r10] shr eax,cl and eax,31 DB 0F3h,0C3h ;repret diff --git a/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s b/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s index a68f7cdbe9..3bbc4e47d6 100644 --- a/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s +++ b/deps/openssl/asm/x86-elf-gas/aes/aesni-x86.s @@ -21,7 +21,10 @@ aesni_encrypt: leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_encrypt,.-.L_aesni_encrypt_begin .globl aesni_decrypt @@ -45,7 +48,10 @@ aesni_decrypt: leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_decrypt,.-.L_aesni_decrypt_begin .type _aesni_encrypt2,@function @@ -259,17 +265,15 @@ _aesni_encrypt6: negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_encrypt6_enter + jmp .L008_aesni_encrypt6_inner .align 16 -.L008enc6_loop: +.L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +.L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -283,7 +287,7 @@ _aesni_encrypt6: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L008enc6_loop + jnz .L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -315,17 +319,15 @@ _aesni_decrypt6: negl %ecx .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_decrypt6_enter + jmp .L010_aesni_decrypt6_inner .align 16 -.L009dec6_loop: +.L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +.L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -339,7 +341,7 @@ _aesni_decrypt6: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L009dec6_loop + jnz .L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -369,14 +371,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L010ecb_ret + jz .L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L011ecb_decrypt + jz .L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L012ecb_enc_tail + jb .L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -385,9 +387,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L013ecb_enc_loop6_enter + jmp .L015ecb_enc_loop6_enter .align 16 -.L014ecb_enc_loop6: +.L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -402,12 +404,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L013ecb_enc_loop6_enter: +.L015ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L014ecb_enc_loop6 + jnc .L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -416,18 +418,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L012ecb_enc_tail: + jz .L012ecb_ret +.L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L015ecb_enc_one + jb .L017ecb_enc_one movups 16(%esi),%xmm3 - je .L016ecb_enc_two + je .L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L017ecb_enc_three + jb .L019ecb_enc_three movups 48(%esi),%xmm5 - je .L018ecb_enc_four + je .L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -436,49 +438,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L015ecb_enc_one: +.L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L019enc1_loop_3: +.L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L019enc1_loop_3 + jnz .L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L016ecb_enc_two: +.L018ecb_enc_two: call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L017ecb_enc_three: +.L019ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L018ecb_enc_four: +.L020ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L011ecb_decrypt: +.L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L020ecb_dec_tail + jb .L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -487,9 +489,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L021ecb_dec_loop6_enter + jmp .L023ecb_dec_loop6_enter .align 16 -.L022ecb_dec_loop6: +.L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -504,12 +506,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L021ecb_dec_loop6_enter: +.L023ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L022ecb_dec_loop6 + jnc .L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -518,18 +520,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L020ecb_dec_tail: + jz .L012ecb_ret +.L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L023ecb_dec_one + jb .L025ecb_dec_one movups 16(%esi),%xmm3 - je .L024ecb_dec_two + je .L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L025ecb_dec_three + jb .L027ecb_dec_three movups 48(%esi),%xmm5 - je .L026ecb_dec_four + je .L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -538,43 +540,51 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L023ecb_dec_one: +.L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L027dec1_loop_4: +.L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L027dec1_loop_4 + jnz .L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L024ecb_dec_two: +.L026ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L025ecb_dec_three: +.L027ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L026ecb_dec_four: +.L028ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L010ecb_ret: +.L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -.L028ccm64_enc_outer: +.L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -.L029ccm64_enc2_loop: +.L031ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L029ccm64_enc2_loop + jnz .L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz .L028ccm64_enc_outer + jnz .L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L030enc1_loop_5: +.L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L030enc1_loop_5 + jnz .L032enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L031ccm64_dec_outer: +.L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L032ccm64_dec_break + jz .L034ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -.L033ccm64_dec2_loop: +.L035ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L033ccm64_dec2_loop + jnz .L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L032ccm64_dec_break: +.L034ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L034enc1_loop_6: +.L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L034enc1_loop_6 + jnz .L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L035ctr32_one_shortcut + je .L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L036ctr32_tail + jb .L038ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L037ctr32_loop6 + jmp .L039ctr32_loop6 .align 16 -.L037ctr32_loop6: +.L039ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L037ctr32_loop6 + jnc .L039ctr32_loop6 addl $6,%eax - jz .L038ctr32_ret + jz .L040ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -.L036ctr32_tail: +.L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L039ctr32_one + jb .L041ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L040ctr32_two + je .L042ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L041ctr32_three + jb .L043ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L042ctr32_four + je .L044ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L035ctr32_one_shortcut: +.L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L039ctr32_one: +.L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L043enc1_loop_7: +.L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L043enc1_loop_7 + jnz .L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L040ctr32_two: +.L042ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L041ctr32_three: +.L043ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L042ctr32_four: +.L044ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L038ctr32_ret: +.L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -1020,12 +1057,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L044enc1_loop_8: +.L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L044enc1_loop_8 + jnz .L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1049,14 +1086,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L045xts_enc_short + jc .L047xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L046xts_enc_loop6 + jmp .L048xts_enc_loop6 .align 16 -.L046xts_enc_loop6: +.L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1145,23 +1182,23 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L046xts_enc_loop6 + jnc .L048xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L045xts_enc_short: +.L047xts_enc_short: addl $96,%eax - jz .L047xts_enc_done6x + jz .L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L048xts_enc_one + jb .L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L049xts_enc_two + je .L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1170,7 +1207,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L050xts_enc_three + jb .L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1180,7 +1217,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L051xts_enc_four + je .L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1212,9 +1249,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L048xts_enc_one: +.L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1222,20 +1259,20 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L053enc1_loop_9: +.L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L053enc1_loop_9 + jnz .L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L049xts_enc_two: +.L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1249,9 +1286,9 @@ aesni_xts_encrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L050xts_enc_three: +.L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1269,9 +1306,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L051xts_enc_four: +.L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1293,28 +1330,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L047xts_enc_done6x: +.L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L055xts_enc_steal + jmp .L057xts_enc_steal .align 16 -.L052xts_enc_done: +.L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L055xts_enc_steal: +.L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1322,7 +1359,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L055xts_enc_steal + jnz .L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1332,16 +1369,30 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L056enc1_loop_10: +.L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L056enc1_loop_10 + jnz .L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L054xts_enc_ret: +.L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1366,12 +1417,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L057enc1_loop_11: +.L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L057enc1_loop_11 + jnz .L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1400,14 +1451,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L058xts_dec_short + jc .L060xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L059xts_dec_loop6 + jmp .L061xts_dec_loop6 .align 16 -.L059xts_dec_loop6: +.L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1496,23 +1547,23 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L059xts_dec_loop6 + jnc .L061xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L058xts_dec_short: +.L060xts_dec_short: addl $96,%eax - jz .L060xts_dec_done6x + jz .L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L061xts_dec_one + jb .L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L062xts_dec_two + je .L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1521,7 +1572,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L063xts_dec_three + jb .L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1531,7 +1582,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L064xts_dec_four + je .L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1563,9 +1614,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L061xts_dec_one: +.L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1573,20 +1624,20 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L066dec1_loop_12: +.L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L066dec1_loop_12 + jnz .L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L062xts_dec_two: +.L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1600,9 +1651,9 @@ aesni_xts_decrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L063xts_dec_three: +.L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1620,9 +1671,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L064xts_dec_four: +.L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1644,20 +1695,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L060xts_dec_done6x: +.L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret movl %eax,112(%esp) - jmp .L068xts_dec_only_one_more + jmp .L070xts_dec_only_one_more .align 16 -.L065xts_dec_done: +.L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1667,7 +1718,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L068xts_dec_only_one_more: +.L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1681,16 +1732,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_13: +.L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_13 + jnz .L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L070xts_dec_steal: +.L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1698,7 +1749,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L070xts_dec_steal + jnz .L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1708,16 +1759,30 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L071dec1_loop_14: +.L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L071dec1_loop_14 + jnz .L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L067xts_dec_ret: +.L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1743,7 +1808,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L072cbc_abort + jz .L074cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1751,14 +1816,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L073cbc_decrypt + je .L075cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L074cbc_enc_tail + jb .L076cbc_enc_tail subl $16,%eax - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L075cbc_enc_loop: +.L077cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1766,24 +1831,25 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L076enc1_loop_15: +.L078enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L076enc1_loop_15 + jnz .L078enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L075cbc_enc_loop + jnc .L077cbc_enc_loop addl $16,%eax - jnz .L074cbc_enc_tail + jnz .L076cbc_enc_tail movaps %xmm2,%xmm7 - jmp .L077cbc_ret -.L074cbc_enc_tail: + pxor %xmm2,%xmm2 + jmp .L079cbc_ret +.L076cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1794,20 +1860,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L073cbc_decrypt: +.L075cbc_decrypt: cmpl $80,%eax - jbe .L078cbc_dec_tail + jbe .L080cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L079cbc_dec_loop6_enter + jmp .L081cbc_dec_loop6_enter .align 16 -.L080cbc_dec_loop6: +.L082cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L079cbc_dec_loop6_enter: +.L081cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1837,28 +1903,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L080cbc_dec_loop6 + ja .L082cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L081cbc_dec_tail_collected + jle .L083cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L078cbc_dec_tail: +.L080cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L082cbc_dec_one + jbe .L084cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L083cbc_dec_two + jbe .L085cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L084cbc_dec_three + jbe .L086cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L085cbc_dec_four + jbe .L087cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1876,55 +1942,62 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L082cbc_dec_one: +.L084cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L086dec1_loop_16: +.L089dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L086dec1_loop_16 + jnz .L089dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L083cbc_dec_two: +.L085cbc_dec_two: call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L084cbc_dec_three: +.L086cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L085cbc_dec_four: +.L087cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1934,28 +2007,44 @@ aesni_cbc_encrypt: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -.L081cbc_dec_tail_collected: + jmp .L088cbc_dec_tail_collected +.align 16 +.L083cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L088cbc_dec_tail_collected: andl $15,%eax - jnz .L087cbc_dec_tail_partial + jnz .L090cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L077cbc_ret + pxor %xmm0,%xmm0 + jmp .L079cbc_ret .align 16 -.L087cbc_dec_tail_partial: +.L090cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L077cbc_ret: + movdqa %xmm2,(%esp) +.L079cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -.L072cbc_abort: + pxor %xmm7,%xmm7 +.L074cbc_abort: popl %edi popl %esi popl %ebx @@ -1965,52 +2054,62 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz .L088bad_pointer + jz .L091bad_pointer testl %edx,%edx - jz .L088bad_pointer + jz .L091bad_pointer + call .L092pic +.L092pic: + popl %ebx + leal .Lkey_const-.L092pic(%ebx),%ebx + leal OPENSSL_ia32cap_P,%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je .L08914rounds + je .L09314rounds cmpl $192,%ecx - je .L09012rounds + je .L09412rounds cmpl $128,%ecx - jne .L091bad_keybits + jne .L095bad_keybits .align 16 -.L09210rounds: +.L09610rounds: + cmpl $268435456,%ebp + je .L09710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L093key_128_cold + call .L098key_128_cold .byte 102,15,58,223,200,2 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,4 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,8 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,16 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,32 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,64 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,128 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,27 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,54 - call .L094key_128 + call .L099key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L094key_128: +.L099key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L093key_128_cold: +.L098key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09012rounds: +.L09710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L101loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L101loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L100good_key +.align 16 +.L09412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je .L10212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L095key_192a_cold + call .L103key_192a_cold .byte 102,15,58,223,202,2 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,4 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,8 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,16 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,32 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,64 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,128 - call .L096key_192b + call .L104key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L097key_192a: +.L105key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L095key_192a_cold: +.L103key_192a_cold: movaps %xmm2,%xmm5 -.L098key_192b_warm: +.L106key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L096key_192b: +.L104key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L098key_192b_warm + jmp .L106key_192b_warm +.align 16 +.L10212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L107loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L107loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L100good_key .align 16 -.L08914rounds: +.L09314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je .L10814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L099key_256a_cold + call .L109key_256a_cold .byte 102,15,58,223,200,1 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,2 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,2 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,4 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,4 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,8 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,8 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,16 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,16 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,32 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,32 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,64 - call .L101key_256a + call .L111key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L101key_256a: +.L111key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L099key_256a_cold: +.L109key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L100key_256b: +.L110key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 16 +.L10814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L112loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L113done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L112loop_key256 +.L113done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L100good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 4 -.L088bad_pointer: +.L091bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 4 -.L091bad_keybits: +.L095bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key .globl aesni_set_encrypt_key @@ -2164,7 +2407,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L102dec_key_ret + jnz .L114dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2172,7 +2415,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L103dec_key_inverse: +.L115dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2182,15 +2425,24 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L103dec_key_inverse + ja .L115dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -.L102dec_key_ret: +.L114dec_key_ret: ret .size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 diff --git a/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s b/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s index cecd5f83f7..c1f5aec62c 100644 --- a/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s +++ b/deps/openssl/asm/x86-macosx-gas/aes/aesni-x86.s @@ -20,7 +20,10 @@ L000enc1_loop_1: leal 16(%edx),%edx jnz L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .globl _aesni_decrypt .align 4 @@ -42,7 +45,10 @@ L001dec1_loop_2: leal 16(%edx),%edx jnz L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .align 4 __aesni_encrypt2: @@ -242,17 +248,15 @@ __aesni_encrypt6: negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp L_aesni_encrypt6_enter + jmp L008_aesni_encrypt6_inner .align 4,0x90 -L008enc6_loop: +L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -266,7 +270,7 @@ L_aesni_encrypt6_enter: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L008enc6_loop + jnz L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -296,17 +300,15 @@ __aesni_decrypt6: negl %ecx .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp L_aesni_decrypt6_enter + jmp L010_aesni_decrypt6_inner .align 4,0x90 -L009dec6_loop: +L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -320,7 +322,7 @@ L_aesni_decrypt6_enter: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L009dec6_loop + jnz L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L010ecb_ret + jz L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L011ecb_decrypt + jz L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L012ecb_enc_tail + jb L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L013ecb_enc_loop6_enter + jmp L015ecb_enc_loop6_enter .align 4,0x90 -L014ecb_enc_loop6: +L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -381,12 +383,12 @@ L014ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L013ecb_enc_loop6_enter: +L015ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L014ecb_enc_loop6 + jnc L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L010ecb_ret -L012ecb_enc_tail: + jz L012ecb_ret +L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L015ecb_enc_one + jb L017ecb_enc_one movups 16(%esi),%xmm3 - je L016ecb_enc_two + je L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L017ecb_enc_three + jb L019ecb_enc_three movups 48(%esi),%xmm5 - je L018ecb_enc_four + je L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -415,49 +417,49 @@ L012ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L015ecb_enc_one: +L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L019enc1_loop_3: +L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L019enc1_loop_3 + jnz L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L016ecb_enc_two: +L018ecb_enc_two: call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L017ecb_enc_three: +L019ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L018ecb_enc_four: +L020ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L011ecb_decrypt: +L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L020ecb_dec_tail + jb L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -466,9 +468,9 @@ L011ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L021ecb_dec_loop6_enter + jmp L023ecb_dec_loop6_enter .align 4,0x90 -L022ecb_dec_loop6: +L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -483,12 +485,12 @@ L022ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L021ecb_dec_loop6_enter: +L023ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L022ecb_dec_loop6 + jnc L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L010ecb_ret -L020ecb_dec_tail: + jz L012ecb_ret +L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L023ecb_dec_one + jb L025ecb_dec_one movups 16(%esi),%xmm3 - je L024ecb_dec_two + je L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L025ecb_dec_three + jb L027ecb_dec_three movups 48(%esi),%xmm5 - je L026ecb_dec_four + je L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -517,43 +519,51 @@ L020ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L023ecb_dec_one: +L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L027dec1_loop_4: +L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L027dec1_loop_4 + jnz L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L024ecb_dec_two: +L026ecb_dec_two: call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L025ecb_dec_three: +L027ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L026ecb_dec_four: +L028ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L010ecb_ret: +L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -L028ccm64_enc_outer: +L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -607,7 +617,7 @@ L028ccm64_enc_outer: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -L029ccm64_enc2_loop: +L031ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -615,7 +625,7 @@ L029ccm64_enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L029ccm64_enc2_loop + jnz L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -628,10 +638,18 @@ L029ccm64_enc2_loop: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz L028ccm64_enc_outer + jnz L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L030enc1_loop_5: +L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L030enc1_loop_5 + jnz L032enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -692,16 +710,16 @@ L030enc1_loop_5: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp L031ccm64_dec_outer + jmp L033ccm64_dec_outer .align 4,0x90 -L031ccm64_dec_outer: +L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L032ccm64_dec_break + jz L034ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -709,7 +727,7 @@ L031ccm64_dec_outer: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -L033ccm64_dec2_loop: +L035ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -717,7 +735,7 @@ L033ccm64_dec2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L033ccm64_dec2_loop + jnz L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -725,9 +743,9 @@ L033ccm64_dec2_loop: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp L031ccm64_dec_outer + jmp L033ccm64_dec_outer .align 4,0x90 -L032ccm64_dec_break: +L034ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -735,16 +753,24 @@ L032ccm64_dec_break: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L034enc1_loop_6: +L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L034enc1_loop_6 + jnz L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L035ctr32_one_shortcut + je L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L036ctr32_tail + jb L038ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L037ctr32_loop6 + jmp L039ctr32_loop6 .align 4,0x90 -L037ctr32_loop6: +L039ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -871,27 +897,27 @@ L037ctr32_loop6: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L037ctr32_loop6 + jnc L039ctr32_loop6 addl $6,%eax - jz L038ctr32_ret + jz L040ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -L036ctr32_tail: +L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L039ctr32_one + jb L041ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L040ctr32_two + je L042ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L041ctr32_three + jb L043ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L042ctr32_four + je L044ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -909,29 +935,29 @@ L036ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L035ctr32_one_shortcut: +L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L039ctr32_one: +L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L043enc1_loop_7: +L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L043enc1_loop_7 + jnz L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L040ctr32_two: +L042ctr32_two: call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -939,9 +965,9 @@ L040ctr32_two: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L041ctr32_three: +L043ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -952,9 +978,9 @@ L041ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L042ctr32_four: +L044ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -968,7 +994,18 @@ L042ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L038ctr32_ret: +L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L044enc1_loop_8: +L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L044enc1_loop_8 + jnz L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1020,14 +1057,14 @@ L044enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L045xts_enc_short + jc L047xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L046xts_enc_loop6 + jmp L048xts_enc_loop6 .align 4,0x90 -L046xts_enc_loop6: +L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1116,23 +1153,23 @@ L046xts_enc_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L046xts_enc_loop6 + jnc L048xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L045xts_enc_short: +L047xts_enc_short: addl $96,%eax - jz L047xts_enc_done6x + jz L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L048xts_enc_one + jb L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L049xts_enc_two + je L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1141,7 +1178,7 @@ L045xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L050xts_enc_three + jb L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1151,7 +1188,7 @@ L045xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L051xts_enc_four + je L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1183,9 +1220,9 @@ L045xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L048xts_enc_one: +L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1193,20 +1230,20 @@ L048xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L053enc1_loop_9: +L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L053enc1_loop_9 + jnz L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L049xts_enc_two: +L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1220,9 +1257,9 @@ L049xts_enc_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L050xts_enc_three: +L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1240,9 +1277,9 @@ L050xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L051xts_enc_four: +L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1264,28 +1301,28 @@ L051xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L047xts_enc_done6x: +L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L054xts_enc_ret + jz L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L055xts_enc_steal + jmp L057xts_enc_steal .align 4,0x90 -L052xts_enc_done: +L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L054xts_enc_ret + jz L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L055xts_enc_steal: +L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1293,7 +1330,7 @@ L055xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L055xts_enc_steal + jnz L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1303,16 +1340,30 @@ L055xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L056enc1_loop_10: +L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L056enc1_loop_10 + jnz L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L054xts_enc_ret: +L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L057enc1_loop_11: +L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L057enc1_loop_11 + jnz L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1369,14 +1420,14 @@ L057enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L058xts_dec_short + jc L060xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L059xts_dec_loop6 + jmp L061xts_dec_loop6 .align 4,0x90 -L059xts_dec_loop6: +L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1465,23 +1516,23 @@ L059xts_dec_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L059xts_dec_loop6 + jnc L061xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L058xts_dec_short: +L060xts_dec_short: addl $96,%eax - jz L060xts_dec_done6x + jz L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L061xts_dec_one + jb L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L062xts_dec_two + je L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1490,7 +1541,7 @@ L058xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L063xts_dec_three + jb L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1500,7 +1551,7 @@ L058xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L064xts_dec_four + je L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1532,9 +1583,9 @@ L058xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L061xts_dec_one: +L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1542,20 +1593,20 @@ L061xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L066dec1_loop_12: +L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L066dec1_loop_12 + jnz L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L062xts_dec_two: +L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1569,9 +1620,9 @@ L062xts_dec_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L063xts_dec_three: +L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1589,9 +1640,9 @@ L063xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L064xts_dec_four: +L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1613,20 +1664,20 @@ L064xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L060xts_dec_done6x: +L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L067xts_dec_ret + jz L069xts_dec_ret movl %eax,112(%esp) - jmp L068xts_dec_only_one_more + jmp L070xts_dec_only_one_more .align 4,0x90 -L065xts_dec_done: +L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L067xts_dec_ret + jz L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1636,7 +1687,7 @@ L065xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L068xts_dec_only_one_more: +L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L069dec1_loop_13: +L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L069dec1_loop_13 + jnz L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L070xts_dec_steal: +L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1667,7 +1718,7 @@ L070xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L070xts_dec_steal + jnz L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1677,16 +1728,30 @@ L070xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L071dec1_loop_14: +L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L071dec1_loop_14 + jnz L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L067xts_dec_ret: +L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz L072cbc_abort + jz L074cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je L073cbc_decrypt + je L075cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb L074cbc_enc_tail + jb L076cbc_enc_tail subl $16,%eax - jmp L075cbc_enc_loop + jmp L077cbc_enc_loop .align 4,0x90 -L075cbc_enc_loop: +L077cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1733,24 +1798,25 @@ L075cbc_enc_loop: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -L076enc1_loop_15: +L078enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L076enc1_loop_15 + jnz L078enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc L075cbc_enc_loop + jnc L077cbc_enc_loop addl $16,%eax - jnz L074cbc_enc_tail + jnz L076cbc_enc_tail movaps %xmm2,%xmm7 - jmp L077cbc_ret -L074cbc_enc_tail: + pxor %xmm2,%xmm2 + jmp L079cbc_ret +L076cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1761,20 +1827,20 @@ L074cbc_enc_tail: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp L075cbc_enc_loop + jmp L077cbc_enc_loop .align 4,0x90 -L073cbc_decrypt: +L075cbc_decrypt: cmpl $80,%eax - jbe L078cbc_dec_tail + jbe L080cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp L079cbc_dec_loop6_enter + jmp L081cbc_dec_loop6_enter .align 4,0x90 -L080cbc_dec_loop6: +L082cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -L079cbc_dec_loop6_enter: +L081cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L080cbc_dec_loop6 + ja L082cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L081cbc_dec_tail_collected + jle L083cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L078cbc_dec_tail: +L080cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L082cbc_dec_one + jbe L084cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L083cbc_dec_two + jbe L085cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L084cbc_dec_three + jbe L086cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L085cbc_dec_four + jbe L087cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1843,55 +1909,62 @@ L078cbc_dec_tail: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L082cbc_dec_one: +L084cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L086dec1_loop_16: +L089dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L086dec1_loop_16 + jnz L089dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_two: +L085cbc_dec_two: call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L084cbc_dec_three: +L086cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L085cbc_dec_four: +L087cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1901,28 +1974,44 @@ L085cbc_dec_four: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -L081cbc_dec_tail_collected: + jmp L088cbc_dec_tail_collected +.align 4,0x90 +L083cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +L088cbc_dec_tail_collected: andl $15,%eax - jnz L087cbc_dec_tail_partial + jnz L090cbc_dec_tail_partial movups %xmm2,(%edi) - jmp L077cbc_ret + pxor %xmm0,%xmm0 + jmp L079cbc_ret .align 4,0x90 -L087cbc_dec_tail_partial: +L090cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -L077cbc_ret: + movdqa %xmm2,(%esp) +L079cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -L072cbc_abort: + pxor %xmm7,%xmm7 +L074cbc_abort: popl %edi popl %esi popl %ebx @@ -1930,52 +2019,62 @@ L072cbc_abort: ret .align 4 __aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz L088bad_pointer + jz L091bad_pointer testl %edx,%edx - jz L088bad_pointer + jz L091bad_pointer + call L092pic +L092pic: + popl %ebx + leal Lkey_const-L092pic(%ebx),%ebx + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je L08914rounds + je L09314rounds cmpl $192,%ecx - je L09012rounds + je L09412rounds cmpl $128,%ecx - jne L091bad_keybits + jne L095bad_keybits .align 4,0x90 -L09210rounds: +L09610rounds: + cmpl $268435456,%ebp + je L09710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L093key_128_cold + call L098key_128_cold .byte 102,15,58,223,200,2 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,4 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,8 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,16 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,32 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,64 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,128 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,27 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,54 - call L094key_128 + call L099key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L094key_128: +L099key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L093key_128_cold: +L098key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1984,38 +2083,91 @@ L093key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L09012rounds: +L09710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +L101loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz L101loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp L100good_key +.align 4,0x90 +L09412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je L10212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L095key_192a_cold + call L103key_192a_cold .byte 102,15,58,223,202,2 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,4 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,8 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,16 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,32 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,64 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,128 - call L096key_192b + call L104key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L097key_192a: +L105key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L095key_192a_cold: +L103key_192a_cold: movaps %xmm2,%xmm5 -L098key_192b_warm: +L106key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2029,56 +2181,90 @@ L098key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L096key_192b: +L104key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L098key_192b_warm + jmp L106key_192b_warm .align 4,0x90 -L08914rounds: +L10212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +L107loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz L107loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp L100good_key +.align 4,0x90 +L09314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je L10814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L099key_256a_cold + call L109key_256a_cold .byte 102,15,58,223,200,1 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,2 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,2 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,4 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,4 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,8 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,8 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,16 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,16 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,32 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,32 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,64 - call L101key_256a + call L111key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L101key_256a: +L111key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L099key_256a_cold: +L109key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2087,7 +2273,7 @@ L099key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L100key_256b: +L110key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2097,13 +2283,70 @@ L100key_256b: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 4,0x90 +L10814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +L112loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz L113done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp L112loop_key256 +L113done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +L100good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 2,0x90 -L088bad_pointer: +L091bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 2,0x90 -L091bad_keybits: +L095bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .globl _aesni_set_encrypt_key .align 4 @@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L102dec_key_ret + jnz L114dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L103dec_key_inverse: +L115dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2143,14 +2386,27 @@ L103dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L103dec_key_inverse + ja L115dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -L102dec_key_ret: +L114dec_key_ret: ret +.align 6,0x90 +Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 +.comm _OPENSSL_ia32cap_P,16,2 diff --git a/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm b/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm index 43fdb5a034..6511c21bcf 100644 --- a/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm +++ b/deps/openssl/asm/x86-win32-masm/aes/aesni-x86.asm @@ -17,6 +17,7 @@ IF @Version LT 800 ELSE .text$ SEGMENT ALIGN(64) 'CODE' ENDIF +;EXTERN _OPENSSL_ia32cap_P:NEAR ALIGN 16 _aesni_encrypt PROC PUBLIC $L_aesni_encrypt_begin:: @@ -36,7 +37,10 @@ DB 102,15,56,220,209 lea edx,DWORD PTR 16[edx] jnz $L000enc1_loop_1 DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR [eax],xmm2 + pxor xmm2,xmm2 ret _aesni_encrypt ENDP ALIGN 16 @@ -58,7 +62,10 @@ DB 102,15,56,222,209 lea edx,DWORD PTR 16[edx] jnz $L001dec1_loop_2 DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR [eax],xmm2 + pxor xmm2,xmm2 ret _aesni_decrypt ENDP ALIGN 16 @@ -265,17 +272,15 @@ DB 102,15,56,220,217 neg ecx DB 102,15,56,220,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR [ecx*1+edx] add ecx,16 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jmp $L_aesni_encrypt6_enter + jmp $L008_aesni_encrypt6_inner ALIGN 16 -$L008enc6_loop: +$L009enc6_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 +$L008_aesni_encrypt6_inner: DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 @@ -289,7 +294,7 @@ DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L008enc6_loop + jnz $L009enc6_loop DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 @@ -320,17 +325,15 @@ DB 102,15,56,222,217 neg ecx DB 102,15,56,222,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR [ecx*1+edx] add ecx,16 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jmp $L_aesni_decrypt6_enter + jmp $L010_aesni_decrypt6_inner ALIGN 16 -$L009dec6_loop: +$L011dec6_loop: DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 +$L010_aesni_decrypt6_inner: DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 @@ -344,7 +347,7 @@ DB 102,15,56,222,232 DB 102,15,56,222,240 DB 102,15,56,222,248 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L009dec6_loop + jnz $L011dec6_loop DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 @@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin:: mov edx,DWORD PTR 32[esp] mov ebx,DWORD PTR 36[esp] and eax,-16 - jz $L010ecb_ret + jz $L012ecb_ret mov ecx,DWORD PTR 240[edx] test ebx,ebx - jz $L011ecb_decrypt + jz $L013ecb_decrypt mov ebp,edx mov ebx,ecx cmp eax,96 - jb $L012ecb_enc_tail + jb $L014ecb_enc_tail movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin:: movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] sub eax,96 - jmp $L013ecb_enc_loop6_enter + jmp $L015ecb_enc_loop6_enter ALIGN 16 -$L014ecb_enc_loop6: +$L016ecb_enc_loop6: movups XMMWORD PTR [edi],xmm2 movdqu xmm2,XMMWORD PTR [esi] movups XMMWORD PTR 16[edi],xmm3 @@ -405,12 +408,12 @@ $L014ecb_enc_loop6: lea edi,DWORD PTR 96[edi] movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] -$L013ecb_enc_loop6_enter: +$L015ecb_enc_loop6_enter: call __aesni_encrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc $L014ecb_enc_loop6 + jnc $L016ecb_enc_loop6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 @@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter: movups XMMWORD PTR 80[edi],xmm7 lea edi,DWORD PTR 96[edi] add eax,96 - jz $L010ecb_ret -$L012ecb_enc_tail: + jz $L012ecb_ret +$L014ecb_enc_tail: movups xmm2,XMMWORD PTR [esi] cmp eax,32 - jb $L015ecb_enc_one + jb $L017ecb_enc_one movups xmm3,XMMWORD PTR 16[esi] - je $L016ecb_enc_two + je $L018ecb_enc_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,64 - jb $L017ecb_enc_three + jb $L019ecb_enc_three movups xmm5,XMMWORD PTR 48[esi] - je $L018ecb_enc_four + je $L020ecb_enc_four movups xmm6,XMMWORD PTR 64[esi] xorps xmm7,xmm7 call __aesni_encrypt6 @@ -439,49 +442,49 @@ $L012ecb_enc_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L015ecb_enc_one: +$L017ecb_enc_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L019enc1_loop_3: +$L021enc1_loop_3: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L019enc1_loop_3 + jnz $L021enc1_loop_3 DB 102,15,56,221,209 movups XMMWORD PTR [edi],xmm2 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L016ecb_enc_two: +$L018ecb_enc_two: call __aesni_encrypt2 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L017ecb_enc_three: +$L019ecb_enc_three: call __aesni_encrypt3 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L018ecb_enc_four: +$L020ecb_enc_four: call __aesni_encrypt4 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L011ecb_decrypt: +$L013ecb_decrypt: mov ebp,edx mov ebx,ecx cmp eax,96 - jb $L020ecb_dec_tail + jb $L022ecb_dec_tail movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -490,9 +493,9 @@ $L011ecb_decrypt: movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] sub eax,96 - jmp $L021ecb_dec_loop6_enter + jmp $L023ecb_dec_loop6_enter ALIGN 16 -$L022ecb_dec_loop6: +$L024ecb_dec_loop6: movups XMMWORD PTR [edi],xmm2 movdqu xmm2,XMMWORD PTR [esi] movups XMMWORD PTR 16[edi],xmm3 @@ -507,12 +510,12 @@ $L022ecb_dec_loop6: lea edi,DWORD PTR 96[edi] movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] -$L021ecb_dec_loop6_enter: +$L023ecb_dec_loop6_enter: call __aesni_decrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc $L022ecb_dec_loop6 + jnc $L024ecb_dec_loop6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 @@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter: movups XMMWORD PTR 80[edi],xmm7 lea edi,DWORD PTR 96[edi] add eax,96 - jz $L010ecb_ret -$L020ecb_dec_tail: + jz $L012ecb_ret +$L022ecb_dec_tail: movups xmm2,XMMWORD PTR [esi] cmp eax,32 - jb $L023ecb_dec_one + jb $L025ecb_dec_one movups xmm3,XMMWORD PTR 16[esi] - je $L024ecb_dec_two + je $L026ecb_dec_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,64 - jb $L025ecb_dec_three + jb $L027ecb_dec_three movups xmm5,XMMWORD PTR 48[esi] - je $L026ecb_dec_four + je $L028ecb_dec_four movups xmm6,XMMWORD PTR 64[esi] xorps xmm7,xmm7 call __aesni_decrypt6 @@ -541,43 +544,51 @@ $L020ecb_dec_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L023ecb_dec_one: +$L025ecb_dec_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L027dec1_loop_4: +$L029dec1_loop_4: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L027dec1_loop_4 + jnz $L029dec1_loop_4 DB 102,15,56,223,209 movups XMMWORD PTR [edi],xmm2 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L024ecb_dec_two: +$L026ecb_dec_two: call __aesni_decrypt2 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L025ecb_dec_three: +$L027ecb_dec_three: call __aesni_decrypt3 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L026ecb_dec_four: +$L028ecb_dec_four: call __aesni_decrypt4 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 -$L010ecb_ret: +$L012ecb_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin:: lea edx,DWORD PTR 32[ecx*1+edx] sub ebx,ecx DB 102,15,56,0,253 -$L028ccm64_enc_outer: +$L030ccm64_enc_outer: movups xmm0,XMMWORD PTR [ebp] mov ecx,ebx movups xmm6,XMMWORD PTR [esi] @@ -631,7 +642,7 @@ $L028ccm64_enc_outer: xorps xmm0,xmm6 xorps xmm3,xmm0 movups xmm0,XMMWORD PTR 32[ebp] -$L029ccm64_enc2_loop: +$L031ccm64_enc2_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 movups xmm1,XMMWORD PTR [ecx*1+edx] @@ -639,7 +650,7 @@ DB 102,15,56,220,217 DB 102,15,56,220,208 DB 102,15,56,220,216 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L029ccm64_enc2_loop + jnz $L031ccm64_enc2_loop DB 102,15,56,220,209 DB 102,15,56,220,217 paddq xmm7,XMMWORD PTR 16[esp] @@ -652,10 +663,18 @@ DB 102,15,56,221,216 movups XMMWORD PTR [edi],xmm6 DB 102,15,56,0,213 lea edi,DWORD PTR 16[edi] - jnz $L028ccm64_enc_outer + jnz $L030ccm64_enc_outer mov esp,DWORD PTR 48[esp] mov edi,DWORD PTR 40[esp] movups XMMWORD PTR [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -701,12 +720,12 @@ DB 102,15,56,0,253 movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L030enc1_loop_5: +$L032enc1_loop_5: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L030enc1_loop_5 + jnz $L032enc1_loop_5 DB 102,15,56,221,209 shl ebx,4 mov ecx,16 @@ -716,16 +735,16 @@ DB 102,15,56,221,209 sub ecx,ebx lea edx,DWORD PTR 32[ebx*1+ebp] mov ebx,ecx - jmp $L031ccm64_dec_outer + jmp $L033ccm64_dec_outer ALIGN 16 -$L031ccm64_dec_outer: +$L033ccm64_dec_outer: xorps xmm6,xmm2 movdqa xmm2,xmm7 movups XMMWORD PTR [edi],xmm6 lea edi,DWORD PTR 16[edi] DB 102,15,56,0,213 sub eax,1 - jz $L032ccm64_dec_break + jz $L034ccm64_dec_break movups xmm0,XMMWORD PTR [ebp] mov ecx,ebx movups xmm1,XMMWORD PTR 16[ebp] @@ -733,7 +752,7 @@ DB 102,15,56,0,213 xorps xmm2,xmm0 xorps xmm3,xmm6 movups xmm0,XMMWORD PTR 32[ebp] -$L033ccm64_dec2_loop: +$L035ccm64_dec2_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 movups xmm1,XMMWORD PTR [ecx*1+edx] @@ -741,7 +760,7 @@ DB 102,15,56,220,217 DB 102,15,56,220,208 DB 102,15,56,220,216 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L033ccm64_dec2_loop + jnz $L035ccm64_dec2_loop movups xmm6,XMMWORD PTR [esi] paddq xmm7,XMMWORD PTR 16[esp] DB 102,15,56,220,209 @@ -749,9 +768,9 @@ DB 102,15,56,220,217 DB 102,15,56,221,208 DB 102,15,56,221,216 lea esi,QWORD PTR 16[esi] - jmp $L031ccm64_dec_outer + jmp $L033ccm64_dec_outer ALIGN 16 -$L032ccm64_dec_break: +$L034ccm64_dec_break: mov ecx,DWORD PTR 240[ebp] mov edx,ebp movups xmm0,XMMWORD PTR [edx] @@ -759,16 +778,24 @@ $L032ccm64_dec_break: xorps xmm6,xmm0 lea edx,DWORD PTR 32[edx] xorps xmm3,xmm6 -$L034enc1_loop_6: +$L036enc1_loop_6: DB 102,15,56,220,217 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L034enc1_loop_6 + jnz $L036enc1_loop_6 DB 102,15,56,221,217 mov esp,DWORD PTR 48[esp] mov edi,DWORD PTR 40[esp] movups XMMWORD PTR [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin:: and esp,-16 mov DWORD PTR 80[esp],ebp cmp eax,1 - je $L035ctr32_one_shortcut + je $L037ctr32_one_shortcut movdqu xmm7,XMMWORD PTR [ebx] mov DWORD PTR [esp],202182159 mov DWORD PTR 4[esp],134810123 @@ -830,7 +857,7 @@ DB 102,15,56,0,202 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 - jb $L036ctr32_tail + jb $L038ctr32_tail pxor xmm7,xmm6 shl ecx,4 mov ebx,16 @@ -839,9 +866,9 @@ DB 102,15,56,0,202 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] sub eax,6 - jmp $L037ctr32_loop6 + jmp $L039ctr32_loop6 ALIGN 16 -$L037ctr32_loop6: +$L039ctr32_loop6: pshufd xmm4,xmm0,64 movdqa xmm0,XMMWORD PTR 32[esp] pshufd xmm5,xmm1,192 @@ -895,27 +922,27 @@ DB 102,15,56,0,202 lea edi,DWORD PTR 96[edi] pshufd xmm3,xmm0,128 sub eax,6 - jnc $L037ctr32_loop6 + jnc $L039ctr32_loop6 add eax,6 - jz $L038ctr32_ret + jz $L040ctr32_ret movdqu xmm7,XMMWORD PTR [ebp] mov edx,ebp pxor xmm7,XMMWORD PTR 32[esp] mov ecx,DWORD PTR 240[ebp] -$L036ctr32_tail: +$L038ctr32_tail: por xmm2,xmm7 cmp eax,2 - jb $L039ctr32_one + jb $L041ctr32_one pshufd xmm4,xmm0,64 por xmm3,xmm7 - je $L040ctr32_two + je $L042ctr32_two pshufd xmm5,xmm1,192 por xmm4,xmm7 cmp eax,4 - jb $L041ctr32_three + jb $L043ctr32_three pshufd xmm6,xmm1,128 por xmm5,xmm7 - je $L042ctr32_four + je $L044ctr32_four por xmm6,xmm7 call __aesni_encrypt6 movups xmm1,XMMWORD PTR [esi] @@ -933,29 +960,29 @@ $L036ctr32_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L035ctr32_one_shortcut: +$L037ctr32_one_shortcut: movups xmm2,XMMWORD PTR [ebx] mov ecx,DWORD PTR 240[edx] -$L039ctr32_one: +$L041ctr32_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L043enc1_loop_7: +$L045enc1_loop_7: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L043enc1_loop_7 + jnz $L045enc1_loop_7 DB 102,15,56,221,209 movups xmm6,XMMWORD PTR [esi] xorps xmm6,xmm2 movups XMMWORD PTR [edi],xmm6 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L040ctr32_two: +$L042ctr32_two: call __aesni_encrypt2 movups xmm5,XMMWORD PTR [esi] movups xmm6,XMMWORD PTR 16[esi] @@ -963,9 +990,9 @@ $L040ctr32_two: xorps xmm3,xmm6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L041ctr32_three: +$L043ctr32_three: call __aesni_encrypt3 movups xmm5,XMMWORD PTR [esi] movups xmm6,XMMWORD PTR 16[esi] @@ -976,9 +1003,9 @@ $L041ctr32_three: xorps xmm4,xmm7 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L042ctr32_four: +$L044ctr32_four: call __aesni_encrypt4 movups xmm6,XMMWORD PTR [esi] movups xmm7,XMMWORD PTR 16[esi] @@ -992,7 +1019,18 @@ $L042ctr32_four: xorps xmm5,xmm0 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 -$L038ctr32_ret: +$L040ctr32_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 mov esp,DWORD PTR 80[esp] pop edi pop esi @@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin:: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L044enc1_loop_8: +$L046enc1_loop_8: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L044enc1_loop_8 + jnz $L046enc1_loop_8 DB 102,15,56,221,209 mov esi,DWORD PTR 20[esp] mov edi,DWORD PTR 24[esp] @@ -1044,14 +1082,14 @@ DB 102,15,56,221,209 mov ebp,edx mov ebx,ecx sub eax,96 - jc $L045xts_enc_short + jc $L047xts_enc_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] - jmp $L046xts_enc_loop6 + jmp $L048xts_enc_loop6 ALIGN 16 -$L046xts_enc_loop6: +$L048xts_enc_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa XMMWORD PTR [esp],xmm1 @@ -1140,23 +1178,23 @@ DB 102,15,56,220,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc $L046xts_enc_loop6 + jnc $L048xts_enc_loop6 mov ecx,DWORD PTR 240[ebp] mov edx,ebp mov ebx,ecx -$L045xts_enc_short: +$L047xts_enc_short: add eax,96 - jz $L047xts_enc_done6x + jz $L049xts_enc_done6x movdqa xmm5,xmm1 cmp eax,32 - jb $L048xts_enc_one + jb $L050xts_enc_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je $L049xts_enc_two + je $L051xts_enc_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1165,7 +1203,7 @@ $L045xts_enc_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb $L050xts_enc_three + jb $L052xts_enc_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1175,7 +1213,7 @@ $L045xts_enc_short: pxor xmm1,xmm2 movdqa XMMWORD PTR [esp],xmm5 movdqa XMMWORD PTR 16[esp],xmm6 - je $L051xts_enc_four + je $L053xts_enc_four movdqa XMMWORD PTR 32[esp],xmm7 pshufd xmm7,xmm0,19 movdqa XMMWORD PTR 48[esp],xmm1 @@ -1207,9 +1245,9 @@ $L045xts_enc_short: movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L048xts_enc_one: +$L050xts_enc_one: movups xmm2,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] xorps xmm2,xmm5 @@ -1217,20 +1255,20 @@ $L048xts_enc_one: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L053enc1_loop_9: +$L055enc1_loop_9: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L053enc1_loop_9 + jnz $L055enc1_loop_9 DB 102,15,56,221,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] movdqa xmm1,xmm5 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L049xts_enc_two: +$L051xts_enc_two: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1244,9 +1282,9 @@ $L049xts_enc_two: movups XMMWORD PTR 16[edi],xmm3 lea edi,DWORD PTR 32[edi] movdqa xmm1,xmm6 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L050xts_enc_three: +$L052xts_enc_three: movaps xmm7,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1264,9 +1302,9 @@ $L050xts_enc_three: movups XMMWORD PTR 32[edi],xmm4 lea edi,DWORD PTR 48[edi] movdqa xmm1,xmm7 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L051xts_enc_four: +$L053xts_enc_four: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1288,28 +1326,28 @@ $L051xts_enc_four: movups XMMWORD PTR 48[edi],xmm5 lea edi,DWORD PTR 64[edi] movdqa xmm1,xmm6 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L047xts_enc_done6x: +$L049xts_enc_done6x: mov eax,DWORD PTR 112[esp] and eax,15 - jz $L054xts_enc_ret + jz $L056xts_enc_ret movdqa xmm5,xmm1 mov DWORD PTR 112[esp],eax - jmp $L055xts_enc_steal + jmp $L057xts_enc_steal ALIGN 16 -$L052xts_enc_done: +$L054xts_enc_done: mov eax,DWORD PTR 112[esp] pxor xmm0,xmm0 and eax,15 - jz $L054xts_enc_ret + jz $L056xts_enc_ret pcmpgtd xmm0,xmm1 mov DWORD PTR 112[esp],eax pshufd xmm5,xmm0,19 paddq xmm1,xmm1 pand xmm5,XMMWORD PTR 96[esp] pxor xmm5,xmm1 -$L055xts_enc_steal: +$L057xts_enc_steal: movzx ecx,BYTE PTR [esi] movzx edx,BYTE PTR [edi-16] lea esi,DWORD PTR 1[esi] @@ -1317,7 +1355,7 @@ $L055xts_enc_steal: mov BYTE PTR [edi],dl lea edi,DWORD PTR 1[edi] sub eax,1 - jnz $L055xts_enc_steal + jnz $L057xts_enc_steal sub edi,DWORD PTR 112[esp] mov edx,ebp mov ecx,ebx @@ -1327,16 +1365,30 @@ $L055xts_enc_steal: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L056enc1_loop_10: +$L058enc1_loop_10: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L056enc1_loop_10 + jnz $L058enc1_loop_10 DB 102,15,56,221,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi-16],xmm2 -$L054xts_enc_ret: +$L056xts_enc_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa XMMWORD PTR [esp],xmm0 + pxor xmm3,xmm3 + movdqa XMMWORD PTR 16[esp],xmm0 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 + movdqa XMMWORD PTR 80[esp],xmm0 mov esp,DWORD PTR 116[esp] pop edi pop esi @@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin:: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L057enc1_loop_11: +$L059enc1_loop_11: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L057enc1_loop_11 + jnz $L059enc1_loop_11 DB 102,15,56,221,209 mov esi,DWORD PTR 20[esp] mov edi,DWORD PTR 24[esp] @@ -1393,14 +1445,14 @@ DB 102,15,56,221,209 pcmpgtd xmm0,xmm1 and eax,-16 sub eax,96 - jc $L058xts_dec_short + jc $L060xts_dec_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] - jmp $L059xts_dec_loop6 + jmp $L061xts_dec_loop6 ALIGN 16 -$L059xts_dec_loop6: +$L061xts_dec_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa XMMWORD PTR [esp],xmm1 @@ -1489,23 +1541,23 @@ DB 102,15,56,222,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc $L059xts_dec_loop6 + jnc $L061xts_dec_loop6 mov ecx,DWORD PTR 240[ebp] mov edx,ebp mov ebx,ecx -$L058xts_dec_short: +$L060xts_dec_short: add eax,96 - jz $L060xts_dec_done6x + jz $L062xts_dec_done6x movdqa xmm5,xmm1 cmp eax,32 - jb $L061xts_dec_one + jb $L063xts_dec_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je $L062xts_dec_two + je $L064xts_dec_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1514,7 +1566,7 @@ $L058xts_dec_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb $L063xts_dec_three + jb $L065xts_dec_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1524,7 +1576,7 @@ $L058xts_dec_short: pxor xmm1,xmm2 movdqa XMMWORD PTR [esp],xmm5 movdqa XMMWORD PTR 16[esp],xmm6 - je $L064xts_dec_four + je $L066xts_dec_four movdqa XMMWORD PTR 32[esp],xmm7 pshufd xmm7,xmm0,19 movdqa XMMWORD PTR 48[esp],xmm1 @@ -1556,9 +1608,9 @@ $L058xts_dec_short: movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L061xts_dec_one: +$L063xts_dec_one: movups xmm2,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] xorps xmm2,xmm5 @@ -1566,20 +1618,20 @@ $L061xts_dec_one: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L066dec1_loop_12: +$L068dec1_loop_12: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L066dec1_loop_12 + jnz $L068dec1_loop_12 DB 102,15,56,223,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] movdqa xmm1,xmm5 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L062xts_dec_two: +$L064xts_dec_two: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1593,9 +1645,9 @@ $L062xts_dec_two: movups XMMWORD PTR 16[edi],xmm3 lea edi,DWORD PTR 32[edi] movdqa xmm1,xmm6 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L063xts_dec_three: +$L065xts_dec_three: movaps xmm7,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1613,9 +1665,9 @@ $L063xts_dec_three: movups XMMWORD PTR 32[edi],xmm4 lea edi,DWORD PTR 48[edi] movdqa xmm1,xmm7 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L064xts_dec_four: +$L066xts_dec_four: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1637,20 +1689,20 @@ $L064xts_dec_four: movups XMMWORD PTR 48[edi],xmm5 lea edi,DWORD PTR 64[edi] movdqa xmm1,xmm6 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L060xts_dec_done6x: +$L062xts_dec_done6x: mov eax,DWORD PTR 112[esp] and eax,15 - jz $L067xts_dec_ret + jz $L069xts_dec_ret mov DWORD PTR 112[esp],eax - jmp $L068xts_dec_only_one_more + jmp $L070xts_dec_only_one_more ALIGN 16 -$L065xts_dec_done: +$L067xts_dec_done: mov eax,DWORD PTR 112[esp] pxor xmm0,xmm0 and eax,15 - jz $L067xts_dec_ret + jz $L069xts_dec_ret pcmpgtd xmm0,xmm1 mov DWORD PTR 112[esp],eax pshufd xmm2,xmm0,19 @@ -1660,7 +1712,7 @@ $L065xts_dec_done: pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 -$L068xts_dec_only_one_more: +$L070xts_dec_only_one_more: pshufd xmm5,xmm0,19 movdqa xmm6,xmm1 paddq xmm1,xmm1 @@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L069dec1_loop_13: +$L071dec1_loop_13: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L069dec1_loop_13 + jnz $L071dec1_loop_13 DB 102,15,56,223,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 -$L070xts_dec_steal: +$L072xts_dec_steal: movzx ecx,BYTE PTR 16[esi] movzx edx,BYTE PTR [edi] lea esi,DWORD PTR 1[esi] @@ -1691,7 +1743,7 @@ $L070xts_dec_steal: mov BYTE PTR 16[edi],dl lea edi,DWORD PTR 1[edi] sub eax,1 - jnz $L070xts_dec_steal + jnz $L072xts_dec_steal sub edi,DWORD PTR 112[esp] mov edx,ebp mov ecx,ebx @@ -1701,16 +1753,30 @@ $L070xts_dec_steal: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L071dec1_loop_14: +$L073dec1_loop_14: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L071dec1_loop_14 + jnz $L073dec1_loop_14 DB 102,15,56,223,209 xorps xmm2,xmm6 movups XMMWORD PTR [edi],xmm2 -$L067xts_dec_ret: +$L069xts_dec_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa XMMWORD PTR [esp],xmm0 + pxor xmm3,xmm3 + movdqa XMMWORD PTR 16[esp],xmm0 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 + movdqa XMMWORD PTR 80[esp],xmm0 mov esp,DWORD PTR 116[esp] pop edi pop esi @@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin:: mov edx,DWORD PTR 32[esp] mov ebp,DWORD PTR 36[esp] test eax,eax - jz $L072cbc_abort + jz $L074cbc_abort cmp DWORD PTR 40[esp],0 xchg ebx,esp movups xmm7,XMMWORD PTR [ebp] @@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin:: mov ebp,edx mov DWORD PTR 16[esp],ebx mov ebx,ecx - je $L073cbc_decrypt + je $L075cbc_decrypt movaps xmm2,xmm7 cmp eax,16 - jb $L074cbc_enc_tail + jb $L076cbc_enc_tail sub eax,16 - jmp $L075cbc_enc_loop + jmp $L077cbc_enc_loop ALIGN 16 -$L075cbc_enc_loop: +$L077cbc_enc_loop: movups xmm7,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] movups xmm0,XMMWORD PTR [edx] @@ -1757,24 +1823,25 @@ $L075cbc_enc_loop: xorps xmm7,xmm0 lea edx,DWORD PTR 32[edx] xorps xmm2,xmm7 -$L076enc1_loop_15: +$L078enc1_loop_15: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L076enc1_loop_15 + jnz $L078enc1_loop_15 DB 102,15,56,221,209 mov ecx,ebx mov edx,ebp movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] sub eax,16 - jnc $L075cbc_enc_loop + jnc $L077cbc_enc_loop add eax,16 - jnz $L074cbc_enc_tail + jnz $L076cbc_enc_tail movaps xmm7,xmm2 - jmp $L077cbc_ret -$L074cbc_enc_tail: + pxor xmm2,xmm2 + jmp $L079cbc_ret +$L076cbc_enc_tail: mov ecx,eax DD 2767451785 mov ecx,16 @@ -1785,20 +1852,20 @@ DD 2868115081 mov ecx,ebx mov esi,edi mov edx,ebp - jmp $L075cbc_enc_loop + jmp $L077cbc_enc_loop ALIGN 16 -$L073cbc_decrypt: +$L075cbc_decrypt: cmp eax,80 - jbe $L078cbc_dec_tail + jbe $L080cbc_dec_tail movaps XMMWORD PTR [esp],xmm7 sub eax,80 - jmp $L079cbc_dec_loop6_enter + jmp $L081cbc_dec_loop6_enter ALIGN 16 -$L080cbc_dec_loop6: +$L082cbc_dec_loop6: movaps XMMWORD PTR [esp],xmm0 movups XMMWORD PTR [edi],xmm7 lea edi,DWORD PTR 16[edi] -$L079cbc_dec_loop6_enter: +$L081cbc_dec_loop6_enter: movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter: movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] sub eax,96 - ja $L080cbc_dec_loop6 + ja $L082cbc_dec_loop6 movaps xmm2,xmm7 movaps xmm7,xmm0 add eax,80 - jle $L081cbc_dec_tail_collected + jle $L083cbc_dec_clear_tail_collected movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] -$L078cbc_dec_tail: +$L080cbc_dec_tail: movups xmm2,XMMWORD PTR [esi] movaps xmm6,xmm2 cmp eax,16 - jbe $L082cbc_dec_one + jbe $L084cbc_dec_one movups xmm3,XMMWORD PTR 16[esi] movaps xmm5,xmm3 cmp eax,32 - jbe $L083cbc_dec_two + jbe $L085cbc_dec_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,48 - jbe $L084cbc_dec_three + jbe $L086cbc_dec_three movups xmm5,XMMWORD PTR 48[esi] cmp eax,64 - jbe $L085cbc_dec_four + jbe $L087cbc_dec_four movups xmm6,XMMWORD PTR 64[esi] movaps XMMWORD PTR [esp],xmm7 movups xmm2,XMMWORD PTR [esi] @@ -1867,55 +1934,62 @@ $L078cbc_dec_tail: xorps xmm6,xmm0 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR 32[edi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR 48[edi],xmm5 + pxor xmm5,xmm5 lea edi,DWORD PTR 64[edi] movaps xmm2,xmm6 + pxor xmm6,xmm6 sub eax,80 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L082cbc_dec_one: +$L084cbc_dec_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L086dec1_loop_16: +$L089dec1_loop_16: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L086dec1_loop_16 + jnz $L089dec1_loop_16 DB 102,15,56,223,209 xorps xmm2,xmm7 movaps xmm7,xmm6 sub eax,16 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L083cbc_dec_two: +$L085cbc_dec_two: call __aesni_decrypt2 xorps xmm2,xmm7 xorps xmm3,xmm6 movups XMMWORD PTR [edi],xmm2 movaps xmm2,xmm3 + pxor xmm3,xmm3 lea edi,DWORD PTR 16[edi] movaps xmm7,xmm5 sub eax,32 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L084cbc_dec_three: +$L086cbc_dec_three: call __aesni_decrypt3 xorps xmm2,xmm7 xorps xmm3,xmm6 xorps xmm4,xmm5 movups XMMWORD PTR [edi],xmm2 movaps xmm2,xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 lea edi,DWORD PTR 32[edi] movups xmm7,XMMWORD PTR 32[esi] sub eax,48 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L085cbc_dec_four: +$L087cbc_dec_four: call __aesni_decrypt4 movups xmm1,XMMWORD PTR 16[esi] movups xmm0,XMMWORD PTR 32[esi] @@ -1925,28 +1999,44 @@ $L085cbc_dec_four: movups XMMWORD PTR [edi],xmm2 xorps xmm4,xmm1 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 xorps xmm5,xmm0 movups XMMWORD PTR 32[edi],xmm4 + pxor xmm4,xmm4 lea edi,DWORD PTR 48[edi] movaps xmm2,xmm5 + pxor xmm5,xmm5 sub eax,64 -$L081cbc_dec_tail_collected: + jmp $L088cbc_dec_tail_collected +ALIGN 16 +$L083cbc_dec_clear_tail_collected: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 +$L088cbc_dec_tail_collected: and eax,15 - jnz $L087cbc_dec_tail_partial + jnz $L090cbc_dec_tail_partial movups XMMWORD PTR [edi],xmm2 - jmp $L077cbc_ret + pxor xmm0,xmm0 + jmp $L079cbc_ret ALIGN 16 -$L087cbc_dec_tail_partial: +$L090cbc_dec_tail_partial: movaps XMMWORD PTR [esp],xmm2 + pxor xmm0,xmm0 mov ecx,16 mov esi,esp sub ecx,eax DD 2767451785 -$L077cbc_ret: + movdqa XMMWORD PTR [esp],xmm2 +$L079cbc_ret: mov esp,DWORD PTR 16[esp] mov ebp,DWORD PTR 36[esp] + pxor xmm2,xmm2 + pxor xmm1,xmm1 movups XMMWORD PTR [ebp],xmm7 -$L072cbc_abort: + pxor xmm7,xmm7 +$L074cbc_abort: pop edi pop esi pop ebx @@ -1955,52 +2045,62 @@ $L072cbc_abort: _aesni_cbc_encrypt ENDP ALIGN 16 __aesni_set_encrypt_key PROC PRIVATE + push ebp + push ebx test eax,eax - jz $L088bad_pointer + jz $L091bad_pointer test edx,edx - jz $L088bad_pointer + jz $L091bad_pointer + call $L092pic +$L092pic: + pop ebx + lea ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx] + lea ebp,DWORD PTR _OPENSSL_ia32cap_P movups xmm0,XMMWORD PTR [eax] xorps xmm4,xmm4 + mov ebp,DWORD PTR 4[ebp] lea edx,DWORD PTR 16[edx] + and ebp,268437504 cmp ecx,256 - je $L08914rounds + je $L09314rounds cmp ecx,192 - je $L09012rounds + je $L09412rounds cmp ecx,128 - jne $L091bad_keybits + jne $L095bad_keybits ALIGN 16 -$L09210rounds: +$L09610rounds: + cmp ebp,268435456 + je $L09710rounds_alt mov ecx,9 movups XMMWORD PTR [edx-16],xmm0 DB 102,15,58,223,200,1 - call $L093key_128_cold + call $L098key_128_cold DB 102,15,58,223,200,2 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,4 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,8 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,16 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,32 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,64 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,128 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,27 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,54 - call $L094key_128 + call $L099key_128 movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 80[edx],ecx - xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L094key_128: +$L099key_128: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] -$L093key_128_cold: +$L098key_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2009,38 +2109,91 @@ $L093key_128_cold: xorps xmm0,xmm1 ret ALIGN 16 -$L09012rounds: +$L09710rounds_alt: + movdqa xmm5,XMMWORD PTR [ebx] + mov ecx,8 + movdqa xmm4,XMMWORD PTR 32[ebx] + movdqa xmm2,xmm0 + movdqu XMMWORD PTR [edx-16],xmm0 +$L101loop_key128: +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + lea edx,DWORD PTR 16[edx] + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx-16],xmm0 + movdqa xmm2,xmm0 + dec ecx + jnz $L101loop_key128 + movdqa xmm4,XMMWORD PTR 48[ebx] +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx],xmm0 + movdqa xmm2,xmm0 +DB 102,15,56,0,197 +DB 102,15,56,221,196 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR 16[edx],xmm0 + mov ecx,9 + mov DWORD PTR 96[edx],ecx + jmp $L100good_key +ALIGN 16 +$L09412rounds: movq xmm2,QWORD PTR 16[eax] + cmp ebp,268435456 + je $L10212rounds_alt mov ecx,11 movups XMMWORD PTR [edx-16],xmm0 DB 102,15,58,223,202,1 - call $L095key_192a_cold + call $L103key_192a_cold DB 102,15,58,223,202,2 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,4 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,8 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,16 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,32 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,64 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,128 - call $L096key_192b + call $L104key_192b movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 48[edx],ecx - xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L097key_192a: +$L105key_192a: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] ALIGN 16 -$L095key_192a_cold: +$L103key_192a_cold: movaps xmm5,xmm2 -$L098key_192b_warm: +$L106key_192b_warm: shufps xmm4,xmm0,16 movdqa xmm3,xmm2 xorps xmm0,xmm4 @@ -2054,56 +2207,90 @@ $L098key_192b_warm: pxor xmm2,xmm3 ret ALIGN 16 -$L096key_192b: +$L104key_192b: movaps xmm3,xmm0 shufps xmm5,xmm0,68 movups XMMWORD PTR [edx],xmm5 shufps xmm3,xmm2,78 movups XMMWORD PTR 16[edx],xmm3 lea edx,DWORD PTR 32[edx] - jmp $L098key_192b_warm + jmp $L106key_192b_warm +ALIGN 16 +$L10212rounds_alt: + movdqa xmm5,XMMWORD PTR 16[ebx] + movdqa xmm4,XMMWORD PTR 32[ebx] + mov ecx,8 + movdqu XMMWORD PTR [edx-16],xmm0 +$L107loop_key192: + movq QWORD PTR [edx],xmm2 + movdqa xmm1,xmm2 +DB 102,15,56,0,213 +DB 102,15,56,221,212 + pslld xmm4,1 + lea edx,DWORD PTR 24[edx] + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pshufd xmm3,xmm0,255 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu XMMWORD PTR [edx-16],xmm0 + dec ecx + jnz $L107loop_key192 + mov ecx,11 + mov DWORD PTR 32[edx],ecx + jmp $L100good_key ALIGN 16 -$L08914rounds: +$L09314rounds: movups xmm2,XMMWORD PTR 16[eax] - mov ecx,13 lea edx,DWORD PTR 16[edx] + cmp ebp,268435456 + je $L10814rounds_alt + mov ecx,13 movups XMMWORD PTR [edx-32],xmm0 movups XMMWORD PTR [edx-16],xmm2 DB 102,15,58,223,202,1 - call $L099key_256a_cold + call $L109key_256a_cold DB 102,15,58,223,200,1 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,2 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,2 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,4 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,4 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,8 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,8 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,16 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,16 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,32 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,32 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,64 - call $L101key_256a + call $L111key_256a movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 16[edx],ecx xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L101key_256a: +$L111key_256a: movups XMMWORD PTR [edx],xmm2 lea edx,DWORD PTR 16[edx] -$L099key_256a_cold: +$L109key_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2112,7 +2299,7 @@ $L099key_256a_cold: xorps xmm0,xmm1 ret ALIGN 16 -$L100key_256b: +$L110key_256b: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] shufps xmm4,xmm2,16 @@ -2122,13 +2309,70 @@ $L100key_256b: shufps xmm1,xmm1,170 xorps xmm2,xmm1 ret +ALIGN 16 +$L10814rounds_alt: + movdqa xmm5,XMMWORD PTR [ebx] + movdqa xmm4,XMMWORD PTR 32[ebx] + mov ecx,7 + movdqu XMMWORD PTR [edx-32],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD PTR [edx-16],xmm2 +$L112loop_key256: +DB 102,15,56,0,213 +DB 102,15,56,221,212 + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx],xmm0 + dec ecx + jz $L113done_key256 + pshufd xmm2,xmm0,255 + pxor xmm3,xmm3 +DB 102,15,56,221,211 + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + pxor xmm2,xmm1 + movdqu XMMWORD PTR 16[edx],xmm2 + lea edx,DWORD PTR 32[edx] + movdqa xmm1,xmm2 + jmp $L112loop_key256 +$L113done_key256: + mov ecx,13 + mov DWORD PTR 16[edx],ecx +$L100good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + pop ebp + ret ALIGN 4 -$L088bad_pointer: +$L091bad_pointer: mov eax,-1 + pop ebx + pop ebp ret ALIGN 4 -$L091bad_keybits: +$L095bad_keybits: + pxor xmm0,xmm0 mov eax,-2 + pop ebx + pop ebp ret __aesni_set_encrypt_key ENDP ALIGN 16 @@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin:: mov edx,DWORD PTR 12[esp] shl ecx,4 test eax,eax - jnz $L102dec_key_ret + jnz $L114dec_key_ret lea eax,DWORD PTR 16[ecx*1+edx] movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR [eax] @@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin:: movups XMMWORD PTR [edx],xmm1 lea edx,DWORD PTR 16[edx] lea eax,DWORD PTR [eax-16] -$L103dec_key_inverse: +$L115dec_key_inverse: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR [eax] DB 102,15,56,219,192 @@ -2168,17 +2412,28 @@ DB 102,15,56,219,201 movups XMMWORD PTR 16[eax],xmm0 movups XMMWORD PTR [edx-16],xmm1 cmp eax,edx - ja $L103dec_key_inverse + ja $L115dec_key_inverse movups xmm0,XMMWORD PTR [edx] DB 102,15,56,219,192 movups XMMWORD PTR [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 xor eax,eax -$L102dec_key_ret: +$L114dec_key_ret: ret _aesni_set_decrypt_key ENDP +ALIGN 64 +$Lkey_const:: +DD 202313229,202313229,202313229,202313229 +DD 67569157,67569157,67569157,67569157 +DD 1,1,1,1 +DD 27,27,27,27 DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 DB 115,108,46,111,114,103,62,0 .text$ ENDS +.bss SEGMENT 'BSS' +COMM _OPENSSL_ia32cap_P:DWORD:4 +.bss ENDS END diff --git a/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S b/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S index 732ba3d9c8..fd979d078f 100644 --- a/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S +++ b/deps/openssl/asm_obsolete/arm-void-gas/aes/aesv8-armx.S @@ -230,17 +230,17 @@ aes_v8_encrypt: .Loop_enc: .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2]! .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2]! subs r3,r3,#2 .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - vld1.32 {q1},[r2]! .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q1},[r2]! bgt .Loop_enc .byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 - vld1.32 {q0},[r2] .byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2] .byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 veor q2,q2,q0 @@ -259,17 +259,17 @@ aes_v8_decrypt: .Loop_dec: .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2]! .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2]! subs r3,r3,#2 .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - vld1.32 {q1},[r2]! .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q1},[r2]! bgt .Loop_dec .byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 - vld1.32 {q0},[r2] .byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2] .byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 veor q2,q2,q0 @@ -313,16 +313,42 @@ aes_v8_cbc_encrypt: veor q5,q8,q7 beq .Lcbc_enc128 + vld1.32 {q2-q3},[r7] + add r7,r3,#16 + add r6,r3,#16*4 + add r12,r3,#16*5 + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r14,r3,#16*6 + add r3,r3,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r6,r6,#2 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc: .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - bgt .Loop_cbc_enc + .byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r6] + cmp r5,#4 + .byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r12] + beq .Lcbc_enc192 + + .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r14] + .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 + .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r3] + nop +.Lcbc_enc192: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 subs r2,r2,#16 @@ -331,7 +357,6 @@ aes_v8_cbc_encrypt: moveq r8,#0 .byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r7,r3,#16 .byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 vld1.8 {q8},[r0],r8 @@ -340,16 +365,14 @@ aes_v8_cbc_encrypt: veor q8,q8,q5 .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vld1.32 {q9},[r7] @ re-pre-load rndkey[1] .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - - mov r6,r5 veor q6,q0,q7 - vst1.8 {q6},[r1]! bhs .Loop_cbc_enc + vst1.8 {q6},[r1]! b .Lcbc_done .align 5 @@ -407,79 +430,78 @@ aes_v8_cbc_encrypt: .Loop3x_cbc_dec: .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - vld1.32 {q8},[r7]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - vld1.32 {q9},[r7]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! bgt .Loop3x_cbc_dec .byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 - .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - veor q4,q6,q7 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q4,q6,q7 + subs r2,r2,#0x30 veor q5,q2,q7 + movlo r6,r2 @ r6, r6, is zero at this point .byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 - .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - veor q9,q3,q7 - subs r2,r2,#0x30 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vorr q6,q11,q11 - movlo r6,r2 @ r6, r6, is zero at this point - .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 - .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 + veor q9,q3,q7 add r0,r0,r6 @ r0 is adjusted in such way that @ at exit from the loop q1-q10 @ are loaded with last "words" + vorr q6,q11,q11 + mov r7,r3 + .byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - mov r7,r3 - .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 - .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 vld1.8 {q2},[r0]! + .byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 vld1.8 {q3},[r0]! .byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 - .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 - vld1.8 {q11},[r0]! .byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vld1.8 {q11},[r0]! .byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 .byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] add r6,r5,#2 veor q4,q4,q0 veor q5,q5,q1 veor q10,q10,q9 vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vorr q0,q2,q2 vst1.8 {q4},[r1]! - vorr q1,q3,q3 + vorr q0,q2,q2 vst1.8 {q5},[r1]! + vorr q1,q3,q3 vst1.8 {q10},[r1]! vorr q10,q11,q11 bhs .Loop3x_cbc_dec @@ -490,39 +512,39 @@ aes_v8_cbc_encrypt: .Lcbc_dec_tail: .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 - vld1.32 {q8},[r7]! .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 - vld1.32 {q9},[r7]! .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! bgt .Lcbc_dec_tail .byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 - .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 - .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 .byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 - .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 cmn r2,#0x20 .byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 - .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 veor q5,q6,q7 .byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 - .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + .byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 .byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 veor q9,q3,q7 .byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 @@ -590,70 +612,69 @@ aes_v8_ctr32_encrypt_blocks: .align 4 .Loop3x_ctr32: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q9},[r7]! bgt .Loop3x_ctr32 .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 - mov r7,r3 .byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 - vld1.8 {q2},[r0]! + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.8 {q2},[r0]! vorr q0,q6,q6 - .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 + .byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 + .byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 vld1.8 {q3},[r0]! - .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 - .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 vorr q1,q6,q6 + .byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vld1.8 {q11},[r0]! + .byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.8 {q11},[r0]! + mov r7,r3 + .byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 .byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 vorr q10,q6,q6 add r9,r8,#1 .byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 - .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 veor q2,q2,q7 add r10,r8,#2 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 veor q3,q3,q7 add r8,r8,#3 .byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 - .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 veor q11,q11,q7 rev r9,r9 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 vmov.32 d1[1], r9 rev r10,r10 .byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 + .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 .byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 - .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 + .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 vmov.32 d3[1], r10 rev r12,r8 - .byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - .byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + .byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 .byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 vmov.32 d21[1], r12 subs r2,r2,#3 @@ -661,13 +682,14 @@ aes_v8_ctr32_encrypt_blocks: .byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 .byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 - mov r6,r5 veor q2,q2,q4 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vst1.8 {q2},[r1]! veor q3,q3,q5 + mov r6,r5 + vst1.8 {q3},[r1]! veor q11,q11,q9 vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q2},[r1]! - vst1.8 {q3},[r1]! vst1.8 {q11},[r1]! bhs .Loop3x_ctr32 @@ -679,40 +701,40 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_tail: .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 - vld1.32 {q8},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q8},[r7]! subs r6,r6,#2 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 - vld1.32 {q9},[r7]! .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q9},[r7]! bgt .Lctr32_tail .byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 - .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 .byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 - .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 vld1.8 {q2},[r0],r12 .byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 - .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 - vld1.8 {q3},[r0] .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0] .byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 - .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 - .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 veor q2,q2,q7 + .byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 .byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + .byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 .byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 veor q3,q3,q7 .byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 diff --git a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S index d321235f79..c54f514997 100644 --- a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S +++ b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghash-armv4.S @@ -495,7 +495,7 @@ gcm_ghash_neon: veor q10,q10,q9 @ vshl.i64 q9,q0,#63 veor q10, q10, q9 @ - veor d1,d1,d20 @ + veor d1,d1,d20 @ veor d4,d4,d21 vshr.u64 q10,q0,#1 @ 2nd phase diff --git a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S index 570d9175c4..269574945f 100644 --- a/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S +++ b/deps/openssl/asm_obsolete/arm-void-gas/modes/ghashv8-armx.S @@ -7,109 +7,223 @@ .type gcm_init_v8,%function .align 4 gcm_init_v8: - vld1.64 {q9},[r1] @ load H - vmov.i8 q8,#0xe1 + vld1.64 {q9},[r1] @ load input H + vmov.i8 q11,#0xe1 + vshl.i64 q11,q11,#57 @ 0xc2.0 vext.8 q3,q9,q9,#8 - vshl.i64 q8,q8,#57 - vshr.u64 q10,q8,#63 - vext.8 q8,q10,q8,#8 @ t0=0xc2....01 + vshr.u64 q10,q11,#63 vdup.32 q9,d18[1] - vshr.u64 q11,q3,#63 + vext.8 q8,q10,q11,#8 @ t0=0xc2....01 + vshr.u64 q10,q3,#63 vshr.s32 q9,q9,#31 @ broadcast carry bit - vand q11,q11,q8 + vand q10,q10,q8 vshl.i64 q3,q3,#1 - vext.8 q11,q11,q11,#8 + vext.8 q10,q10,q10,#8 vand q8,q8,q9 - vorr q3,q3,q11 @ H<<<=1 - veor q3,q3,q8 @ twisted H - vst1.64 {q3},[r0] + vorr q3,q3,q10 @ H<<<=1 + veor q12,q3,q8 @ twisted H + vst1.64 {q12},[r0]! @ store Htable[0] + + @ calculate H^2 + vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing + .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + veor q8,q8,q12 + .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 + .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q14,q0,q10 + + vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + veor q9,q9,q14 + vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed + vst1.64 {q13-q14},[r0] @ store Htable[1..2] bx lr .size gcm_init_v8,.-gcm_init_v8 - .global gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: vld1.64 {q9},[r0] @ load Xi vmov.i8 q11,#0xe1 - vld1.64 {q12},[r1] @ load twisted H + vld1.64 {q12-q13},[r1] @ load twisted H, ... vshl.u64 q11,q11,#57 #ifndef __ARMEB__ vrev64.8 q9,q9 #endif - vext.8 q13,q12,q12,#8 - mov r3,#0 vext.8 q3,q9,q9,#8 - mov r12,#0 - veor q13,q13,q12 @ Karatsuba pre-processing - mov r2,r0 - b .Lgmult_v8 -.size gcm_gmult_v8,.-gcm_gmult_v8 + .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing + .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr +.size gcm_gmult_v8,.-gcm_gmult_v8 .global gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: + vstmdb sp!,{d8-d15} @ 32-bit ABI says so vld1.64 {q0},[r0] @ load [rotated] Xi - subs r3,r3,#16 + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ alorithm specification + subs r3,r3,#32 @ see if r3 is 32 or larger + mov r12,#16 @ r12 is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ r12 is zeroed just in time + @ to preclude oversteping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {q12-q13},[r1]! @ load twisted H, ..., H^2 vmov.i8 q11,#0xe1 - mov r12,#16 - vld1.64 {q12},[r1] @ load twisted H - moveq r12,#0 - vext.8 q0,q0,q0,#8 - vshl.u64 q11,q11,#57 - vld1.64 {q9},[r2],r12 @ load [rotated] inp - vext.8 q13,q12,q12,#8 + vld1.64 {q14},[r1] + moveq r12,#0 @ is it time to zero r12? + vext.8 q0,q0,q0,#8 @ rotate Xi + vld1.64 {q8},[r2]! @ load [rotated] I[0] + vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant #ifndef __ARMEB__ + vrev64.8 q8,q8 vrev64.8 q0,q0 +#endif + vext.8 q3,q8,q8,#8 @ rotate I[0] + blo .Lodd_tail_v8 @ r3 was less than 32 + vld1.64 {q9},[r2],r12 @ load [rotated] I[1] +#ifndef __ARMEB__ vrev64.8 q9,q9 #endif - veor q13,q13,q12 @ Karatsuba pre-processing - vext.8 q3,q9,q9,#8 - b .Loop_v8 + vext.8 q7,q9,q9,#8 + veor q3,q3,q0 @ I[i]^=Xi + .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q9,q9,q7 @ Karatsuba pre-processing + .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + b .Loop_mod2x_v8 .align 4 -.Loop_v8: +.Loop_mod2x_v8: + vext.8 q10,q3,q3,#8 + subs r3,r3,#32 @ is there more data? + .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + movlo r12,#0 @ is it time to zero r12? + + .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + veor q10,q10,q3 @ Karatsuba pre-processing + .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + veor q0,q0,q4 @ accumulate + .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] + + veor q2,q2,q6 + moveq r12,#0 @ is it time to zero r12? + veor q1,q1,q5 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 q8,q8 +#endif + veor q1,q1,q10 + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + vext.8 q7,q9,q9,#8 + vext.8 q3,q8,q8,#8 + veor q0,q1,q10 + .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q3,q3,q2 @ accumulate q3 early + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction + .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q3,q3,q10 + veor q9,q9,q7 @ Karatsuba pre-processing + veor q3,q3,q0 + .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + bhs .Loop_mod2x_v8 @ there was at least 32 more bytes + + veor q2,q2,q10 + vext.8 q3,q8,q8,#8 @ re-construct q3 + adds r3,r3,#32 @ re-construct r3 + veor q0,q0,q2 @ re-construct q0 + beq .Ldone_v8 @ is r3 zero? +.Lodd_tail_v8: vext.8 q10,q0,q0,#8 veor q3,q3,q0 @ inp^=Xi - veor q9,q9,q10 @ q9 is rotated inp^Xi + veor q9,q8,q10 @ q9 is rotated inp^Xi -.Lgmult_v8: .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi - subs r3,r3,#16 .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - moveq r12,#0 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 - vld1.64 {q9},[r2],r12 @ load [rotated] inp veor q1,q1,q10 - .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif veor q0,q1,q10 - vext.8 q3,q9,q9,#8 - vext.8 q10,q0,q0,#8 @ 2nd phase + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 - bhs .Loop_v8 +.Ldone_v8: #ifndef __ARMEB__ vrev64.8 q0,q0 #endif vext.8 q0,q0,q0,#8 vst1.64 {q0},[r0] @ write out Xi + vldmia sp!,{d8-d15} @ 32-bit ABI says so bx lr .size gcm_ghash_v8,.-gcm_ghash_v8 .asciz "GHASH for ARMv8, CRYPTOGAMS by " diff --git a/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S b/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S index bf1ce4f997..683f1cc0c8 100644 --- a/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S +++ b/deps/openssl/asm_obsolete/arm-void-gas/sha/sha256-armv4.S @@ -1,7 +1,59 @@ -#include "arm_arch.h" + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif .text +#if __ARM_ARCH__<7 .code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif .type K256,%object .align 5 @@ -24,7 +76,7 @@ K256: .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: .word OPENSSL_armcap_P-sha256_block_data_order #endif @@ -33,9 +85,12 @@ K256: .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +#if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order - add r2,r1,r2,lsl#6 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,sha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P tst r12,#ARMV8_SHA256 @@ -43,6 +98,7 @@ sha256_block_data_order: tst r12,#ARMV7_NEON bne .LNEON #endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp stmdb sp!,{r0,r1,r2,r4-r11,lr} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} sub r14,r3,#256+32 @ K256 @@ -1736,6 +1792,9 @@ sha256_block_data_order: eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif ldreq r3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx @@ -1777,16 +1836,19 @@ sha256_block_data_order: .arch armv7-a .fpu neon +.global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 4 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4-r12,lr} + sub r11,sp,#16*4+16 + adr r14,K256 + bic r11,r11,#15 @ align for 128-bit stores mov r12,sp - sub sp,sp,#16*4+16 @ alloca - sub r14,r3,#256+32 @ K256 - bic sp,sp,#15 @ align for 128-bit stores + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp vld1.8 {q0},[r1]! vld1.8 {q1},[r1]! @@ -2224,11 +2286,13 @@ sha256_block_data_order_neon: ldr r0,[sp,#72] sub r14,r14,#256 @ rewind r14 teq r1,r0 + it eq subeq r1,r1,#64 @ avoid SEGV vld1.8 {q0},[r1]! @ load next input block vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! + it ne strne r1,[sp,#68] mov r1,sp add r11,r11,r2 @@ -2542,23 +2606,38 @@ sha256_block_data_order_neon: str r7,[r2],#4 stmia r2,{r8-r11} + ittte ne movne r1,sp ldrne r2,[sp,#0] eorne r12,r12,r12 ldreq sp,[sp,#76] @ restore original sp + itt ne eorne r3,r5,r6 bne .L_00_48 ldmia sp!,{r4-r12,pc} .size sha256_block_data_order_neon,.-sha256_block_data_order_neon #endif -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: .LARMv8: vld1.32 {q0,q1},[r0] - sub r3,r3,#sha256_block_data_order-K256 +# ifdef __thumb2__ + adr r3,.LARMv8 + sub r3,r3,#.LARMv8-K256 +# else + adrl r3,K256 +# endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp .Loop_v8: vld1.8 {q8-q9},[r1]! @@ -2573,114 +2652,115 @@ sha256_block_data_order_armv8: teq r1,r2 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 - .byte 0xe2,0x03,0xfa,0xf3 @ sha256su0 q8,q9 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe6,0x0c,0x64,0xf3 @ sha256su1 q8,q10,q11 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 - .byte 0xe4,0x23,0xfa,0xf3 @ sha256su0 q9,q10 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe0,0x2c,0x66,0xf3 @ sha256su1 q9,q11,q8 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 - .byte 0xe6,0x43,0xfa,0xf3 @ sha256su0 q10,q11 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 - .byte 0xe2,0x4c,0x60,0xf3 @ sha256su1 q10,q8,q9 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 - .byte 0xe0,0x63,0xfa,0xf3 @ sha256su0 q11,q8 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 - .byte 0xe4,0x6c,0x62,0xf3 @ sha256su1 q11,q9,q10 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vld1.32 {q13},[r3] vadd.i32 q12,q12,q10 sub r3,r3,#256-16 @ rewind vmov q2,q0 - .byte 0x68,0x0c,0x02,0xf3 @ sha256h q0,q1,q12 - .byte 0x68,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q12 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vadd.i32 q13,q13,q11 vmov q2,q0 - .byte 0x6a,0x0c,0x02,0xf3 @ sha256h q0,q1,q13 - .byte 0x6a,0x2c,0x14,0xf3 @ sha256h2 q1,q2,q13 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vadd.i32 q0,q0,q14 vadd.i32 q1,q1,q15 + it ne bne .Loop_v8 vst1.32 {q0,q1},[r0] @@ -2690,6 +2770,6 @@ sha256_block_data_order_armv8: #endif .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 #endif diff --git a/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S b/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S index 0a4b1ac4c4..f5dd6cbb86 100644 --- a/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S +++ b/deps/openssl/asm_obsolete/arm64-linux64-gas/aes/aesv8-armx.S @@ -227,17 +227,17 @@ aes_v8_encrypt: .Loop_enc: aese v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aese v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_enc aese v2.16b,v0.16b - ld1 {v0.4s},[x2] aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] aese v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -256,17 +256,17 @@ aes_v8_decrypt: .Loop_dec: aesd v2.16b,v0.16b - ld1 {v0.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 subs w3,w3,#2 aesd v2.16b,v1.16b - ld1 {v1.4s},[x2],#16 aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 b.gt .Loop_dec aesd v2.16b,v0.16b - ld1 {v0.4s},[x2] aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] aesd v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b @@ -308,16 +308,42 @@ aes_v8_cbc_encrypt: eor v5.16b,v16.16b,v7.16b b.eq .Lcbc_enc128 + ld1 {v2.4s-v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b .Lenter_cbc_enc + +.align 4 .Loop_cbc_enc: aese v0.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b - subs w6,w6,#2 + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc: aese v0.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b - b.gt .Loop_cbc_enc + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq .Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop +.Lcbc_enc192: aese v0.16b,v16.16b aesmc v0.16b,v0.16b subs x2,x2,#16 @@ -326,7 +352,6 @@ aes_v8_cbc_encrypt: csel x8,xzr,x8,eq aese v0.16b,v18.16b aesmc v0.16b,v0.16b - add x7,x3,#16 aese v0.16b,v19.16b aesmc v0.16b,v0.16b ld1 {v16.16b},[x0],x8 @@ -335,16 +360,14 @@ aes_v8_cbc_encrypt: eor v16.16b,v16.16b,v5.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b - - mov w6,w5 eor v6.16b,v0.16b,v7.16b - st1 {v6.16b},[x1],#16 b.hs .Loop_cbc_enc + st1 {v6.16b},[x1],#16 b .Lcbc_done .align 5 @@ -402,79 +425,78 @@ aes_v8_cbc_encrypt: .Loop3x_cbc_dec: aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Loop3x_cbc_dec aesd v0.16b,v16.16b - aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - eor v4.16b,v6.16b,v7.16b aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point aesd v0.16b,v17.16b - aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - eor v17.16b,v3.16b,v7.16b - subs x2,x2,#0x30 aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point - aesd v0.16b,v20.16b - aesd v1.16b,v20.16b - aesd v18.16b,v20.16b + eor v17.16b,v3.16b,v7.16b add x0,x0,x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v18.16b // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b - mov x7,x3 - aesd v0.16b,v21.16b - aesd v1.16b,v21.16b - aesd v18.16b,v21.16b ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 aesd v0.16b,v22.16b - aesd v1.16b,v22.16b - aesd v18.16b,v22.16b - ld1 {v19.16b},[x0],#16 aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v19.16b},[x0],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v18.16b,v23.16b - + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v18.16b,v18.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - orr v0.16b,v2.16b,v2.16b st1 {v4.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b + orr v0.16b,v2.16b,v2.16b st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b st1 {v18.16b},[x1],#16 orr v18.16b,v19.16b,v19.16b b.hs .Loop3x_cbc_dec @@ -485,39 +507,39 @@ aes_v8_cbc_encrypt: .Lcbc_dec_tail: aesd v1.16b,v16.16b - aesd v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aesd v1.16b,v17.16b - aesd v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Lcbc_dec_tail aesd v1.16b,v16.16b - aesd v18.16b,v16.16b aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b aesimc v18.16b,v18.16b aesd v1.16b,v17.16b - aesd v18.16b,v17.16b aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b aesimc v18.16b,v18.16b aesd v1.16b,v20.16b - aesd v18.16b,v20.16b aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b aesimc v18.16b,v18.16b cmn x2,#0x20 aesd v1.16b,v21.16b - aesd v18.16b,v21.16b aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b aesimc v18.16b,v18.16b eor v5.16b,v6.16b,v7.16b aesd v1.16b,v22.16b - aesd v18.16b,v22.16b aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b aesimc v18.16b,v18.16b eor v17.16b,v3.16b,v7.16b aesd v1.16b,v23.16b @@ -583,70 +605,69 @@ aes_v8_ctr32_encrypt_blocks: .align 4 .Loop3x_ctr32: aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b + aese v18.16b,v16.16b aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b - aese v1.16b,v17.16b - aese v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b + aese v18.16b,v17.16b aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 b.gt .Loop3x_ctr32 aese v0.16b,v16.16b - aese v1.16b,v16.16b - aese v18.16b,v16.16b - mov x7,x3 aesmc v4.16b,v0.16b - ld1 {v2.16b},[x0],#16 + aese v1.16b,v16.16b aesmc v5.16b,v1.16b - aesmc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 orr v0.16b,v6.16b,v6.16b - aese v4.16b,v17.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b ld1 {v3.16b},[x0],#16 - aese v5.16b,v17.16b - aese v18.16b,v17.16b orr v1.16b,v6.16b,v6.16b + aese v4.16b,v17.16b aesmc v4.16b,v4.16b - ld1 {v19.16b},[x0],#16 + aese v5.16b,v17.16b aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b aesmc v17.16b,v18.16b orr v18.16b,v6.16b,v6.16b add w9,w8,#1 aese v4.16b,v20.16b + aesmc v4.16b,v4.16b aese v5.16b,v20.16b - aese v17.16b,v20.16b + aesmc v5.16b,v5.16b eor v2.16b,v2.16b,v7.16b add w10,w8,#2 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b + aese v17.16b,v20.16b aesmc v17.16b,v17.16b eor v3.16b,v3.16b,v7.16b add w8,w8,#3 aese v4.16b,v21.16b + aesmc v4.16b,v4.16b aese v5.16b,v21.16b - aese v17.16b,v21.16b + aesmc v5.16b,v5.16b eor v19.16b,v19.16b,v7.16b rev w9,w9 - aesmc v4.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - aesmc v5.16b,v5.16b + aese v17.16b,v21.16b aesmc v17.16b,v17.16b mov v0.s[3], w9 rev w10,w10 aese v4.16b,v22.16b + aesmc v4.16b,v4.16b aese v5.16b,v22.16b - aese v17.16b,v22.16b + aesmc v5.16b,v5.16b mov v1.s[3], w10 rev w12,w8 - aesmc v4.16b,v4.16b - aesmc v5.16b,v5.16b + aese v17.16b,v22.16b aesmc v17.16b,v17.16b mov v18.s[3], w12 subs x2,x2,#3 @@ -654,13 +675,14 @@ aes_v8_ctr32_encrypt_blocks: aese v5.16b,v23.16b aese v17.16b,v23.16b - mov w6,w5 eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 eor v19.16b,v19.16b,v17.16b ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v2.16b},[x1],#16 - st1 {v3.16b},[x1],#16 st1 {v19.16b},[x1],#16 b.hs .Loop3x_ctr32 @@ -672,40 +694,40 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_tail: aese v0.16b,v16.16b - aese v1.16b,v16.16b - ld1 {v16.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 subs w6,w6,#2 aese v0.16b,v17.16b - aese v1.16b,v17.16b - ld1 {v17.4s},[x7],#16 aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 b.gt .Lctr32_tail aese v0.16b,v16.16b - aese v1.16b,v16.16b aesmc v0.16b,v0.16b + aese v1.16b,v16.16b aesmc v1.16b,v1.16b aese v0.16b,v17.16b - aese v1.16b,v17.16b aesmc v0.16b,v0.16b + aese v1.16b,v17.16b aesmc v1.16b,v1.16b ld1 {v2.16b},[x0],x12 aese v0.16b,v20.16b - aese v1.16b,v20.16b - ld1 {v3.16b},[x0] aesmc v0.16b,v0.16b + aese v1.16b,v20.16b aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] aese v0.16b,v21.16b - aese v1.16b,v21.16b aesmc v0.16b,v0.16b + aese v1.16b,v21.16b aesmc v1.16b,v1.16b - aese v0.16b,v22.16b - aese v1.16b,v22.16b eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b aesmc v0.16b,v0.16b + aese v1.16b,v22.16b aesmc v1.16b,v1.16b eor v3.16b,v3.16b,v7.16b aese v0.16b,v23.16b diff --git a/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S b/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S index 1bfb26340a..479007dc54 100644 --- a/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S +++ b/deps/openssl/asm_obsolete/arm64-linux64-gas/modes/ghashv8-armx.S @@ -6,103 +6,215 @@ .type gcm_init_v8,%function .align 4 gcm_init_v8: - ld1 {v17.2d},[x1] //load H - movi v16.16b,#0xe1 + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 - shl v16.2d,v16.2d,#57 - ushr v18.2d,v16.2d,#63 - ext v16.16b,v18.16b,v16.16b,#8 //t0=0xc2....01 + ushr v18.2d,v19.2d,#63 dup v17.4s,v17.s[1] - ushr v19.2d,v3.2d,#63 + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 sshr v17.4s,v17.4s,#31 //broadcast carry bit - and v19.16b,v19.16b,v16.16b + and v18.16b,v18.16b,v16.16b shl v3.2d,v3.2d,#1 - ext v19.16b,v19.16b,v19.16b,#8 + ext v18.16b,v18.16b,v18.16b,#8 and v16.16b,v16.16b,v17.16b - orr v3.16b,v3.16b,v19.16b //H<<<=1 - eor v3.16b,v3.16b,v16.16b //twisted H - st1 {v3.2d},[x0] + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d-v22.2d},[x0] //store Htable[1..2] ret .size gcm_init_v8,.-gcm_init_v8 - .global gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: ld1 {v17.2d},[x0] //load Xi movi v19.16b,#0xe1 - ld1 {v20.2d},[x1] //load twisted H + ld1 {v20.2d-v21.2d},[x1] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - ext v21.16b,v20.16b,v20.16b,#8 - mov x3,#0 ext v3.16b,v17.16b,v17.16b,#8 - mov x12,#0 - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - mov x2,x0 - b .Lgmult_v8 -.size gcm_gmult_v8,.-gcm_gmult_v8 + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __ARMEB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 .global gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: ld1 {v0.2d},[x0] //load [rotated] Xi - subs x3,x3,#16 + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //alorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude oversteping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d-v21.2d},[x1],#32 //load twisted H, ..., H^2 movi v19.16b,#0xe1 - mov x12,#16 - ld1 {v20.2d},[x1] //load twisted H - csel x12,xzr,x12,eq - ext v0.16b,v0.16b,v0.16b,#8 - shl v19.2d,v19.2d,#57 - ld1 {v17.2d},[x2],x12 //load [rotated] inp - ext v21.16b,v20.16b,v20.16b,#8 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant #ifndef __ARMEB__ + rev64 v16.16b,v16.16b rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo .Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __ARMEB__ rev64 v17.16b,v17.16b #endif - eor v21.16b,v21.16b,v20.16b //Karatsuba pre-processing - ext v3.16b,v17.16b,v17.16b,#8 - b .Loop_v8 + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b .Loop_mod2x_v8 .align 4 -.Loop_v8: +.Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __ARMEB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __ARMEB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs .Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq .Ldone_v8 //is x3 zero? +.Lodd_tail_v8: ext v18.16b,v0.16b,v0.16b,#8 eor v3.16b,v3.16b,v0.16b //inp^=Xi - eor v17.16b,v17.16b,v18.16b //v17.16b is rotated inp^Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi -.Lgmult_v8: pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - subs x3,x3,#16 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - csel x12,xzr,x12,eq ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] inp eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction ins v2.d[0],v1.d[1] ins v1.d[1],v0.d[0] -#ifndef __ARMEB__ - rev64 v17.16b,v17.16b -#endif eor v0.16b,v1.16b,v18.16b - ext v3.16b,v17.16b,v17.16b,#8 - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction pmull v0.1q,v0.1d,v19.1d eor v18.16b,v18.16b,v2.16b eor v0.16b,v0.16b,v18.16b - b.hs .Loop_v8 +.Ldone_v8: #ifndef __ARMEB__ rev64 v0.16b,v0.16b #endif diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s index 84708afbbb..6573fe4be3 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s @@ -17,7 +17,10 @@ aesni_encrypt: leaq 16(%rdx),%rdx jnz .Loop_enc1_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_encrypt,.-aesni_encrypt @@ -38,7 +41,10 @@ aesni_decrypt: leaq 16(%rdx),%rdx jnz .Loop_dec1_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 .size aesni_decrypt, .-aesni_decrypt .type _aesni_encrypt2,@function @@ -264,21 +270,18 @@ _aesni_encrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%rcx,%rax,1),%xmm0 jmp .Lenc_loop6_enter .align 16 .Lenc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +.Lenc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -.Lenc_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 @@ -321,21 +324,18 @@ _aesni_decrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%rcx,%rax,1),%xmm0 jmp .Ldec_loop6_enter .align 16 .Ldec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +.Ldec_loop6_enter: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -.Ldec_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,222,208 @@ -375,23 +375,18 @@ _aesni_encrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 pxor %xmm0,%xmm9 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp .Lenc_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner .align 16 .Lenc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 +.Lenc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -444,23 +439,18 @@ _aesni_decrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,222,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,222,217 pxor %xmm0,%xmm8 +.byte 102,15,56,222,217 pxor %xmm0,%xmm9 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp .Ldec_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop8_inner .align 16 .Ldec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 +.Ldec_loop8_inner: .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -587,6 +577,7 @@ aesni_ecb_encrypt: movups 80(%rdi),%xmm7 je .Lecb_enc_six movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 call _aesni_encrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -700,15 +691,23 @@ aesni_ecb_encrypt: jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movq %r11,%rcx movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movl %r10d,%eax movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi addq $128,%rdx jz .Lecb_ret @@ -731,14 +730,23 @@ aesni_ecb_encrypt: je .Lecb_dec_six movups 96(%rdi),%xmm8 movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 call _aesni_decrypt8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp .Lecb_ret .align 16 .Lecb_dec_one: @@ -754,49 +762,73 @@ aesni_ecb_encrypt: jnz .Loop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lecb_ret .align 16 .Lecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 jmp .Lecb_ret .align 16 .Lecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 jmp .Lecb_ret .align 16 .Lecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 jmp .Lecb_ret .align 16 .Lecb_dec_five: xorps %xmm7,%xmm7 call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 jmp .Lecb_ret .align 16 .Lecb_dec_six: call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 .Lecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 .byte 0xf3,0xc3 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt .globl aesni_ccm64_encrypt_blocks @@ -853,7 +885,13 @@ aesni_ccm64_encrypt_blocks: leaq 16(%rsi),%rsi jnz .Lccm64_enc_outer + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks .globl aesni_ccm64_decrypt_blocks @@ -944,21 +982,56 @@ aesni_ccm64_decrypt_blocks: leaq 16(%r11),%r11 jnz .Loop_enc1_6 .byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks .globl aesni_ctr32_encrypt_blocks .type aesni_ctr32_encrypt_blocks,@function .align 16 aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx + jne .Lctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: leaq (%rsp),%rax pushq %rbp subq $128,%rsp andq $-16,%rsp leaq -8(%rax),%rbp - cmpq $1,%rdx - je .Lctr32_one_shortcut + + movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 @@ -1349,11 +1422,14 @@ aesni_ctr32_encrypt_blocks: leaq -128(%rcx),%rcx .Lctr32_tail: + + leaq 16(%rcx),%rcx cmpq $4,%rdx jb .Lctr32_loop3 je .Lctr32_loop4 + shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 @@ -1456,30 +1532,33 @@ aesni_ctr32_encrypt_blocks: movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) - jmp .Lctr32_done -.align 16 -.Lctr32_one_shortcut: - movups (%r8),%xmm2 - movups (%rdi),%xmm10 - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_7: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_7 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - jmp .Lctr32_done - -.align 16 .Lctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lctr32_epilogue: @@ -1750,6 +1829,7 @@ aesni_xts_encrypt: shrl $4,%eax .Lxts_enc_short: + movl %eax,%r10d pxor %xmm0,%xmm10 addq $96,%rdx @@ -1778,6 +1858,7 @@ aesni_xts_encrypt: pxor %xmm12,%xmm4 pxor %xmm13,%xmm5 pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 call _aesni_encrypt6 @@ -1920,6 +2001,29 @@ aesni_xts_encrypt: movups %xmm2,-16(%rsi) .Lxts_enc_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lxts_enc_epilogue: @@ -2196,6 +2300,7 @@ aesni_xts_decrypt: shrl $4,%eax .Lxts_dec_short: + movl %eax,%r10d pxor %xmm0,%xmm10 pxor %xmm0,%xmm11 @@ -2398,6 +2503,29 @@ aesni_xts_decrypt: movups %xmm2,(%rsi) .Lxts_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp .Lxts_dec_epilogue: @@ -2446,7 +2574,11 @@ aesni_cbc_encrypt: jnc .Lcbc_enc_loop addq $16,%rdx jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 jmp .Lcbc_ret .Lcbc_enc_tail: @@ -2466,6 +2598,35 @@ aesni_cbc_encrypt: .align 16 .Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: leaq (%rsp),%rax pushq %rbp subq $16,%rsp @@ -2702,7 +2863,7 @@ aesni_cbc_encrypt: movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx addq $112,%rdx - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi cmpq $80,%rdx @@ -2721,14 +2882,19 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 leaq 80(%rsi),%rsi movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 jmp .Lcbc_dec_tail_collected .align 16 @@ -2743,16 +2909,23 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 pxor %xmm9,%xmm8 movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 leaq 96(%rsi),%rsi movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp .Lcbc_dec_tail_collected .align 16 @@ -2796,7 +2969,7 @@ aesni_cbc_encrypt: movdqa %xmm7,%xmm2 addq $80,%rdx - jle .Lcbc_dec_tail_collected + jle .Lcbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi @@ -2831,12 +3004,17 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 leaq 64(%rsi),%rsi movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 subq $16,%rdx jmp .Lcbc_dec_tail_collected @@ -2847,12 +3025,12 @@ aesni_cbc_encrypt: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -.Loop_dec1_16: +.Loop_dec1_17: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz .Loop_dec1_16 + jnz .Loop_dec1_17 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -2866,6 +3044,7 @@ aesni_cbc_encrypt: pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 leaq 16(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 @@ -2878,7 +3057,9 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 leaq 32(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 @@ -2891,29 +3072,45 @@ aesni_cbc_encrypt: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 leaq 48(%rsi),%rsi jmp .Lcbc_dec_tail_collected .align 16 +.Lcbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 .Lcbc_dec_tail_collected: movups %xmm10,(%r8) andq $15,%rdx jnz .Lcbc_dec_tail_partial movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx leaq (%rsp),%rsi .long 0x9066A4F3 + movdqa %xmm2,(%rsp) .Lcbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 leaq (%rbp),%rsp popq %rbp .Lcbc_ret: @@ -2951,7 +3148,9 @@ aesni_set_decrypt_key: movups (%rdx),%xmm0 .byte 102,15,56,219,192 + pxor %xmm1,%xmm1 movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 .Ldec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz .Lenc_key_ret + movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 + andl OPENSSL_ia32cap_P+4(%rip),%r10d leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds @@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key: .L10rounds: movl $9,%esi + cmpl $268435456,%r10d + je .L10rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call .Lkey_expansion_128_cold @@ -3007,10 +3211,80 @@ __aesni_set_encrypt_key: xorl %eax,%eax jmp .Lenc_key_ret +.align 16 +.L10rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .L12rounds: movq 16(%rdi),%xmm2 movl $11,%esi + cmpl $268435456,%r10d + je .L12rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call .Lkey_expansion_192a_cold @@ -3033,11 +3307,55 @@ __aesni_set_encrypt_key: xorq %rax,%rax jmp .Lenc_key_ret +.align 16 +.L12rounds_alt: + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz .Loop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .L14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax + cmpl $268435456,%r10d + je .L14rounds_alt + movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 @@ -3071,10 +3389,70 @@ __aesni_set_encrypt_key: xorq %rax,%rax jmp .Lenc_key_ret +.align 16 +.L14rounds_alt: + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + .align 16 .Lbad_keybits: movq $-2,%rax .Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 addq $8,%rsp .byte 0xf3,0xc3 .LSEH_end_set_encrypt_key: @@ -3160,6 +3538,14 @@ __aesni_set_encrypt_key: .long 0x87,0,1,0 .Lincrement1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s index 1bf368c7eb..5f98ff2237 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s @@ -1755,11 +1755,16 @@ bn_from_mont8x: .type bn_get_bits5,@function .align 16 bn_get_bits5: - movq %rdi,%r10 + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 movl %esi,%ecx - shrl $3,%esi - movzwl (%r10,%rsi,1),%eax - andl $7,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax shrl %cl,%eax andl $31,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s index 57509ae719..41ad80eebd 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s @@ -17,7 +17,10 @@ L$oop_enc1_1: leaq 16(%rdx),%rdx jnz L$oop_enc1_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 @@ -38,7 +41,10 @@ L$oop_dec1_2: leaq 16(%rdx),%rdx jnz L$oop_dec1_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 .byte 0xf3,0xc3 @@ -264,21 +270,18 @@ _aesni_encrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%rcx,%rax,1),%xmm0 jmp L$enc_loop6_enter .p2align 4 L$enc_loop6: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +L$enc_loop6_enter: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 -L$enc_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,220,208 @@ -321,21 +324,18 @@ _aesni_decrypt6: pxor %xmm0,%xmm6 .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 addq $16,%rax -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%rcx,%rax,1),%xmm0 jmp L$dec_loop6_enter .p2align 4 L$dec_loop6: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +L$dec_loop6_enter: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 -L$dec_loop6_enter: movups (%rcx,%rax,1),%xmm1 addq $32,%rax .byte 102,15,56,222,208 @@ -375,23 +375,18 @@ _aesni_encrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,220,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 pxor %xmm0,%xmm8 +.byte 102,15,56,220,217 pxor %xmm0,%xmm9 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp L$enc_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop8_inner .p2align 4 L$enc_loop8: .byte 102,15,56,220,209 .byte 102,15,56,220,217 +L$enc_loop8_inner: .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -444,23 +439,18 @@ _aesni_decrypt8: leaq 32(%rcx,%rax,1),%rcx negq %rax .byte 102,15,56,222,209 - addq $16,%rax pxor %xmm0,%xmm7 -.byte 102,15,56,222,217 pxor %xmm0,%xmm8 +.byte 102,15,56,222,217 pxor %xmm0,%xmm9 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups -16(%rcx,%rax,1),%xmm0 - jmp L$dec_loop8_enter + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$dec_loop8_inner .p2align 4 L$dec_loop8: .byte 102,15,56,222,209 .byte 102,15,56,222,217 +L$dec_loop8_inner: .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -587,6 +577,7 @@ L$ecb_enc_tail: movups 80(%rdi),%xmm7 je L$ecb_enc_six movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 call _aesni_encrypt8 movups %xmm2,(%rsi) movups %xmm3,16(%rsi) @@ -700,15 +691,23 @@ L$ecb_dec_loop8_enter: jnc L$ecb_dec_loop8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movq %r11,%rcx movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movl %r10d,%eax movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi addq $128,%rdx jz L$ecb_ret @@ -731,14 +730,23 @@ L$ecb_dec_tail: je L$ecb_dec_six movups 96(%rdi),%xmm8 movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 call _aesni_decrypt8 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp L$ecb_ret .p2align 4 L$ecb_dec_one: @@ -754,49 +762,73 @@ L$oop_dec1_4: jnz L$oop_dec1_4 .byte 102,15,56,223,209 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp L$ecb_ret .p2align 4 L$ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 jmp L$ecb_ret .p2align 4 L$ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 jmp L$ecb_ret .p2align 4 L$ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 jmp L$ecb_ret .p2align 4 L$ecb_dec_five: xorps %xmm7,%xmm7 call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 jmp L$ecb_ret .p2align 4 L$ecb_dec_six: call _aesni_decrypt6 movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 L$ecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 .byte 0xf3,0xc3 .globl _aesni_ccm64_encrypt_blocks @@ -853,7 +885,13 @@ L$ccm64_enc2_loop: leaq 16(%rsi),%rsi jnz L$ccm64_enc_outer + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .globl _aesni_ccm64_decrypt_blocks @@ -944,21 +982,56 @@ L$oop_enc1_6: leaq 16(%r11),%r11 jnz L$oop_enc1_6 .byte 102,15,56,221,217 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 movups %xmm3,(%r9) + pxor %xmm3,%xmm3 + pxor %xmm8,%xmm8 + pxor %xmm6,%xmm6 .byte 0xf3,0xc3 .globl _aesni_ctr32_encrypt_blocks .p2align 4 _aesni_ctr32_encrypt_blocks: + cmpq $1,%rdx + jne L$ctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_enc1_7: +.byte 102,15,56,220,209 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_7 +.byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp L$ctr32_epilogue + +.p2align 4 +L$ctr32_bulk: leaq (%rsp),%rax pushq %rbp subq $128,%rsp andq $-16,%rsp leaq -8(%rax),%rbp - cmpq $1,%rdx - je L$ctr32_one_shortcut + + movdqu (%r8),%xmm2 movdqu (%rcx),%xmm0 @@ -1349,11 +1422,14 @@ L$ctr32_enc_done: leaq -128(%rcx),%rcx L$ctr32_tail: + + leaq 16(%rcx),%rcx cmpq $4,%rdx jb L$ctr32_loop3 je L$ctr32_loop4 + shll $4,%eax movdqa 96(%rsp),%xmm8 pxor %xmm9,%xmm9 @@ -1456,30 +1532,33 @@ L$ctr32_loop3: movups 32(%rdi),%xmm12 xorps %xmm12,%xmm4 movups %xmm4,32(%rsi) - jmp L$ctr32_done -.p2align 4 -L$ctr32_one_shortcut: - movups (%r8),%xmm2 - movups (%rdi),%xmm10 - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_7: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_7 -.byte 102,15,56,221,209 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - jmp L$ctr32_done - -.p2align 4 L$ctr32_done: + xorps %xmm0,%xmm0 + xorl %r11d,%r11d + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$ctr32_epilogue: @@ -1750,6 +1829,7 @@ L$xts_enc_loop6: shrl $4,%eax L$xts_enc_short: + movl %eax,%r10d pxor %xmm0,%xmm10 addq $96,%rdx @@ -1778,6 +1858,7 @@ L$xts_enc_short: pxor %xmm12,%xmm4 pxor %xmm13,%xmm5 pxor %xmm14,%xmm6 + pxor %xmm7,%xmm7 call _aesni_encrypt6 @@ -1920,6 +2001,29 @@ L$oop_enc1_10: movups %xmm2,-16(%rsi) L$xts_enc_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$xts_enc_epilogue: @@ -2196,6 +2300,7 @@ L$xts_dec_loop6: shrl $4,%eax L$xts_dec_short: + movl %eax,%r10d pxor %xmm0,%xmm10 pxor %xmm0,%xmm11 @@ -2398,6 +2503,29 @@ L$oop_dec1_14: movups %xmm2,(%rsi) L$xts_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 leaq (%rbp),%rsp popq %rbp L$xts_dec_epilogue: @@ -2446,7 +2574,11 @@ L$oop_enc1_15: jnc L$cbc_enc_loop addq $16,%rdx jnz L$cbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 jmp L$cbc_ret L$cbc_enc_tail: @@ -2466,6 +2598,35 @@ L$cbc_enc_tail: .p2align 4 L$cbc_decrypt: + cmpq $16,%rdx + jne L$cbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_16: +.byte 102,15,56,222,209 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_16 +.byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$cbc_ret +.p2align 4 +L$cbc_decrypt_bulk: leaq (%rsp),%rax pushq %rbp subq $16,%rsp @@ -2702,7 +2863,7 @@ L$cbc_dec_done: movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx addq $112,%rdx - jle L$cbc_dec_tail_collected + jle L$cbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi cmpq $80,%rdx @@ -2721,14 +2882,19 @@ L$cbc_dec_six_or_seven: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 leaq 80(%rsi),%rsi movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 jmp L$cbc_dec_tail_collected .p2align 4 @@ -2743,16 +2909,23 @@ L$cbc_dec_seven: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 pxor %xmm15,%xmm7 movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 pxor %xmm9,%xmm8 movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 leaq 96(%rsi),%rsi movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 jmp L$cbc_dec_tail_collected .p2align 4 @@ -2796,7 +2969,7 @@ L$cbc_dec_loop6_enter: movdqa %xmm7,%xmm2 addq $80,%rdx - jle L$cbc_dec_tail_collected + jle L$cbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi @@ -2831,12 +3004,17 @@ L$cbc_dec_tail: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 pxor %xmm14,%xmm6 movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 leaq 64(%rsi),%rsi movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 subq $16,%rdx jmp L$cbc_dec_tail_collected @@ -2847,12 +3025,12 @@ L$cbc_dec_one: movups 16(%rcx),%xmm1 leaq 32(%rcx),%rcx xorps %xmm0,%xmm2 -L$oop_dec1_16: +L$oop_dec1_17: .byte 102,15,56,222,209 decl %eax movups (%rcx),%xmm1 leaq 16(%rcx),%rcx - jnz L$oop_dec1_16 + jnz L$oop_dec1_17 .byte 102,15,56,223,209 xorps %xmm10,%xmm2 movaps %xmm11,%xmm10 @@ -2866,6 +3044,7 @@ L$cbc_dec_two: pxor %xmm11,%xmm3 movdqu %xmm2,(%rsi) movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 leaq 16(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 @@ -2878,7 +3057,9 @@ L$cbc_dec_three: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 leaq 32(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 @@ -2891,29 +3072,45 @@ L$cbc_dec_four: movdqu %xmm2,(%rsi) pxor %xmm12,%xmm4 movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 leaq 48(%rsi),%rsi jmp L$cbc_dec_tail_collected .p2align 4 +L$cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 L$cbc_dec_tail_collected: movups %xmm10,(%r8) andq $15,%rdx jnz L$cbc_dec_tail_partial movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 jmp L$cbc_dec_ret .p2align 4 L$cbc_dec_tail_partial: movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 movq $16,%rcx movq %rsi,%rdi subq %rdx,%rcx leaq (%rsp),%rsi .long 0x9066A4F3 + movdqa %xmm2,(%rsp) L$cbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 leaq (%rbp),%rsp popq %rbp L$cbc_ret: @@ -2951,7 +3148,9 @@ L$dec_key_inverse: movups (%rdx),%xmm0 .byte 102,15,56,219,192 + pxor %xmm1,%xmm1 movups %xmm0,(%rdi) + pxor %xmm0,%xmm0 L$dec_key_ret: addq $8,%rsp .byte 0xf3,0xc3 @@ -2969,8 +3168,10 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz L$enc_key_ret + movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 + andl _OPENSSL_ia32cap_P+4(%rip),%r10d leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds @@ -2981,6 +3182,9 @@ __aesni_set_encrypt_key: L$10rounds: movl $9,%esi + cmpl $268435456,%r10d + je L$10rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,200,1 call L$key_expansion_128_cold @@ -3007,10 +3211,80 @@ L$10rounds: xorl %eax,%eax jmp L$enc_key_ret +.p2align 4 +L$10rounds_alt: + movdqa L$key_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa L$key_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp L$oop_key128 + +.p2align 4 +L$oop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz L$oop_key128 + + movdqa L$key_rcon1b(%rip),%xmm4 + +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$12rounds: movq 16(%rdi),%xmm2 movl $11,%esi + cmpl $268435456,%r10d + je L$12rounds_alt + movups %xmm0,(%rdx) .byte 102,15,58,223,202,1 call L$key_expansion_192a_cold @@ -3033,11 +3307,55 @@ L$12rounds: xorq %rax,%rax jmp L$enc_key_ret +.p2align 4 +L$12rounds_alt: + movdqa L$key_rotate192(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp L$oop_key192 + +.p2align 4 +L$oop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz L$oop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$14rounds: movups 16(%rdi),%xmm2 movl $13,%esi leaq 16(%rax),%rax + cmpl $268435456,%r10d + je L$14rounds_alt + movups %xmm0,(%rdx) movups %xmm2,16(%rdx) .byte 102,15,58,223,202,1 @@ -3071,10 +3389,70 @@ L$14rounds: xorq %rax,%rax jmp L$enc_key_ret +.p2align 4 +L$14rounds_alt: + movdqa L$key_rotate(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp L$oop_key256 + +.p2align 4 +L$oop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz L$done_key256 + + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp L$oop_key256 + +L$done_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + .p2align 4 L$bad_keybits: movq $-2,%rax L$enc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 addq $8,%rsp .byte 0xf3,0xc3 L$SEH_end_set_encrypt_key: @@ -3160,6 +3538,14 @@ L$xts_magic: .long 0x87,0,1,0 L$increment1: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$key_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +L$key_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +L$key_rcon1: +.long 1,1,1,1 +L$key_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s index ba4d62157c..049bf06473 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s @@ -1755,11 +1755,16 @@ L$from_epilogue: .p2align 4 _bn_get_bits5: - movq %rdi,%r10 + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 movl %esi,%ecx - shrl $3,%esi - movzwl (%r10,%rsi,1),%eax - andl $7,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax shrl %cl,%eax andl $31,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm index 9473352638..34b554f9a9 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-sha256-x86_64.asm @@ -60,77 +60,6 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98 DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108 DB 46,111,114,103,62,0 ALIGN 64 - mov rsi,rax - mov rax,QWORD PTR[((64+56))+rax] - lea rax,QWORD PTR[48+rax] - - mov rbx,QWORD PTR[((-8))+rax] - mov rbp,QWORD PTR[((-16))+rax] - mov r12,QWORD PTR[((-24))+rax] - mov r13,QWORD PTR[((-32))+rax] - mov r14,QWORD PTR[((-40))+rax] - mov r15,QWORD PTR[((-48))+rax] - mov QWORD PTR[144+r8],rbx - mov QWORD PTR[160+r8],rbp - mov QWORD PTR[216+r8],r12 - mov QWORD PTR[224+r8],r13 - mov QWORD PTR[232+r8],r14 - mov QWORD PTR[240+r8],r15 - - lea rsi,QWORD PTR[((64+64))+rsi] - lea rdi,QWORD PTR[512+r8] - mov ecx,20 - DD 0a548f3fch - -$L$in_prologue:: - mov rdi,QWORD PTR[8+rax] - mov rsi,QWORD PTR[16+rax] - mov QWORD PTR[152+r8],rax - mov QWORD PTR[168+r8],rsi - mov QWORD PTR[176+r8],rdi - - mov rdi,QWORD PTR[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0a548f3fch - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD PTR[8+rsi] - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[16+rsi] - mov r10,QWORD PTR[40+rsi] - lea r11,QWORD PTR[56+rsi] - lea r12,QWORD PTR[24+rsi] - mov QWORD PTR[32+rsp],r10 - mov QWORD PTR[40+rsp],r11 - mov QWORD PTR[48+rsp],r12 - mov QWORD PTR[56+rsp],rcx - call QWORD PTR[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - .text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) - DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_xop - DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_xop - DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_xop - - DD imagerel $L$SEH_begin_aesni_cbc_sha256_enc_avx - DD imagerel $L$SEH_end_aesni_cbc_sha256_enc_avx - DD imagerel $L$SEH_info_aesni_cbc_sha256_enc_avx - -.pdata ENDS END diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm index 53d8afc950..5e848125d6 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/aes/aesni-x86_64.asm @@ -18,7 +18,10 @@ DB 102,15,56,220,209 lea r8,QWORD PTR[16+r8] jnz $L$oop_enc1_1 DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[rdx],xmm2 + pxor xmm2,xmm2 DB 0F3h,0C3h ;repret aesni_encrypt ENDP @@ -39,7 +42,10 @@ DB 102,15,56,222,209 lea r8,QWORD PTR[16+r8] jnz $L$oop_dec1_2 DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[rdx],xmm2 + pxor xmm2,xmm2 DB 0F3h,0C3h ;repret aesni_decrypt ENDP @@ -265,21 +271,18 @@ DB 102,15,56,220,217 pxor xmm6,xmm0 DB 102,15,56,220,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR[rax*1+rcx] add rax,16 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] jmp $L$enc_loop6_enter ALIGN 16 $L$enc_loop6:: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 +$L$enc_loop6_enter:: DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 -$L$enc_loop6_enter:: movups xmm1,XMMWORD PTR[rax*1+rcx] add rax,32 DB 102,15,56,220,208 @@ -322,21 +325,18 @@ DB 102,15,56,222,217 pxor xmm6,xmm0 DB 102,15,56,222,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR[rax*1+rcx] add rax,16 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] jmp $L$dec_loop6_enter ALIGN 16 $L$dec_loop6:: DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 +$L$dec_loop6_enter:: DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 -$L$dec_loop6_enter:: movups xmm1,XMMWORD PTR[rax*1+rcx] add rax,32 DB 102,15,56,222,208 @@ -376,23 +376,18 @@ _aesni_encrypt8 PROC PRIVATE lea rcx,QWORD PTR[32+rax*1+rcx] neg rax DB 102,15,56,220,209 - add rax,16 pxor xmm7,xmm0 -DB 102,15,56,220,217 pxor xmm8,xmm0 +DB 102,15,56,220,217 pxor xmm9,xmm0 -DB 102,15,56,220,225 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 -DB 102,68,15,56,220,193 -DB 102,68,15,56,220,201 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] - jmp $L$enc_loop8_enter + movups xmm0,XMMWORD PTR[rax*1+rcx] + add rax,16 + jmp $L$enc_loop8_inner ALIGN 16 $L$enc_loop8:: DB 102,15,56,220,209 DB 102,15,56,220,217 +$L$enc_loop8_inner:: DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 @@ -445,23 +440,18 @@ _aesni_decrypt8 PROC PRIVATE lea rcx,QWORD PTR[32+rax*1+rcx] neg rax DB 102,15,56,222,209 - add rax,16 pxor xmm7,xmm0 -DB 102,15,56,222,217 pxor xmm8,xmm0 +DB 102,15,56,222,217 pxor xmm9,xmm0 -DB 102,15,56,222,225 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 -DB 102,68,15,56,222,193 -DB 102,68,15,56,222,201 - movups xmm0,XMMWORD PTR[((-16))+rax*1+rcx] - jmp $L$dec_loop8_enter + movups xmm0,XMMWORD PTR[rax*1+rcx] + add rax,16 + jmp $L$dec_loop8_inner ALIGN 16 $L$dec_loop8:: DB 102,15,56,222,209 DB 102,15,56,222,217 +$L$dec_loop8_inner:: DB 102,15,56,222,225 DB 102,15,56,222,233 DB 102,15,56,222,241 @@ -605,6 +595,7 @@ $L$ecb_enc_tail:: movups xmm7,XMMWORD PTR[80+rdi] je $L$ecb_enc_six movdqu xmm8,XMMWORD PTR[96+rdi] + xorps xmm9,xmm9 call _aesni_encrypt8 movups XMMWORD PTR[rsi],xmm2 movups XMMWORD PTR[16+rsi],xmm3 @@ -718,15 +709,23 @@ $L$ecb_dec_loop8_enter:: jnc $L$ecb_dec_loop8 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 mov rcx,r11 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 mov eax,r10d movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 movups XMMWORD PTR[96+rsi],xmm8 + pxor xmm8,xmm8 movups XMMWORD PTR[112+rsi],xmm9 + pxor xmm9,xmm9 lea rsi,QWORD PTR[128+rsi] add rdx,080h jz $L$ecb_ret @@ -749,14 +748,23 @@ $L$ecb_dec_tail:: je $L$ecb_dec_six movups xmm8,XMMWORD PTR[96+rdi] movups xmm0,XMMWORD PTR[rcx] + xorps xmm9,xmm9 call _aesni_decrypt8 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 movups XMMWORD PTR[96+rsi],xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_one:: @@ -772,53 +780,81 @@ DB 102,15,56,222,209 jnz $L$oop_dec1_4 DB 102,15,56,223,209 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_two:: call _aesni_decrypt2 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_three:: call _aesni_decrypt3 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_four:: call _aesni_decrypt4 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_five:: xorps xmm7,xmm7 call _aesni_decrypt6 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 jmp $L$ecb_ret ALIGN 16 $L$ecb_dec_six:: call _aesni_decrypt6 movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 movups XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 movups XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 movups XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 $L$ecb_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ecb_enc_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -898,11 +934,21 @@ DB 102,15,56,0,215 lea rsi,QWORD PTR[16+rsi] jnz $L$ccm64_enc_outer + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 movups XMMWORD PTR[r9],xmm3 + pxor xmm3,xmm3 + pxor xmm8,xmm8 + pxor xmm6,xmm6 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ccm64_enc_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -1016,11 +1062,21 @@ DB 102,15,56,220,217 lea r11,QWORD PTR[16+r11] jnz $L$oop_enc1_6 DB 102,15,56,221,217 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 movups XMMWORD PTR[r9],xmm3 + pxor xmm3,xmm3 + pxor xmm8,xmm8 + pxor xmm6,xmm6 movaps xmm6,XMMWORD PTR[rsp] + movaps XMMWORD PTR[rsp],xmm0 movaps xmm7,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm8,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm9,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 lea rsp,QWORD PTR[88+rsp] $L$ccm64_dec_ret:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue @@ -1043,6 +1099,35 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks:: mov r8,QWORD PTR[40+rsp] + cmp rdx,1 + jne $L$ctr32_bulk + + + + movups xmm2,XMMWORD PTR[r8] + movups xmm3,XMMWORD PTR[rdi] + mov edx,DWORD PTR[240+rcx] + movups xmm0,XMMWORD PTR[rcx] + movups xmm1,XMMWORD PTR[16+rcx] + lea rcx,QWORD PTR[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_7:: +DB 102,15,56,220,209 + dec edx + movups xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + jnz $L$oop_enc1_7 +DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD PTR[rsi],xmm2 + xorps xmm2,xmm2 + jmp $L$ctr32_epilogue + +ALIGN 16 +$L$ctr32_bulk:: lea rax,QWORD PTR[rsp] push rbp sub rsp,288 @@ -1060,8 +1145,8 @@ $L$SEH_begin_aesni_ctr32_encrypt_blocks:: $L$ctr32_body:: lea rbp,QWORD PTR[((-8))+rax] - cmp rdx,1 - je $L$ctr32_one_shortcut + + movdqu xmm2,XMMWORD PTR[r8] movdqu xmm0,XMMWORD PTR[rcx] @@ -1452,11 +1537,14 @@ DB 102,69,15,56,221,202 lea rcx,QWORD PTR[((-128))+rcx] $L$ctr32_tail:: + + lea rcx,QWORD PTR[16+rcx] cmp rdx,4 jb $L$ctr32_loop3 je $L$ctr32_loop4 + shl eax,4 movdqa xmm8,XMMWORD PTR[96+rsp] pxor xmm9,xmm9 @@ -1559,40 +1647,43 @@ DB 102,15,56,221,225 movups xmm12,XMMWORD PTR[32+rdi] xorps xmm4,xmm12 movups XMMWORD PTR[32+rsi],xmm4 - jmp $L$ctr32_done -ALIGN 16 -$L$ctr32_one_shortcut:: - movups xmm2,XMMWORD PTR[r8] - movups xmm10,XMMWORD PTR[rdi] - mov eax,DWORD PTR[240+rcx] - movups xmm0,XMMWORD PTR[rcx] - movups xmm1,XMMWORD PTR[16+rcx] - lea rcx,QWORD PTR[32+rcx] - xorps xmm2,xmm0 -$L$oop_enc1_7:: -DB 102,15,56,220,209 - dec eax - movups xmm1,XMMWORD PTR[rcx] - lea rcx,QWORD PTR[16+rcx] - jnz $L$oop_enc1_7 -DB 102,15,56,221,209 - xorps xmm2,xmm10 - movups XMMWORD PTR[rsi],xmm2 - jmp $L$ctr32_done - -ALIGN 16 $L$ctr32_done:: + xorps xmm0,xmm0 + xor r11d,r11d + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 + movaps XMMWORD PTR[112+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$ctr32_epilogue:: @@ -1889,6 +1980,7 @@ DB 102,15,56,221,124,36,80 shr eax,4 $L$xts_enc_short:: + mov r10d,eax pxor xmm10,xmm0 add rdx,16*6 @@ -1917,6 +2009,7 @@ $L$xts_enc_short:: pxor xmm4,xmm12 pxor xmm5,xmm13 pxor xmm6,xmm14 + pxor xmm7,xmm7 call _aesni_encrypt6 @@ -2059,16 +2152,39 @@ DB 102,15,56,221,209 movups XMMWORD PTR[(-16)+rsi],xmm2 $L$xts_enc_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$xts_enc_epilogue:: @@ -2371,6 +2487,7 @@ DB 102,15,56,223,124,36,80 shr eax,4 $L$xts_dec_short:: + mov r10d,eax pxor xmm10,xmm0 pxor xmm11,xmm0 @@ -2573,16 +2690,39 @@ DB 102,15,56,223,209 movups XMMWORD PTR[rsi],xmm2 $L$xts_dec_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 movaps xmm6,XMMWORD PTR[((-160))+rbp] + movaps XMMWORD PTR[(-160)+rbp],xmm0 movaps xmm7,XMMWORD PTR[((-144))+rbp] + movaps XMMWORD PTR[(-144)+rbp],xmm0 movaps xmm8,XMMWORD PTR[((-128))+rbp] + movaps XMMWORD PTR[(-128)+rbp],xmm0 movaps xmm9,XMMWORD PTR[((-112))+rbp] + movaps XMMWORD PTR[(-112)+rbp],xmm0 movaps xmm10,XMMWORD PTR[((-96))+rbp] + movaps XMMWORD PTR[(-96)+rbp],xmm0 movaps xmm11,XMMWORD PTR[((-80))+rbp] + movaps XMMWORD PTR[(-80)+rbp],xmm0 movaps xmm12,XMMWORD PTR[((-64))+rbp] + movaps XMMWORD PTR[(-64)+rbp],xmm0 movaps xmm13,XMMWORD PTR[((-48))+rbp] + movaps XMMWORD PTR[(-48)+rbp],xmm0 movaps xmm14,XMMWORD PTR[((-32))+rbp] + movaps XMMWORD PTR[(-32)+rbp],xmm0 movaps xmm15,XMMWORD PTR[((-16))+rbp] + movaps XMMWORD PTR[(-16)+rbp],xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps XMMWORD PTR[64+rsp],xmm0 + movaps XMMWORD PTR[80+rsp],xmm0 + movaps XMMWORD PTR[96+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$xts_dec_epilogue:: @@ -2646,7 +2786,11 @@ DB 102,15,56,221,209 jnc $L$cbc_enc_loop add rdx,16 jnz $L$cbc_enc_tail + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR[r8],xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 jmp $L$cbc_ret $L$cbc_enc_tail:: @@ -2666,6 +2810,35 @@ $L$cbc_enc_tail:: ALIGN 16 $L$cbc_decrypt:: + cmp rdx,16 + jne $L$cbc_decrypt_bulk + + + + movdqu xmm2,XMMWORD PTR[rdi] + movdqu xmm3,XMMWORD PTR[r8] + movdqa xmm4,xmm2 + movups xmm0,XMMWORD PTR[rcx] + movups xmm1,XMMWORD PTR[16+rcx] + lea rcx,QWORD PTR[32+rcx] + xorps xmm2,xmm0 +$L$oop_dec1_16:: +DB 102,15,56,222,209 + dec r10d + movups xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + jnz $L$oop_dec1_16 +DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqu XMMWORD PTR[r8],xmm4 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 + jmp $L$cbc_ret +ALIGN 16 +$L$cbc_decrypt_bulk:: lea rax,QWORD PTR[rsp] push rbp sub rsp,176 @@ -2913,7 +3086,7 @@ DB 102,69,15,56,223,202 movaps xmm2,xmm9 lea rcx,QWORD PTR[((-112))+rcx] add rdx,070h - jle $L$cbc_dec_tail_collected + jle $L$cbc_dec_clear_tail_collected movups XMMWORD PTR[rsi],xmm9 lea rsi,QWORD PTR[16+rsi] cmp rdx,050h @@ -2932,14 +3105,19 @@ $L$cbc_dec_six_or_seven:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 pxor xmm7,xmm15 movdqu XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 lea rsi,QWORD PTR[80+rsi] movdqa xmm2,xmm7 + pxor xmm7,xmm7 jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -2954,16 +3132,23 @@ $L$cbc_dec_seven:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 pxor xmm7,xmm15 movdqu XMMWORD PTR[64+rsi],xmm6 + pxor xmm6,xmm6 pxor xmm8,xmm9 movdqu XMMWORD PTR[80+rsi],xmm7 + pxor xmm7,xmm7 lea rsi,QWORD PTR[96+rsi] movdqa xmm2,xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3007,7 +3192,7 @@ $L$cbc_dec_loop6_enter:: movdqa xmm2,xmm7 add rdx,050h - jle $L$cbc_dec_tail_collected + jle $L$cbc_dec_clear_tail_collected movups XMMWORD PTR[rsi],xmm7 lea rsi,QWORD PTR[16+rsi] @@ -3042,12 +3227,17 @@ $L$cbc_dec_tail:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 pxor xmm6,xmm14 movdqu XMMWORD PTR[48+rsi],xmm5 + pxor xmm5,xmm5 lea rsi,QWORD PTR[64+rsi] movdqa xmm2,xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 sub rdx,010h jmp $L$cbc_dec_tail_collected @@ -3058,12 +3248,12 @@ $L$cbc_dec_one:: movups xmm1,XMMWORD PTR[16+rcx] lea rcx,QWORD PTR[32+rcx] xorps xmm2,xmm0 -$L$oop_dec1_16:: +$L$oop_dec1_17:: DB 102,15,56,222,209 dec eax movups xmm1,XMMWORD PTR[rcx] lea rcx,QWORD PTR[16+rcx] - jnz $L$oop_dec1_16 + jnz $L$oop_dec1_17 DB 102,15,56,223,209 xorps xmm2,xmm10 movaps xmm10,xmm11 @@ -3077,6 +3267,7 @@ $L$cbc_dec_two:: pxor xmm3,xmm11 movdqu XMMWORD PTR[rsi],xmm2 movdqa xmm2,xmm3 + pxor xmm3,xmm3 lea rsi,QWORD PTR[16+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3089,7 +3280,9 @@ $L$cbc_dec_three:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 movdqa xmm2,xmm4 + pxor xmm4,xmm4 lea rsi,QWORD PTR[32+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 @@ -3102,39 +3295,61 @@ $L$cbc_dec_four:: movdqu XMMWORD PTR[rsi],xmm2 pxor xmm4,xmm12 movdqu XMMWORD PTR[16+rsi],xmm3 + pxor xmm3,xmm3 pxor xmm5,xmm13 movdqu XMMWORD PTR[32+rsi],xmm4 + pxor xmm4,xmm4 movdqa xmm2,xmm5 + pxor xmm5,xmm5 lea rsi,QWORD PTR[48+rsi] jmp $L$cbc_dec_tail_collected ALIGN 16 +$L$cbc_dec_clear_tail_collected:: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 $L$cbc_dec_tail_collected:: movups XMMWORD PTR[r8],xmm10 and rdx,15 jnz $L$cbc_dec_tail_partial movups XMMWORD PTR[rsi],xmm2 + pxor xmm2,xmm2 jmp $L$cbc_dec_ret ALIGN 16 $L$cbc_dec_tail_partial:: movaps XMMWORD PTR[rsp],xmm2 + pxor xmm2,xmm2 mov rcx,16 mov rdi,rsi sub rcx,rdx lea rsi,QWORD PTR[rsp] DD 09066A4F3h + movdqa XMMWORD PTR[rsp],xmm2 $L$cbc_dec_ret:: + xorps xmm0,xmm0 + pxor xmm1,xmm1 movaps xmm6,XMMWORD PTR[16+rsp] + movaps XMMWORD PTR[16+rsp],xmm0 movaps xmm7,XMMWORD PTR[32+rsp] + movaps XMMWORD PTR[32+rsp],xmm0 movaps xmm8,XMMWORD PTR[48+rsp] + movaps XMMWORD PTR[48+rsp],xmm0 movaps xmm9,XMMWORD PTR[64+rsp] + movaps XMMWORD PTR[64+rsp],xmm0 movaps xmm10,XMMWORD PTR[80+rsp] + movaps XMMWORD PTR[80+rsp],xmm0 movaps xmm11,XMMWORD PTR[96+rsp] + movaps XMMWORD PTR[96+rsp],xmm0 movaps xmm12,XMMWORD PTR[112+rsp] + movaps XMMWORD PTR[112+rsp],xmm0 movaps xmm13,XMMWORD PTR[128+rsp] + movaps XMMWORD PTR[128+rsp],xmm0 movaps xmm14,XMMWORD PTR[144+rsp] + movaps XMMWORD PTR[144+rsp],xmm0 movaps xmm15,XMMWORD PTR[160+rsp] + movaps XMMWORD PTR[160+rsp],xmm0 lea rsp,QWORD PTR[rbp] pop rbp $L$cbc_ret:: @@ -3175,7 +3390,9 @@ DB 102,15,56,219,201 movups xmm0,XMMWORD PTR[r8] DB 102,15,56,219,192 + pxor xmm1,xmm1 movups XMMWORD PTR[rcx],xmm0 + pxor xmm0,xmm0 $L$dec_key_ret:: add rsp,8 DB 0F3h,0C3h ;repret @@ -3193,8 +3410,10 @@ DB 048h,083h,0ECh,008h test r8,r8 jz $L$enc_key_ret + mov r10d,268437504 movups xmm0,XMMWORD PTR[rcx] xorps xmm4,xmm4 + and r10d,DWORD PTR[((OPENSSL_ia32cap_P+4))] lea rax,QWORD PTR[16+r8] cmp edx,256 je $L$14rounds @@ -3205,6 +3424,9 @@ DB 048h,083h,0ECh,008h $L$10rounds:: mov edx,9 + cmp r10d,268435456 + je $L$10rounds_alt + movups XMMWORD PTR[r8],xmm0 DB 102,15,58,223,200,1 call $L$key_expansion_128_cold @@ -3231,10 +3453,80 @@ DB 102,15,58,223,200,54 xor eax,eax jmp $L$enc_key_ret +ALIGN 16 +$L$10rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate] + mov r10d,8 + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + movdqa xmm2,xmm0 + movdqu XMMWORD PTR[r8],xmm0 + jmp $L$oop_key128 + +ALIGN 16 +$L$oop_key128:: +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + lea rax,QWORD PTR[16+rax] + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[(-16)+rax],xmm0 + movdqa xmm2,xmm0 + + dec r10d + jnz $L$oop_key128 + + movdqa xmm4,XMMWORD PTR[$L$key_rcon1b] + +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[rax],xmm0 + + movdqa xmm2,xmm0 +DB 102,15,56,0,197 +DB 102,15,56,221,196 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[16+rax],xmm0 + + mov DWORD PTR[96+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$12rounds:: movq xmm2,QWORD PTR[16+rcx] mov edx,11 + cmp r10d,268435456 + je $L$12rounds_alt + movups XMMWORD PTR[r8],xmm0 DB 102,15,58,223,202,1 call $L$key_expansion_192a_cold @@ -3257,11 +3549,55 @@ DB 102,15,58,223,202,128 xor rax,rax jmp $L$enc_key_ret +ALIGN 16 +$L$12rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate192] + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + mov r10d,8 + movdqu XMMWORD PTR[r8],xmm0 + jmp $L$oop_key192 + +ALIGN 16 +$L$oop_key192:: + movq QWORD PTR[rax],xmm2 + movdqa xmm1,xmm2 +DB 102,15,56,0,213 +DB 102,15,56,221,212 + pslld xmm4,1 + lea rax,QWORD PTR[24+rax] + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + + pshufd xmm3,xmm0,0ffh + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu XMMWORD PTR[(-16)+rax],xmm0 + + dec r10d + jnz $L$oop_key192 + + mov DWORD PTR[32+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$14rounds:: movups xmm2,XMMWORD PTR[16+rcx] mov edx,13 lea rax,QWORD PTR[16+rax] + cmp r10d,268435456 + je $L$14rounds_alt + movups XMMWORD PTR[r8],xmm0 movups XMMWORD PTR[16+r8],xmm2 DB 102,15,58,223,202,1 @@ -3295,10 +3631,70 @@ DB 102,15,58,223,202,64 xor rax,rax jmp $L$enc_key_ret +ALIGN 16 +$L$14rounds_alt:: + movdqa xmm5,XMMWORD PTR[$L$key_rotate] + movdqa xmm4,XMMWORD PTR[$L$key_rcon1] + mov r10d,7 + movdqu XMMWORD PTR[r8],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD PTR[16+r8],xmm2 + jmp $L$oop_key256 + +ALIGN 16 +$L$oop_key256:: +DB 102,15,56,0,213 +DB 102,15,56,221,212 + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + + pxor xmm0,xmm2 + movdqu XMMWORD PTR[rax],xmm0 + + dec r10d + jz $L$done_key256 + + pshufd xmm2,xmm0,0ffh + pxor xmm3,xmm3 +DB 102,15,56,221,211 + + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + + pxor xmm2,xmm1 + movdqu XMMWORD PTR[16+rax],xmm2 + lea rax,QWORD PTR[32+rax] + movdqa xmm1,xmm2 + + jmp $L$oop_key256 + +$L$done_key256:: + mov DWORD PTR[16+rax],edx + xor eax,eax + jmp $L$enc_key_ret + ALIGN 16 $L$bad_keybits:: mov rax,-2 $L$enc_key_ret:: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 add rsp,8 DB 0F3h,0C3h ;repret $L$SEH_end_set_encrypt_key:: @@ -3384,6 +3780,14 @@ $L$xts_magic:: DD 087h,0,1,0 $L$increment1:: DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$key_rotate:: + DD 00c0f0e0dh,00c0f0e0dh,00c0f0e0dh,00c0f0e0dh +$L$key_rotate192:: + DD 004070605h,004070605h,004070605h,004070605h +$L$key_rcon1:: + DD 1,1,1,1 +$L$key_rcon1b:: + DD 01bh,01bh,01bh,01bh DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 @@ -3489,7 +3893,7 @@ cbc_se_handler PROC PRIVATE mov rax,QWORD PTR[152+r8] mov rbx,QWORD PTR[248+r8] - lea r10,QWORD PTR[$L$cbc_decrypt] + lea r10,QWORD PTR[$L$cbc_decrypt_bulk] cmp rbx,r10 jb $L$common_seh_tail diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm index c47130f44c..f690ba58d3 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm @@ -1832,11 +1832,16 @@ PUBLIC bn_get_bits5 ALIGN 16 bn_get_bits5 PROC PUBLIC - mov r10,rcx + lea r10,QWORD PTR[rcx] + lea r11,QWORD PTR[1+rcx] mov ecx,edx - shr edx,3 - movzx eax,WORD PTR[rdx*1+r10] - and ecx,7 + shr edx,4 + and ecx,15 + lea eax,DWORD PTR[((-8))+rcx] + cmp ecx,11 + cmova r10,r11 + cmova ecx,eax + movzx eax,WORD PTR[rdx*2+r10] shr eax,cl and eax,31 DB 0F3h,0C3h ;repret diff --git a/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s b/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s index a68f7cdbe9..3bbc4e47d6 100644 --- a/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s +++ b/deps/openssl/asm_obsolete/x86-elf-gas/aes/aesni-x86.s @@ -21,7 +21,10 @@ aesni_encrypt: leal 16(%edx),%edx jnz .L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_encrypt,.-.L_aesni_encrypt_begin .globl aesni_decrypt @@ -45,7 +48,10 @@ aesni_decrypt: leal 16(%edx),%edx jnz .L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .size aesni_decrypt,.-.L_aesni_decrypt_begin .type _aesni_encrypt2,@function @@ -259,17 +265,15 @@ _aesni_encrypt6: negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_encrypt6_enter + jmp .L008_aesni_encrypt6_inner .align 16 -.L008enc6_loop: +.L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +.L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -283,7 +287,7 @@ _aesni_encrypt6: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L008enc6_loop + jnz .L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -315,17 +319,15 @@ _aesni_decrypt6: negl %ecx .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp .L_aesni_decrypt6_enter + jmp .L010_aesni_decrypt6_inner .align 16 -.L009dec6_loop: +.L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +.L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -339,7 +341,7 @@ _aesni_decrypt6: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz .L009dec6_loop + jnz .L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -369,14 +371,14 @@ aesni_ecb_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz .L010ecb_ret + jz .L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz .L011ecb_decrypt + jz .L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L012ecb_enc_tail + jb .L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -385,9 +387,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L013ecb_enc_loop6_enter + jmp .L015ecb_enc_loop6_enter .align 16 -.L014ecb_enc_loop6: +.L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -402,12 +404,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L013ecb_enc_loop6_enter: +.L015ecb_enc_loop6_enter: call _aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L014ecb_enc_loop6 + jnc .L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -416,18 +418,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L012ecb_enc_tail: + jz .L012ecb_ret +.L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L015ecb_enc_one + jb .L017ecb_enc_one movups 16(%esi),%xmm3 - je .L016ecb_enc_two + je .L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L017ecb_enc_three + jb .L019ecb_enc_three movups 48(%esi),%xmm5 - je .L018ecb_enc_four + je .L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_encrypt6 @@ -436,49 +438,49 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L015ecb_enc_one: +.L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L019enc1_loop_3: +.L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L019enc1_loop_3 + jnz .L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L016ecb_enc_two: +.L018ecb_enc_two: call _aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L017ecb_enc_three: +.L019ecb_enc_three: call _aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L018ecb_enc_four: +.L020ecb_enc_four: call _aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L011ecb_decrypt: +.L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb .L020ecb_dec_tail + jb .L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -487,9 +489,9 @@ aesni_ecb_encrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp .L021ecb_dec_loop6_enter + jmp .L023ecb_dec_loop6_enter .align 16 -.L022ecb_dec_loop6: +.L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -504,12 +506,12 @@ aesni_ecb_encrypt: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -.L021ecb_dec_loop6_enter: +.L023ecb_dec_loop6_enter: call _aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc .L022ecb_dec_loop6 + jnc .L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -518,18 +520,18 @@ aesni_ecb_encrypt: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz .L010ecb_ret -.L020ecb_dec_tail: + jz .L012ecb_ret +.L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb .L023ecb_dec_one + jb .L025ecb_dec_one movups 16(%esi),%xmm3 - je .L024ecb_dec_two + je .L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb .L025ecb_dec_three + jb .L027ecb_dec_three movups 48(%esi),%xmm5 - je .L026ecb_dec_four + je .L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call _aesni_decrypt6 @@ -538,43 +540,51 @@ aesni_ecb_encrypt: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L023ecb_dec_one: +.L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L027dec1_loop_4: +.L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L027dec1_loop_4 + jnz .L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L024ecb_dec_two: +.L026ecb_dec_two: call _aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L025ecb_dec_three: +.L027ecb_dec_three: call _aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L010ecb_ret + jmp .L012ecb_ret .align 16 -.L026ecb_dec_four: +.L028ecb_dec_four: call _aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L010ecb_ret: +.L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -621,7 +631,7 @@ aesni_ccm64_encrypt_blocks: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -.L028ccm64_enc_outer: +.L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -630,7 +640,7 @@ aesni_ccm64_encrypt_blocks: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -.L029ccm64_enc2_loop: +.L031ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -638,7 +648,7 @@ aesni_ccm64_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L029ccm64_enc2_loop + jnz .L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -651,10 +661,18 @@ aesni_ccm64_encrypt_blocks: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz .L028ccm64_enc_outer + jnz .L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -702,12 +720,12 @@ aesni_ccm64_decrypt_blocks: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L030enc1_loop_5: +.L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L030enc1_loop_5 + jnz .L032enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -717,16 +735,16 @@ aesni_ccm64_decrypt_blocks: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L031ccm64_dec_outer: +.L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz .L032ccm64_dec_break + jz .L034ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -734,7 +752,7 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -.L033ccm64_dec2_loop: +.L035ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -742,7 +760,7 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz .L033ccm64_dec2_loop + jnz .L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -750,9 +768,9 @@ aesni_ccm64_decrypt_blocks: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp .L031ccm64_dec_outer + jmp .L033ccm64_dec_outer .align 16 -.L032ccm64_dec_break: +.L034ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -760,16 +778,24 @@ aesni_ccm64_decrypt_blocks: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -.L034enc1_loop_6: +.L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L034enc1_loop_6 + jnz .L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -795,7 +821,7 @@ aesni_ctr32_encrypt_blocks: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je .L035ctr32_one_shortcut + je .L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -833,7 +859,7 @@ aesni_ctr32_encrypt_blocks: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb .L036ctr32_tail + jb .L038ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -842,9 +868,9 @@ aesni_ctr32_encrypt_blocks: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp .L037ctr32_loop6 + jmp .L039ctr32_loop6 .align 16 -.L037ctr32_loop6: +.L039ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -898,27 +924,27 @@ aesni_ctr32_encrypt_blocks: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc .L037ctr32_loop6 + jnc .L039ctr32_loop6 addl $6,%eax - jz .L038ctr32_ret + jz .L040ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -.L036ctr32_tail: +.L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb .L039ctr32_one + jb .L041ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je .L040ctr32_two + je .L042ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb .L041ctr32_three + jb .L043ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je .L042ctr32_four + je .L044ctr32_four por %xmm7,%xmm6 call _aesni_encrypt6 movups (%esi),%xmm1 @@ -936,29 +962,29 @@ aesni_ctr32_encrypt_blocks: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L035ctr32_one_shortcut: +.L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -.L039ctr32_one: +.L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L043enc1_loop_7: +.L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L043enc1_loop_7 + jnz .L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L040ctr32_two: +.L042ctr32_two: call _aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -966,9 +992,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L041ctr32_three: +.L043ctr32_three: call _aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -979,9 +1005,9 @@ aesni_ctr32_encrypt_blocks: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp .L038ctr32_ret + jmp .L040ctr32_ret .align 16 -.L042ctr32_four: +.L044ctr32_four: call _aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -995,7 +1021,18 @@ aesni_ctr32_encrypt_blocks: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -.L038ctr32_ret: +.L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -1020,12 +1057,12 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L044enc1_loop_8: +.L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L044enc1_loop_8 + jnz .L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1049,14 +1086,14 @@ aesni_xts_encrypt: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc .L045xts_enc_short + jc .L047xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L046xts_enc_loop6 + jmp .L048xts_enc_loop6 .align 16 -.L046xts_enc_loop6: +.L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1145,23 +1182,23 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L046xts_enc_loop6 + jnc .L048xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L045xts_enc_short: +.L047xts_enc_short: addl $96,%eax - jz .L047xts_enc_done6x + jz .L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L048xts_enc_one + jb .L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L049xts_enc_two + je .L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1170,7 +1207,7 @@ aesni_xts_encrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L050xts_enc_three + jb .L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1180,7 +1217,7 @@ aesni_xts_encrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L051xts_enc_four + je .L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1212,9 +1249,9 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L048xts_enc_one: +.L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1222,20 +1259,20 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L053enc1_loop_9: +.L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L053enc1_loop_9 + jnz .L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L049xts_enc_two: +.L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1249,9 +1286,9 @@ aesni_xts_encrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L050xts_enc_three: +.L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1269,9 +1306,9 @@ aesni_xts_encrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L051xts_enc_four: +.L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1293,28 +1330,28 @@ aesni_xts_encrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L052xts_enc_done + jmp .L054xts_enc_done .align 16 -.L047xts_enc_done6x: +.L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp .L055xts_enc_steal + jmp .L057xts_enc_steal .align 16 -.L052xts_enc_done: +.L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L054xts_enc_ret + jz .L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -.L055xts_enc_steal: +.L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1322,7 +1359,7 @@ aesni_xts_encrypt: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L055xts_enc_steal + jnz .L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1332,16 +1369,30 @@ aesni_xts_encrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L056enc1_loop_10: +.L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L056enc1_loop_10 + jnz .L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -.L054xts_enc_ret: +.L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1366,12 +1417,12 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L057enc1_loop_11: +.L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L057enc1_loop_11 + jnz .L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1400,14 +1451,14 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc .L058xts_dec_short + jc .L060xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp .L059xts_dec_loop6 + jmp .L061xts_dec_loop6 .align 16 -.L059xts_dec_loop6: +.L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1496,23 +1547,23 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc .L059xts_dec_loop6 + jnc .L061xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -.L058xts_dec_short: +.L060xts_dec_short: addl $96,%eax - jz .L060xts_dec_done6x + jz .L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb .L061xts_dec_one + jb .L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je .L062xts_dec_two + je .L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1521,7 +1572,7 @@ aesni_xts_decrypt: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb .L063xts_dec_three + jb .L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1531,7 +1582,7 @@ aesni_xts_decrypt: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je .L064xts_dec_four + je .L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1563,9 +1614,9 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L061xts_dec_one: +.L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1573,20 +1624,20 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L066dec1_loop_12: +.L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L066dec1_loop_12 + jnz .L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L062xts_dec_two: +.L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1600,9 +1651,9 @@ aesni_xts_decrypt: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L063xts_dec_three: +.L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1620,9 +1671,9 @@ aesni_xts_decrypt: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L064xts_dec_four: +.L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1644,20 +1695,20 @@ aesni_xts_decrypt: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp .L065xts_dec_done + jmp .L067xts_dec_done .align 16 -.L060xts_dec_done6x: +.L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret movl %eax,112(%esp) - jmp .L068xts_dec_only_one_more + jmp .L070xts_dec_only_one_more .align 16 -.L065xts_dec_done: +.L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz .L067xts_dec_ret + jz .L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1667,7 +1718,7 @@ aesni_xts_decrypt: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -.L068xts_dec_only_one_more: +.L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1681,16 +1732,16 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L069dec1_loop_13: +.L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L069dec1_loop_13 + jnz .L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -.L070xts_dec_steal: +.L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1698,7 +1749,7 @@ aesni_xts_decrypt: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz .L070xts_dec_steal + jnz .L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1708,16 +1759,30 @@ aesni_xts_decrypt: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L071dec1_loop_14: +.L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L071dec1_loop_14 + jnz .L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -.L067xts_dec_ret: +.L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1743,7 +1808,7 @@ aesni_cbc_encrypt: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz .L072cbc_abort + jz .L074cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1751,14 +1816,14 @@ aesni_cbc_encrypt: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je .L073cbc_decrypt + je .L075cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb .L074cbc_enc_tail + jb .L076cbc_enc_tail subl $16,%eax - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L075cbc_enc_loop: +.L077cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1766,24 +1831,25 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -.L076enc1_loop_15: +.L078enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L076enc1_loop_15 + jnz .L078enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc .L075cbc_enc_loop + jnc .L077cbc_enc_loop addl $16,%eax - jnz .L074cbc_enc_tail + jnz .L076cbc_enc_tail movaps %xmm2,%xmm7 - jmp .L077cbc_ret -.L074cbc_enc_tail: + pxor %xmm2,%xmm2 + jmp .L079cbc_ret +.L076cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1794,20 +1860,20 @@ aesni_cbc_encrypt: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp .L075cbc_enc_loop + jmp .L077cbc_enc_loop .align 16 -.L073cbc_decrypt: +.L075cbc_decrypt: cmpl $80,%eax - jbe .L078cbc_dec_tail + jbe .L080cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp .L079cbc_dec_loop6_enter + jmp .L081cbc_dec_loop6_enter .align 16 -.L080cbc_dec_loop6: +.L082cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -.L079cbc_dec_loop6_enter: +.L081cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1837,28 +1903,28 @@ aesni_cbc_encrypt: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja .L080cbc_dec_loop6 + ja .L082cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle .L081cbc_dec_tail_collected + jle .L083cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -.L078cbc_dec_tail: +.L080cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe .L082cbc_dec_one + jbe .L084cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe .L083cbc_dec_two + jbe .L085cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe .L084cbc_dec_three + jbe .L086cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe .L085cbc_dec_four + jbe .L087cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1876,55 +1942,62 @@ aesni_cbc_encrypt: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L082cbc_dec_one: +.L084cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -.L086dec1_loop_16: +.L089dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz .L086dec1_loop_16 + jnz .L089dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L083cbc_dec_two: +.L085cbc_dec_two: call _aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L084cbc_dec_three: +.L086cbc_dec_three: call _aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp .L081cbc_dec_tail_collected + jmp .L088cbc_dec_tail_collected .align 16 -.L085cbc_dec_four: +.L087cbc_dec_four: call _aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1934,28 +2007,44 @@ aesni_cbc_encrypt: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -.L081cbc_dec_tail_collected: + jmp .L088cbc_dec_tail_collected +.align 16 +.L083cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L088cbc_dec_tail_collected: andl $15,%eax - jnz .L087cbc_dec_tail_partial + jnz .L090cbc_dec_tail_partial movups %xmm2,(%edi) - jmp .L077cbc_ret + pxor %xmm0,%xmm0 + jmp .L079cbc_ret .align 16 -.L087cbc_dec_tail_partial: +.L090cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -.L077cbc_ret: + movdqa %xmm2,(%esp) +.L079cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -.L072cbc_abort: + pxor %xmm7,%xmm7 +.L074cbc_abort: popl %edi popl %esi popl %ebx @@ -1965,52 +2054,62 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz .L088bad_pointer + jz .L091bad_pointer testl %edx,%edx - jz .L088bad_pointer + jz .L091bad_pointer + call .L092pic +.L092pic: + popl %ebx + leal .Lkey_const-.L092pic(%ebx),%ebx + leal OPENSSL_ia32cap_P,%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je .L08914rounds + je .L09314rounds cmpl $192,%ecx - je .L09012rounds + je .L09412rounds cmpl $128,%ecx - jne .L091bad_keybits + jne .L095bad_keybits .align 16 -.L09210rounds: +.L09610rounds: + cmpl $268435456,%ebp + je .L09710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call .L093key_128_cold + call .L098key_128_cold .byte 102,15,58,223,200,2 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,4 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,8 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,16 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,32 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,64 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,128 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,27 - call .L094key_128 + call .L099key_128 .byte 102,15,58,223,200,54 - call .L094key_128 + call .L099key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L094key_128: +.L099key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -.L093key_128_cold: +.L098key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2019,38 +2118,91 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L09012rounds: +.L09710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L101loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L101loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L100good_key +.align 16 +.L09412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je .L10212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call .L095key_192a_cold + call .L103key_192a_cold .byte 102,15,58,223,202,2 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,4 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,8 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,16 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,32 - call .L096key_192b + call .L104key_192b .byte 102,15,58,223,202,64 - call .L097key_192a + call .L105key_192a .byte 102,15,58,223,202,128 - call .L096key_192b + call .L104key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L097key_192a: +.L105key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 16 -.L095key_192a_cold: +.L103key_192a_cold: movaps %xmm2,%xmm5 -.L098key_192b_warm: +.L106key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2064,56 +2216,90 @@ _aesni_set_encrypt_key: pxor %xmm3,%xmm2 ret .align 16 -.L096key_192b: +.L104key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp .L098key_192b_warm + jmp .L106key_192b_warm +.align 16 +.L10212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L107loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L107loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L100good_key .align 16 -.L08914rounds: +.L09314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je .L10814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call .L099key_256a_cold + call .L109key_256a_cold .byte 102,15,58,223,200,1 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,2 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,2 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,4 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,4 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,8 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,8 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,16 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,16 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,32 - call .L101key_256a + call .L111key_256a .byte 102,15,58,223,200,32 - call .L100key_256b + call .L110key_256b .byte 102,15,58,223,202,64 - call .L101key_256a + call .L111key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp .L100good_key .align 16 -.L101key_256a: +.L111key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -.L099key_256a_cold: +.L109key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2122,7 +2308,7 @@ _aesni_set_encrypt_key: xorps %xmm1,%xmm0 ret .align 16 -.L100key_256b: +.L110key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2132,13 +2318,70 @@ _aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 16 +.L10814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L112loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L113done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L112loop_key256 +.L113done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L100good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 4 -.L088bad_pointer: +.L091bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 4 -.L091bad_keybits: +.L095bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key .globl aesni_set_encrypt_key @@ -2164,7 +2407,7 @@ aesni_set_decrypt_key: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz .L102dec_key_ret + jnz .L114dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2172,7 +2415,7 @@ aesni_set_decrypt_key: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -.L103dec_key_inverse: +.L115dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2182,15 +2425,24 @@ aesni_set_decrypt_key: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja .L103dec_key_inverse + ja .L115dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -.L102dec_key_ret: +.L114dec_key_ret: ret .size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 diff --git a/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s b/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s index cecd5f83f7..c1f5aec62c 100644 --- a/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s +++ b/deps/openssl/asm_obsolete/x86-macosx-gas/aes/aesni-x86.s @@ -20,7 +20,10 @@ L000enc1_loop_1: leal 16(%edx),%edx jnz L000enc1_loop_1 .byte 102,15,56,221,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .globl _aesni_decrypt .align 4 @@ -42,7 +45,10 @@ L001dec1_loop_2: leal 16(%edx),%edx jnz L001dec1_loop_2 .byte 102,15,56,223,209 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 movups %xmm2,(%eax) + pxor %xmm2,%xmm2 ret .align 4 __aesni_encrypt2: @@ -242,17 +248,15 @@ __aesni_encrypt6: negl %ecx .byte 102,15,56,220,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp L_aesni_encrypt6_enter + jmp L008_aesni_encrypt6_inner .align 4,0x90 -L008enc6_loop: +L009enc6_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 +L008_aesni_encrypt6_inner: .byte 102,15,56,220,233 .byte 102,15,56,220,241 .byte 102,15,56,220,249 @@ -266,7 +270,7 @@ L_aesni_encrypt6_enter: .byte 102,15,56,220,240 .byte 102,15,56,220,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L008enc6_loop + jnz L009enc6_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 @@ -296,17 +300,15 @@ __aesni_decrypt6: negl %ecx .byte 102,15,56,222,225 pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 addl $16,%ecx -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups -16(%edx,%ecx,1),%xmm0 - jmp L_aesni_decrypt6_enter + jmp L010_aesni_decrypt6_inner .align 4,0x90 -L009dec6_loop: +L011dec6_loop: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 +L010_aesni_decrypt6_inner: .byte 102,15,56,222,233 .byte 102,15,56,222,241 .byte 102,15,56,222,249 @@ -320,7 +322,7 @@ L_aesni_decrypt6_enter: .byte 102,15,56,222,240 .byte 102,15,56,222,248 movups -16(%edx,%ecx,1),%xmm0 - jnz L009dec6_loop + jnz L011dec6_loop .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 @@ -348,14 +350,14 @@ L_aesni_ecb_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebx andl $-16,%eax - jz L010ecb_ret + jz L012ecb_ret movl 240(%edx),%ecx testl %ebx,%ebx - jz L011ecb_decrypt + jz L013ecb_decrypt movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L012ecb_enc_tail + jb L014ecb_enc_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -364,9 +366,9 @@ L_aesni_ecb_encrypt_begin: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L013ecb_enc_loop6_enter + jmp L015ecb_enc_loop6_enter .align 4,0x90 -L014ecb_enc_loop6: +L016ecb_enc_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -381,12 +383,12 @@ L014ecb_enc_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L013ecb_enc_loop6_enter: +L015ecb_enc_loop6_enter: call __aesni_encrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L014ecb_enc_loop6 + jnc L016ecb_enc_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -395,18 +397,18 @@ L013ecb_enc_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L010ecb_ret -L012ecb_enc_tail: + jz L012ecb_ret +L014ecb_enc_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L015ecb_enc_one + jb L017ecb_enc_one movups 16(%esi),%xmm3 - je L016ecb_enc_two + je L018ecb_enc_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L017ecb_enc_three + jb L019ecb_enc_three movups 48(%esi),%xmm5 - je L018ecb_enc_four + je L020ecb_enc_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_encrypt6 @@ -415,49 +417,49 @@ L012ecb_enc_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L015ecb_enc_one: +L017ecb_enc_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L019enc1_loop_3: +L021enc1_loop_3: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L019enc1_loop_3 + jnz L021enc1_loop_3 .byte 102,15,56,221,209 movups %xmm2,(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L016ecb_enc_two: +L018ecb_enc_two: call __aesni_encrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L017ecb_enc_three: +L019ecb_enc_three: call __aesni_encrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L018ecb_enc_four: +L020ecb_enc_four: call __aesni_encrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L011ecb_decrypt: +L013ecb_decrypt: movl %edx,%ebp movl %ecx,%ebx cmpl $96,%eax - jb L020ecb_dec_tail + jb L022ecb_dec_tail movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -466,9 +468,9 @@ L011ecb_decrypt: movdqu 80(%esi),%xmm7 leal 96(%esi),%esi subl $96,%eax - jmp L021ecb_dec_loop6_enter + jmp L023ecb_dec_loop6_enter .align 4,0x90 -L022ecb_dec_loop6: +L024ecb_dec_loop6: movups %xmm2,(%edi) movdqu (%esi),%xmm2 movups %xmm3,16(%edi) @@ -483,12 +485,12 @@ L022ecb_dec_loop6: leal 96(%edi),%edi movdqu 80(%esi),%xmm7 leal 96(%esi),%esi -L021ecb_dec_loop6_enter: +L023ecb_dec_loop6_enter: call __aesni_decrypt6 movl %ebp,%edx movl %ebx,%ecx subl $96,%eax - jnc L022ecb_dec_loop6 + jnc L024ecb_dec_loop6 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) @@ -497,18 +499,18 @@ L021ecb_dec_loop6_enter: movups %xmm7,80(%edi) leal 96(%edi),%edi addl $96,%eax - jz L010ecb_ret -L020ecb_dec_tail: + jz L012ecb_ret +L022ecb_dec_tail: movups (%esi),%xmm2 cmpl $32,%eax - jb L023ecb_dec_one + jb L025ecb_dec_one movups 16(%esi),%xmm3 - je L024ecb_dec_two + je L026ecb_dec_two movups 32(%esi),%xmm4 cmpl $64,%eax - jb L025ecb_dec_three + jb L027ecb_dec_three movups 48(%esi),%xmm5 - je L026ecb_dec_four + je L028ecb_dec_four movups 64(%esi),%xmm6 xorps %xmm7,%xmm7 call __aesni_decrypt6 @@ -517,43 +519,51 @@ L020ecb_dec_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L023ecb_dec_one: +L025ecb_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L027dec1_loop_4: +L029dec1_loop_4: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L027dec1_loop_4 + jnz L029dec1_loop_4 .byte 102,15,56,223,209 movups %xmm2,(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L024ecb_dec_two: +L026ecb_dec_two: call __aesni_decrypt2 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L025ecb_dec_three: +L027ecb_dec_three: call __aesni_decrypt3 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L010ecb_ret + jmp L012ecb_ret .align 4,0x90 -L026ecb_dec_four: +L028ecb_dec_four: call __aesni_decrypt4 movups %xmm2,(%edi) movups %xmm3,16(%edi) movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L010ecb_ret: +L012ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -598,7 +608,7 @@ L_aesni_ccm64_encrypt_blocks_begin: leal 32(%edx,%ecx,1),%edx subl %ecx,%ebx .byte 102,15,56,0,253 -L028ccm64_enc_outer: +L030ccm64_enc_outer: movups (%ebp),%xmm0 movl %ebx,%ecx movups (%esi),%xmm6 @@ -607,7 +617,7 @@ L028ccm64_enc_outer: xorps %xmm6,%xmm0 xorps %xmm0,%xmm3 movups 32(%ebp),%xmm0 -L029ccm64_enc2_loop: +L031ccm64_enc2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -615,7 +625,7 @@ L029ccm64_enc2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L029ccm64_enc2_loop + jnz L031ccm64_enc2_loop .byte 102,15,56,220,209 .byte 102,15,56,220,217 paddq 16(%esp),%xmm7 @@ -628,10 +638,18 @@ L029ccm64_enc2_loop: movups %xmm6,(%edi) .byte 102,15,56,0,213 leal 16(%edi),%edi - jnz L028ccm64_enc_outer + jnz L030ccm64_enc_outer movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -677,12 +695,12 @@ L_aesni_ccm64_decrypt_blocks_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L030enc1_loop_5: +L032enc1_loop_5: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L030enc1_loop_5 + jnz L032enc1_loop_5 .byte 102,15,56,221,209 shll $4,%ebx movl $16,%ecx @@ -692,16 +710,16 @@ L030enc1_loop_5: subl %ebx,%ecx leal 32(%ebp,%ebx,1),%edx movl %ecx,%ebx - jmp L031ccm64_dec_outer + jmp L033ccm64_dec_outer .align 4,0x90 -L031ccm64_dec_outer: +L033ccm64_dec_outer: xorps %xmm2,%xmm6 movdqa %xmm7,%xmm2 movups %xmm6,(%edi) leal 16(%edi),%edi .byte 102,15,56,0,213 subl $1,%eax - jz L032ccm64_dec_break + jz L034ccm64_dec_break movups (%ebp),%xmm0 movl %ebx,%ecx movups 16(%ebp),%xmm1 @@ -709,7 +727,7 @@ L031ccm64_dec_outer: xorps %xmm0,%xmm2 xorps %xmm6,%xmm3 movups 32(%ebp),%xmm0 -L033ccm64_dec2_loop: +L035ccm64_dec2_loop: .byte 102,15,56,220,209 .byte 102,15,56,220,217 movups (%edx,%ecx,1),%xmm1 @@ -717,7 +735,7 @@ L033ccm64_dec2_loop: .byte 102,15,56,220,208 .byte 102,15,56,220,216 movups -16(%edx,%ecx,1),%xmm0 - jnz L033ccm64_dec2_loop + jnz L035ccm64_dec2_loop movups (%esi),%xmm6 paddq 16(%esp),%xmm7 .byte 102,15,56,220,209 @@ -725,9 +743,9 @@ L033ccm64_dec2_loop: .byte 102,15,56,221,208 .byte 102,15,56,221,216 leal 16(%esi),%esi - jmp L031ccm64_dec_outer + jmp L033ccm64_dec_outer .align 4,0x90 -L032ccm64_dec_break: +L034ccm64_dec_break: movl 240(%ebp),%ecx movl %ebp,%edx movups (%edx),%xmm0 @@ -735,16 +753,24 @@ L032ccm64_dec_break: xorps %xmm0,%xmm6 leal 32(%edx),%edx xorps %xmm6,%xmm3 -L034enc1_loop_6: +L036enc1_loop_6: .byte 102,15,56,220,217 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L034enc1_loop_6 + jnz L036enc1_loop_6 .byte 102,15,56,221,217 movl 48(%esp),%esp movl 40(%esp),%edi movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 popl %edi popl %esi popl %ebx @@ -768,7 +794,7 @@ L_aesni_ctr32_encrypt_blocks_begin: andl $-16,%esp movl %ebp,80(%esp) cmpl $1,%eax - je L035ctr32_one_shortcut + je L037ctr32_one_shortcut movdqu (%ebx),%xmm7 movl $202182159,(%esp) movl $134810123,4(%esp) @@ -806,7 +832,7 @@ L_aesni_ctr32_encrypt_blocks_begin: pshufd $192,%xmm0,%xmm2 pshufd $128,%xmm0,%xmm3 cmpl $6,%eax - jb L036ctr32_tail + jb L038ctr32_tail pxor %xmm6,%xmm7 shll $4,%ecx movl $16,%ebx @@ -815,9 +841,9 @@ L_aesni_ctr32_encrypt_blocks_begin: subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx subl $6,%eax - jmp L037ctr32_loop6 + jmp L039ctr32_loop6 .align 4,0x90 -L037ctr32_loop6: +L039ctr32_loop6: pshufd $64,%xmm0,%xmm4 movdqa 32(%esp),%xmm0 pshufd $192,%xmm1,%xmm5 @@ -871,27 +897,27 @@ L037ctr32_loop6: leal 96(%edi),%edi pshufd $128,%xmm0,%xmm3 subl $6,%eax - jnc L037ctr32_loop6 + jnc L039ctr32_loop6 addl $6,%eax - jz L038ctr32_ret + jz L040ctr32_ret movdqu (%ebp),%xmm7 movl %ebp,%edx pxor 32(%esp),%xmm7 movl 240(%ebp),%ecx -L036ctr32_tail: +L038ctr32_tail: por %xmm7,%xmm2 cmpl $2,%eax - jb L039ctr32_one + jb L041ctr32_one pshufd $64,%xmm0,%xmm4 por %xmm7,%xmm3 - je L040ctr32_two + je L042ctr32_two pshufd $192,%xmm1,%xmm5 por %xmm7,%xmm4 cmpl $4,%eax - jb L041ctr32_three + jb L043ctr32_three pshufd $128,%xmm1,%xmm6 por %xmm7,%xmm5 - je L042ctr32_four + je L044ctr32_four por %xmm7,%xmm6 call __aesni_encrypt6 movups (%esi),%xmm1 @@ -909,29 +935,29 @@ L036ctr32_tail: movups %xmm4,32(%edi) movups %xmm5,48(%edi) movups %xmm6,64(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L035ctr32_one_shortcut: +L037ctr32_one_shortcut: movups (%ebx),%xmm2 movl 240(%edx),%ecx -L039ctr32_one: +L041ctr32_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L043enc1_loop_7: +L045enc1_loop_7: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L043enc1_loop_7 + jnz L045enc1_loop_7 .byte 102,15,56,221,209 movups (%esi),%xmm6 xorps %xmm2,%xmm6 movups %xmm6,(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L040ctr32_two: +L042ctr32_two: call __aesni_encrypt2 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -939,9 +965,9 @@ L040ctr32_two: xorps %xmm6,%xmm3 movups %xmm2,(%edi) movups %xmm3,16(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L041ctr32_three: +L043ctr32_three: call __aesni_encrypt3 movups (%esi),%xmm5 movups 16(%esi),%xmm6 @@ -952,9 +978,9 @@ L041ctr32_three: xorps %xmm7,%xmm4 movups %xmm3,16(%edi) movups %xmm4,32(%edi) - jmp L038ctr32_ret + jmp L040ctr32_ret .align 4,0x90 -L042ctr32_four: +L044ctr32_four: call __aesni_encrypt4 movups (%esi),%xmm6 movups 16(%esi),%xmm7 @@ -968,7 +994,18 @@ L042ctr32_four: xorps %xmm0,%xmm5 movups %xmm4,32(%edi) movups %xmm5,48(%edi) -L038ctr32_ret: +L040ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 movl 80(%esp),%esp popl %edi popl %esi @@ -991,12 +1028,12 @@ L_aesni_xts_encrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L044enc1_loop_8: +L046enc1_loop_8: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L044enc1_loop_8 + jnz L046enc1_loop_8 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1020,14 +1057,14 @@ L044enc1_loop_8: movl %edx,%ebp movl %ecx,%ebx subl $96,%eax - jc L045xts_enc_short + jc L047xts_enc_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L046xts_enc_loop6 + jmp L048xts_enc_loop6 .align 4,0x90 -L046xts_enc_loop6: +L048xts_enc_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1116,23 +1153,23 @@ L046xts_enc_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L046xts_enc_loop6 + jnc L048xts_enc_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L045xts_enc_short: +L047xts_enc_short: addl $96,%eax - jz L047xts_enc_done6x + jz L049xts_enc_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L048xts_enc_one + jb L050xts_enc_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L049xts_enc_two + je L051xts_enc_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1141,7 +1178,7 @@ L045xts_enc_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L050xts_enc_three + jb L052xts_enc_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1151,7 +1188,7 @@ L045xts_enc_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L051xts_enc_four + je L053xts_enc_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1183,9 +1220,9 @@ L045xts_enc_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L048xts_enc_one: +L050xts_enc_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1193,20 +1230,20 @@ L048xts_enc_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L053enc1_loop_9: +L055enc1_loop_9: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L053enc1_loop_9 + jnz L055enc1_loop_9 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L049xts_enc_two: +L051xts_enc_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1220,9 +1257,9 @@ L049xts_enc_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L050xts_enc_three: +L052xts_enc_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1240,9 +1277,9 @@ L050xts_enc_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L051xts_enc_four: +L053xts_enc_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1264,28 +1301,28 @@ L051xts_enc_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L052xts_enc_done + jmp L054xts_enc_done .align 4,0x90 -L047xts_enc_done6x: +L049xts_enc_done6x: movl 112(%esp),%eax andl $15,%eax - jz L054xts_enc_ret + jz L056xts_enc_ret movdqa %xmm1,%xmm5 movl %eax,112(%esp) - jmp L055xts_enc_steal + jmp L057xts_enc_steal .align 4,0x90 -L052xts_enc_done: +L054xts_enc_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L054xts_enc_ret + jz L056xts_enc_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm5 paddq %xmm1,%xmm1 pand 96(%esp),%xmm5 pxor %xmm1,%xmm5 -L055xts_enc_steal: +L057xts_enc_steal: movzbl (%esi),%ecx movzbl -16(%edi),%edx leal 1(%esi),%esi @@ -1293,7 +1330,7 @@ L055xts_enc_steal: movb %dl,(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L055xts_enc_steal + jnz L057xts_enc_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1303,16 +1340,30 @@ L055xts_enc_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L056enc1_loop_10: +L058enc1_loop_10: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L056enc1_loop_10 + jnz L058enc1_loop_10 .byte 102,15,56,221,209 xorps %xmm5,%xmm2 movups %xmm2,-16(%edi) -L054xts_enc_ret: +L056xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1335,12 +1386,12 @@ L_aesni_xts_decrypt_begin: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L057enc1_loop_11: +L059enc1_loop_11: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L057enc1_loop_11 + jnz L059enc1_loop_11 .byte 102,15,56,221,209 movl 20(%esp),%esi movl 24(%esp),%edi @@ -1369,14 +1420,14 @@ L057enc1_loop_11: pcmpgtd %xmm1,%xmm0 andl $-16,%eax subl $96,%eax - jc L058xts_dec_short + jc L060xts_dec_short shll $4,%ecx movl $16,%ebx subl %ecx,%ebx leal 32(%edx,%ecx,1),%edx - jmp L059xts_dec_loop6 + jmp L061xts_dec_loop6 .align 4,0x90 -L059xts_dec_loop6: +L061xts_dec_loop6: pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,(%esp) @@ -1465,23 +1516,23 @@ L059xts_dec_loop6: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 subl $96,%eax - jnc L059xts_dec_loop6 + jnc L061xts_dec_loop6 movl 240(%ebp),%ecx movl %ebp,%edx movl %ecx,%ebx -L058xts_dec_short: +L060xts_dec_short: addl $96,%eax - jz L060xts_dec_done6x + jz L062xts_dec_done6x movdqa %xmm1,%xmm5 cmpl $32,%eax - jb L061xts_dec_one + jb L063xts_dec_one pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 paddq %xmm1,%xmm1 pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 - je L062xts_dec_two + je L064xts_dec_two pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm6 @@ -1490,7 +1541,7 @@ L058xts_dec_short: pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 cmpl $64,%eax - jb L063xts_dec_three + jb L065xts_dec_three pshufd $19,%xmm0,%xmm2 pxor %xmm0,%xmm0 movdqa %xmm1,%xmm7 @@ -1500,7 +1551,7 @@ L058xts_dec_short: pxor %xmm2,%xmm1 movdqa %xmm5,(%esp) movdqa %xmm6,16(%esp) - je L064xts_dec_four + je L066xts_dec_four movdqa %xmm7,32(%esp) pshufd $19,%xmm0,%xmm7 movdqa %xmm1,48(%esp) @@ -1532,9 +1583,9 @@ L058xts_dec_short: movups %xmm5,48(%edi) movups %xmm6,64(%edi) leal 80(%edi),%edi - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L061xts_dec_one: +L063xts_dec_one: movups (%esi),%xmm2 leal 16(%esi),%esi xorps %xmm5,%xmm2 @@ -1542,20 +1593,20 @@ L061xts_dec_one: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L066dec1_loop_12: +L068dec1_loop_12: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L066dec1_loop_12 + jnz L068dec1_loop_12 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) leal 16(%edi),%edi movdqa %xmm5,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L062xts_dec_two: +L064xts_dec_two: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1569,9 +1620,9 @@ L062xts_dec_two: movups %xmm3,16(%edi) leal 32(%edi),%edi movdqa %xmm6,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L063xts_dec_three: +L065xts_dec_three: movaps %xmm1,%xmm7 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1589,9 +1640,9 @@ L063xts_dec_three: movups %xmm4,32(%edi) leal 48(%edi),%edi movdqa %xmm7,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L064xts_dec_four: +L066xts_dec_four: movaps %xmm1,%xmm6 movups (%esi),%xmm2 movups 16(%esi),%xmm3 @@ -1613,20 +1664,20 @@ L064xts_dec_four: movups %xmm5,48(%edi) leal 64(%edi),%edi movdqa %xmm6,%xmm1 - jmp L065xts_dec_done + jmp L067xts_dec_done .align 4,0x90 -L060xts_dec_done6x: +L062xts_dec_done6x: movl 112(%esp),%eax andl $15,%eax - jz L067xts_dec_ret + jz L069xts_dec_ret movl %eax,112(%esp) - jmp L068xts_dec_only_one_more + jmp L070xts_dec_only_one_more .align 4,0x90 -L065xts_dec_done: +L067xts_dec_done: movl 112(%esp),%eax pxor %xmm0,%xmm0 andl $15,%eax - jz L067xts_dec_ret + jz L069xts_dec_ret pcmpgtd %xmm1,%xmm0 movl %eax,112(%esp) pshufd $19,%xmm0,%xmm2 @@ -1636,7 +1687,7 @@ L065xts_dec_done: pand %xmm3,%xmm2 pcmpgtd %xmm1,%xmm0 pxor %xmm2,%xmm1 -L068xts_dec_only_one_more: +L070xts_dec_only_one_more: pshufd $19,%xmm0,%xmm5 movdqa %xmm1,%xmm6 paddq %xmm1,%xmm1 @@ -1650,16 +1701,16 @@ L068xts_dec_only_one_more: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L069dec1_loop_13: +L071dec1_loop_13: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L069dec1_loop_13 + jnz L071dec1_loop_13 .byte 102,15,56,223,209 xorps %xmm5,%xmm2 movups %xmm2,(%edi) -L070xts_dec_steal: +L072xts_dec_steal: movzbl 16(%esi),%ecx movzbl (%edi),%edx leal 1(%esi),%esi @@ -1667,7 +1718,7 @@ L070xts_dec_steal: movb %dl,16(%edi) leal 1(%edi),%edi subl $1,%eax - jnz L070xts_dec_steal + jnz L072xts_dec_steal subl 112(%esp),%edi movl %ebp,%edx movl %ebx,%ecx @@ -1677,16 +1728,30 @@ L070xts_dec_steal: movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L071dec1_loop_14: +L073dec1_loop_14: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L071dec1_loop_14 + jnz L073dec1_loop_14 .byte 102,15,56,223,209 xorps %xmm6,%xmm2 movups %xmm2,(%edi) -L067xts_dec_ret: +L069xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) movl 116(%esp),%esp popl %edi popl %esi @@ -1710,7 +1775,7 @@ L_aesni_cbc_encrypt_begin: movl 32(%esp),%edx movl 36(%esp),%ebp testl %eax,%eax - jz L072cbc_abort + jz L074cbc_abort cmpl $0,40(%esp) xchgl %esp,%ebx movups (%ebp),%xmm7 @@ -1718,14 +1783,14 @@ L_aesni_cbc_encrypt_begin: movl %edx,%ebp movl %ebx,16(%esp) movl %ecx,%ebx - je L073cbc_decrypt + je L075cbc_decrypt movaps %xmm7,%xmm2 cmpl $16,%eax - jb L074cbc_enc_tail + jb L076cbc_enc_tail subl $16,%eax - jmp L075cbc_enc_loop + jmp L077cbc_enc_loop .align 4,0x90 -L075cbc_enc_loop: +L077cbc_enc_loop: movups (%esi),%xmm7 leal 16(%esi),%esi movups (%edx),%xmm0 @@ -1733,24 +1798,25 @@ L075cbc_enc_loop: xorps %xmm0,%xmm7 leal 32(%edx),%edx xorps %xmm7,%xmm2 -L076enc1_loop_15: +L078enc1_loop_15: .byte 102,15,56,220,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L076enc1_loop_15 + jnz L078enc1_loop_15 .byte 102,15,56,221,209 movl %ebx,%ecx movl %ebp,%edx movups %xmm2,(%edi) leal 16(%edi),%edi subl $16,%eax - jnc L075cbc_enc_loop + jnc L077cbc_enc_loop addl $16,%eax - jnz L074cbc_enc_tail + jnz L076cbc_enc_tail movaps %xmm2,%xmm7 - jmp L077cbc_ret -L074cbc_enc_tail: + pxor %xmm2,%xmm2 + jmp L079cbc_ret +L076cbc_enc_tail: movl %eax,%ecx .long 2767451785 movl $16,%ecx @@ -1761,20 +1827,20 @@ L074cbc_enc_tail: movl %ebx,%ecx movl %edi,%esi movl %ebp,%edx - jmp L075cbc_enc_loop + jmp L077cbc_enc_loop .align 4,0x90 -L073cbc_decrypt: +L075cbc_decrypt: cmpl $80,%eax - jbe L078cbc_dec_tail + jbe L080cbc_dec_tail movaps %xmm7,(%esp) subl $80,%eax - jmp L079cbc_dec_loop6_enter + jmp L081cbc_dec_loop6_enter .align 4,0x90 -L080cbc_dec_loop6: +L082cbc_dec_loop6: movaps %xmm0,(%esp) movups %xmm7,(%edi) leal 16(%edi),%edi -L079cbc_dec_loop6_enter: +L081cbc_dec_loop6_enter: movdqu (%esi),%xmm2 movdqu 16(%esi),%xmm3 movdqu 32(%esi),%xmm4 @@ -1804,28 +1870,28 @@ L079cbc_dec_loop6_enter: movups %xmm6,64(%edi) leal 80(%edi),%edi subl $96,%eax - ja L080cbc_dec_loop6 + ja L082cbc_dec_loop6 movaps %xmm7,%xmm2 movaps %xmm0,%xmm7 addl $80,%eax - jle L081cbc_dec_tail_collected + jle L083cbc_dec_clear_tail_collected movups %xmm2,(%edi) leal 16(%edi),%edi -L078cbc_dec_tail: +L080cbc_dec_tail: movups (%esi),%xmm2 movaps %xmm2,%xmm6 cmpl $16,%eax - jbe L082cbc_dec_one + jbe L084cbc_dec_one movups 16(%esi),%xmm3 movaps %xmm3,%xmm5 cmpl $32,%eax - jbe L083cbc_dec_two + jbe L085cbc_dec_two movups 32(%esi),%xmm4 cmpl $48,%eax - jbe L084cbc_dec_three + jbe L086cbc_dec_three movups 48(%esi),%xmm5 cmpl $64,%eax - jbe L085cbc_dec_four + jbe L087cbc_dec_four movups 64(%esi),%xmm6 movaps %xmm7,(%esp) movups (%esi),%xmm2 @@ -1843,55 +1909,62 @@ L078cbc_dec_tail: xorps %xmm0,%xmm6 movups %xmm2,(%edi) movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 leal 64(%edi),%edi movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 subl $80,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L082cbc_dec_one: +L084cbc_dec_one: movups (%edx),%xmm0 movups 16(%edx),%xmm1 leal 32(%edx),%edx xorps %xmm0,%xmm2 -L086dec1_loop_16: +L089dec1_loop_16: .byte 102,15,56,222,209 decl %ecx movups (%edx),%xmm1 leal 16(%edx),%edx - jnz L086dec1_loop_16 + jnz L089dec1_loop_16 .byte 102,15,56,223,209 xorps %xmm7,%xmm2 movaps %xmm6,%xmm7 subl $16,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L083cbc_dec_two: +L085cbc_dec_two: call __aesni_decrypt2 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 movups %xmm2,(%edi) movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 leal 16(%edi),%edi movaps %xmm5,%xmm7 subl $32,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L084cbc_dec_three: +L086cbc_dec_three: call __aesni_decrypt3 xorps %xmm7,%xmm2 xorps %xmm6,%xmm3 xorps %xmm5,%xmm4 movups %xmm2,(%edi) movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 leal 32(%edi),%edi movups 32(%esi),%xmm7 subl $48,%eax - jmp L081cbc_dec_tail_collected + jmp L088cbc_dec_tail_collected .align 4,0x90 -L085cbc_dec_four: +L087cbc_dec_four: call __aesni_decrypt4 movups 16(%esi),%xmm1 movups 32(%esi),%xmm0 @@ -1901,28 +1974,44 @@ L085cbc_dec_four: movups %xmm2,(%edi) xorps %xmm1,%xmm4 movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 xorps %xmm0,%xmm5 movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 leal 48(%edi),%edi movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 subl $64,%eax -L081cbc_dec_tail_collected: + jmp L088cbc_dec_tail_collected +.align 4,0x90 +L083cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +L088cbc_dec_tail_collected: andl $15,%eax - jnz L087cbc_dec_tail_partial + jnz L090cbc_dec_tail_partial movups %xmm2,(%edi) - jmp L077cbc_ret + pxor %xmm0,%xmm0 + jmp L079cbc_ret .align 4,0x90 -L087cbc_dec_tail_partial: +L090cbc_dec_tail_partial: movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 movl $16,%ecx movl %esp,%esi subl %eax,%ecx .long 2767451785 -L077cbc_ret: + movdqa %xmm2,(%esp) +L079cbc_ret: movl 16(%esp),%esp movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 movups %xmm7,(%ebp) -L072cbc_abort: + pxor %xmm7,%xmm7 +L074cbc_abort: popl %edi popl %esi popl %ebx @@ -1930,52 +2019,62 @@ L072cbc_abort: ret .align 4 __aesni_set_encrypt_key: + pushl %ebp + pushl %ebx testl %eax,%eax - jz L088bad_pointer + jz L091bad_pointer testl %edx,%edx - jz L088bad_pointer + jz L091bad_pointer + call L092pic +L092pic: + popl %ebx + leal Lkey_const-L092pic(%ebx),%ebx + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp movups (%eax),%xmm0 xorps %xmm4,%xmm4 + movl 4(%ebp),%ebp leal 16(%edx),%edx + andl $268437504,%ebp cmpl $256,%ecx - je L08914rounds + je L09314rounds cmpl $192,%ecx - je L09012rounds + je L09412rounds cmpl $128,%ecx - jne L091bad_keybits + jne L095bad_keybits .align 4,0x90 -L09210rounds: +L09610rounds: + cmpl $268435456,%ebp + je L09710rounds_alt movl $9,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,200,1 - call L093key_128_cold + call L098key_128_cold .byte 102,15,58,223,200,2 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,4 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,8 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,16 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,32 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,64 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,128 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,27 - call L094key_128 + call L099key_128 .byte 102,15,58,223,200,54 - call L094key_128 + call L099key_128 movups %xmm0,(%edx) movl %ecx,80(%edx) - xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L094key_128: +L099key_128: movups %xmm0,(%edx) leal 16(%edx),%edx -L093key_128_cold: +L098key_128_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -1984,38 +2083,91 @@ L093key_128_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L09012rounds: +L09710rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +L101loop_key128: +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz L101loop_key128 + movdqa 48(%ebx),%xmm4 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 +.byte 102,15,56,0,197 +.byte 102,15,56,221,196 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp L100good_key +.align 4,0x90 +L09412rounds: movq 16(%eax),%xmm2 + cmpl $268435456,%ebp + je L10212rounds_alt movl $11,%ecx movups %xmm0,-16(%edx) .byte 102,15,58,223,202,1 - call L095key_192a_cold + call L103key_192a_cold .byte 102,15,58,223,202,2 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,4 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,8 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,16 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,32 - call L096key_192b + call L104key_192b .byte 102,15,58,223,202,64 - call L097key_192a + call L105key_192a .byte 102,15,58,223,202,128 - call L096key_192b + call L104key_192b movups %xmm0,(%edx) movl %ecx,48(%edx) - xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L097key_192a: +L105key_192a: movups %xmm0,(%edx) leal 16(%edx),%edx .align 4,0x90 -L095key_192a_cold: +L103key_192a_cold: movaps %xmm2,%xmm5 -L098key_192b_warm: +L106key_192b_warm: shufps $16,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 @@ -2029,56 +2181,90 @@ L098key_192b_warm: pxor %xmm3,%xmm2 ret .align 4,0x90 -L096key_192b: +L104key_192b: movaps %xmm0,%xmm3 shufps $68,%xmm0,%xmm5 movups %xmm5,(%edx) shufps $78,%xmm2,%xmm3 movups %xmm3,16(%edx) leal 32(%edx),%edx - jmp L098key_192b_warm + jmp L106key_192b_warm .align 4,0x90 -L08914rounds: +L10212rounds_alt: + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +L107loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz L107loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp L100good_key +.align 4,0x90 +L09314rounds: movups 16(%eax),%xmm2 - movl $13,%ecx leal 16(%edx),%edx + cmpl $268435456,%ebp + je L10814rounds_alt + movl $13,%ecx movups %xmm0,-32(%edx) movups %xmm2,-16(%edx) .byte 102,15,58,223,202,1 - call L099key_256a_cold + call L109key_256a_cold .byte 102,15,58,223,200,1 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,2 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,2 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,4 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,4 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,8 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,8 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,16 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,16 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,32 - call L101key_256a + call L111key_256a .byte 102,15,58,223,200,32 - call L100key_256b + call L110key_256b .byte 102,15,58,223,202,64 - call L101key_256a + call L111key_256a movups %xmm0,(%edx) movl %ecx,16(%edx) xorl %eax,%eax - ret + jmp L100good_key .align 4,0x90 -L101key_256a: +L111key_256a: movups %xmm2,(%edx) leal 16(%edx),%edx -L099key_256a_cold: +L109key_256a_cold: shufps $16,%xmm0,%xmm4 xorps %xmm4,%xmm0 shufps $140,%xmm0,%xmm4 @@ -2087,7 +2273,7 @@ L099key_256a_cold: xorps %xmm1,%xmm0 ret .align 4,0x90 -L100key_256b: +L110key_256b: movups %xmm0,(%edx) leal 16(%edx),%edx shufps $16,%xmm2,%xmm4 @@ -2097,13 +2283,70 @@ L100key_256b: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 ret +.align 4,0x90 +L10814rounds_alt: + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +L112loop_key256: +.byte 102,15,56,0,213 +.byte 102,15,56,221,212 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz L113done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 +.byte 102,15,56,221,211 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp L112loop_key256 +L113done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +L100good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + popl %ebp + ret .align 2,0x90 -L088bad_pointer: +L091bad_pointer: movl $-1,%eax + popl %ebx + popl %ebp ret .align 2,0x90 -L091bad_keybits: +L095bad_keybits: + pxor %xmm0,%xmm0 movl $-2,%eax + popl %ebx + popl %ebp ret .globl _aesni_set_encrypt_key .align 4 @@ -2125,7 +2368,7 @@ L_aesni_set_decrypt_key_begin: movl 12(%esp),%edx shll $4,%ecx testl %eax,%eax - jnz L102dec_key_ret + jnz L114dec_key_ret leal 16(%edx,%ecx,1),%eax movups (%edx),%xmm0 movups (%eax),%xmm1 @@ -2133,7 +2376,7 @@ L_aesni_set_decrypt_key_begin: movups %xmm1,(%edx) leal 16(%edx),%edx leal -16(%eax),%eax -L103dec_key_inverse: +L115dec_key_inverse: movups (%edx),%xmm0 movups (%eax),%xmm1 .byte 102,15,56,219,192 @@ -2143,14 +2386,27 @@ L103dec_key_inverse: movups %xmm0,16(%eax) movups %xmm1,-16(%edx) cmpl %edx,%eax - ja L103dec_key_inverse + ja L115dec_key_inverse movups (%edx),%xmm0 .byte 102,15,56,219,192 movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 xorl %eax,%eax -L102dec_key_ret: +L114dec_key_ret: ret +.align 6,0x90 +Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 .byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 +.comm _OPENSSL_ia32cap_P,16,2 diff --git a/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm b/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm index 43fdb5a034..6511c21bcf 100644 --- a/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm +++ b/deps/openssl/asm_obsolete/x86-win32-masm/aes/aesni-x86.asm @@ -17,6 +17,7 @@ IF @Version LT 800 ELSE .text$ SEGMENT ALIGN(64) 'CODE' ENDIF +;EXTERN _OPENSSL_ia32cap_P:NEAR ALIGN 16 _aesni_encrypt PROC PUBLIC $L_aesni_encrypt_begin:: @@ -36,7 +37,10 @@ DB 102,15,56,220,209 lea edx,DWORD PTR 16[edx] jnz $L000enc1_loop_1 DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR [eax],xmm2 + pxor xmm2,xmm2 ret _aesni_encrypt ENDP ALIGN 16 @@ -58,7 +62,10 @@ DB 102,15,56,222,209 lea edx,DWORD PTR 16[edx] jnz $L001dec1_loop_2 DB 102,15,56,223,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 movups XMMWORD PTR [eax],xmm2 + pxor xmm2,xmm2 ret _aesni_decrypt ENDP ALIGN 16 @@ -265,17 +272,15 @@ DB 102,15,56,220,217 neg ecx DB 102,15,56,220,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR [ecx*1+edx] add ecx,16 -DB 102,15,56,220,233 -DB 102,15,56,220,241 -DB 102,15,56,220,249 - movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jmp $L_aesni_encrypt6_enter + jmp $L008_aesni_encrypt6_inner ALIGN 16 -$L008enc6_loop: +$L009enc6_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 +$L008_aesni_encrypt6_inner: DB 102,15,56,220,233 DB 102,15,56,220,241 DB 102,15,56,220,249 @@ -289,7 +294,7 @@ DB 102,15,56,220,232 DB 102,15,56,220,240 DB 102,15,56,220,248 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L008enc6_loop + jnz $L009enc6_loop DB 102,15,56,220,209 DB 102,15,56,220,217 DB 102,15,56,220,225 @@ -320,17 +325,15 @@ DB 102,15,56,222,217 neg ecx DB 102,15,56,222,225 pxor xmm7,xmm0 + movups xmm0,XMMWORD PTR [ecx*1+edx] add ecx,16 -DB 102,15,56,222,233 -DB 102,15,56,222,241 -DB 102,15,56,222,249 - movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jmp $L_aesni_decrypt6_enter + jmp $L010_aesni_decrypt6_inner ALIGN 16 -$L009dec6_loop: +$L011dec6_loop: DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 +$L010_aesni_decrypt6_inner: DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 @@ -344,7 +347,7 @@ DB 102,15,56,222,232 DB 102,15,56,222,240 DB 102,15,56,222,248 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L009dec6_loop + jnz $L011dec6_loop DB 102,15,56,222,209 DB 102,15,56,222,217 DB 102,15,56,222,225 @@ -372,14 +375,14 @@ $L_aesni_ecb_encrypt_begin:: mov edx,DWORD PTR 32[esp] mov ebx,DWORD PTR 36[esp] and eax,-16 - jz $L010ecb_ret + jz $L012ecb_ret mov ecx,DWORD PTR 240[edx] test ebx,ebx - jz $L011ecb_decrypt + jz $L013ecb_decrypt mov ebp,edx mov ebx,ecx cmp eax,96 - jb $L012ecb_enc_tail + jb $L014ecb_enc_tail movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -388,9 +391,9 @@ $L_aesni_ecb_encrypt_begin:: movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] sub eax,96 - jmp $L013ecb_enc_loop6_enter + jmp $L015ecb_enc_loop6_enter ALIGN 16 -$L014ecb_enc_loop6: +$L016ecb_enc_loop6: movups XMMWORD PTR [edi],xmm2 movdqu xmm2,XMMWORD PTR [esi] movups XMMWORD PTR 16[edi],xmm3 @@ -405,12 +408,12 @@ $L014ecb_enc_loop6: lea edi,DWORD PTR 96[edi] movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] -$L013ecb_enc_loop6_enter: +$L015ecb_enc_loop6_enter: call __aesni_encrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc $L014ecb_enc_loop6 + jnc $L016ecb_enc_loop6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 @@ -419,18 +422,18 @@ $L013ecb_enc_loop6_enter: movups XMMWORD PTR 80[edi],xmm7 lea edi,DWORD PTR 96[edi] add eax,96 - jz $L010ecb_ret -$L012ecb_enc_tail: + jz $L012ecb_ret +$L014ecb_enc_tail: movups xmm2,XMMWORD PTR [esi] cmp eax,32 - jb $L015ecb_enc_one + jb $L017ecb_enc_one movups xmm3,XMMWORD PTR 16[esi] - je $L016ecb_enc_two + je $L018ecb_enc_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,64 - jb $L017ecb_enc_three + jb $L019ecb_enc_three movups xmm5,XMMWORD PTR 48[esi] - je $L018ecb_enc_four + je $L020ecb_enc_four movups xmm6,XMMWORD PTR 64[esi] xorps xmm7,xmm7 call __aesni_encrypt6 @@ -439,49 +442,49 @@ $L012ecb_enc_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L015ecb_enc_one: +$L017ecb_enc_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L019enc1_loop_3: +$L021enc1_loop_3: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L019enc1_loop_3 + jnz $L021enc1_loop_3 DB 102,15,56,221,209 movups XMMWORD PTR [edi],xmm2 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L016ecb_enc_two: +$L018ecb_enc_two: call __aesni_encrypt2 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L017ecb_enc_three: +$L019ecb_enc_three: call __aesni_encrypt3 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L018ecb_enc_four: +$L020ecb_enc_four: call __aesni_encrypt4 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L011ecb_decrypt: +$L013ecb_decrypt: mov ebp,edx mov ebx,ecx cmp eax,96 - jb $L020ecb_dec_tail + jb $L022ecb_dec_tail movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -490,9 +493,9 @@ $L011ecb_decrypt: movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] sub eax,96 - jmp $L021ecb_dec_loop6_enter + jmp $L023ecb_dec_loop6_enter ALIGN 16 -$L022ecb_dec_loop6: +$L024ecb_dec_loop6: movups XMMWORD PTR [edi],xmm2 movdqu xmm2,XMMWORD PTR [esi] movups XMMWORD PTR 16[edi],xmm3 @@ -507,12 +510,12 @@ $L022ecb_dec_loop6: lea edi,DWORD PTR 96[edi] movdqu xmm7,XMMWORD PTR 80[esi] lea esi,DWORD PTR 96[esi] -$L021ecb_dec_loop6_enter: +$L023ecb_dec_loop6_enter: call __aesni_decrypt6 mov edx,ebp mov ecx,ebx sub eax,96 - jnc $L022ecb_dec_loop6 + jnc $L024ecb_dec_loop6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 @@ -521,18 +524,18 @@ $L021ecb_dec_loop6_enter: movups XMMWORD PTR 80[edi],xmm7 lea edi,DWORD PTR 96[edi] add eax,96 - jz $L010ecb_ret -$L020ecb_dec_tail: + jz $L012ecb_ret +$L022ecb_dec_tail: movups xmm2,XMMWORD PTR [esi] cmp eax,32 - jb $L023ecb_dec_one + jb $L025ecb_dec_one movups xmm3,XMMWORD PTR 16[esi] - je $L024ecb_dec_two + je $L026ecb_dec_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,64 - jb $L025ecb_dec_three + jb $L027ecb_dec_three movups xmm5,XMMWORD PTR 48[esi] - je $L026ecb_dec_four + je $L028ecb_dec_four movups xmm6,XMMWORD PTR 64[esi] xorps xmm7,xmm7 call __aesni_decrypt6 @@ -541,43 +544,51 @@ $L020ecb_dec_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L023ecb_dec_one: +$L025ecb_dec_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L027dec1_loop_4: +$L029dec1_loop_4: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L027dec1_loop_4 + jnz $L029dec1_loop_4 DB 102,15,56,223,209 movups XMMWORD PTR [edi],xmm2 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L024ecb_dec_two: +$L026ecb_dec_two: call __aesni_decrypt2 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L025ecb_dec_three: +$L027ecb_dec_three: call __aesni_decrypt3 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L010ecb_ret + jmp $L012ecb_ret ALIGN 16 -$L026ecb_dec_four: +$L028ecb_dec_four: call __aesni_decrypt4 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 -$L010ecb_ret: +$L012ecb_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -622,7 +633,7 @@ $L_aesni_ccm64_encrypt_blocks_begin:: lea edx,DWORD PTR 32[ecx*1+edx] sub ebx,ecx DB 102,15,56,0,253 -$L028ccm64_enc_outer: +$L030ccm64_enc_outer: movups xmm0,XMMWORD PTR [ebp] mov ecx,ebx movups xmm6,XMMWORD PTR [esi] @@ -631,7 +642,7 @@ $L028ccm64_enc_outer: xorps xmm0,xmm6 xorps xmm3,xmm0 movups xmm0,XMMWORD PTR 32[ebp] -$L029ccm64_enc2_loop: +$L031ccm64_enc2_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 movups xmm1,XMMWORD PTR [ecx*1+edx] @@ -639,7 +650,7 @@ DB 102,15,56,220,217 DB 102,15,56,220,208 DB 102,15,56,220,216 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L029ccm64_enc2_loop + jnz $L031ccm64_enc2_loop DB 102,15,56,220,209 DB 102,15,56,220,217 paddq xmm7,XMMWORD PTR 16[esp] @@ -652,10 +663,18 @@ DB 102,15,56,221,216 movups XMMWORD PTR [edi],xmm6 DB 102,15,56,0,213 lea edi,DWORD PTR 16[edi] - jnz $L028ccm64_enc_outer + jnz $L030ccm64_enc_outer mov esp,DWORD PTR 48[esp] mov edi,DWORD PTR 40[esp] movups XMMWORD PTR [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -701,12 +720,12 @@ DB 102,15,56,0,253 movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L030enc1_loop_5: +$L032enc1_loop_5: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L030enc1_loop_5 + jnz $L032enc1_loop_5 DB 102,15,56,221,209 shl ebx,4 mov ecx,16 @@ -716,16 +735,16 @@ DB 102,15,56,221,209 sub ecx,ebx lea edx,DWORD PTR 32[ebx*1+ebp] mov ebx,ecx - jmp $L031ccm64_dec_outer + jmp $L033ccm64_dec_outer ALIGN 16 -$L031ccm64_dec_outer: +$L033ccm64_dec_outer: xorps xmm6,xmm2 movdqa xmm2,xmm7 movups XMMWORD PTR [edi],xmm6 lea edi,DWORD PTR 16[edi] DB 102,15,56,0,213 sub eax,1 - jz $L032ccm64_dec_break + jz $L034ccm64_dec_break movups xmm0,XMMWORD PTR [ebp] mov ecx,ebx movups xmm1,XMMWORD PTR 16[ebp] @@ -733,7 +752,7 @@ DB 102,15,56,0,213 xorps xmm2,xmm0 xorps xmm3,xmm6 movups xmm0,XMMWORD PTR 32[ebp] -$L033ccm64_dec2_loop: +$L035ccm64_dec2_loop: DB 102,15,56,220,209 DB 102,15,56,220,217 movups xmm1,XMMWORD PTR [ecx*1+edx] @@ -741,7 +760,7 @@ DB 102,15,56,220,217 DB 102,15,56,220,208 DB 102,15,56,220,216 movups xmm0,XMMWORD PTR [ecx*1+edx-16] - jnz $L033ccm64_dec2_loop + jnz $L035ccm64_dec2_loop movups xmm6,XMMWORD PTR [esi] paddq xmm7,XMMWORD PTR 16[esp] DB 102,15,56,220,209 @@ -749,9 +768,9 @@ DB 102,15,56,220,217 DB 102,15,56,221,208 DB 102,15,56,221,216 lea esi,QWORD PTR 16[esi] - jmp $L031ccm64_dec_outer + jmp $L033ccm64_dec_outer ALIGN 16 -$L032ccm64_dec_break: +$L034ccm64_dec_break: mov ecx,DWORD PTR 240[ebp] mov edx,ebp movups xmm0,XMMWORD PTR [edx] @@ -759,16 +778,24 @@ $L032ccm64_dec_break: xorps xmm6,xmm0 lea edx,DWORD PTR 32[edx] xorps xmm3,xmm6 -$L034enc1_loop_6: +$L036enc1_loop_6: DB 102,15,56,220,217 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L034enc1_loop_6 + jnz $L036enc1_loop_6 DB 102,15,56,221,217 mov esp,DWORD PTR 48[esp] mov edi,DWORD PTR 40[esp] movups XMMWORD PTR [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 pop edi pop esi pop ebx @@ -792,7 +819,7 @@ $L_aesni_ctr32_encrypt_blocks_begin:: and esp,-16 mov DWORD PTR 80[esp],ebp cmp eax,1 - je $L035ctr32_one_shortcut + je $L037ctr32_one_shortcut movdqu xmm7,XMMWORD PTR [ebx] mov DWORD PTR [esp],202182159 mov DWORD PTR 4[esp],134810123 @@ -830,7 +857,7 @@ DB 102,15,56,0,202 pshufd xmm2,xmm0,192 pshufd xmm3,xmm0,128 cmp eax,6 - jb $L036ctr32_tail + jb $L038ctr32_tail pxor xmm7,xmm6 shl ecx,4 mov ebx,16 @@ -839,9 +866,9 @@ DB 102,15,56,0,202 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] sub eax,6 - jmp $L037ctr32_loop6 + jmp $L039ctr32_loop6 ALIGN 16 -$L037ctr32_loop6: +$L039ctr32_loop6: pshufd xmm4,xmm0,64 movdqa xmm0,XMMWORD PTR 32[esp] pshufd xmm5,xmm1,192 @@ -895,27 +922,27 @@ DB 102,15,56,0,202 lea edi,DWORD PTR 96[edi] pshufd xmm3,xmm0,128 sub eax,6 - jnc $L037ctr32_loop6 + jnc $L039ctr32_loop6 add eax,6 - jz $L038ctr32_ret + jz $L040ctr32_ret movdqu xmm7,XMMWORD PTR [ebp] mov edx,ebp pxor xmm7,XMMWORD PTR 32[esp] mov ecx,DWORD PTR 240[ebp] -$L036ctr32_tail: +$L038ctr32_tail: por xmm2,xmm7 cmp eax,2 - jb $L039ctr32_one + jb $L041ctr32_one pshufd xmm4,xmm0,64 por xmm3,xmm7 - je $L040ctr32_two + je $L042ctr32_two pshufd xmm5,xmm1,192 por xmm4,xmm7 cmp eax,4 - jb $L041ctr32_three + jb $L043ctr32_three pshufd xmm6,xmm1,128 por xmm5,xmm7 - je $L042ctr32_four + je $L044ctr32_four por xmm6,xmm7 call __aesni_encrypt6 movups xmm1,XMMWORD PTR [esi] @@ -933,29 +960,29 @@ $L036ctr32_tail: movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L035ctr32_one_shortcut: +$L037ctr32_one_shortcut: movups xmm2,XMMWORD PTR [ebx] mov ecx,DWORD PTR 240[edx] -$L039ctr32_one: +$L041ctr32_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L043enc1_loop_7: +$L045enc1_loop_7: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L043enc1_loop_7 + jnz $L045enc1_loop_7 DB 102,15,56,221,209 movups xmm6,XMMWORD PTR [esi] xorps xmm6,xmm2 movups XMMWORD PTR [edi],xmm6 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L040ctr32_two: +$L042ctr32_two: call __aesni_encrypt2 movups xmm5,XMMWORD PTR [esi] movups xmm6,XMMWORD PTR 16[esi] @@ -963,9 +990,9 @@ $L040ctr32_two: xorps xmm3,xmm6 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L041ctr32_three: +$L043ctr32_three: call __aesni_encrypt3 movups xmm5,XMMWORD PTR [esi] movups xmm6,XMMWORD PTR 16[esi] @@ -976,9 +1003,9 @@ $L041ctr32_three: xorps xmm4,xmm7 movups XMMWORD PTR 16[edi],xmm3 movups XMMWORD PTR 32[edi],xmm4 - jmp $L038ctr32_ret + jmp $L040ctr32_ret ALIGN 16 -$L042ctr32_four: +$L044ctr32_four: call __aesni_encrypt4 movups xmm6,XMMWORD PTR [esi] movups xmm7,XMMWORD PTR 16[esi] @@ -992,7 +1019,18 @@ $L042ctr32_four: xorps xmm5,xmm0 movups XMMWORD PTR 32[edi],xmm4 movups XMMWORD PTR 48[edi],xmm5 -$L038ctr32_ret: +$L040ctr32_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 mov esp,DWORD PTR 80[esp] pop edi pop esi @@ -1015,12 +1053,12 @@ $L_aesni_xts_encrypt_begin:: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L044enc1_loop_8: +$L046enc1_loop_8: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L044enc1_loop_8 + jnz $L046enc1_loop_8 DB 102,15,56,221,209 mov esi,DWORD PTR 20[esp] mov edi,DWORD PTR 24[esp] @@ -1044,14 +1082,14 @@ DB 102,15,56,221,209 mov ebp,edx mov ebx,ecx sub eax,96 - jc $L045xts_enc_short + jc $L047xts_enc_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] - jmp $L046xts_enc_loop6 + jmp $L048xts_enc_loop6 ALIGN 16 -$L046xts_enc_loop6: +$L048xts_enc_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa XMMWORD PTR [esp],xmm1 @@ -1140,23 +1178,23 @@ DB 102,15,56,220,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc $L046xts_enc_loop6 + jnc $L048xts_enc_loop6 mov ecx,DWORD PTR 240[ebp] mov edx,ebp mov ebx,ecx -$L045xts_enc_short: +$L047xts_enc_short: add eax,96 - jz $L047xts_enc_done6x + jz $L049xts_enc_done6x movdqa xmm5,xmm1 cmp eax,32 - jb $L048xts_enc_one + jb $L050xts_enc_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je $L049xts_enc_two + je $L051xts_enc_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1165,7 +1203,7 @@ $L045xts_enc_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb $L050xts_enc_three + jb $L052xts_enc_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1175,7 +1213,7 @@ $L045xts_enc_short: pxor xmm1,xmm2 movdqa XMMWORD PTR [esp],xmm5 movdqa XMMWORD PTR 16[esp],xmm6 - je $L051xts_enc_four + je $L053xts_enc_four movdqa XMMWORD PTR 32[esp],xmm7 pshufd xmm7,xmm0,19 movdqa XMMWORD PTR 48[esp],xmm1 @@ -1207,9 +1245,9 @@ $L045xts_enc_short: movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L048xts_enc_one: +$L050xts_enc_one: movups xmm2,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] xorps xmm2,xmm5 @@ -1217,20 +1255,20 @@ $L048xts_enc_one: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L053enc1_loop_9: +$L055enc1_loop_9: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L053enc1_loop_9 + jnz $L055enc1_loop_9 DB 102,15,56,221,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] movdqa xmm1,xmm5 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L049xts_enc_two: +$L051xts_enc_two: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1244,9 +1282,9 @@ $L049xts_enc_two: movups XMMWORD PTR 16[edi],xmm3 lea edi,DWORD PTR 32[edi] movdqa xmm1,xmm6 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L050xts_enc_three: +$L052xts_enc_three: movaps xmm7,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1264,9 +1302,9 @@ $L050xts_enc_three: movups XMMWORD PTR 32[edi],xmm4 lea edi,DWORD PTR 48[edi] movdqa xmm1,xmm7 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L051xts_enc_four: +$L053xts_enc_four: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1288,28 +1326,28 @@ $L051xts_enc_four: movups XMMWORD PTR 48[edi],xmm5 lea edi,DWORD PTR 64[edi] movdqa xmm1,xmm6 - jmp $L052xts_enc_done + jmp $L054xts_enc_done ALIGN 16 -$L047xts_enc_done6x: +$L049xts_enc_done6x: mov eax,DWORD PTR 112[esp] and eax,15 - jz $L054xts_enc_ret + jz $L056xts_enc_ret movdqa xmm5,xmm1 mov DWORD PTR 112[esp],eax - jmp $L055xts_enc_steal + jmp $L057xts_enc_steal ALIGN 16 -$L052xts_enc_done: +$L054xts_enc_done: mov eax,DWORD PTR 112[esp] pxor xmm0,xmm0 and eax,15 - jz $L054xts_enc_ret + jz $L056xts_enc_ret pcmpgtd xmm0,xmm1 mov DWORD PTR 112[esp],eax pshufd xmm5,xmm0,19 paddq xmm1,xmm1 pand xmm5,XMMWORD PTR 96[esp] pxor xmm5,xmm1 -$L055xts_enc_steal: +$L057xts_enc_steal: movzx ecx,BYTE PTR [esi] movzx edx,BYTE PTR [edi-16] lea esi,DWORD PTR 1[esi] @@ -1317,7 +1355,7 @@ $L055xts_enc_steal: mov BYTE PTR [edi],dl lea edi,DWORD PTR 1[edi] sub eax,1 - jnz $L055xts_enc_steal + jnz $L057xts_enc_steal sub edi,DWORD PTR 112[esp] mov edx,ebp mov ecx,ebx @@ -1327,16 +1365,30 @@ $L055xts_enc_steal: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L056enc1_loop_10: +$L058enc1_loop_10: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L056enc1_loop_10 + jnz $L058enc1_loop_10 DB 102,15,56,221,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi-16],xmm2 -$L054xts_enc_ret: +$L056xts_enc_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa XMMWORD PTR [esp],xmm0 + pxor xmm3,xmm3 + movdqa XMMWORD PTR 16[esp],xmm0 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 + movdqa XMMWORD PTR 80[esp],xmm0 mov esp,DWORD PTR 116[esp] pop edi pop esi @@ -1359,12 +1411,12 @@ $L_aesni_xts_decrypt_begin:: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L057enc1_loop_11: +$L059enc1_loop_11: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L057enc1_loop_11 + jnz $L059enc1_loop_11 DB 102,15,56,221,209 mov esi,DWORD PTR 20[esp] mov edi,DWORD PTR 24[esp] @@ -1393,14 +1445,14 @@ DB 102,15,56,221,209 pcmpgtd xmm0,xmm1 and eax,-16 sub eax,96 - jc $L058xts_dec_short + jc $L060xts_dec_short shl ecx,4 mov ebx,16 sub ebx,ecx lea edx,DWORD PTR 32[ecx*1+edx] - jmp $L059xts_dec_loop6 + jmp $L061xts_dec_loop6 ALIGN 16 -$L059xts_dec_loop6: +$L061xts_dec_loop6: pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa XMMWORD PTR [esp],xmm1 @@ -1489,23 +1541,23 @@ DB 102,15,56,222,249 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 sub eax,96 - jnc $L059xts_dec_loop6 + jnc $L061xts_dec_loop6 mov ecx,DWORD PTR 240[ebp] mov edx,ebp mov ebx,ecx -$L058xts_dec_short: +$L060xts_dec_short: add eax,96 - jz $L060xts_dec_done6x + jz $L062xts_dec_done6x movdqa xmm5,xmm1 cmp eax,32 - jb $L061xts_dec_one + jb $L063xts_dec_one pshufd xmm2,xmm0,19 pxor xmm0,xmm0 paddq xmm1,xmm1 pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 - je $L062xts_dec_two + je $L064xts_dec_two pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm6,xmm1 @@ -1514,7 +1566,7 @@ $L058xts_dec_short: pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 cmp eax,64 - jb $L063xts_dec_three + jb $L065xts_dec_three pshufd xmm2,xmm0,19 pxor xmm0,xmm0 movdqa xmm7,xmm1 @@ -1524,7 +1576,7 @@ $L058xts_dec_short: pxor xmm1,xmm2 movdqa XMMWORD PTR [esp],xmm5 movdqa XMMWORD PTR 16[esp],xmm6 - je $L064xts_dec_four + je $L066xts_dec_four movdqa XMMWORD PTR 32[esp],xmm7 pshufd xmm7,xmm0,19 movdqa XMMWORD PTR 48[esp],xmm1 @@ -1556,9 +1608,9 @@ $L058xts_dec_short: movups XMMWORD PTR 48[edi],xmm5 movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L061xts_dec_one: +$L063xts_dec_one: movups xmm2,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] xorps xmm2,xmm5 @@ -1566,20 +1618,20 @@ $L061xts_dec_one: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L066dec1_loop_12: +$L068dec1_loop_12: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L066dec1_loop_12 + jnz $L068dec1_loop_12 DB 102,15,56,223,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] movdqa xmm1,xmm5 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L062xts_dec_two: +$L064xts_dec_two: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1593,9 +1645,9 @@ $L062xts_dec_two: movups XMMWORD PTR 16[edi],xmm3 lea edi,DWORD PTR 32[edi] movdqa xmm1,xmm6 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L063xts_dec_three: +$L065xts_dec_three: movaps xmm7,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1613,9 +1665,9 @@ $L063xts_dec_three: movups XMMWORD PTR 32[edi],xmm4 lea edi,DWORD PTR 48[edi] movdqa xmm1,xmm7 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L064xts_dec_four: +$L066xts_dec_four: movaps xmm6,xmm1 movups xmm2,XMMWORD PTR [esi] movups xmm3,XMMWORD PTR 16[esi] @@ -1637,20 +1689,20 @@ $L064xts_dec_four: movups XMMWORD PTR 48[edi],xmm5 lea edi,DWORD PTR 64[edi] movdqa xmm1,xmm6 - jmp $L065xts_dec_done + jmp $L067xts_dec_done ALIGN 16 -$L060xts_dec_done6x: +$L062xts_dec_done6x: mov eax,DWORD PTR 112[esp] and eax,15 - jz $L067xts_dec_ret + jz $L069xts_dec_ret mov DWORD PTR 112[esp],eax - jmp $L068xts_dec_only_one_more + jmp $L070xts_dec_only_one_more ALIGN 16 -$L065xts_dec_done: +$L067xts_dec_done: mov eax,DWORD PTR 112[esp] pxor xmm0,xmm0 and eax,15 - jz $L067xts_dec_ret + jz $L069xts_dec_ret pcmpgtd xmm0,xmm1 mov DWORD PTR 112[esp],eax pshufd xmm2,xmm0,19 @@ -1660,7 +1712,7 @@ $L065xts_dec_done: pand xmm2,xmm3 pcmpgtd xmm0,xmm1 pxor xmm1,xmm2 -$L068xts_dec_only_one_more: +$L070xts_dec_only_one_more: pshufd xmm5,xmm0,19 movdqa xmm6,xmm1 paddq xmm1,xmm1 @@ -1674,16 +1726,16 @@ $L068xts_dec_only_one_more: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L069dec1_loop_13: +$L071dec1_loop_13: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L069dec1_loop_13 + jnz $L071dec1_loop_13 DB 102,15,56,223,209 xorps xmm2,xmm5 movups XMMWORD PTR [edi],xmm2 -$L070xts_dec_steal: +$L072xts_dec_steal: movzx ecx,BYTE PTR 16[esi] movzx edx,BYTE PTR [edi] lea esi,DWORD PTR 1[esi] @@ -1691,7 +1743,7 @@ $L070xts_dec_steal: mov BYTE PTR 16[edi],dl lea edi,DWORD PTR 1[edi] sub eax,1 - jnz $L070xts_dec_steal + jnz $L072xts_dec_steal sub edi,DWORD PTR 112[esp] mov edx,ebp mov ecx,ebx @@ -1701,16 +1753,30 @@ $L070xts_dec_steal: movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L071dec1_loop_14: +$L073dec1_loop_14: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L071dec1_loop_14 + jnz $L073dec1_loop_14 DB 102,15,56,223,209 xorps xmm2,xmm6 movups XMMWORD PTR [edi],xmm2 -$L067xts_dec_ret: +$L069xts_dec_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa XMMWORD PTR [esp],xmm0 + pxor xmm3,xmm3 + movdqa XMMWORD PTR 16[esp],xmm0 + pxor xmm4,xmm4 + movdqa XMMWORD PTR 32[esp],xmm0 + pxor xmm5,xmm5 + movdqa XMMWORD PTR 48[esp],xmm0 + pxor xmm6,xmm6 + movdqa XMMWORD PTR 64[esp],xmm0 + pxor xmm7,xmm7 + movdqa XMMWORD PTR 80[esp],xmm0 mov esp,DWORD PTR 116[esp] pop edi pop esi @@ -1734,7 +1800,7 @@ $L_aesni_cbc_encrypt_begin:: mov edx,DWORD PTR 32[esp] mov ebp,DWORD PTR 36[esp] test eax,eax - jz $L072cbc_abort + jz $L074cbc_abort cmp DWORD PTR 40[esp],0 xchg ebx,esp movups xmm7,XMMWORD PTR [ebp] @@ -1742,14 +1808,14 @@ $L_aesni_cbc_encrypt_begin:: mov ebp,edx mov DWORD PTR 16[esp],ebx mov ebx,ecx - je $L073cbc_decrypt + je $L075cbc_decrypt movaps xmm2,xmm7 cmp eax,16 - jb $L074cbc_enc_tail + jb $L076cbc_enc_tail sub eax,16 - jmp $L075cbc_enc_loop + jmp $L077cbc_enc_loop ALIGN 16 -$L075cbc_enc_loop: +$L077cbc_enc_loop: movups xmm7,XMMWORD PTR [esi] lea esi,DWORD PTR 16[esi] movups xmm0,XMMWORD PTR [edx] @@ -1757,24 +1823,25 @@ $L075cbc_enc_loop: xorps xmm7,xmm0 lea edx,DWORD PTR 32[edx] xorps xmm2,xmm7 -$L076enc1_loop_15: +$L078enc1_loop_15: DB 102,15,56,220,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L076enc1_loop_15 + jnz $L078enc1_loop_15 DB 102,15,56,221,209 mov ecx,ebx mov edx,ebp movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] sub eax,16 - jnc $L075cbc_enc_loop + jnc $L077cbc_enc_loop add eax,16 - jnz $L074cbc_enc_tail + jnz $L076cbc_enc_tail movaps xmm7,xmm2 - jmp $L077cbc_ret -$L074cbc_enc_tail: + pxor xmm2,xmm2 + jmp $L079cbc_ret +$L076cbc_enc_tail: mov ecx,eax DD 2767451785 mov ecx,16 @@ -1785,20 +1852,20 @@ DD 2868115081 mov ecx,ebx mov esi,edi mov edx,ebp - jmp $L075cbc_enc_loop + jmp $L077cbc_enc_loop ALIGN 16 -$L073cbc_decrypt: +$L075cbc_decrypt: cmp eax,80 - jbe $L078cbc_dec_tail + jbe $L080cbc_dec_tail movaps XMMWORD PTR [esp],xmm7 sub eax,80 - jmp $L079cbc_dec_loop6_enter + jmp $L081cbc_dec_loop6_enter ALIGN 16 -$L080cbc_dec_loop6: +$L082cbc_dec_loop6: movaps XMMWORD PTR [esp],xmm0 movups XMMWORD PTR [edi],xmm7 lea edi,DWORD PTR 16[edi] -$L079cbc_dec_loop6_enter: +$L081cbc_dec_loop6_enter: movdqu xmm2,XMMWORD PTR [esi] movdqu xmm3,XMMWORD PTR 16[esi] movdqu xmm4,XMMWORD PTR 32[esi] @@ -1828,28 +1895,28 @@ $L079cbc_dec_loop6_enter: movups XMMWORD PTR 64[edi],xmm6 lea edi,DWORD PTR 80[edi] sub eax,96 - ja $L080cbc_dec_loop6 + ja $L082cbc_dec_loop6 movaps xmm2,xmm7 movaps xmm7,xmm0 add eax,80 - jle $L081cbc_dec_tail_collected + jle $L083cbc_dec_clear_tail_collected movups XMMWORD PTR [edi],xmm2 lea edi,DWORD PTR 16[edi] -$L078cbc_dec_tail: +$L080cbc_dec_tail: movups xmm2,XMMWORD PTR [esi] movaps xmm6,xmm2 cmp eax,16 - jbe $L082cbc_dec_one + jbe $L084cbc_dec_one movups xmm3,XMMWORD PTR 16[esi] movaps xmm5,xmm3 cmp eax,32 - jbe $L083cbc_dec_two + jbe $L085cbc_dec_two movups xmm4,XMMWORD PTR 32[esi] cmp eax,48 - jbe $L084cbc_dec_three + jbe $L086cbc_dec_three movups xmm5,XMMWORD PTR 48[esi] cmp eax,64 - jbe $L085cbc_dec_four + jbe $L087cbc_dec_four movups xmm6,XMMWORD PTR 64[esi] movaps XMMWORD PTR [esp],xmm7 movups xmm2,XMMWORD PTR [esi] @@ -1867,55 +1934,62 @@ $L078cbc_dec_tail: xorps xmm6,xmm0 movups XMMWORD PTR [edi],xmm2 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 movups XMMWORD PTR 32[edi],xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR 48[edi],xmm5 + pxor xmm5,xmm5 lea edi,DWORD PTR 64[edi] movaps xmm2,xmm6 + pxor xmm6,xmm6 sub eax,80 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L082cbc_dec_one: +$L084cbc_dec_one: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR 16[edx] lea edx,DWORD PTR 32[edx] xorps xmm2,xmm0 -$L086dec1_loop_16: +$L089dec1_loop_16: DB 102,15,56,222,209 dec ecx movups xmm1,XMMWORD PTR [edx] lea edx,DWORD PTR 16[edx] - jnz $L086dec1_loop_16 + jnz $L089dec1_loop_16 DB 102,15,56,223,209 xorps xmm2,xmm7 movaps xmm7,xmm6 sub eax,16 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L083cbc_dec_two: +$L085cbc_dec_two: call __aesni_decrypt2 xorps xmm2,xmm7 xorps xmm3,xmm6 movups XMMWORD PTR [edi],xmm2 movaps xmm2,xmm3 + pxor xmm3,xmm3 lea edi,DWORD PTR 16[edi] movaps xmm7,xmm5 sub eax,32 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L084cbc_dec_three: +$L086cbc_dec_three: call __aesni_decrypt3 xorps xmm2,xmm7 xorps xmm3,xmm6 xorps xmm4,xmm5 movups XMMWORD PTR [edi],xmm2 movaps xmm2,xmm4 + pxor xmm4,xmm4 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 lea edi,DWORD PTR 32[edi] movups xmm7,XMMWORD PTR 32[esi] sub eax,48 - jmp $L081cbc_dec_tail_collected + jmp $L088cbc_dec_tail_collected ALIGN 16 -$L085cbc_dec_four: +$L087cbc_dec_four: call __aesni_decrypt4 movups xmm1,XMMWORD PTR 16[esi] movups xmm0,XMMWORD PTR 32[esi] @@ -1925,28 +1999,44 @@ $L085cbc_dec_four: movups XMMWORD PTR [edi],xmm2 xorps xmm4,xmm1 movups XMMWORD PTR 16[edi],xmm3 + pxor xmm3,xmm3 xorps xmm5,xmm0 movups XMMWORD PTR 32[edi],xmm4 + pxor xmm4,xmm4 lea edi,DWORD PTR 48[edi] movaps xmm2,xmm5 + pxor xmm5,xmm5 sub eax,64 -$L081cbc_dec_tail_collected: + jmp $L088cbc_dec_tail_collected +ALIGN 16 +$L083cbc_dec_clear_tail_collected: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 +$L088cbc_dec_tail_collected: and eax,15 - jnz $L087cbc_dec_tail_partial + jnz $L090cbc_dec_tail_partial movups XMMWORD PTR [edi],xmm2 - jmp $L077cbc_ret + pxor xmm0,xmm0 + jmp $L079cbc_ret ALIGN 16 -$L087cbc_dec_tail_partial: +$L090cbc_dec_tail_partial: movaps XMMWORD PTR [esp],xmm2 + pxor xmm0,xmm0 mov ecx,16 mov esi,esp sub ecx,eax DD 2767451785 -$L077cbc_ret: + movdqa XMMWORD PTR [esp],xmm2 +$L079cbc_ret: mov esp,DWORD PTR 16[esp] mov ebp,DWORD PTR 36[esp] + pxor xmm2,xmm2 + pxor xmm1,xmm1 movups XMMWORD PTR [ebp],xmm7 -$L072cbc_abort: + pxor xmm7,xmm7 +$L074cbc_abort: pop edi pop esi pop ebx @@ -1955,52 +2045,62 @@ $L072cbc_abort: _aesni_cbc_encrypt ENDP ALIGN 16 __aesni_set_encrypt_key PROC PRIVATE + push ebp + push ebx test eax,eax - jz $L088bad_pointer + jz $L091bad_pointer test edx,edx - jz $L088bad_pointer + jz $L091bad_pointer + call $L092pic +$L092pic: + pop ebx + lea ebx,DWORD PTR ($Lkey_const-$L092pic)[ebx] + lea ebp,DWORD PTR _OPENSSL_ia32cap_P movups xmm0,XMMWORD PTR [eax] xorps xmm4,xmm4 + mov ebp,DWORD PTR 4[ebp] lea edx,DWORD PTR 16[edx] + and ebp,268437504 cmp ecx,256 - je $L08914rounds + je $L09314rounds cmp ecx,192 - je $L09012rounds + je $L09412rounds cmp ecx,128 - jne $L091bad_keybits + jne $L095bad_keybits ALIGN 16 -$L09210rounds: +$L09610rounds: + cmp ebp,268435456 + je $L09710rounds_alt mov ecx,9 movups XMMWORD PTR [edx-16],xmm0 DB 102,15,58,223,200,1 - call $L093key_128_cold + call $L098key_128_cold DB 102,15,58,223,200,2 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,4 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,8 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,16 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,32 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,64 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,128 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,27 - call $L094key_128 + call $L099key_128 DB 102,15,58,223,200,54 - call $L094key_128 + call $L099key_128 movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 80[edx],ecx - xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L094key_128: +$L099key_128: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] -$L093key_128_cold: +$L098key_128_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2009,38 +2109,91 @@ $L093key_128_cold: xorps xmm0,xmm1 ret ALIGN 16 -$L09012rounds: +$L09710rounds_alt: + movdqa xmm5,XMMWORD PTR [ebx] + mov ecx,8 + movdqa xmm4,XMMWORD PTR 32[ebx] + movdqa xmm2,xmm0 + movdqu XMMWORD PTR [edx-16],xmm0 +$L101loop_key128: +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + lea edx,DWORD PTR 16[edx] + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx-16],xmm0 + movdqa xmm2,xmm0 + dec ecx + jnz $L101loop_key128 + movdqa xmm4,XMMWORD PTR 48[ebx] +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx],xmm0 + movdqa xmm2,xmm0 +DB 102,15,56,0,197 +DB 102,15,56,221,196 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu XMMWORD PTR 16[edx],xmm0 + mov ecx,9 + mov DWORD PTR 96[edx],ecx + jmp $L100good_key +ALIGN 16 +$L09412rounds: movq xmm2,QWORD PTR 16[eax] + cmp ebp,268435456 + je $L10212rounds_alt mov ecx,11 movups XMMWORD PTR [edx-16],xmm0 DB 102,15,58,223,202,1 - call $L095key_192a_cold + call $L103key_192a_cold DB 102,15,58,223,202,2 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,4 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,8 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,16 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,32 - call $L096key_192b + call $L104key_192b DB 102,15,58,223,202,64 - call $L097key_192a + call $L105key_192a DB 102,15,58,223,202,128 - call $L096key_192b + call $L104key_192b movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 48[edx],ecx - xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L097key_192a: +$L105key_192a: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] ALIGN 16 -$L095key_192a_cold: +$L103key_192a_cold: movaps xmm5,xmm2 -$L098key_192b_warm: +$L106key_192b_warm: shufps xmm4,xmm0,16 movdqa xmm3,xmm2 xorps xmm0,xmm4 @@ -2054,56 +2207,90 @@ $L098key_192b_warm: pxor xmm2,xmm3 ret ALIGN 16 -$L096key_192b: +$L104key_192b: movaps xmm3,xmm0 shufps xmm5,xmm0,68 movups XMMWORD PTR [edx],xmm5 shufps xmm3,xmm2,78 movups XMMWORD PTR 16[edx],xmm3 lea edx,DWORD PTR 32[edx] - jmp $L098key_192b_warm + jmp $L106key_192b_warm +ALIGN 16 +$L10212rounds_alt: + movdqa xmm5,XMMWORD PTR 16[ebx] + movdqa xmm4,XMMWORD PTR 32[ebx] + mov ecx,8 + movdqu XMMWORD PTR [edx-16],xmm0 +$L107loop_key192: + movq QWORD PTR [edx],xmm2 + movdqa xmm1,xmm2 +DB 102,15,56,0,213 +DB 102,15,56,221,212 + pslld xmm4,1 + lea edx,DWORD PTR 24[edx] + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pshufd xmm3,xmm0,255 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu XMMWORD PTR [edx-16],xmm0 + dec ecx + jnz $L107loop_key192 + mov ecx,11 + mov DWORD PTR 32[edx],ecx + jmp $L100good_key ALIGN 16 -$L08914rounds: +$L09314rounds: movups xmm2,XMMWORD PTR 16[eax] - mov ecx,13 lea edx,DWORD PTR 16[edx] + cmp ebp,268435456 + je $L10814rounds_alt + mov ecx,13 movups XMMWORD PTR [edx-32],xmm0 movups XMMWORD PTR [edx-16],xmm2 DB 102,15,58,223,202,1 - call $L099key_256a_cold + call $L109key_256a_cold DB 102,15,58,223,200,1 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,2 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,2 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,4 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,4 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,8 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,8 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,16 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,16 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,32 - call $L101key_256a + call $L111key_256a DB 102,15,58,223,200,32 - call $L100key_256b + call $L110key_256b DB 102,15,58,223,202,64 - call $L101key_256a + call $L111key_256a movups XMMWORD PTR [edx],xmm0 mov DWORD PTR 16[edx],ecx xor eax,eax - ret + jmp $L100good_key ALIGN 16 -$L101key_256a: +$L111key_256a: movups XMMWORD PTR [edx],xmm2 lea edx,DWORD PTR 16[edx] -$L099key_256a_cold: +$L109key_256a_cold: shufps xmm4,xmm0,16 xorps xmm0,xmm4 shufps xmm4,xmm0,140 @@ -2112,7 +2299,7 @@ $L099key_256a_cold: xorps xmm0,xmm1 ret ALIGN 16 -$L100key_256b: +$L110key_256b: movups XMMWORD PTR [edx],xmm0 lea edx,DWORD PTR 16[edx] shufps xmm4,xmm2,16 @@ -2122,13 +2309,70 @@ $L100key_256b: shufps xmm1,xmm1,170 xorps xmm2,xmm1 ret +ALIGN 16 +$L10814rounds_alt: + movdqa xmm5,XMMWORD PTR [ebx] + movdqa xmm4,XMMWORD PTR 32[ebx] + mov ecx,7 + movdqu XMMWORD PTR [edx-32],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD PTR [edx-16],xmm2 +$L112loop_key256: +DB 102,15,56,0,213 +DB 102,15,56,221,212 + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + pxor xmm0,xmm2 + movdqu XMMWORD PTR [edx],xmm0 + dec ecx + jz $L113done_key256 + pshufd xmm2,xmm0,255 + pxor xmm3,xmm3 +DB 102,15,56,221,211 + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + pxor xmm2,xmm1 + movdqu XMMWORD PTR 16[edx],xmm2 + lea edx,DWORD PTR 32[edx] + movdqa xmm1,xmm2 + jmp $L112loop_key256 +$L113done_key256: + mov ecx,13 + mov DWORD PTR 16[edx],ecx +$L100good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + pop ebp + ret ALIGN 4 -$L088bad_pointer: +$L091bad_pointer: mov eax,-1 + pop ebx + pop ebp ret ALIGN 4 -$L091bad_keybits: +$L095bad_keybits: + pxor xmm0,xmm0 mov eax,-2 + pop ebx + pop ebp ret __aesni_set_encrypt_key ENDP ALIGN 16 @@ -2150,7 +2394,7 @@ $L_aesni_set_decrypt_key_begin:: mov edx,DWORD PTR 12[esp] shl ecx,4 test eax,eax - jnz $L102dec_key_ret + jnz $L114dec_key_ret lea eax,DWORD PTR 16[ecx*1+edx] movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR [eax] @@ -2158,7 +2402,7 @@ $L_aesni_set_decrypt_key_begin:: movups XMMWORD PTR [edx],xmm1 lea edx,DWORD PTR 16[edx] lea eax,DWORD PTR [eax-16] -$L103dec_key_inverse: +$L115dec_key_inverse: movups xmm0,XMMWORD PTR [edx] movups xmm1,XMMWORD PTR [eax] DB 102,15,56,219,192 @@ -2168,17 +2412,28 @@ DB 102,15,56,219,201 movups XMMWORD PTR 16[eax],xmm0 movups XMMWORD PTR [edx-16],xmm1 cmp eax,edx - ja $L103dec_key_inverse + ja $L115dec_key_inverse movups xmm0,XMMWORD PTR [edx] DB 102,15,56,219,192 movups XMMWORD PTR [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 xor eax,eax -$L102dec_key_ret: +$L114dec_key_ret: ret _aesni_set_decrypt_key ENDP +ALIGN 64 +$Lkey_const:: +DD 202313229,202313229,202313229,202313229 +DD 67569157,67569157,67569157,67569157 +DD 1,1,1,1 +DD 27,27,27,27 DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 DB 115,108,46,111,114,103,62,0 .text$ ENDS +.bss SEGMENT 'BSS' +COMM _OPENSSL_ia32cap_P:DWORD:4 +.bss ENDS END