;; Added by Diederik Huys, March 2013
	;;
	;; Provided public procedures:
	;; 	secp256k1_fe_mul_inner
	;; 	secp256k1_fe_sqr_inner
	;;
	;; Needed tools: YASM (http://yasm.tortall.net)
	;;
	;; 

	BITS 64

	;;  Procedure ExSetMult
	;;  Register Layout:
	;;  INPUT: 	rdi	= a->n
	;; 	   	rsi  	= b->n
	;; 	   	rdx  	= r->a
	;; 
	;;  INTERNAL:	rdx:rax  = multiplication accumulator
	;; 		r9:r8    = c
	;; 		r10-r13  = t0-t3
	;; 		r14	 = b.n[0] / t4
	;; 		r15	 = b.n[1] / t5
	;; 		rbx	 = b.n[2] / t6
	;; 		rcx	 = b.n[3] / t7
	;; 		rbp	 = Constant 0FFFFFFFFFFFFFh / t8
	;; 		rsi	 = b.n / b.n[4] / t9

	GLOBAL secp256k1_fe_mul_inner
	ALIGN 32
secp256k1_fe_mul_inner:
	push rbp
	push rbx
	push r12
	push r13
	push r14
	push r15
	push rdx
	mov r14,[rsi+8*0]	; preload b.n[0]. This will be the case until
				; b.n[0] is no longer needed, then we reassign
				; r14 to t4
	;; c=a.n[0] * b.n[0]
   	mov rax,[rdi+0*8]	; load a.n[0]
	mov rbp,0FFFFFFFFFFFFFh
	mul r14			; rdx:rax=a.n[0]*b.n[0]
	mov r15,[rsi+1*8]
	mov r10,rbp		; load modulus into target register for t0
	mov r8,rax
	and r10,rax		; only need lower qword of c
	shrd r8,rdx,52
	xor r9,r9		; c < 2^64, so we ditch the HO part 

	;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
	mov rax,[rdi+0*8]
	mul r15			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+1*8]
	mul r14			
	mov r11,rbp
	mov rbx,[rsi+2*8]
	add r8,rax
	adc r9,rdx
	and r11,r8
	shrd r8,r9,52
	xor r9,r9
	
	;; c+=a.n[0 1 2] * b.n[2 1 0]
	mov rax,[rdi+0*8]
	mul rbx			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+1*8]
	mul r15			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+2*8]
	mul r14
	mov r12,rbp		
	mov rcx,[rsi+3*8]
	add r8,rax
	adc r9,rdx
	and r12,r8		
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
	mov rax,[rdi+0*8]
	mul rcx			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+1*8]
	mul rbx			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+2*8]
	mul r15			
	add r8,rax
	adc r9,rdx
	
	mov rax,[rdi+3*8]
	mul r14			
	mov r13,rbp             
	mov rsi,[rsi+4*8]	; load b.n[4] and destroy pointer
	add r8,rax
	adc r9,rdx
	and r13,r8

	shrd r8,r9,52
	xor r9,r9		


	;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
	mov rax,[rdi+0*8]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+1*8]
	mul rcx
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+2*8]
	mul rbx			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+3*8]
	mul r15			
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+4*8]
	mul r14			
	mov r14,rbp             ; load modulus into t4 and destroy a.n[0]
	add r8,rax
	adc r9,rdx
	and r14,r8
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
	mov rax,[rdi+1*8]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+2*8]
	mul rcx
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+3*8]
	mul rbx
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+4*8]
	mul r15
	mov r15,rbp		
	add r8,rax
	adc r9,rdx

	and r15,r8
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[2 3 4] * b.n[4 3 2]
	mov rax,[rdi+2*8]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+3*8]
	mul rcx
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+4*8]
	mul rbx
	mov rbx,rbp		
	add r8,rax
	adc r9,rdx

	and rbx,r8		
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[3 4] * b.n[4 3]
	mov rax,[rdi+3*8]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,[rdi+4*8]
	mul rcx
	mov rcx,rbp		
	add r8,rax
	adc r9,rdx
	and rcx,r8		
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[4] * b.n[4]
	mov rax,[rdi+4*8]
	mul rsi
	;; mov rbp,rbp		; modulus already there!
	add r8,rax
	adc r9,rdx
	and rbp,r8 
	shrd r8,r9,52
	xor r9,r9		

	mov rsi,r8		; load c into t9 and destroy b.n[4]

	;; *******************************************************
common_exit_norm:
	mov rdi,01000003D10h	; load constant

	mov rax,r15		; get t5
	mul rdi
	add rax,r10    		; +t0
	adc rdx,0
	mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
	mov r8,rax		; +c
	and r10,rax
	shrd r8,rdx,52
	xor r9,r9

	mov rax,rbx		; get t6
	mul rdi
	add rax,r11		; +t1
	adc rdx,0
	mov r11,0FFFFFFFFFFFFFh ; modulus
	add r8,rax		; +c
	adc r9,rdx
	and r11,r8
	shrd r8,r9,52
	xor r9,r9

	mov rax,rcx    		; get t7
	mul rdi
	add rax,r12		; +t2
	adc rdx,0
	pop rbx			; retrieve pointer to this.n	
	mov r12,0FFFFFFFFFFFFFh ; modulus
	add r8,rax		; +c
	adc r9,rdx
	and r12,r8
	mov [rbx+2*8],r12	; mov into this.n[2]
	shrd r8,r9,52
	xor r9,r9
	
	mov rax,rbp    		; get t8
	mul rdi
	add rax,r13    		; +t3
	adc rdx,0
	mov r13,0FFFFFFFFFFFFFh ; modulus
	add r8,rax		; +c
	adc r9,rdx
	and r13,r8
	mov [rbx+3*8],r13	; -> this.n[3]
	shrd r8,r9,52
	xor r9,r9
	
	mov rax,rsi    		; get t9
	mul rdi
	add rax,r14    		; +t4
	adc rdx,0
	mov r14,0FFFFFFFFFFFFh	; !!!
	add r8,rax		; +c
	adc r9,rdx
	and r14,r8
	mov [rbx+4*8],r14	; -> this.n[4]
	shrd r8,r9,48		; !!!
	xor r9,r9
	
	mov rax,01000003D1h
	mul r8		
	add rax,r10
	adc rdx,0
	mov r10,0FFFFFFFFFFFFFh ; modulus
	mov r8,rax
	and rax,r10
	shrd r8,rdx,52
	mov [rbx+0*8],rax	; -> this.n[0]
	add r8,r11
	mov [rbx+1*8],r8	; -> this.n[1]

	pop r15
	pop r14
	pop r13
	pop r12
	pop rbx
	pop rbp
	ret

	
	;;  PROC ExSetSquare
	;;  Register Layout:
	;;  INPUT: 	rdi	 = a.n
	;; 	   	rsi  	 = this.a
	;;  INTERNAL:	rdx:rax  = multiplication accumulator
	;; 		r9:r8    = c
	;; 		r10-r13  = t0-t3
	;; 		r14	 = a.n[0] / t4
	;; 		r15	 = a.n[1] / t5
	;; 		rbx	 = a.n[2] / t6
	;; 		rcx	 = a.n[3] / t7
	;; 		rbp	 = 0FFFFFFFFFFFFFh / t8
	;; 		rsi	 = a.n[4] / t9
	GLOBAL secp256k1_fe_sqr_inner
	ALIGN 32
secp256k1_fe_sqr_inner:
	push rbp
	push rbx
	push r12
	push r13
	push r14
	push r15
	push rsi
	mov rbp,0FFFFFFFFFFFFFh
	
	;; c=a.n[0] * a.n[0]
   	mov r14,[rdi+0*8]	; r14=a.n[0]
	mov r10,rbp		; modulus 
	mov rax,r14
	mul rax
	mov r15,[rdi+1*8]	; a.n[1]
	add r14,r14		; r14=2*a.n[0]
	mov r8,rax
	and r10,rax		; only need lower qword
	shrd r8,rdx,52
	xor r9,r9

	;; c+=2*a.n[0] * a.n[1]
	mov rax,r14		; r14=2*a.n[0]
	mul r15
	mov rbx,[rdi+2*8]	; rbx=a.n[2]
	mov r11,rbp 		; modulus
	add r8,rax
	adc r9,rdx
	and r11,r8
	shrd r8,r9,52
	xor r9,r9
	
	;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
	mov rax,r14
	mul rbx
	add r8,rax
	adc r9,rdx

	mov rax,r15
	mov r12,rbp		; modulus
	mul rax
	mov rcx,[rdi+3*8]	; rcx=a.n[3]
	add r15,r15		; r15=a.n[1]*2
	add r8,rax
	adc r9,rdx
	and r12,r8		; only need lower dword
	shrd r8,r9,52
	xor r9,r9		

	;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
	mov rax,r14
	mul rcx
	add r8,rax
	adc r9,rdx

	mov rax,r15		; rax=2*a.n[1]
	mov r13,rbp		; modulus
	mul rbx
	mov rsi,[rdi+4*8]	; rsi=a.n[4]
	add r8,rax
	adc r9,rdx
	and r13,r8
	shrd r8,r9,52
	xor r9,r9		

	;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
	mov rax,r14		; last time we need 2*a.n[0]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,r15
	mul rcx
	mov r14,rbp		; modulus
	add r8,rax
	adc r9,rdx

	mov rax,rbx
	mul rax
	add rbx,rbx		; rcx=2*a.n[2]
	add r8,rax
	adc r9,rdx
	and r14,r8
	shrd r8,r9,52
	xor r9,r9		

	;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
	mov rax,r15		; last time we need 2*a.n[1]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,rbx
	mul rcx
	mov r15,rbp		; modulus
	add r8,rax
	adc r9,rdx
	and r15,r8
	shrd r8,r9,52
	xor r9,r9		

	;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
	mov rax,rbx		; last time we need 2*a.n[2]
	mul rsi
	add r8,rax
	adc r9,rdx

	mov rax,rcx		; a.n[3]
	mul rax
	mov rbx,rbp		; modulus
	add r8,rax
	adc r9,rdx
	and rbx,r8		; only need lower dword
	lea rax,[2*rcx]
	shrd r8,r9,52
	xor r9,r9		

	;; c+=2*a.n[3]*a.n[4]
	mul rsi
	mov rcx,rbp		; modulus
	add r8,rax
	adc r9,rdx
	and rcx,r8		; only need lower dword
	shrd r8,r9,52
	xor r9,r9		

	;; c+=a.n[4]*a.n[4]
	mov rax,rsi
	mul rax
	;; mov rbp,rbp		; modulus is already there!
	add r8,rax
	adc r9,rdx
	and rbp,r8 
	shrd r8,r9,52
	xor r9,r9		

	mov rsi,r8

	;; *******************************************************
	jmp common_exit_norm
	end