From 004686b6e5ec2536689a5303686ee28c8fc619ed Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Sat, 20 Jul 2024 20:23:05 -0400 Subject: [PATCH] crypto/internal/nistec: Avo port of p256_asm_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. The reference assembly file does not specify a frame size for a number of the defined assembly functions. Avo automatically infers the frame size when generating the TEXT directive, leading to a diff on those lines. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="src/crypto/internal/nistec/p256_asm_amd64.s" REFERENCE="54fe0fd43fcf8609666c16ae6d15ed92873b1564" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 1c1 < TEXT .p256OrdLittleToBig(SB), NOSPLIT, $0 --- > TEXT .p256OrdLittleToBig(SB), NOSPLIT, $0-16 3c3 < TEXT .p256OrdBigToLittle(SB), NOSPLIT, $0 --- > TEXT .p256OrdBigToLittle(SB), NOSPLIT, $0-16 5c5 < TEXT .p256LittleToBig(SB), NOSPLIT, $0 --- > TEXT .p256LittleToBig(SB), NOSPLIT, $0-16 7c7 < TEXT .p256BigToLittle(SB), NOSPLIT, $0 --- > TEXT .p256BigToLittle(SB), NOSPLIT, $0-16 23c23 < TEXT .p256MovCond(SB), NOSPLIT, $0 --- > TEXT .p256MovCond(SB), NOSPLIT, $0-32 74c74 < TEXT .p256NegCond(SB), NOSPLIT, $0 --- > TEXT .p256NegCond(SB), NOSPLIT, $0-16 99c99 < TEXT .p256Sqr(SB), NOSPLIT, $0 --- > TEXT .p256Sqr(SB), NOSPLIT, $0-24 234c234 < TEXT .p256Mul(SB), NOSPLIT, $0 --- > TEXT .p256Mul(SB), NOSPLIT, $0-24 401c401 < TEXT .p256FromMont(SB), NOSPLIT, $0 --- > TEXT .p256FromMont(SB), NOSPLIT, $0-16 465c465 < TEXT .p256Select(SB), NOSPLIT, $0 --- > TEXT .p256Select(SB), NOSPLIT, $0-24 513c513 < TEXT .p256SelectAffine(SB), NOSPLIT, $0 --- > TEXT .p256SelectAffine(SB), NOSPLIT, $0-24 566c566 < TEXT .p256OrdMul(SB), NOSPLIT, $0 --- > TEXT .p256OrdMul(SB), NOSPLIT, $0-24 806c806 < TEXT .p256OrdSqr(SB), NOSPLIT, $0 --- > TEXT .p256OrdSqr(SB), NOSPLIT, $0-24 Change-Id: I610b097c573b9d9018f0e26bc2afde5edb3f954b Reviewed-on: https://go-review.googlesource.com/c/go/+/599875 Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker --- .../compile/internal/types2/stdlib_test.go | 1 + src/crypto/internal/nistec/_asm/go.mod | 11 + src/crypto/internal/nistec/_asm/go.sum | 8 + .../internal/nistec/_asm/p256_asm_amd64.go | 2788 +++++++++++ src/crypto/internal/nistec/p256_asm_amd64.s | 4457 +++++++++-------- src/go/types/stdlib_test.go | 1 + 6 files changed, 5089 insertions(+), 2177 deletions(-) create mode 100644 src/crypto/internal/nistec/_asm/go.mod create mode 100644 src/crypto/internal/nistec/_asm/go.sum create mode 100644 src/crypto/internal/nistec/_asm/p256_asm_amd64.go diff --git a/src/cmd/compile/internal/types2/stdlib_test.go b/src/cmd/compile/internal/types2/stdlib_test.go index 4d7e9b1ae0a9c..108c0629c6aa8 100644 --- a/src/cmd/compile/internal/types2/stdlib_test.go +++ b/src/cmd/compile/internal/types2/stdlib_test.go @@ -359,6 +359,7 @@ var excluded = map[string]bool{ "crypto/aes/_asm/standard": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, + "crypto/internal/nistec/_asm": true, "crypto/md5/_asm": true, "crypto/sha1/_asm": true, "crypto/sha256/_asm": true, diff --git a/src/crypto/internal/nistec/_asm/go.mod b/src/crypto/internal/nistec/_asm/go.mod new file mode 100644 index 0000000000000..116284483d2b6 --- /dev/null +++ b/src/crypto/internal/nistec/_asm/go.mod @@ -0,0 +1,11 @@ +module std/crypto/internal/nistec/_asm + +go 1.24 + +require github.com/mmcloughlin/avo v0.6.0 + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/src/crypto/internal/nistec/_asm/go.sum b/src/crypto/internal/nistec/_asm/go.sum new file mode 100644 index 0000000000000..76af484b2eba3 --- /dev/null +++ b/src/crypto/internal/nistec/_asm/go.sum @@ -0,0 +1,8 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/src/crypto/internal/nistec/_asm/p256_asm_amd64.go b/src/crypto/internal/nistec/_asm/p256_asm_amd64.go new file mode 100644 index 0000000000000..4413516aacab1 --- /dev/null +++ b/src/crypto/internal/nistec/_asm/p256_asm_amd64.go @@ -0,0 +1,2788 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains constant-time, 64-bit assembly implementation of +// P256. The optimizations performed here are described in detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf + +package main + +import ( + "os" + "strings" + + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +//go:generate go run . -out ../p256_asm_amd64.s -pkg nistec + +var ( + res_ptr GPPhysical = RDI + x_ptr = RSI + y_ptr = RCX +) + +// These variables have been versioned as they get redfined in the reference implementation. +// This is done to produce a minimal semantic diff. +var ( + acc0_v1 GPPhysical = R8 + acc1_v1 = R9 + acc2_v1 = R10 + acc3_v1 = R11 + acc4_v1 = R12 + acc5_v1 = R13 + t0_v1 = R14 + t1_v1 = R15 +) + +func main() { + Package("crypto/internal/nistec") + ConstraintExpr("!purego") + p256OrdLittleToBig() + p256OrdBigToLittle() + p256LittleToBig() + p256BigToLittle() + p256MovCond() + p256NegCond() + p256Sqr() + p256Mul() + p256FromMont() + p256Select() + p256SelectAffine() + p256OrdMul() + p256OrdSqr() + p256SubInternal() + p256MulInternal() + p256SqrInternal() + p256PointAddAffineAsm() + p256IsZero() + p256PointAddAsm() + p256PointDoubleAsm() + Generate() + + internalFunctions := []string{ + "·p256SubInternal", + "·p256MulInternal", + "·p256SqrInternal", + "·p256IsZero", + } + removePeskyUnicodeDot(internalFunctions, "../p256_asm_amd64.s") +} + +// Implements: +// +// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) +func p256OrdLittleToBig() { + Implement("p256OrdLittleToBig") + Attributes(NOSPLIT) + // Hack to get Avo to output: + // JMP ·p256BigToLittle(SB) + Instruction(&ir.Instruction{ + Opcode: "JMP", + Operands: []Op{ + LabelRef("·p256BigToLittle(SB)"), + }, + }) +} + +// Implements: +// +// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) +func p256OrdBigToLittle() { + Implement("p256OrdBigToLittle") + Attributes(NOSPLIT) + // Hack to get Avo to output: + // JMP ·p256BigToLittle(SB) + Instruction(&ir.Instruction{ + Opcode: "JMP", + Operands: []Op{ + LabelRef("·p256BigToLittle(SB)"), + }, + }) +} + +// Implements +// +// func p256LittleToBig(res *[32]byte, in *p256Element) +func p256LittleToBig() { + Implement("p256LittleToBig") + Attributes(NOSPLIT) + // Hack to get Avo to output: + // JMP ·p256BigToLittle(SB) + Instruction(&ir.Instruction{ + Opcode: "JMP", + Operands: []Op{ + LabelRef("·p256BigToLittle(SB)"), + }, + }) +} + +// Implements: +// +// func p256BigToLittle(res *p256Element, in *[32]byte) +func p256BigToLittle() { + Implement("p256BigToLittle") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in"), x_ptr) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), acc0_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*1), acc1_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*2), acc2_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*3), acc3_v1) + + BSWAPQ(acc0_v1) + BSWAPQ(acc1_v1) + BSWAPQ(acc2_v1) + BSWAPQ(acc3_v1) + + MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*3)) + + RET() +} + +// Implements: +// +// func p256MovCond(res, a, b *P256Point, cond int) +func p256MovCond() { + Implement("p256MovCond") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("a"), x_ptr) + Load(Param("b"), y_ptr) + Load(Param("cond"), X12) + + PXOR(X13, X13) + PSHUFD(Imm(0), X12, X12) + PCMPEQL(X13, X12) + + MOVOU(X12, X0) + MOVOU(Mem{Base: x_ptr}.Offset(16*0), X6) + PANDN(X6, X0) + MOVOU(X12, X1) + MOVOU(Mem{Base: x_ptr}.Offset(16*1), X7) + PANDN(X7, X1) + MOVOU(X12, X2) + MOVOU(Mem{Base: x_ptr}.Offset(16*2), X8) + PANDN(X8, X2) + MOVOU(X12, X3) + MOVOU(Mem{Base: x_ptr}.Offset(16*3), X9) + PANDN(X9, X3) + MOVOU(X12, X4) + MOVOU(Mem{Base: x_ptr}.Offset(16*4), X10) + PANDN(X10, X4) + MOVOU(X12, X5) + MOVOU(Mem{Base: x_ptr}.Offset(16*5), X11) + PANDN(X11, X5) + + MOVOU(Mem{Base: y_ptr}.Offset(16*0), X6) + MOVOU(Mem{Base: y_ptr}.Offset(16*1), X7) + MOVOU(Mem{Base: y_ptr}.Offset(16*2), X8) + MOVOU(Mem{Base: y_ptr}.Offset(16*3), X9) + MOVOU(Mem{Base: y_ptr}.Offset(16*4), X10) + MOVOU(Mem{Base: y_ptr}.Offset(16*5), X11) + + PAND(X12, X6) + PAND(X12, X7) + PAND(X12, X8) + PAND(X12, X9) + PAND(X12, X10) + PAND(X12, X11) + + PXOR(X6, X0) + PXOR(X7, X1) + PXOR(X8, X2) + PXOR(X9, X3) + PXOR(X10, X4) + PXOR(X11, X5) + + MOVOU(X0, Mem{Base: res_ptr}.Offset(16*0)) + MOVOU(X1, Mem{Base: res_ptr}.Offset(16*1)) + MOVOU(X2, Mem{Base: res_ptr}.Offset(16*2)) + MOVOU(X3, Mem{Base: res_ptr}.Offset(16*3)) + MOVOU(X4, Mem{Base: res_ptr}.Offset(16*4)) + MOVOU(X5, Mem{Base: res_ptr}.Offset(16*5)) + + RET() +} + +// Implements: +// +// func p256NegCond(val *p256Element, cond int) +func p256NegCond() { + Implement("p256NegCond") + Attributes(NOSPLIT) + + Load(Param("val"), res_ptr) + Load(Param("cond"), t0_v1) + + Comment("acc = poly") + MOVQ(I32(-1), acc0_v1) + p256const0 := p256const0_DATA() + MOVQ(p256const0, acc1_v1) + MOVQ(I32(0), acc2_v1) + p256const1 := p256const1_DATA() + MOVQ(p256const1, acc3_v1) + + Comment("Load the original value") + MOVQ(Mem{Base: res_ptr}.Offset(8*0), acc5_v1) + MOVQ(Mem{Base: res_ptr}.Offset(8*1), x_ptr) + MOVQ(Mem{Base: res_ptr}.Offset(8*2), y_ptr) + MOVQ(Mem{Base: res_ptr}.Offset(8*3), t1_v1) + + Comment("Speculatively subtract") + SUBQ(acc5_v1, acc0_v1) + SBBQ(x_ptr, acc1_v1) + SBBQ(y_ptr, acc2_v1) + SBBQ(t1_v1, acc3_v1) + + Comment("If condition is 0, keep original value") + TESTQ(t0_v1, t0_v1) + CMOVQEQ(acc5_v1, acc0_v1) + CMOVQEQ(x_ptr, acc1_v1) + CMOVQEQ(y_ptr, acc2_v1) + CMOVQEQ(t1_v1, acc3_v1) + + Comment("Store result") + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3)) + + RET() +} + +// Implements: +// +// func p256Sqr(res, in *p256Element, n int) +func p256Sqr() { + Implement("p256Sqr") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in"), x_ptr) + Load(Param("n"), RBX) + + Label("sqrLoop") + + Comment("y[1:] * y[0]") + MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + MOVQ(RAX, acc1_v1) + MOVQ(RDX, acc2_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc3_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v1) + + Comment("y[2:] * y[1]") + MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc5_v1) + + Comment("y[3] * y[2]") + MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc5_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, y_ptr) + XORQ(t1_v1, t1_v1) + + Comment("*2") + ADDQ(acc1_v1, acc1_v1) + ADCQ(acc2_v1, acc2_v1) + ADCQ(acc3_v1, acc3_v1) + ADCQ(acc4_v1, acc4_v1) + ADCQ(acc5_v1, acc5_v1) + ADCQ(y_ptr, y_ptr) + ADCQ(Imm(0), t1_v1) + + Comment("Missing products") + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(RAX) + MOVQ(RAX, acc0_v1) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc1_v1) + ADCQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc3_v1) + ADCQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc5_v1) + ADCQ(RAX, y_ptr) + ADCQ(RDX, t1_v1) + MOVQ(t1_v1, x_ptr) + + Comment("First reduction step") + MOVQ(acc0_v1, RAX) + MOVQ(acc0_v1, t1_v1) + SHLQ(Imm(32), acc0_v1) + + p256const1 := p256const1_DATA() + MULQ(p256const1) + + SHRQ(Imm(32), t1_v1) + ADDQ(acc0_v1, acc1_v1) + ADCQ(t1_v1, acc2_v1) + ADCQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc0_v1) + + Comment("Second reduction step") + MOVQ(acc1_v1, RAX) + MOVQ(acc1_v1, t1_v1) + SHLQ(Imm(32), acc1_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc1_v1, acc2_v1) + ADCQ(t1_v1, acc3_v1) + ADCQ(RAX, acc0_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc1_v1) + + Comment("Third reduction step") + MOVQ(acc2_v1, RAX) + MOVQ(acc2_v1, t1_v1) + SHLQ(Imm(32), acc2_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc2_v1, acc3_v1) + ADCQ(t1_v1, acc0_v1) + ADCQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc2_v1) + + Comment("Last reduction step") + XORQ(t0_v1, t0_v1) + MOVQ(acc3_v1, RAX) + MOVQ(acc3_v1, t1_v1) + SHLQ(Imm(32), acc3_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc3_v1, acc0_v1) + ADCQ(t1_v1, acc1_v1) + ADCQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc3_v1) + + Comment("Add bits [511:256] of the sqr result") + ADCQ(acc4_v1, acc0_v1) + ADCQ(acc5_v1, acc1_v1) + ADCQ(y_ptr, acc2_v1) + ADCQ(x_ptr, acc3_v1) + ADCQ(Imm(0), t0_v1) + + MOVQ(acc0_v1, acc4_v1) + MOVQ(acc1_v1, acc5_v1) + MOVQ(acc2_v1, y_ptr) + MOVQ(acc3_v1, t1_v1) + + Comment("Subtract p256") + SUBQ(I8(-1), acc0_v1) + + p256const0 := p256const0_DATA() + SBBQ(p256const0, acc1_v1) + SBBQ(Imm(0), acc2_v1) + SBBQ(p256const1, acc3_v1) + SBBQ(Imm(0), t0_v1) + + CMOVQCS(acc4_v1, acc0_v1) + CMOVQCS(acc5_v1, acc1_v1) + CMOVQCS(y_ptr, acc2_v1) + CMOVQCS(t1_v1, acc3_v1) + + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3)) + MOVQ(res_ptr, x_ptr) + DECQ(RBX) + JNE(LabelRef("sqrLoop")) + + RET() +} + +// Implements: +// +// func p256Mul(res, in1, in2 *p256Element) +func p256Mul() { + Implement("p256Mul") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in1"), x_ptr) + Load(Param("in2"), y_ptr) + + Comment("x * y[0]") + MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + MOVQ(RAX, acc0_v1) + MOVQ(RDX, acc1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc2_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc3_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v1) + XORQ(acc5_v1, acc5_v1) + + Comment("First reduction step") + MOVQ(acc0_v1, RAX) + MOVQ(acc0_v1, t1_v1) + SHLQ(Imm(32), acc0_v1) + p256const1 := p256const1_DATA() + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc0_v1, acc1_v1) + ADCQ(t1_v1, acc2_v1) + ADCQ(RAX, acc3_v1) + ADCQ(RDX, acc4_v1) + ADCQ(Imm(0), acc5_v1) + XORQ(acc0_v1, acc0_v1) + + Comment("x * y[1]") + MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(RDX, acc5_v1) + ADCQ(Imm(0), acc0_v1) + + Comment("Second reduction step") + MOVQ(acc1_v1, RAX) + MOVQ(acc1_v1, t1_v1) + SHLQ(Imm(32), acc1_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc1_v1, acc2_v1) + ADCQ(t1_v1, acc3_v1) + ADCQ(RAX, acc4_v1) + ADCQ(RDX, acc5_v1) + ADCQ(Imm(0), acc0_v1) + XORQ(acc1_v1, acc1_v1) + + Comment("x * y[2]") + MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(RDX, acc0_v1) + ADCQ(Imm(0), acc1_v1) + + Comment("Third reduction step") + MOVQ(acc2_v1, RAX) + MOVQ(acc2_v1, t1_v1) + SHLQ(Imm(32), acc2_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc2_v1, acc3_v1) + ADCQ(t1_v1, acc4_v1) + ADCQ(RAX, acc5_v1) + ADCQ(RDX, acc0_v1) + ADCQ(Imm(0), acc1_v1) + XORQ(acc2_v1, acc2_v1) + Comment("x * y[3]") + + MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc0_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc0_v1) + ADCQ(RDX, acc1_v1) + ADCQ(Imm(0), acc2_v1) + + Comment("Last reduction step") + MOVQ(acc3_v1, RAX) + MOVQ(acc3_v1, t1_v1) + SHLQ(Imm(32), acc3_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc3_v1, acc4_v1) + ADCQ(t1_v1, acc5_v1) + ADCQ(RAX, acc0_v1) + ADCQ(RDX, acc1_v1) + ADCQ(Imm(0), acc2_v1) + + Comment("Copy result [255:0]") + MOVQ(acc4_v1, x_ptr) + MOVQ(acc5_v1, acc3_v1) + MOVQ(acc0_v1, t0_v1) + MOVQ(acc1_v1, t1_v1) + + Comment("Subtract p256") + SUBQ(I8(-1), acc4_v1) + p256const0 := p256const0_DATA() + SBBQ(p256const0, acc5_v1) + SBBQ(Imm(0), acc0_v1) + // SBBQ p256const1<>(SB), acc1_v1 + SBBQ(p256const1, acc1_v1) + SBBQ(Imm(0), acc2_v1) + + CMOVQCS(x_ptr, acc4_v1) + CMOVQCS(acc3_v1, acc5_v1) + CMOVQCS(t0_v1, acc0_v1) + CMOVQCS(t1_v1, acc1_v1) + + MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3)) + + RET() +} + +// Implements: +// +// func p256FromMont(res, in *p256Element) +func p256FromMont() { + Implement("p256FromMont") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in"), x_ptr) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), acc0_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*1), acc1_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*2), acc2_v1) + MOVQ(Mem{Base: x_ptr}.Offset(8*3), acc3_v1) + XORQ(acc4_v1, acc4_v1) + + Comment("Only reduce, no multiplications are needed") + Comment("First stage") + MOVQ(acc0_v1, RAX) + MOVQ(acc0_v1, t1_v1) + SHLQ(Imm(32), acc0_v1) + p256const1 := p256const1_DATA() + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc0_v1, acc1_v1) + ADCQ(t1_v1, acc2_v1) + ADCQ(RAX, acc3_v1) + ADCQ(RDX, acc4_v1) + XORQ(acc5_v1, acc5_v1) + + Comment("Second stage") + MOVQ(acc1_v1, RAX) + MOVQ(acc1_v1, t1_v1) + SHLQ(Imm(32), acc1_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc1_v1, acc2_v1) + ADCQ(t1_v1, acc3_v1) + ADCQ(RAX, acc4_v1) + ADCQ(RDX, acc5_v1) + XORQ(acc0_v1, acc0_v1) + + Comment("Third stage") + MOVQ(acc2_v1, RAX) + MOVQ(acc2_v1, t1_v1) + SHLQ(Imm(32), acc2_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc2_v1, acc3_v1) + ADCQ(t1_v1, acc4_v1) + ADCQ(RAX, acc5_v1) + ADCQ(RDX, acc0_v1) + XORQ(acc1_v1, acc1_v1) + + Comment("Last stage") + MOVQ(acc3_v1, RAX) + MOVQ(acc3_v1, t1_v1) + SHLQ(Imm(32), acc3_v1) + MULQ(p256const1) + SHRQ(Imm(32), t1_v1) + ADDQ(acc3_v1, acc4_v1) + ADCQ(t1_v1, acc5_v1) + ADCQ(RAX, acc0_v1) + ADCQ(RDX, acc1_v1) + + MOVQ(acc4_v1, x_ptr) + MOVQ(acc5_v1, acc3_v1) + MOVQ(acc0_v1, t0_v1) + MOVQ(acc1_v1, t1_v1) + + SUBQ(I8(-1), acc4_v1) + p256const0 := p256const0_DATA() + SBBQ(p256const0, acc5_v1) + SBBQ(Imm(0), acc0_v1) + SBBQ(p256const1, acc1_v1) + + CMOVQCS(x_ptr, acc4_v1) + CMOVQCS(acc3_v1, acc5_v1) + CMOVQCS(t0_v1, acc0_v1) + CMOVQCS(t1_v1, acc1_v1) + + MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3)) + + RET() +} + +// Implements: +// +// func p256Select(res *P256Point, table *p256Table, idx int) +func p256Select() { + Implement("p256Select") + Attributes(NOSPLIT) + + Load(Param("idx"), RAX) + Load(Param("table"), RDI) + Load(Param("res"), RDX) + + PXOR(X15, X15) // X15 = 0 + PCMPEQL(X14, X14) // X14 = -1 + PSUBL(X14, X15) // X15 = 1 + // Force Avo to emit: + // MOVL AX, X14 + Instruction(&ir.Instruction{ + Opcode: "MOVL", + Operands: []Op{ + EAX, X14, + }, + }) + PSHUFD(Imm(0), X14, X14) + + PXOR(X0, X0) + PXOR(X1, X1) + PXOR(X2, X2) + PXOR(X3, X3) + PXOR(X4, X4) + PXOR(X5, X5) + MOVQ(U32(16), RAX) + + MOVOU(X15, X13) + + Label("loop_select") + + MOVOU(X13, X12) + PADDL(X15, X13) + PCMPEQL(X14, X12) + + MOVOU(Mem{Base: DI}.Offset(16*0), X6) + MOVOU(Mem{Base: DI}.Offset(16*1), X7) + MOVOU(Mem{Base: DI}.Offset(16*2), X8) + MOVOU(Mem{Base: DI}.Offset(16*3), X9) + MOVOU(Mem{Base: DI}.Offset(16*4), X10) + MOVOU(Mem{Base: DI}.Offset(16*5), X11) + ADDQ(U8(16*6), RDI) + + PAND(X12, X6) + PAND(X12, X7) + PAND(X12, X8) + PAND(X12, X9) + PAND(X12, X10) + PAND(X12, X11) + + PXOR(X6, X0) + PXOR(X7, X1) + PXOR(X8, X2) + PXOR(X9, X3) + PXOR(X10, X4) + PXOR(X11, X5) + + DECQ(RAX) + JNE(LabelRef("loop_select")) + + MOVOU(X0, Mem{Base: DX}.Offset(16*0)) + MOVOU(X1, Mem{Base: DX}.Offset(16*1)) + MOVOU(X2, Mem{Base: DX}.Offset(16*2)) + MOVOU(X3, Mem{Base: DX}.Offset(16*3)) + MOVOU(X4, Mem{Base: DX}.Offset(16*4)) + MOVOU(X5, Mem{Base: DX}.Offset(16*5)) + + RET() +} + +// Implements: +// +// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) +func p256SelectAffine() { + Implement("p256SelectAffine") + Attributes(NOSPLIT) + + Load(Param("idx"), RAX) + Load(Param("table"), RDI) + Load(Param("res"), RDX) + + PXOR(X15, X15) // X15 = 0 + PCMPEQL(X14, X14) // X14 = -1 + PSUBL(X14, X15) // X15 = 1 + + // Hack to get Avo to emit: + // MOVL AX, X14 + Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{RAX, X14}}) + + PSHUFD(Imm(0), X14, X14) + + PXOR(X0, X0) + PXOR(X1, X1) + PXOR(X2, X2) + PXOR(X3, X3) + MOVQ(U32(16), RAX) + + MOVOU(X15, X13) + + Label("loop_select_base") + + MOVOU(X13, X12) + PADDL(X15, X13) + PCMPEQL(X14, X12) + + MOVOU(Mem{Base: DI}.Offset(16*0), X4) + MOVOU(Mem{Base: DI}.Offset(16*1), X5) + MOVOU(Mem{Base: DI}.Offset(16*2), X6) + MOVOU(Mem{Base: DI}.Offset(16*3), X7) + + MOVOU(Mem{Base: DI}.Offset(16*4), X8) + MOVOU(Mem{Base: DI}.Offset(16*5), X9) + MOVOU(Mem{Base: DI}.Offset(16*6), X10) + MOVOU(Mem{Base: DI}.Offset(16*7), X11) + + ADDQ(Imm(16*8), RDI) + + PAND(X12, X4) + PAND(X12, X5) + PAND(X12, X6) + PAND(X12, X7) + + MOVOU(X13, X12) + PADDL(X15, X13) + PCMPEQL(X14, X12) + + PAND(X12, X8) + PAND(X12, X9) + PAND(X12, X10) + PAND(X12, X11) + + PXOR(X4, X0) + PXOR(X5, X1) + PXOR(X6, X2) + PXOR(X7, X3) + + PXOR(X8, X0) + PXOR(X9, X1) + PXOR(X10, X2) + PXOR(X11, X3) + + DECQ(RAX) + JNE(LabelRef("loop_select_base")) + + MOVOU(X0, Mem{Base: DX}.Offset(16*0)) + MOVOU(X1, Mem{Base: DX}.Offset(16*1)) + MOVOU(X2, Mem{Base: DX}.Offset(16*2)) + MOVOU(X3, Mem{Base: DX}.Offset(16*3)) + + RET() +} + +// Implements: +// +// func p256OrdMul(res, in1, in2 *p256OrdElement) +func p256OrdMul() { + Implement("p256OrdMul") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in1"), x_ptr) + Load(Param("in2"), y_ptr) + + Comment("x * y[0]") + MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + MOVQ(RAX, acc0_v1) + MOVQ(RDX, acc1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc2_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc3_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v1) + XORQ(acc5_v1, acc5_v1) + + Comment("First reduction step") + MOVQ(acc0_v1, RAX) + p256ordK0 := p256ordK0_DATA() + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + p256ord := p256ord_DATA() + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc0_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc1_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x10), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x18), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(RDX, acc4_v1) + ADCQ(Imm(0), acc5_v1) + + Comment("x * y[1]") + MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(RDX, acc5_v1) + ADCQ(Imm(0), acc0_v1) + + Comment("Second reduction step") + MOVQ(acc1_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x10), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x18), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(RDX, acc5_v1) + ADCQ(Imm(0), acc0_v1) + + Comment("x * y[2]") + MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(RDX, acc0_v1) + ADCQ(Imm(0), acc1_v1) + + Comment("Third reduction step") + MOVQ(acc2_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x10), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x18), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(RDX, acc0_v1) + ADCQ(Imm(0), acc1_v1) + + Comment("x * y[3]") + MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc0_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc0_v1) + ADCQ(RDX, acc1_v1) + ADCQ(Imm(0), acc2_v1) + + Comment("Last reduction step") + MOVQ(acc3_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x10), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc5_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc5_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x18), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc0_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc0_v1) + ADCQ(RDX, acc1_v1) + ADCQ(Imm(0), acc2_v1) + + Comment("Copy result [255:0]") + MOVQ(acc4_v1, x_ptr) + MOVQ(acc5_v1, acc3_v1) + MOVQ(acc0_v1, t0_v1) + MOVQ(acc1_v1, t1_v1) + + Comment("Subtract p256") + SUBQ(p256ord.Offset(0x00), acc4_v1) + SBBQ(p256ord.Offset(0x08), acc5_v1) + SBBQ(p256ord.Offset(0x10), acc0_v1) + SBBQ(p256ord.Offset(0x18), acc1_v1) + SBBQ(Imm(0), acc2_v1) + + CMOVQCS(x_ptr, acc4_v1) + CMOVQCS(acc3_v1, acc5_v1) + CMOVQCS(t0_v1, acc0_v1) + CMOVQCS(t1_v1, acc1_v1) + + MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3)) + + RET() +} + +// Implements: +// +// func p256OrdSqr(res, in *p256OrdElement, n int) +func p256OrdSqr() { + Implement("p256OrdSqr") + Attributes(NOSPLIT) + + Load(Param("res"), res_ptr) + Load(Param("in"), x_ptr) + Load(Param("n"), RBX) + + Label("ordSqrLoop") + + Comment("y[1:] * y[0]") + MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(t0_v1) + MOVQ(RAX, acc1_v1) + MOVQ(RDX, acc2_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc3_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v1) + + Comment("y[2:] * y[1]") + MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc4_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc5_v1) + + Comment("y[3] * y[2]") + MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc5_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, y_ptr) + XORQ(t1_v1, t1_v1) + + Comment("*2") + ADDQ(acc1_v1, acc1_v1) + ADCQ(acc2_v1, acc2_v1) + ADCQ(acc3_v1, acc3_v1) + ADCQ(acc4_v1, acc4_v1) + ADCQ(acc5_v1, acc5_v1) + ADCQ(y_ptr, y_ptr) + ADCQ(Imm(0), t1_v1) + + Comment("Missing products") + MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX) + MULQ(RAX) + MOVQ(RAX, acc0_v1) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc1_v1) + ADCQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc3_v1) + ADCQ(RAX, acc4_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t0_v1) + + MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX) + MULQ(RAX) + ADDQ(t0_v1, acc5_v1) + ADCQ(RAX, y_ptr) + ADCQ(RDX, t1_v1) + MOVQ(t1_v1, x_ptr) + + Comment("First reduction step") + MOVQ(acc0_v1, RAX) + p256ordK0 := p256ordK0_DATA() + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + p256ord := p256ord_DATA() + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc0_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc1_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc1_v1) + + MOVQ(t0_v1, t1_v1) + ADCQ(RDX, acc2_v1) + ADCQ(Imm(0), t1_v1) + SUBQ(t0_v1, acc2_v1) + SBBQ(Imm(0), t1_v1) + + MOVQ(t0_v1, RAX) + MOVQ(t0_v1, RDX) + MOVQ(t0_v1, acc0_v1) + SHLQ(Imm(32), RAX) + SHRQ(Imm(32), RDX) + + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), acc0_v1) + SUBQ(RAX, acc3_v1) + SBBQ(RDX, acc0_v1) + + Comment("Second reduction step") + MOVQ(acc1_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc1_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc2_v1) + + MOVQ(t0_v1, t1_v1) + ADCQ(RDX, acc3_v1) + ADCQ(Imm(0), t1_v1) + SUBQ(t0_v1, acc3_v1) + SBBQ(Imm(0), t1_v1) + + MOVQ(t0_v1, RAX) + MOVQ(t0_v1, RDX) + MOVQ(t0_v1, acc1_v1) + SHLQ(Imm(32), RAX) + SHRQ(Imm(32), RDX) + + ADDQ(t1_v1, acc0_v1) + ADCQ(Imm(0), acc1_v1) + SUBQ(RAX, acc0_v1) + SBBQ(RDX, acc1_v1) + + Comment("Third reduction step") + MOVQ(acc2_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc2_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc3_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc3_v1) + + MOVQ(t0_v1, t1_v1) + ADCQ(RDX, acc0_v1) + ADCQ(Imm(0), t1_v1) + SUBQ(t0_v1, acc0_v1) + SBBQ(Imm(0), t1_v1) + + MOVQ(t0_v1, RAX) + MOVQ(t0_v1, RDX) + MOVQ(t0_v1, acc2_v1) + SHLQ(Imm(32), RAX) + SHRQ(Imm(32), RDX) + + ADDQ(t1_v1, acc1_v1) + ADCQ(Imm(0), acc2_v1) + SUBQ(RAX, acc1_v1) + SBBQ(RDX, acc2_v1) + + Comment("Last reduction step") + MOVQ(acc3_v1, RAX) + MULQ(p256ordK0) + MOVQ(RAX, t0_v1) + + MOVQ(p256ord.Offset(0x00), RAX) + MULQ(t0_v1) + ADDQ(RAX, acc3_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(p256ord.Offset(0x08), RAX) + MULQ(t0_v1) + ADDQ(t1_v1, acc0_v1) + ADCQ(Imm(0), RDX) + ADDQ(RAX, acc0_v1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, t1_v1) + + MOVQ(t0_v1, t1_v1) + ADCQ(RDX, acc1_v1) + ADCQ(Imm(0), t1_v1) + SUBQ(t0_v1, acc1_v1) + SBBQ(Imm(0), t1_v1) + + MOVQ(t0_v1, RAX) + MOVQ(t0_v1, RDX) + MOVQ(t0_v1, acc3_v1) + SHLQ(Imm(32), RAX) + SHRQ(Imm(32), RDX) + + ADDQ(t1_v1, acc2_v1) + ADCQ(Imm(0), acc3_v1) + SUBQ(RAX, acc2_v1) + SBBQ(RDX, acc3_v1) + XORQ(t0_v1, t0_v1) + + Comment("Add bits [511:256] of the sqr result") + ADCQ(acc4_v1, acc0_v1) + ADCQ(acc5_v1, acc1_v1) + ADCQ(y_ptr, acc2_v1) + ADCQ(x_ptr, acc3_v1) + ADCQ(Imm(0), t0_v1) + + MOVQ(acc0_v1, acc4_v1) + MOVQ(acc1_v1, acc5_v1) + MOVQ(acc2_v1, y_ptr) + MOVQ(acc3_v1, t1_v1) + + Comment("Subtract p256") + SUBQ(p256ord.Offset(0x00), acc0_v1) + SBBQ(p256ord.Offset(0x08), acc1_v1) + SBBQ(p256ord.Offset(0x10), acc2_v1) + SBBQ(p256ord.Offset(0x18), acc3_v1) + SBBQ(Imm(0), t0_v1) + + CMOVQCS(acc4_v1, acc0_v1) + CMOVQCS(acc5_v1, acc1_v1) + CMOVQCS(y_ptr, acc2_v1) + CMOVQCS(t1_v1, acc3_v1) + + MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0)) + MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1)) + MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2)) + MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3)) + MOVQ(res_ptr, x_ptr) + DECQ(RBX) + JNE(LabelRef("ordSqrLoop")) + + RET() +} + +// These variables have been versioned as they get redfined in the reference implementation. +// This is done to produce a minimal semantic diff. +var ( + mul0_v2 = RAX + mul1_v2 = RDX + acc0_v2 = RBX + acc1_v2 = RCX + acc2_v2 = R8 + acc3_v2 = R9 + acc4_v2 = R10 + acc5_v2 = R11 + acc6_v2 = R12 + acc7_v2 = R13 + t0_v2 = R14 + t1_v2 = R15 + t2_v2 = RDI + t3_v2 = RSI + hlp_v2 = RBP +) + +func p256SubInternal() { + Function("p256SubInternal") + Attributes(NOSPLIT) + + XORQ(mul0_v2, mul0_v2) + SUBQ(t0_v2, acc4_v2) + SBBQ(t1_v2, acc5_v2) + SBBQ(t2_v2, acc6_v2) + SBBQ(t3_v2, acc7_v2) + SBBQ(Imm(0), mul0_v2) + + MOVQ(acc4_v2, acc0_v2) + MOVQ(acc5_v2, acc1_v2) + MOVQ(acc6_v2, acc2_v2) + MOVQ(acc7_v2, acc3_v2) + + ADDQ(I8(-1), acc4_v2) + p256const0 := p256const0_DATA() + ADCQ(p256const0, acc5_v2) + ADCQ(Imm(0), acc6_v2) + p256const1 := p256const1_DATA() + ADCQ(p256const1, acc7_v2) + ANDQ(Imm(1), mul0_v2) + + CMOVQEQ(acc0_v2, acc4_v2) + CMOVQEQ(acc1_v2, acc5_v2) + CMOVQEQ(acc2_v2, acc6_v2) + CMOVQEQ(acc3_v2, acc7_v2) + + RET() +} + +func p256MulInternal() { + Function("p256MulInternal") + Attributes(NOSPLIT) + + MOVQ(acc4_v2, mul0_v2) + MULQ(t0_v2) + MOVQ(mul0_v2, acc0_v2) + MOVQ(mul1_v2, acc1_v2) + + MOVQ(acc4_v2, mul0_v2) + MULQ(t1_v2) + ADDQ(mul0_v2, acc1_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc2_v2) + + MOVQ(acc4_v2, mul0_v2) + MULQ(t2_v2) + ADDQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc3_v2) + + MOVQ(acc4_v2, mul0_v2) + MULQ(t3_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc4_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(t0_v2) + ADDQ(mul0_v2, acc1_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(t1_v2) + ADDQ(hlp_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(t2_v2) + ADDQ(hlp_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(t3_v2) + ADDQ(hlp_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc5_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(t0_v2) + ADDQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(t1_v2) + ADDQ(hlp_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(t2_v2) + ADDQ(hlp_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(t3_v2) + ADDQ(hlp_v2, acc5_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc5_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc6_v2) + + MOVQ(acc7_v2, mul0_v2) + MULQ(t0_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc7_v2, mul0_v2) + MULQ(t1_v2) + ADDQ(hlp_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc4_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc7_v2, mul0_v2) + MULQ(t2_v2) + ADDQ(hlp_v2, acc5_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc5_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc7_v2, mul0_v2) + MULQ(t3_v2) + ADDQ(hlp_v2, acc6_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, acc6_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc7_v2) + + Comment("First reduction step") + MOVQ(acc0_v2, mul0_v2) + MOVQ(acc0_v2, hlp_v2) + SHLQ(Imm(32), acc0_v2) + p256const1 := p256const1_DATA() + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc0_v2, acc1_v2) + ADCQ(hlp_v2, acc2_v2) + ADCQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc0_v2) + + Comment("Second reduction step") + MOVQ(acc1_v2, mul0_v2) + MOVQ(acc1_v2, hlp_v2) + SHLQ(Imm(32), acc1_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc1_v2, acc2_v2) + ADCQ(hlp_v2, acc3_v2) + ADCQ(mul0_v2, acc0_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc1_v2) + + Comment("Third reduction step") + MOVQ(acc2_v2, mul0_v2) + MOVQ(acc2_v2, hlp_v2) + SHLQ(Imm(32), acc2_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc2_v2, acc3_v2) + ADCQ(hlp_v2, acc0_v2) + ADCQ(mul0_v2, acc1_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc2_v2) + + Comment("Last reduction step") + MOVQ(acc3_v2, mul0_v2) + MOVQ(acc3_v2, hlp_v2) + SHLQ(Imm(32), acc3_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc3_v2, acc0_v2) + ADCQ(hlp_v2, acc1_v2) + ADCQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc3_v2) + MOVQ(U32(0), RBP) + + Comment("Add bits [511:256] of the result") + ADCQ(acc0_v2, acc4_v2) + ADCQ(acc1_v2, acc5_v2) + ADCQ(acc2_v2, acc6_v2) + ADCQ(acc3_v2, acc7_v2) + ADCQ(Imm(0), hlp_v2) + + Comment("Copy result") + MOVQ(acc4_v2, acc0_v2) + MOVQ(acc5_v2, acc1_v2) + MOVQ(acc6_v2, acc2_v2) + MOVQ(acc7_v2, acc3_v2) + + Comment("Subtract p256") + SUBQ(I8(-1), acc4_v2) + p256const0 := p256const0_DATA() + SBBQ(p256const0, acc5_v2) + SBBQ(Imm(0), acc6_v2) + SBBQ(p256const1, acc7_v2) + SBBQ(Imm(0), hlp_v2) + + Comment("If the result of the subtraction is negative, restore the previous result") + CMOVQCS(acc0_v2, acc4_v2) + CMOVQCS(acc1_v2, acc5_v2) + CMOVQCS(acc2_v2, acc6_v2) + CMOVQCS(acc3_v2, acc7_v2) + + RET() +} + +func p256SqrInternal() { + Function("p256SqrInternal") + Attributes(NOSPLIT) + + MOVQ(acc4_v2, mul0_v2) + MULQ(acc5_v2) + MOVQ(mul0_v2, acc1_v2) + MOVQ(mul1_v2, acc2_v2) + + MOVQ(acc4_v2, mul0_v2) + MULQ(acc6_v2) + ADDQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc3_v2) + + MOVQ(acc4_v2, mul0_v2) + MULQ(acc7_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, t0_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(acc6_v2) + ADDQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, hlp_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(acc7_v2) + ADDQ(hlp_v2, t0_v2) + ADCQ(Imm(0), mul1_v2) + ADDQ(mul0_v2, t0_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, t1_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(acc7_v2) + ADDQ(mul0_v2, t1_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, t2_v2) + XORQ(t3_v2, t3_v2) + + Comment("*2") + ADDQ(acc1_v2, acc1_v2) + ADCQ(acc2_v2, acc2_v2) + ADCQ(acc3_v2, acc3_v2) + ADCQ(t0_v2, t0_v2) + ADCQ(t1_v2, t1_v2) + ADCQ(t2_v2, t2_v2) + ADCQ(Imm(0), t3_v2) + + Comment("Missing products") + MOVQ(acc4_v2, mul0_v2) + MULQ(mul0_v2) + MOVQ(mul0_v2, acc0_v2) + MOVQ(RDX, acc4_v2) + + MOVQ(acc5_v2, mul0_v2) + MULQ(mul0_v2) + ADDQ(acc4_v2, acc1_v2) + ADCQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v2) + + MOVQ(acc6_v2, mul0_v2) + MULQ(mul0_v2) + ADDQ(acc4_v2, acc3_v2) + ADCQ(mul0_v2, t0_v2) + ADCQ(Imm(0), RDX) + MOVQ(RDX, acc4_v2) + + MOVQ(acc7_v2, mul0_v2) + MULQ(mul0_v2) + ADDQ(acc4_v2, t1_v2) + ADCQ(mul0_v2, t2_v2) + ADCQ(RDX, t3_v2) + + Comment("First reduction step") + MOVQ(acc0_v2, mul0_v2) + MOVQ(acc0_v2, hlp_v2) + SHLQ(Imm(32), acc0_v2) + p256const1 := p256const1_DATA() + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc0_v2, acc1_v2) + ADCQ(hlp_v2, acc2_v2) + ADCQ(mul0_v2, acc3_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc0_v2) + + Comment("Second reduction step") + MOVQ(acc1_v2, mul0_v2) + MOVQ(acc1_v2, hlp_v2) + SHLQ(Imm(32), acc1_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc1_v2, acc2_v2) + ADCQ(hlp_v2, acc3_v2) + ADCQ(mul0_v2, acc0_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc1_v2) + + Comment("Third reduction step") + MOVQ(acc2_v2, mul0_v2) + MOVQ(acc2_v2, hlp_v2) + SHLQ(Imm(32), acc2_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc2_v2, acc3_v2) + ADCQ(hlp_v2, acc0_v2) + ADCQ(mul0_v2, acc1_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc2_v2) + + Comment("Last reduction step") + MOVQ(acc3_v2, mul0_v2) + MOVQ(acc3_v2, hlp_v2) + SHLQ(Imm(32), acc3_v2) + MULQ(p256const1) + SHRQ(Imm(32), hlp_v2) + ADDQ(acc3_v2, acc0_v2) + ADCQ(hlp_v2, acc1_v2) + ADCQ(mul0_v2, acc2_v2) + ADCQ(Imm(0), mul1_v2) + MOVQ(mul1_v2, acc3_v2) + MOVQ(U32(0), RBP) + + Comment("Add bits [511:256] of the result") + ADCQ(acc0_v2, t0_v2) + ADCQ(acc1_v2, t1_v2) + ADCQ(acc2_v2, t2_v2) + ADCQ(acc3_v2, t3_v2) + ADCQ(Imm(0), hlp_v2) + + Comment("Copy result") + MOVQ(t0_v2, acc4_v2) + MOVQ(t1_v2, acc5_v2) + MOVQ(t2_v2, acc6_v2) + MOVQ(t3_v2, acc7_v2) + + Comment("Subtract p256") + SUBQ(I8(-1), acc4_v2) + p256const0 := p256const0_DATA() + SBBQ(p256const0, acc5_v2) + SBBQ(Imm(0), acc6_v2) + SBBQ(p256const1, acc7_v2) + SBBQ(Imm(0), hlp_v2) + + Comment("If the result of the subtraction is negative, restore the previous result") + CMOVQCS(t0_v2, acc4_v2) + CMOVQCS(t1_v2, acc5_v2) + CMOVQCS(t2_v2, acc6_v2) + CMOVQCS(t3_v2, acc7_v2) + + RET() +} + +func p256MulBy2Inline() { + XORQ(mul0_v2, mul0_v2) + ADDQ(acc4_v2, acc4_v2) + ADCQ(acc5_v2, acc5_v2) + ADCQ(acc6_v2, acc6_v2) + ADCQ(acc7_v2, acc7_v2) + ADCQ(I8(0), mul0_v2) + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) + SUBQ(I8(-1), t0_v2) + p256const0 := p256const0_DATA() + SBBQ(p256const0, t1_v2) + SBBQ(I8(0), t2_v2) + p256const1 := p256const1_DATA() + SBBQ(p256const1, t3_v2) + SBBQ(I8(0), mul0_v2) + CMOVQCS(acc4_v2, t0_v2) + CMOVQCS(acc5_v2, t1_v2) + CMOVQCS(acc6_v2, t2_v2) + CMOVQCS(acc7_v2, t3_v2) +} + +func p256AddInline() { + XORQ(mul0_v2, mul0_v2) + ADDQ(t0_v2, acc4_v2) + ADCQ(t1_v2, acc5_v2) + ADCQ(t2_v2, acc6_v2) + ADCQ(t3_v2, acc7_v2) + ADCQ(I8(0), mul0_v2) + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) + SUBQ(I8(-1), t0_v2) + p256const0 := p256const0_DATA() + SBBQ(p256const0, t1_v2) + SBBQ(I8(0), t2_v2) + p256const1 := p256const1_DATA() + SBBQ(p256const1, t3_v2) + SBBQ(I8(0), mul0_v2) + CMOVQCS(acc4_v2, t0_v2) + CMOVQCS(acc5_v2, t1_v2) + CMOVQCS(acc6_v2, t2_v2) + CMOVQCS(acc7_v2, t3_v2) +} + +/* ---------------------------------------*/ + +type MemFunc func(off int) Mem + +func LDacc(src MemFunc) { + MOVQ(src(8*0), acc4_v2) + MOVQ(src(8*1), acc5_v2) + MOVQ(src(8*2), acc6_v2) + MOVQ(src(8*3), acc7_v2) +} + +func LDt(src MemFunc) { + MOVQ(src(8*0), t0_v2) + MOVQ(src(8*1), t1_v2) + MOVQ(src(8*2), t2_v2) + MOVQ(src(8*3), t3_v2) +} + +func ST(dst MemFunc) { + MOVQ(acc4_v2, dst(8*0)) + MOVQ(acc5_v2, dst(8*1)) + MOVQ(acc6_v2, dst(8*2)) + MOVQ(acc7_v2, dst(8*3)) +} + +func STt(dst MemFunc) { + MOVQ(t0_v2, dst(8*0)) + MOVQ(t1_v2, dst(8*1)) + MOVQ(t2_v2, dst(8*2)) + MOVQ(t3_v2, dst(8*3)) +} + +func acc2t() { + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) +} + +func t2acc() { + MOVQ(t0_v2, acc4_v2) + MOVQ(t1_v2, acc5_v2) + MOVQ(t2_v2, acc6_v2) + MOVQ(t3_v2, acc7_v2) +} + +/* ---------------------------------------*/ + +// These functions exist as #define macros in the reference implementation. +// +// In the reference assembly, these macros are later undefined and redefined. +// They are implemented here as versioned functions. + +func x1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) } +func y1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) } +func z1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) } +func x2in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) } +func y2in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) } +func xout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) } +func yout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) } +func zout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*7 + off) } +func s2_v1(off int) Mem { return Mem{Base: SP}.Offset(32*8 + off) } +func z1sqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*9 + off) } +func h_v1(off int) Mem { return Mem{Base: SP}.Offset(32*10 + off) } +func r_v1(off int) Mem { return Mem{Base: SP}.Offset(32*11 + off) } +func hsqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*12 + off) } +func rsqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*13 + off) } +func hcub_v1(off int) Mem { return Mem{Base: SP}.Offset(32*14 + off) } + +var ( + rptr_v1 Mem = Mem{Base: SP}.Offset(32*15 + 0) + sel_save_v1 = Mem{Base: SP}.Offset(32*15 + 8) + zero_save_v1 = Mem{Base: SP}.Offset(32*15 + 8 + 4) +) + +// Implements: +// +// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) +func p256PointAddAffineAsm() { + Implement("p256PointAddAffineAsm") + AllocLocal(512) + + Load(Param("res"), RAX) + Load(Param("in1"), RBX) + Load(Param("in2"), RCX) + Load(Param("sign"), RDX) + Load(Param("sel"), t1_v2) + Load(Param("zero"), t2_v2) + + MOVOU(Mem{Base: BX}.Offset(16*0), X0) + MOVOU(Mem{Base: BX}.Offset(16*1), X1) + MOVOU(Mem{Base: BX}.Offset(16*2), X2) + MOVOU(Mem{Base: BX}.Offset(16*3), X3) + MOVOU(Mem{Base: BX}.Offset(16*4), X4) + MOVOU(Mem{Base: BX}.Offset(16*5), X5) + + MOVOU(X0, x1in_v1(16*0)) + MOVOU(X1, x1in_v1(16*1)) + MOVOU(X2, y1in_v1(16*0)) + MOVOU(X3, y1in_v1(16*1)) + MOVOU(X4, z1in_v1(16*0)) + MOVOU(X5, z1in_v1(16*1)) + + MOVOU(Mem{Base: CX}.Offset(16*0), X0) + MOVOU(Mem{Base: CX}.Offset(16*1), X1) + + MOVOU(X0, x2in_v1(16*0)) + MOVOU(X1, x2in_v1(16*1)) + + Comment("Store pointer to result") + MOVQ(mul0_v2, rptr_v1) + + // Hack to get Avo to emit: + // MOVL t1, sel_save_v1 + Instruction(&ir.Instruction{ + Opcode: "MOVL", + Operands: []Op{t1_v2, sel_save_v1}, + }) + + // Hack to get Avo to emit: + // MOVL t2_v2, zero_save_v1 + Instruction(&ir.Instruction{ + Opcode: "MOVL", + Operands: []Op{t2_v2, zero_save_v1}, + }) + + Comment("Negate y2in based on sign") + MOVQ(Mem{Base: CX}.Offset(16*2+8*0), acc4_v2) + MOVQ(Mem{Base: CX}.Offset(16*2+8*1), acc5_v2) + MOVQ(Mem{Base: CX}.Offset(16*2+8*2), acc6_v2) + MOVQ(Mem{Base: CX}.Offset(16*2+8*3), acc7_v2) + MOVQ(I32(-1), acc0_v2) + p256const0 := p256const0_DATA() + MOVQ(p256const0, acc1_v2) + MOVQ(U32(0), acc2_v2) + p256const1 := p256const1_DATA() + MOVQ(p256const1, acc3_v2) + XORQ(mul0_v2, mul0_v2) + + Comment("Speculatively subtract") + SUBQ(acc4_v2, acc0_v2) + SBBQ(acc5_v2, acc1_v2) + SBBQ(acc6_v2, acc2_v2) + SBBQ(acc7_v2, acc3_v2) + SBBQ(Imm(0), mul0_v2) + MOVQ(acc0_v2, t0_v2) + MOVQ(acc1_v2, t1_v2) + MOVQ(acc2_v2, t2_v2) + MOVQ(acc3_v2, t3_v2) + + Comment("Add in case the operand was > p256") + ADDQ(I8(-1), acc0_v2) + ADCQ(p256const0, acc1_v2) + ADCQ(Imm(0), acc2_v2) + ADCQ(p256const1, acc3_v2) + ADCQ(Imm(0), mul0_v2) + CMOVQNE(t0_v2, acc0_v2) + CMOVQNE(t1_v2, acc1_v2) + CMOVQNE(t2_v2, acc2_v2) + CMOVQNE(t3_v2, acc3_v2) + + Comment("If condition is 0, keep original value") + TESTQ(RDX, RDX) + CMOVQEQ(acc4_v2, acc0_v2) + CMOVQEQ(acc5_v2, acc1_v2) + CMOVQEQ(acc6_v2, acc2_v2) + CMOVQEQ(acc7_v2, acc3_v2) + + Comment("Store result") + MOVQ(acc0_v2, y2in_v1(8*0)) + MOVQ(acc1_v2, y2in_v1(8*1)) + MOVQ(acc2_v2, y2in_v1(8*2)) + MOVQ(acc3_v2, y2in_v1(8*3)) + + Comment("Begin point add") + LDacc(z1in_v1) + CALL(LabelRef("p256SqrInternal(SB)")) // z1ˆ2 + ST(z1sqr_v1) + + LDt(x2in_v1) + CALL(LabelRef("p256MulInternal(SB)")) // x2 * z1ˆ2 + + LDt(x1in_v1) + CALL(LabelRef("p256SubInternal(SB)")) // h = u2 - u1) + ST(h_v1) + + LDt(z1in_v1) + CALL(LabelRef("p256MulInternal(SB)")) // z3 = h * z1 + ST(zout_v1) + + LDacc(z1sqr_v1) + CALL(LabelRef("p256MulInternal(SB)")) // z1ˆ3 + + LDt(y2in_v1) + CALL(LabelRef("p256MulInternal(SB)")) // s2 = y2 * z1ˆ3 + ST(s2_v1) + + LDt(y1in_v1) + CALL(LabelRef("p256SubInternal(SB)")) // r = s2 - s1) + ST(r_v1) + + CALL(LabelRef("p256SqrInternal(SB)")) // rsqr = rˆ2 + ST(rsqr_v1) + + LDacc(h_v1) + CALL(LabelRef("p256SqrInternal(SB)")) // hsqr = hˆ2 + ST(hsqr_v1) + + LDt(h_v1) + CALL(LabelRef("p256MulInternal(SB)")) // hcub = hˆ3 + ST(hcub_v1) + + LDt(y1in_v1) + CALL(LabelRef("p256MulInternal(SB)")) // y1 * hˆ3 + ST(s2_v1) + + LDacc(x1in_v1) + LDt(hsqr_v1) + CALL(LabelRef("p256MulInternal(SB)")) // u1 * hˆ2 + ST(h_v1) + + p256MulBy2Inline() // u1 * hˆ2 * 2, inline + LDacc(rsqr_v1) + CALL(LabelRef("p256SubInternal(SB)")) // rˆ2 - u1 * hˆ2 * 2) + + LDt(hcub_v1) + CALL(LabelRef("p256SubInternal(SB)")) + ST(xout_v1) + + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) + LDacc(h_v1) + CALL(LabelRef("p256SubInternal(SB)")) + + LDt(r_v1) + CALL(LabelRef("p256MulInternal(SB)")) + + LDt(s2_v1) + CALL(LabelRef("p256SubInternal(SB)")) + ST(yout_v1) + + Comment("Load stored values from stack") + MOVQ(rptr_v1, RAX) + MOVL(sel_save_v1, EBX) + MOVL(zero_save_v1, ECX) + + Comment("The result is not valid if (sel == 0), conditional choose") + MOVOU(xout_v1(16*0), X0) + MOVOU(xout_v1(16*1), X1) + MOVOU(yout_v1(16*0), X2) + MOVOU(yout_v1(16*1), X3) + MOVOU(zout_v1(16*0), X4) + MOVOU(zout_v1(16*1), X5) + + // Hack to get Avo to emit: + // MOVL BX, X6 + Instruction(&ir.Instruction{ + Opcode: "MOVL", + Operands: []Op{EBX, X6}, + }) + + // Hack to get Avo to emit: + // MOVL CX, X7 + Instruction(&ir.Instruction{ + Opcode: "MOVL", + Operands: []Op{ECX, X7}, + }) + + PXOR(X8, X8) + PCMPEQL(X9, X9) + + PSHUFD(Imm(0), X6, X6) + PSHUFD(Imm(0), X7, X7) + + PCMPEQL(X8, X6) + PCMPEQL(X8, X7) + + MOVOU(X6, X15) + PANDN(X9, X15) + + MOVOU(x1in_v1(16*0), X9) + MOVOU(x1in_v1(16*1), X10) + MOVOU(y1in_v1(16*0), X11) + MOVOU(y1in_v1(16*1), X12) + MOVOU(z1in_v1(16*0), X13) + MOVOU(z1in_v1(16*1), X14) + + PAND(X15, X0) + PAND(X15, X1) + PAND(X15, X2) + PAND(X15, X3) + PAND(X15, X4) + PAND(X15, X5) + + PAND(X6, X9) + PAND(X6, X10) + PAND(X6, X11) + PAND(X6, X12) + PAND(X6, X13) + PAND(X6, X14) + + PXOR(X9, X0) + PXOR(X10, X1) + PXOR(X11, X2) + PXOR(X12, X3) + PXOR(X13, X4) + PXOR(X14, X5) + + Comment("Similarly if zero == 0") + PCMPEQL(X9, X9) + MOVOU(X7, X15) + PANDN(X9, X15) + + MOVOU(x2in_v1(16*0), X9) + MOVOU(x2in_v1(16*1), X10) + MOVOU(y2in_v1(16*0), X11) + MOVOU(y2in_v1(16*1), X12) + p256one := p256one_DATA() + MOVOU(p256one.Offset(0x00), X13) + MOVOU(p256one.Offset(0x10), X14) + + PAND(X15, X0) + PAND(X15, X1) + PAND(X15, X2) + PAND(X15, X3) + PAND(X15, X4) + PAND(X15, X5) + + PAND(X7, X9) + PAND(X7, X10) + PAND(X7, X11) + PAND(X7, X12) + PAND(X7, X13) + PAND(X7, X14) + + PXOR(X9, X0) + PXOR(X10, X1) + PXOR(X11, X2) + PXOR(X12, X3) + PXOR(X13, X4) + PXOR(X14, X5) + + Comment("Finally output the result") + MOVOU(X0, Mem{Base: AX}.Offset(16*0)) + MOVOU(X1, Mem{Base: AX}.Offset(16*1)) + MOVOU(X2, Mem{Base: AX}.Offset(16*2)) + MOVOU(X3, Mem{Base: AX}.Offset(16*3)) + MOVOU(X4, Mem{Base: AX}.Offset(16*4)) + MOVOU(X5, Mem{Base: AX}.Offset(16*5)) + MOVQ(U32(0), rptr_v1) + + RET() +} + +// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero +// otherwise. It writes to [acc4..acc7], t0 and t1. +func p256IsZero() { + Function("p256IsZero") + Attributes(NOSPLIT) + + Comment("AX contains a flag that is set if the input is zero.") + XORQ(RAX, RAX) + MOVQ(U32(1), t1_v2) + + Comment("Check whether [acc4..acc7] are all zero.") + MOVQ(acc4_v2, t0_v2) + ORQ(acc5_v2, t0_v2) + ORQ(acc6_v2, t0_v2) + ORQ(acc7_v2, t0_v2) + + Comment("Set the zero flag if so. (CMOV of a constant to a register doesn't") + Comment("appear to be supported in Go. Thus t1 = 1.)") + CMOVQEQ(t1_v2, RAX) + + Comment("XOR [acc4..acc7] with P and compare with zero again.") + XORQ(I8(-1), acc4_v2) + p256const0 := p256const0_DATA() + XORQ(p256const0, acc5_v2) + p256const1 := p256const1_DATA() + XORQ(p256const1, acc7_v2) + ORQ(acc5_v2, acc4_v2) + ORQ(acc6_v2, acc4_v2) + ORQ(acc7_v2, acc4_v2) + + Comment("Set the zero flag if so.") + CMOVQEQ(t1_v2, RAX) + RET() +} + +func x1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) } +func y1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) } +func z1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) } +func x2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) } +func y2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) } +func z2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) } + +func xout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) } +func yout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*7 + off) } +func zout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*8 + off) } + +func u1_v2(off int) Mem { return Mem{Base: SP}.Offset(32*9 + off) } +func u2_v2(off int) Mem { return Mem{Base: SP}.Offset(32*10 + off) } +func s1_v2(off int) Mem { return Mem{Base: SP}.Offset(32*11 + off) } +func s2_v2(off int) Mem { return Mem{Base: SP}.Offset(32*12 + off) } +func z1sqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*13 + off) } +func z2sqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*14 + off) } +func h_v2(off int) Mem { return Mem{Base: SP}.Offset(32*15 + off) } +func r_v2(off int) Mem { return Mem{Base: SP}.Offset(32*16 + off) } +func hsqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*17 + off) } +func rsqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*18 + off) } +func hcub_v2(off int) Mem { return Mem{Base: SP}.Offset(32*19 + off) } + +var ( + rptr_v2 Mem = Mem{Base: SP}.Offset(32 * 20) + points_eq_v2 = Mem{Base: SP}.Offset(32*20 + 8) +) + +// Implements: +// +// func p256PointAddAsm(res, in1, in2 *P256Point) int +// +// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl +func p256PointAddAsm() { + Implement("p256PointAddAsm") + AllocLocal(680) + + Comment("Move input to stack in order to free registers") + Load(Param("res"), RAX) + Load(Param("in1"), RBX) + Load(Param("in2"), RCX) + + MOVOU(Mem{Base: BX}.Offset(16*0), X0) + MOVOU(Mem{Base: BX}.Offset(16*1), X1) + MOVOU(Mem{Base: BX}.Offset(16*2), X2) + MOVOU(Mem{Base: BX}.Offset(16*3), X3) + MOVOU(Mem{Base: BX}.Offset(16*4), X4) + MOVOU(Mem{Base: BX}.Offset(16*5), X5) + + MOVOU(X0, x1in_v2(16*0)) + MOVOU(X1, x1in_v2(16*1)) + MOVOU(X2, y1in_v2(16*0)) + MOVOU(X3, y1in_v2(16*1)) + MOVOU(X4, z1in_v2(16*0)) + MOVOU(X5, z1in_v2(16*1)) + + MOVOU(Mem{Base: CX}.Offset(16*0), X0) + MOVOU(Mem{Base: CX}.Offset(16*1), X1) + MOVOU(Mem{Base: CX}.Offset(16*2), X2) + MOVOU(Mem{Base: CX}.Offset(16*3), X3) + MOVOU(Mem{Base: CX}.Offset(16*4), X4) + MOVOU(Mem{Base: CX}.Offset(16*5), X5) + + MOVOU(X0, x2in_v2(16*0)) + MOVOU(X1, x2in_v2(16*1)) + MOVOU(X2, y2in_v2(16*0)) + MOVOU(X3, y2in_v2(16*1)) + MOVOU(X4, z2in_v2(16*0)) + MOVOU(X5, z2in_v2(16*1)) + + Comment("Store pointer to result") + MOVQ(RAX, rptr_v2) + + Comment("Begin point add") + LDacc(z2in_v2) + CALL(LabelRef("p256SqrInternal(SB)")) // z2ˆ2 + ST(z2sqr_v2) + LDt(z2in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // z2ˆ3 + LDt(y1in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // s1 = z2ˆ3*y1 + ST(s1_v2) + + LDacc(z1in_v2) + CALL(LabelRef("p256SqrInternal(SB)")) // z1ˆ2 + ST(z1sqr_v2) + LDt(z1in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // z1ˆ3 + LDt(y2in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // s2 = z1ˆ3*y2 + ST(s2_v2) + + LDt(s1_v2) + CALL(LabelRef("p256SubInternal(SB)")) // r = s2 - s1 + ST(r_v2) + CALL(LabelRef("p256IsZero(SB)")) + MOVQ(RAX, points_eq_v2) + + LDacc(z2sqr_v2) + LDt(x1in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // u1 = x1 * z2ˆ2 + ST(u1_v2) + LDacc(z1sqr_v2) + LDt(x2in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // u2 = x2 * z1ˆ2 + ST(u2_v2) + + LDt(u1_v2) + CALL(LabelRef("p256SubInternal(SB)")) // h = u2 - u1 + ST(h_v2) + CALL(LabelRef("p256IsZero(SB)")) + ANDQ(points_eq_v2, RAX) + MOVQ(RAX, points_eq_v2) + + LDacc(r_v2) + CALL(LabelRef("p256SqrInternal(SB)")) // rsqr = rˆ2 + ST(rsqr_v2) + + LDacc(h_v2) + CALL(LabelRef("p256SqrInternal(SB)")) // hsqr = hˆ2 + ST(hsqr_v2) + + LDt(h_v2) + CALL(LabelRef("p256MulInternal(SB)")) // hcub = hˆ3 + ST(hcub_v2) + + LDt(s1_v2) + CALL(LabelRef("p256MulInternal(SB)")) + ST(s2_v2) + + LDacc(z1in_v2) + LDt(z2in_v2) + CALL(LabelRef("p256MulInternal(SB)")) // z1 * z2 + LDt(h_v2) + CALL(LabelRef("p256MulInternal(SB)")) // z1 * z2 * h + ST(zout_v2) + + LDacc(hsqr_v2) + LDt(u1_v2) + CALL(LabelRef("p256MulInternal(SB)")) // hˆ2 * u1 + ST(u2_v2) + + p256MulBy2Inline() // u1 * hˆ2 * 2, inline + LDacc(rsqr_v2) + CALL(LabelRef("p256SubInternal(SB)")) // rˆ2 - u1 * hˆ2 * 2 + + LDt(hcub_v2) + CALL(LabelRef("p256SubInternal(SB)")) + ST(xout_v2) + + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) + LDacc(u2_v2) + CALL(LabelRef("p256SubInternal(SB)")) + + LDt(r_v2) + CALL(LabelRef("p256MulInternal(SB)")) + + LDt(s2_v2) + CALL(LabelRef("p256SubInternal(SB)")) + ST(yout_v2) + + MOVOU(xout_v2(16*0), X0) + MOVOU(xout_v2(16*1), X1) + MOVOU(yout_v2(16*0), X2) + MOVOU(yout_v2(16*1), X3) + MOVOU(zout_v2(16*0), X4) + MOVOU(zout_v2(16*1), X5) + + Comment("Finally output the result") + MOVQ(rptr_v2, RAX) + MOVQ(U32(0), rptr_v2) + MOVOU(X0, Mem{Base: AX}.Offset(16*0)) + MOVOU(X1, Mem{Base: AX}.Offset(16*1)) + MOVOU(X2, Mem{Base: AX}.Offset(16*2)) + MOVOU(X3, Mem{Base: AX}.Offset(16*3)) + MOVOU(X4, Mem{Base: AX}.Offset(16*4)) + MOVOU(X5, Mem{Base: AX}.Offset(16*5)) + + MOVQ(points_eq_v2, RAX) + ret := NewParamAddr("ret", 24) + MOVQ(RAX, ret) + + RET() +} + +func x(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) } +func y(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) } +func z(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) } + +func s(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) } +func m(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) } +func zsqr(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) } +func tmp(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) } + +var rptr_v3 = Mem{Base: SP}.Offset(32 * 7) + +// Implements: +// +// func p256PointDoubleAsm(res, in *P256Point) +func p256PointDoubleAsm() { + Implement("p256PointDoubleAsm") + Attributes(NOSPLIT) + AllocLocal(256) + + Load(Param("res"), RAX) + Load(Param("in"), RBX) + + MOVOU(Mem{Base: BX}.Offset(16*0), X0) + MOVOU(Mem{Base: BX}.Offset(16*1), X1) + MOVOU(Mem{Base: BX}.Offset(16*2), X2) + MOVOU(Mem{Base: BX}.Offset(16*3), X3) + MOVOU(Mem{Base: BX}.Offset(16*4), X4) + MOVOU(Mem{Base: BX}.Offset(16*5), X5) + + MOVOU(X0, x(16*0)) + MOVOU(X1, x(16*1)) + MOVOU(X2, y(16*0)) + MOVOU(X3, y(16*1)) + MOVOU(X4, z(16*0)) + MOVOU(X5, z(16*1)) + + Comment("Store pointer to result") + MOVQ(RAX, rptr_v3) + + Comment("Begin point double") + LDacc(z) + CALL(LabelRef("p256SqrInternal(SB)")) + ST(zsqr) + + LDt(x) + p256AddInline() + STt(m) + + LDacc(z) + LDt(y) + CALL(LabelRef("p256MulInternal(SB)")) + p256MulBy2Inline() + MOVQ(rptr_v3, RAX) + + Comment("Store z") + MOVQ(t0_v2, Mem{Base: AX}.Offset(16*4+8*0)) + MOVQ(t1_v2, Mem{Base: AX}.Offset(16*4+8*1)) + MOVQ(t2_v2, Mem{Base: AX}.Offset(16*4+8*2)) + MOVQ(t3_v2, Mem{Base: AX}.Offset(16*4+8*3)) + + LDacc(x) + LDt(zsqr) + CALL(LabelRef("p256SubInternal(SB)")) + LDt(m) + CALL(LabelRef("p256MulInternal(SB)")) + ST(m) + + Comment("Multiply by 3") + p256MulBy2Inline() + LDacc(m) + p256AddInline() + STt(m) + Comment("////////////////////////") + LDacc(y) + p256MulBy2Inline() + t2acc() + CALL(LabelRef("p256SqrInternal(SB)")) + ST(s) + CALL(LabelRef("p256SqrInternal(SB)")) + + Comment("Divide by 2") + XORQ(mul0_v2, mul0_v2) + MOVQ(acc4_v2, t0_v2) + MOVQ(acc5_v2, t1_v2) + MOVQ(acc6_v2, t2_v2) + MOVQ(acc7_v2, t3_v2) + + ADDQ(I8(-1), acc4_v2) + p256const0 := p256const0_DATA() + ADCQ(p256const0, acc5_v2) + ADCQ(Imm(0), acc6_v2) + p256const1 := p256const1_DATA() + ADCQ(p256const1, acc7_v2) + ADCQ(Imm(0), mul0_v2) + TESTQ(U32(1), t0_v2) + + CMOVQEQ(t0_v2, acc4_v2) + CMOVQEQ(t1_v2, acc5_v2) + CMOVQEQ(t2_v2, acc6_v2) + CMOVQEQ(t3_v2, acc7_v2) + ANDQ(t0_v2, mul0_v2) + + SHRQ(Imm(1), acc5_v2, acc4_v2) + SHRQ(Imm(1), acc6_v2, acc5_v2) + SHRQ(Imm(1), acc7_v2, acc6_v2) + SHRQ(Imm(1), mul0_v2, acc7_v2) + ST(y) + Comment("/////////////////////////") + LDacc(x) + LDt(s) + CALL(LabelRef("p256MulInternal(SB)")) + ST(s) + p256MulBy2Inline() + STt(tmp) + + LDacc(m) + CALL(LabelRef("p256SqrInternal(SB)")) + LDt(tmp) + CALL(LabelRef("p256SubInternal(SB)")) + + MOVQ(rptr_v3, RAX) + + Comment("Store x") + MOVQ(acc4_v2, Mem{Base: AX}.Offset(16*0+8*0)) + MOVQ(acc5_v2, Mem{Base: AX}.Offset(16*0+8*1)) + MOVQ(acc6_v2, Mem{Base: AX}.Offset(16*0+8*2)) + MOVQ(acc7_v2, Mem{Base: AX}.Offset(16*0+8*3)) + + acc2t() + LDacc(s) + CALL(LabelRef("p256SubInternal(SB)")) + + LDt(m) + CALL(LabelRef("p256MulInternal(SB)")) + + LDt(y) + CALL(LabelRef("p256SubInternal(SB)")) + MOVQ(rptr_v3, RAX) + + Comment("Store y") + MOVQ(acc4_v2, Mem{Base: AX}.Offset(16*2+8*0)) + MOVQ(acc5_v2, Mem{Base: AX}.Offset(16*2+8*1)) + MOVQ(acc6_v2, Mem{Base: AX}.Offset(16*2+8*2)) + MOVQ(acc7_v2, Mem{Base: AX}.Offset(16*2+8*3)) + Comment("///////////////////////") + MOVQ(U32(0), rptr_v3) + + RET() +} + +// #----------------------------DATA SECTION-----------------------------------## + +// Pointers for memoizing Data section symbols +var p256const0_ptr, p256const1_ptr, p256ordK0_ptr, p256ord_ptr, p256one_ptr *Mem + +func p256const0_DATA() Mem { + if p256const0_ptr != nil { + return *p256const0_ptr + } + + p256const0 := GLOBL("p256const0", 8) + p256const0_ptr = &p256const0 + DATA(0, U64(0x00000000ffffffff)) + return p256const0 +} + +func p256const1_DATA() Mem { + if p256const1_ptr != nil { + return *p256const1_ptr + } + + p256const1 := GLOBL("p256const1", 8) + p256const1_ptr = &p256const1 + DATA(0, U64(0xffffffff00000001)) + return p256const1 +} + +func p256ordK0_DATA() Mem { + if p256ordK0_ptr != nil { + return *p256ordK0_ptr + } + + p256ordK0 := GLOBL("p256ordK0", 8) + p256ordK0_ptr = &p256ordK0 + DATA(0, U64(0xccd1c8aaee00bc4f)) + return p256ordK0 +} + +var p256ordConstants = [4]uint64{ + 0xf3b9cac2fc632551, + 0xbce6faada7179e84, + 0xffffffffffffffff, + 0xffffffff00000000, +} + +func p256ord_DATA() Mem { + if p256ord_ptr != nil { + return *p256ord_ptr + } + + p256ord := GLOBL("p256ord", 8) + p256ord_ptr = &p256ord + + for i, k := range p256ordConstants { + DATA(i*8, U64(k)) + } + + return p256ord +} + +var p256oneConstants = [4]uint64{ + 0x0000000000000001, + 0xffffffff00000000, + 0xffffffffffffffff, + 0x00000000fffffffe, +} + +func p256one_DATA() Mem { + if p256one_ptr != nil { + return *p256one_ptr + } + + p256one := GLOBL("p256one", 8) + p256one_ptr = &p256one + + for i, k := range p256oneConstants { + DATA(i*8, U64(k)) + } + + return p256one +} + +const ThatPeskyUnicodeDot = "\u00b7" + +// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they +// can exist as internal assembly functions +// +// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode +// dot tells the compiler to link a TEXT symbol to a function in the current Go package +// (or another package if specified). Avo unconditionally prepends the unicode dot to all +// TEXT symbols, making it impossible to emit an internal function without this hack. +// +// There is a pending PR to add internal functions to Avo: +// https://github.com/mmcloughlin/avo/pull/443 +// +// If merged it should allow the usage of InternalFunction("NAME") for the specified functions +func removePeskyUnicodeDot(internalFunctions []string, target string) { + bytes, err := os.ReadFile(target) + if err != nil { + panic(err) + } + + content := string(bytes) + + for _, from := range internalFunctions { + to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "") + content = strings.ReplaceAll(content, from, to) + } + + err = os.WriteFile(target, []byte(content), 0644) + if err != nil { + panic(err) + } +} diff --git a/src/crypto/internal/nistec/p256_asm_amd64.s b/src/crypto/internal/nistec/p256_asm_amd64.s index f5c008319bd7f..501e094266a05 100644 --- a/src/crypto/internal/nistec/p256_asm_amd64.s +++ b/src/crypto/internal/nistec/p256_asm_amd64.s @@ -1,2352 +1,2455 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run p256_asm_amd64.go -out ../p256_asm_amd64.s -pkg nistec. DO NOT EDIT. //go:build !purego -// This file contains constant-time, 64-bit assembly implementation of -// P256. The optimizations performed here are described in detail in: -// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with -// 256-bit primes" -// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x -// https://eprint.iacr.org/2013/816.pdf - #include "textflag.h" -#define res_ptr DI -#define x_ptr SI -#define y_ptr CX - -#define acc0 R8 -#define acc1 R9 -#define acc2 R10 -#define acc3 R11 -#define acc4 R12 -#define acc5 R13 -#define t0 R14 -#define t1 R15 - -DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff -DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 -DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f -DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 -DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 -DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 -DATA p256one<>+0x00(SB)/8, $0x0000000000000001 -DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 -DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe -GLOBL p256const0<>(SB), 8, $8 -GLOBL p256const1<>(SB), 8, $8 -GLOBL p256ordK0<>(SB), 8, $8 -GLOBL p256ord<>(SB), 8, $32 -GLOBL p256one<>(SB), 8, $32 - -/* ---------------------------------------*/ // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) -TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 +TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ + // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) -TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 +TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ + // func p256LittleToBig(res *[32]byte, in *p256Element) -TEXT ·p256LittleToBig(SB),NOSPLIT,$0 +TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ -// func p256BigToLittle(res *p256Element, in *[32]byte) -TEXT ·p256BigToLittle(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - - BSWAPQ acc0 - BSWAPQ acc1 - BSWAPQ acc2 - BSWAPQ acc3 - - MOVQ acc3, (8*0)(res_ptr) - MOVQ acc2, (8*1)(res_ptr) - MOVQ acc1, (8*2)(res_ptr) - MOVQ acc0, (8*3)(res_ptr) +// func p256BigToLittle(res *p256Element, in *[32]byte) +TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16 + MOVQ res+0(FP), DI + MOVQ in+8(FP), SI + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + BSWAPQ R8 + BSWAPQ R9 + BSWAPQ R10 + BSWAPQ R11 + MOVQ R11, (DI) + MOVQ R10, 8(DI) + MOVQ R9, 16(DI) + MOVQ R8, 24(DI) RET -/* ---------------------------------------*/ -// func p256MovCond(res, a, b *P256Point, cond int) -TEXT ·p256MovCond(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ a+8(FP), x_ptr - MOVQ b+16(FP), y_ptr - MOVQ cond+24(FP), X12 - - PXOR X13, X13 - PSHUFD $0, X12, X12 - PCMPEQL X13, X12 - - MOVOU X12, X0 - MOVOU (16*0)(x_ptr), X6 - PANDN X6, X0 - MOVOU X12, X1 - MOVOU (16*1)(x_ptr), X7 - PANDN X7, X1 - MOVOU X12, X2 - MOVOU (16*2)(x_ptr), X8 - PANDN X8, X2 - MOVOU X12, X3 - MOVOU (16*3)(x_ptr), X9 - PANDN X9, X3 - MOVOU X12, X4 - MOVOU (16*4)(x_ptr), X10 - PANDN X10, X4 - MOVOU X12, X5 - MOVOU (16*5)(x_ptr), X11 - PANDN X11, X5 - - MOVOU (16*0)(y_ptr), X6 - MOVOU (16*1)(y_ptr), X7 - MOVOU (16*2)(y_ptr), X8 - MOVOU (16*3)(y_ptr), X9 - MOVOU (16*4)(y_ptr), X10 - MOVOU (16*5)(y_ptr), X11 - - PAND X12, X6 - PAND X12, X7 - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X6, X0 - PXOR X7, X1 - PXOR X8, X2 - PXOR X9, X3 - PXOR X10, X4 - PXOR X11, X5 - - MOVOU X0, (16*0)(res_ptr) - MOVOU X1, (16*1)(res_ptr) - MOVOU X2, (16*2)(res_ptr) - MOVOU X3, (16*3)(res_ptr) - MOVOU X4, (16*4)(res_ptr) - MOVOU X5, (16*5)(res_ptr) +// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int) +// Requires: SSE2 +TEXT ·p256MovCond(SB), NOSPLIT, $0-32 + MOVQ res+0(FP), DI + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ cond+24(FP), X12 + PXOR X13, X13 + PSHUFD $0x00, X12, X12 + PCMPEQL X13, X12 + MOVOU X12, X0 + MOVOU (SI), X6 + PANDN X6, X0 + MOVOU X12, X1 + MOVOU 16(SI), X7 + PANDN X7, X1 + MOVOU X12, X2 + MOVOU 32(SI), X8 + PANDN X8, X2 + MOVOU X12, X3 + MOVOU 48(SI), X9 + PANDN X9, X3 + MOVOU X12, X4 + MOVOU 64(SI), X10 + PANDN X10, X4 + MOVOU X12, X5 + MOVOU 80(SI), X11 + PANDN X11, X5 + MOVOU (CX), X6 + MOVOU 16(CX), X7 + MOVOU 32(CX), X8 + MOVOU 48(CX), X9 + MOVOU 64(CX), X10 + MOVOU 80(CX), X11 + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + MOVOU X0, (DI) + MOVOU X1, 16(DI) + MOVOU X2, 32(DI) + MOVOU X3, 48(DI) + MOVOU X4, 64(DI) + MOVOU X5, 80(DI) RET -/* ---------------------------------------*/ + // func p256NegCond(val *p256Element, cond int) -TEXT ·p256NegCond(SB),NOSPLIT,$0 - MOVQ val+0(FP), res_ptr - MOVQ cond+8(FP), t0 +// Requires: CMOV +TEXT ·p256NegCond(SB), NOSPLIT, $0-16 + MOVQ val+0(FP), DI + MOVQ cond+8(FP), R14 + // acc = poly - MOVQ $-1, acc0 - MOVQ p256const0<>(SB), acc1 - MOVQ $0, acc2 - MOVQ p256const1<>(SB), acc3 + MOVQ $-1, R8 + MOVQ p256const0<>+0(SB), R9 + MOVQ $+0, R10 + MOVQ p256const1<>+0(SB), R11 + // Load the original value - MOVQ (8*0)(res_ptr), acc5 - MOVQ (8*1)(res_ptr), x_ptr - MOVQ (8*2)(res_ptr), y_ptr - MOVQ (8*3)(res_ptr), t1 + MOVQ (DI), R13 + MOVQ 8(DI), SI + MOVQ 16(DI), CX + MOVQ 24(DI), R15 + // Speculatively subtract - SUBQ acc5, acc0 - SBBQ x_ptr, acc1 - SBBQ y_ptr, acc2 - SBBQ t1, acc3 + SUBQ R13, R8 + SBBQ SI, R9 + SBBQ CX, R10 + SBBQ R15, R11 + // If condition is 0, keep original value - TESTQ t0, t0 - CMOVQEQ acc5, acc0 - CMOVQEQ x_ptr, acc1 - CMOVQEQ y_ptr, acc2 - CMOVQEQ t1, acc3 - // Store result - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) + TESTQ R14, R14 + CMOVQEQ R13, R8 + CMOVQEQ SI, R9 + CMOVQEQ CX, R10 + CMOVQEQ R15, R11 + // Store result + MOVQ R8, (DI) + MOVQ R9, 8(DI) + MOVQ R10, 16(DI) + MOVQ R11, 24(DI) RET -/* ---------------------------------------*/ -// func p256Sqr(res, in *p256Element, n int) -TEXT ·p256Sqr(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr + +DATA p256const0<>+0(SB)/8, $0x00000000ffffffff +GLOBL p256const0<>(SB), RODATA, $8 + +DATA p256const1<>+0(SB)/8, $0xffffffff00000001 +GLOBL p256const1<>(SB), RODATA, $8 + +// func p256Sqr(res *p256Element, in *p256Element, n int) +// Requires: CMOV +TEXT ·p256Sqr(SB), NOSPLIT, $0-24 + MOVQ res+0(FP), DI + MOVQ in+8(FP), SI MOVQ n+16(FP), BX sqrLoop: - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 + MOVQ (SI), R14 + MOVQ 8(SI), AX + MULQ R14 + MOVQ AX, R9 + MOVQ DX, R10 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R12 + // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 + MOVQ 8(SI), R14 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R13 + // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ t1, t1 + MOVQ 16(SI), R14 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + MOVQ DX, CX + XORQ R15, R15 + // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 + ADDQ R9, R9 + ADCQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ CX, CX + ADCQ $0x00, R15 + // Missing products - MOVQ (8*0)(x_ptr), AX + MOVQ (SI), AX MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX + MOVQ AX, R8 + MOVQ DX, R14 + MOVQ 8(SI), AX MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX + ADDQ R14, R9 + ADCQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R14 + MOVQ 16(SI), AX MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX + ADDQ R14, R11 + ADCQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R14 + MOVQ 24(SI), AX MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, t1 - MOVQ t1, x_ptr + ADDQ R14, R13 + ADCQ AX, CX + ADCQ DX, R15 + MOVQ R15, SI + // First reduction step - MOVQ acc0, AX - MOVQ acc0, t1 - SHLQ $32, acc0 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc0, acc1 - ADCQ t1, acc2 - ADCQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc0 + MOVQ R8, AX + MOVQ R8, R15 + SHLQ $0x20, R8 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R8, R9 + ADCQ R15, R10 + ADCQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R8 + // Second reduction step - MOVQ acc1, AX - MOVQ acc1, t1 - SHLQ $32, acc1 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc1, acc2 - ADCQ t1, acc3 - ADCQ AX, acc0 - ADCQ $0, DX - MOVQ DX, acc1 + MOVQ R9, AX + MOVQ R9, R15 + SHLQ $0x20, R9 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R9, R10 + ADCQ R15, R11 + ADCQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R9 + // Third reduction step - MOVQ acc2, AX - MOVQ acc2, t1 - SHLQ $32, acc2 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc2, acc3 - ADCQ t1, acc0 - ADCQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 + MOVQ R10, AX + MOVQ R10, R15 + SHLQ $0x20, R10 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R10, R11 + ADCQ R15, R8 + ADCQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R10 + // Last reduction step - XORQ t0, t0 - MOVQ acc3, AX - MOVQ acc3, t1 - SHLQ $32, acc3 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc3, acc0 - ADCQ t1, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 + XORQ R14, R14 + MOVQ R11, AX + MOVQ R11, R15 + SHLQ $0x20, R11 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R11, R8 + ADCQ R15, R9 + ADCQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 - - MOVQ acc0, acc4 - MOVQ acc1, acc5 - MOVQ acc2, y_ptr - MOVQ acc3, t1 - // Subtract p256 - SUBQ $-1, acc0 - SBBQ p256const0<>(SB) ,acc1 - SBBQ $0, acc2 - SBBQ p256const1<>(SB), acc3 - SBBQ $0, t0 - - CMOVQCS acc4, acc0 - CMOVQCS acc5, acc1 - CMOVQCS y_ptr, acc2 - CMOVQCS t1, acc3 - - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) - MOVQ res_ptr, x_ptr - DECQ BX - JNE sqrLoop + ADCQ R12, R8 + ADCQ R13, R9 + ADCQ CX, R10 + ADCQ SI, R11 + ADCQ $0x00, R14 + MOVQ R8, R12 + MOVQ R9, R13 + MOVQ R10, CX + MOVQ R11, R15 + // Subtract p256 + SUBQ $-1, R8 + SBBQ p256const0<>+0(SB), R9 + SBBQ $0x00, R10 + SBBQ p256const1<>+0(SB), R11 + SBBQ $0x00, R14 + CMOVQCS R12, R8 + CMOVQCS R13, R9 + CMOVQCS CX, R10 + CMOVQCS R15, R11 + MOVQ R8, (DI) + MOVQ R9, 8(DI) + MOVQ R10, 16(DI) + MOVQ R11, 24(DI) + MOVQ DI, SI + DECQ BX + JNE sqrLoop RET -/* ---------------------------------------*/ -// func p256Mul(res, in1, in2 *p256Element) -TEXT ·p256Mul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in1+8(FP), x_ptr - MOVQ in2+16(FP), y_ptr + +// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element) +// Requires: CMOV +TEXT ·p256Mul(SB), NOSPLIT, $0-24 + MOVQ res+0(FP), DI + MOVQ in1+8(FP), SI + MOVQ in2+16(FP), CX + // x * y[0] - MOVQ (8*0)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - MOVQ AX, acc0 - MOVQ DX, acc1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - XORQ acc5, acc5 + MOVQ (CX), R14 + MOVQ (SI), AX + MULQ R14 + MOVQ AX, R8 + MOVQ DX, R9 + MOVQ 8(SI), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R12 + XORQ R13, R13 + // First reduction step - MOVQ acc0, AX - MOVQ acc0, t1 - SHLQ $32, acc0 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc0, acc1 - ADCQ t1, acc2 - ADCQ AX, acc3 - ADCQ DX, acc4 - ADCQ $0, acc5 - XORQ acc0, acc0 + MOVQ R8, AX + MOVQ R8, R15 + SHLQ $0x20, R8 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R8, R9 + ADCQ R15, R10 + ADCQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + XORQ R8, R8 + // x * y[1] - MOVQ (8*1)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 + MOVQ 8(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R8 + // Second reduction step - MOVQ acc1, AX - MOVQ acc1, t1 - SHLQ $32, acc1 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc1, acc2 - ADCQ t1, acc3 - ADCQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 - XORQ acc1, acc1 + MOVQ R9, AX + MOVQ R9, R15 + SHLQ $0x20, R9 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R9, R10 + ADCQ R15, R11 + ADCQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R8 + XORQ R9, R9 + // x * y[2] - MOVQ (8*2)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 + MOVQ 16(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ DX, R8 + ADCQ $0x00, R9 + // Third reduction step - MOVQ acc2, AX - MOVQ acc2, t1 - SHLQ $32, acc2 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc2, acc3 - ADCQ t1, acc4 - ADCQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 - XORQ acc2, acc2 + MOVQ R10, AX + MOVQ R10, R15 + SHLQ $0x20, R10 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R10, R11 + ADCQ R15, R12 + ADCQ AX, R13 + ADCQ DX, R8 + ADCQ $0x00, R9 + XORQ R10, R10 + // x * y[3] - MOVQ (8*3)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 + MOVQ 24(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R8 + ADCQ $0x00, DX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + // Last reduction step - MOVQ acc3, AX - MOVQ acc3, t1 - SHLQ $32, acc3 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc3, acc4 - ADCQ t1, acc5 - ADCQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 + MOVQ R11, AX + MOVQ R11, R15 + SHLQ $0x20, R11 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R11, R12 + ADCQ R15, R13 + ADCQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ $-1, acc4 - SBBQ p256const0<>(SB) ,acc5 - SBBQ $0, acc0 - SBBQ p256const1<>(SB), acc1 - SBBQ $0, acc2 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + MOVQ R12, SI + MOVQ R13, R11 + MOVQ R8, R14 + MOVQ R9, R15 + // Subtract p256 + SUBQ $-1, R12 + SBBQ p256const0<>+0(SB), R13 + SBBQ $0x00, R8 + SBBQ p256const1<>+0(SB), R9 + SBBQ $0x00, R10 + CMOVQCS SI, R12 + CMOVQCS R11, R13 + CMOVQCS R14, R8 + CMOVQCS R15, R9 + MOVQ R12, (DI) + MOVQ R13, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) RET -/* ---------------------------------------*/ -// func p256FromMont(res, in *p256Element) -TEXT ·p256FromMont(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr - - MOVQ (8*0)(x_ptr), acc0 - MOVQ (8*1)(x_ptr), acc1 - MOVQ (8*2)(x_ptr), acc2 - MOVQ (8*3)(x_ptr), acc3 - XORQ acc4, acc4 + +// func p256FromMont(res *p256Element, in *p256Element) +// Requires: CMOV +TEXT ·p256FromMont(SB), NOSPLIT, $0-16 + MOVQ res+0(FP), DI + MOVQ in+8(FP), SI + MOVQ (SI), R8 + MOVQ 8(SI), R9 + MOVQ 16(SI), R10 + MOVQ 24(SI), R11 + XORQ R12, R12 // Only reduce, no multiplications are needed // First stage - MOVQ acc0, AX - MOVQ acc0, t1 - SHLQ $32, acc0 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc0, acc1 - ADCQ t1, acc2 - ADCQ AX, acc3 - ADCQ DX, acc4 - XORQ acc5, acc5 + MOVQ R8, AX + MOVQ R8, R15 + SHLQ $0x20, R8 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R8, R9 + ADCQ R15, R10 + ADCQ AX, R11 + ADCQ DX, R12 + XORQ R13, R13 + // Second stage - MOVQ acc1, AX - MOVQ acc1, t1 - SHLQ $32, acc1 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc1, acc2 - ADCQ t1, acc3 - ADCQ AX, acc4 - ADCQ DX, acc5 - XORQ acc0, acc0 + MOVQ R9, AX + MOVQ R9, R15 + SHLQ $0x20, R9 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R9, R10 + ADCQ R15, R11 + ADCQ AX, R12 + ADCQ DX, R13 + XORQ R8, R8 + // Third stage - MOVQ acc2, AX - MOVQ acc2, t1 - SHLQ $32, acc2 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc2, acc3 - ADCQ t1, acc4 - ADCQ AX, acc5 - ADCQ DX, acc0 - XORQ acc1, acc1 - // Last stage - MOVQ acc3, AX - MOVQ acc3, t1 - SHLQ $32, acc3 - MULQ p256const1<>(SB) - SHRQ $32, t1 - ADDQ acc3, acc4 - ADCQ t1, acc5 - ADCQ AX, acc0 - ADCQ DX, acc1 - - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - - SUBQ $-1, acc4 - SBBQ p256const0<>(SB), acc5 - SBBQ $0, acc0 - SBBQ p256const1<>(SB), acc1 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + MOVQ R10, AX + MOVQ R10, R15 + SHLQ $0x20, R10 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R10, R11 + ADCQ R15, R12 + ADCQ AX, R13 + ADCQ DX, R8 + XORQ R9, R9 + // Last stage + MOVQ R11, AX + MOVQ R11, R15 + SHLQ $0x20, R11 + MULQ p256const1<>+0(SB) + SHRQ $0x20, R15 + ADDQ R11, R12 + ADCQ R15, R13 + ADCQ AX, R8 + ADCQ DX, R9 + MOVQ R12, SI + MOVQ R13, R11 + MOVQ R8, R14 + MOVQ R9, R15 + SUBQ $-1, R12 + SBBQ p256const0<>+0(SB), R13 + SBBQ $0x00, R8 + SBBQ p256const1<>+0(SB), R9 + CMOVQCS SI, R12 + CMOVQCS R11, R13 + CMOVQCS R14, R8 + CMOVQCS R15, R9 + MOVQ R12, (DI) + MOVQ R13, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) RET -/* ---------------------------------------*/ + // func p256Select(res *P256Point, table *p256Table, idx int) -TEXT ·p256Select(SB),NOSPLIT,$0 - MOVQ idx+16(FP),AX - MOVQ table+8(FP),DI - MOVQ res+0(FP),DX - - PXOR X15, X15 // X15 = 0 - PCMPEQL X14, X14 // X14 = -1 - PSUBL X14, X15 // X15 = 1 - MOVL AX, X14 - PSHUFD $0, X14, X14 - - PXOR X0, X0 - PXOR X1, X1 - PXOR X2, X2 - PXOR X3, X3 - PXOR X4, X4 - PXOR X5, X5 - MOVQ $16, AX - - MOVOU X15, X13 +// Requires: SSE2 +TEXT ·p256Select(SB), NOSPLIT, $0-24 + MOVQ idx+16(FP), AX + MOVQ table+8(FP), DI + MOVQ res+0(FP), DX + PXOR X15, X15 + PCMPEQL X14, X14 + PSUBL X14, X15 + MOVL AX, X14 + PSHUFD $0x00, X14, X14 + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + MOVQ $0x00000010, AX + MOVOU X15, X13 loop_select: - - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - MOVOU (16*0)(DI), X6 - MOVOU (16*1)(DI), X7 - MOVOU (16*2)(DI), X8 - MOVOU (16*3)(DI), X9 - MOVOU (16*4)(DI), X10 - MOVOU (16*5)(DI), X11 - ADDQ $(16*6), DI - - PAND X12, X6 - PAND X12, X7 - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X6, X0 - PXOR X7, X1 - PXOR X8, X2 - PXOR X9, X3 - PXOR X10, X4 - PXOR X11, X5 - - DECQ AX - JNE loop_select - - MOVOU X0, (16*0)(DX) - MOVOU X1, (16*1)(DX) - MOVOU X2, (16*2)(DX) - MOVOU X3, (16*3)(DX) - MOVOU X4, (16*4)(DX) - MOVOU X5, (16*5)(DX) - + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + MOVOU (DI), X6 + MOVOU 16(DI), X7 + MOVOU 32(DI), X8 + MOVOU 48(DI), X9 + MOVOU 64(DI), X10 + MOVOU 80(DI), X11 + ADDQ $0x60, DI + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + DECQ AX + JNE loop_select + MOVOU X0, (DX) + MOVOU X1, 16(DX) + MOVOU X2, 32(DX) + MOVOU X3, 48(DX) + MOVOU X4, 64(DX) + MOVOU X5, 80(DX) RET -/* ---------------------------------------*/ -// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) -TEXT ·p256SelectAffine(SB),NOSPLIT,$0 - MOVQ idx+16(FP),AX - MOVQ table+8(FP),DI - MOVQ res+0(FP),DX - - PXOR X15, X15 // X15 = 0 - PCMPEQL X14, X14 // X14 = -1 - PSUBL X14, X15 // X15 = 1 - MOVL AX, X14 - PSHUFD $0, X14, X14 - PXOR X0, X0 - PXOR X1, X1 - PXOR X2, X2 - PXOR X3, X3 - MOVQ $16, AX - - MOVOU X15, X13 +// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) +// Requires: SSE2 +TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 + MOVQ idx+16(FP), AX + MOVQ table+8(FP), DI + MOVQ res+0(FP), DX + PXOR X15, X15 + PCMPEQL X14, X14 + PSUBL X14, X15 + MOVL AX, X14 + PSHUFD $0x00, X14, X14 + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + MOVQ $0x00000010, AX + MOVOU X15, X13 loop_select_base: + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + MOVOU (DI), X4 + MOVOU 16(DI), X5 + MOVOU 32(DI), X6 + MOVOU 48(DI), X7 + MOVOU 64(DI), X8 + MOVOU 80(DI), X9 + MOVOU 96(DI), X10 + MOVOU 112(DI), X11 + ADDQ $0x80, DI + PAND X12, X4 + PAND X12, X5 + PAND X12, X6 + PAND X12, X7 + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + PXOR X8, X0 + PXOR X9, X1 + PXOR X10, X2 + PXOR X11, X3 + DECQ AX + JNE loop_select_base + MOVOU X0, (DX) + MOVOU X1, 16(DX) + MOVOU X2, 32(DX) + MOVOU X3, 48(DX) + RET - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - MOVOU (16*0)(DI), X4 - MOVOU (16*1)(DI), X5 - MOVOU (16*2)(DI), X6 - MOVOU (16*3)(DI), X7 - - MOVOU (16*4)(DI), X8 - MOVOU (16*5)(DI), X9 - MOVOU (16*6)(DI), X10 - MOVOU (16*7)(DI), X11 - - ADDQ $(16*8), DI - - PAND X12, X4 - PAND X12, X5 - PAND X12, X6 - PAND X12, X7 - - MOVOU X13, X12 - PADDL X15, X13 - PCMPEQL X14, X12 - - PAND X12, X8 - PAND X12, X9 - PAND X12, X10 - PAND X12, X11 - - PXOR X4, X0 - PXOR X5, X1 - PXOR X6, X2 - PXOR X7, X3 - - PXOR X8, X0 - PXOR X9, X1 - PXOR X10, X2 - PXOR X11, X3 - - DECQ AX - JNE loop_select_base - - MOVOU X0, (16*0)(DX) - MOVOU X1, (16*1)(DX) - MOVOU X2, (16*2)(DX) - MOVOU X3, (16*3)(DX) +// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement) +// Requires: CMOV +TEXT ·p256OrdMul(SB), NOSPLIT, $0-24 + MOVQ res+0(FP), DI + MOVQ in1+8(FP), SI + MOVQ in2+16(FP), CX - RET -/* ---------------------------------------*/ -// func p256OrdMul(res, in1, in2 *p256OrdElement) -TEXT ·p256OrdMul(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in1+8(FP), x_ptr - MOVQ in2+16(FP), y_ptr // x * y[0] - MOVQ (8*0)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - MOVQ AX, acc0 - MOVQ DX, acc1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 - XORQ acc5, acc5 + MOVQ (CX), R14 + MOVQ (SI), AX + MULQ R14 + MOVQ AX, R8 + MOVQ DX, R9 + MOVQ 8(SI), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R12 + XORQ R13, R13 + // First reduction step - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 - ADCQ $0, acc5 + MOVQ R8, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R9 + ADCQ $0x00, DX + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+16(SB), AX + MULQ R14 + ADDQ R15, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+24(SB), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ DX, R12 + ADCQ $0x00, R13 + // x * y[1] - MOVQ (8*1)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 + MOVQ 8(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R8 + // Second reduction step - MOVQ acc1, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 - ADCQ $0, acc0 + MOVQ R9, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+16(SB), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+24(SB), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ DX, R13 + ADCQ $0x00, R8 + // x * y[2] - MOVQ (8*2)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 + MOVQ 16(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ DX, R8 + ADCQ $0x00, R9 + // Third reduction step - MOVQ acc2, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 - ADCQ $0, acc1 + MOVQ R10, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+16(SB), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+24(SB), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ DX, R8 + ADCQ $0x00, R9 + // x * y[3] - MOVQ (8*3)(y_ptr), t0 - - MOVQ (8*0)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 + MOVQ 24(CX), R14 + MOVQ (SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 8(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 16(SI), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R8 + ADCQ $0x00, DX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + // Last reduction step - MOVQ acc3, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x10(SB), AX - MULQ t0 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x18(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 - ADCQ $0, acc2 + MOVQ R11, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+16(SB), AX + MULQ R14 + ADDQ R15, R13 + ADCQ $0x00, DX + ADDQ AX, R13 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+24(SB), AX + MULQ R14 + ADDQ R15, R8 + ADCQ $0x00, DX + ADDQ AX, R8 + ADCQ DX, R9 + ADCQ $0x00, R10 + // Copy result [255:0] - MOVQ acc4, x_ptr - MOVQ acc5, acc3 - MOVQ acc0, t0 - MOVQ acc1, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc4 - SBBQ p256ord<>+0x08(SB) ,acc5 - SBBQ p256ord<>+0x10(SB), acc0 - SBBQ p256ord<>+0x18(SB), acc1 - SBBQ $0, acc2 - - CMOVQCS x_ptr, acc4 - CMOVQCS acc3, acc5 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - - MOVQ acc4, (8*0)(res_ptr) - MOVQ acc5, (8*1)(res_ptr) - MOVQ acc0, (8*2)(res_ptr) - MOVQ acc1, (8*3)(res_ptr) + MOVQ R12, SI + MOVQ R13, R11 + MOVQ R8, R14 + MOVQ R9, R15 + // Subtract p256 + SUBQ p256ord<>+0(SB), R12 + SBBQ p256ord<>+8(SB), R13 + SBBQ p256ord<>+16(SB), R8 + SBBQ p256ord<>+24(SB), R9 + SBBQ $0x00, R10 + CMOVQCS SI, R12 + CMOVQCS R11, R13 + CMOVQCS R14, R8 + CMOVQCS R15, R9 + MOVQ R12, (DI) + MOVQ R13, 8(DI) + MOVQ R8, 16(DI) + MOVQ R9, 24(DI) RET -/* ---------------------------------------*/ -// func p256OrdSqr(res, in *p256OrdElement, n int) -TEXT ·p256OrdSqr(SB),NOSPLIT,$0 - MOVQ res+0(FP), res_ptr - MOVQ in+8(FP), x_ptr + +DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f +GLOBL p256ordK0<>(SB), RODATA, $8 + +DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551 +DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84 +DATA p256ord<>+16(SB)/8, $0xffffffffffffffff +DATA p256ord<>+24(SB)/8, $0xffffffff00000000 +GLOBL p256ord<>(SB), RODATA, $32 + +// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int) +// Requires: CMOV +TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24 + MOVQ res+0(FP), DI + MOVQ in+8(FP), SI MOVQ n+16(FP), BX ordSqrLoop: - // y[1:] * y[0] - MOVQ (8*0)(x_ptr), t0 - - MOVQ (8*1)(x_ptr), AX - MULQ t0 - MOVQ AX, acc1 - MOVQ DX, acc2 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc4 + MOVQ (SI), R14 + MOVQ 8(SI), AX + MULQ R14 + MOVQ AX, R9 + MOVQ DX, R10 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R12 + // y[2:] * y[1] - MOVQ (8*1)(x_ptr), t0 - - MOVQ (8*2)(x_ptr), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, acc5 + MOVQ 8(SI), R14 + MOVQ 16(SI), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ 24(SI), AX + MULQ R14 + ADDQ R15, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R13 + // y[3] * y[2] - MOVQ (8*2)(x_ptr), t0 - - MOVQ (8*3)(x_ptr), AX - MULQ t0 - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, y_ptr - XORQ t1, t1 + MOVQ 16(SI), R14 + MOVQ 24(SI), AX + MULQ R14 + ADDQ AX, R13 + ADCQ $0x00, DX + MOVQ DX, CX + XORQ R15, R15 + // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 + ADDQ R9, R9 + ADCQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ CX, CX + ADCQ $0x00, R15 + // Missing products - MOVQ (8*0)(x_ptr), AX + MOVQ (SI), AX MULQ AX - MOVQ AX, acc0 - MOVQ DX, t0 - - MOVQ (8*1)(x_ptr), AX + MOVQ AX, R8 + MOVQ DX, R14 + MOVQ 8(SI), AX MULQ AX - ADDQ t0, acc1 - ADCQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*2)(x_ptr), AX + ADDQ R14, R9 + ADCQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R14 + MOVQ 16(SI), AX MULQ AX - ADDQ t0, acc3 - ADCQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t0 - - MOVQ (8*3)(x_ptr), AX + ADDQ R14, R11 + ADCQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R14 + MOVQ 24(SI), AX MULQ AX - ADDQ t0, acc5 - ADCQ AX, y_ptr - ADCQ DX, t1 - MOVQ t1, x_ptr + ADDQ R14, R13 + ADCQ AX, CX + ADCQ DX, R15 + MOVQ R15, SI + // First reduction step - MOVQ acc0, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - - MOVQ t0, t1 - ADCQ DX, acc2 - ADCQ $0, t1 - SUBQ t0, acc2 - SBBQ $0, t1 - - MOVQ t0, AX - MOVQ t0, DX - MOVQ t0, acc0 - SHLQ $32, AX - SHRQ $32, DX - - ADDQ t1, acc3 - ADCQ $0, acc0 - SUBQ AX, acc3 - SBBQ DX, acc0 + MOVQ R8, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R9 + ADCQ $0x00, DX + ADDQ AX, R9 + MOVQ R14, R15 + ADCQ DX, R10 + ADCQ $0x00, R15 + SUBQ R14, R10 + SBBQ $0x00, R15 + MOVQ R14, AX + MOVQ R14, DX + MOVQ R14, R8 + SHLQ $0x20, AX + SHRQ $0x20, DX + ADDQ R15, R11 + ADCQ $0x00, R8 + SUBQ AX, R11 + SBBQ DX, R8 + // Second reduction step - MOVQ acc1, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - - MOVQ t0, t1 - ADCQ DX, acc3 - ADCQ $0, t1 - SUBQ t0, acc3 - SBBQ $0, t1 - - MOVQ t0, AX - MOVQ t0, DX - MOVQ t0, acc1 - SHLQ $32, AX - SHRQ $32, DX - - ADDQ t1, acc0 - ADCQ $0, acc1 - SUBQ AX, acc0 - SBBQ DX, acc1 + MOVQ R9, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + MOVQ R14, R15 + ADCQ DX, R11 + ADCQ $0x00, R15 + SUBQ R14, R11 + SBBQ $0x00, R15 + MOVQ R14, AX + MOVQ R14, DX + MOVQ R14, R9 + SHLQ $0x20, AX + SHRQ $0x20, DX + ADDQ R15, R8 + ADCQ $0x00, R9 + SUBQ AX, R8 + SBBQ DX, R9 + // Third reduction step - MOVQ acc2, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - - MOVQ t0, t1 - ADCQ DX, acc0 - ADCQ $0, t1 - SUBQ t0, acc0 - SBBQ $0, t1 - - MOVQ t0, AX - MOVQ t0, DX - MOVQ t0, acc2 - SHLQ $32, AX - SHRQ $32, DX - - ADDQ t1, acc1 - ADCQ $0, acc2 - SUBQ AX, acc1 - SBBQ DX, acc2 + MOVQ R10, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + MOVQ R14, R15 + ADCQ DX, R8 + ADCQ $0x00, R15 + SUBQ R14, R8 + SBBQ $0x00, R15 + MOVQ R14, AX + MOVQ R14, DX + MOVQ R14, R10 + SHLQ $0x20, AX + SHRQ $0x20, DX + ADDQ R15, R9 + ADCQ $0x00, R10 + SUBQ AX, R9 + SBBQ DX, R10 + // Last reduction step - MOVQ acc3, AX - MULQ p256ordK0<>(SB) - MOVQ AX, t0 - - MOVQ p256ord<>+0x00(SB), AX - MULQ t0 - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ p256ord<>+0x08(SB), AX - MULQ t0 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - - MOVQ t0, t1 - ADCQ DX, acc1 - ADCQ $0, t1 - SUBQ t0, acc1 - SBBQ $0, t1 - - MOVQ t0, AX - MOVQ t0, DX - MOVQ t0, acc3 - SHLQ $32, AX - SHRQ $32, DX - - ADDQ t1, acc2 - ADCQ $0, acc3 - SUBQ AX, acc2 - SBBQ DX, acc3 - XORQ t0, t0 + MOVQ R11, AX + MULQ p256ordK0<>+0(SB) + MOVQ AX, R14 + MOVQ p256ord<>+0(SB), AX + MULQ R14 + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ p256ord<>+8(SB), AX + MULQ R14 + ADDQ R15, R8 + ADCQ $0x00, DX + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ R14, R15 + ADCQ DX, R9 + ADCQ $0x00, R15 + SUBQ R14, R9 + SBBQ $0x00, R15 + MOVQ R14, AX + MOVQ R14, DX + MOVQ R14, R11 + SHLQ $0x20, AX + SHRQ $0x20, DX + ADDQ R15, R10 + ADCQ $0x00, R11 + SUBQ AX, R10 + SBBQ DX, R11 + XORQ R14, R14 + // Add bits [511:256] of the sqr result - ADCQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 - - MOVQ acc0, acc4 - MOVQ acc1, acc5 - MOVQ acc2, y_ptr - MOVQ acc3, t1 - // Subtract p256 - SUBQ p256ord<>+0x00(SB), acc0 - SBBQ p256ord<>+0x08(SB) ,acc1 - SBBQ p256ord<>+0x10(SB), acc2 - SBBQ p256ord<>+0x18(SB), acc3 - SBBQ $0, t0 - - CMOVQCS acc4, acc0 - CMOVQCS acc5, acc1 - CMOVQCS y_ptr, acc2 - CMOVQCS t1, acc3 - - MOVQ acc0, (8*0)(res_ptr) - MOVQ acc1, (8*1)(res_ptr) - MOVQ acc2, (8*2)(res_ptr) - MOVQ acc3, (8*3)(res_ptr) - MOVQ res_ptr, x_ptr - DECQ BX - JNE ordSqrLoop + ADCQ R12, R8 + ADCQ R13, R9 + ADCQ CX, R10 + ADCQ SI, R11 + ADCQ $0x00, R14 + MOVQ R8, R12 + MOVQ R9, R13 + MOVQ R10, CX + MOVQ R11, R15 + // Subtract p256 + SUBQ p256ord<>+0(SB), R8 + SBBQ p256ord<>+8(SB), R9 + SBBQ p256ord<>+16(SB), R10 + SBBQ p256ord<>+24(SB), R11 + SBBQ $0x00, R14 + CMOVQCS R12, R8 + CMOVQCS R13, R9 + CMOVQCS CX, R10 + CMOVQCS R15, R11 + MOVQ R8, (DI) + MOVQ R9, 8(DI) + MOVQ R10, 16(DI) + MOVQ R11, 24(DI) + MOVQ DI, SI + DECQ BX + JNE ordSqrLoop RET -/* ---------------------------------------*/ -#undef res_ptr -#undef x_ptr -#undef y_ptr - -#undef acc0 -#undef acc1 -#undef acc2 -#undef acc3 -#undef acc4 -#undef acc5 -#undef t0 -#undef t1 -/* ---------------------------------------*/ -#define mul0 AX -#define mul1 DX -#define acc0 BX -#define acc1 CX -#define acc2 R8 -#define acc3 R9 -#define acc4 R10 -#define acc5 R11 -#define acc6 R12 -#define acc7 R13 -#define t0 R14 -#define t1 R15 -#define t2 DI -#define t3 SI -#define hlp BP -/* ---------------------------------------*/ -TEXT p256SubInternal(SB),NOSPLIT,$0 - XORQ mul0, mul0 - SUBQ t0, acc4 - SBBQ t1, acc5 - SBBQ t2, acc6 - SBBQ t3, acc7 - SBBQ $0, mul0 - - MOVQ acc4, acc0 - MOVQ acc5, acc1 - MOVQ acc6, acc2 - MOVQ acc7, acc3 - - ADDQ $-1, acc4 - ADCQ p256const0<>(SB), acc5 - ADCQ $0, acc6 - ADCQ p256const1<>(SB), acc7 - ANDQ $1, mul0 - - CMOVQEQ acc0, acc4 - CMOVQEQ acc1, acc5 - CMOVQEQ acc2, acc6 - CMOVQEQ acc3, acc7 +// func p256SubInternal() +// Requires: CMOV +TEXT p256SubInternal(SB), NOSPLIT, $0 + XORQ AX, AX + SUBQ R14, R10 + SBBQ R15, R11 + SBBQ DI, R12 + SBBQ SI, R13 + SBBQ $0x00, AX + MOVQ R10, BX + MOVQ R11, CX + MOVQ R12, R8 + MOVQ R13, R9 + ADDQ $-1, R10 + ADCQ p256const0<>+0(SB), R11 + ADCQ $0x00, R12 + ADCQ p256const1<>+0(SB), R13 + ANDQ $0x01, AX + CMOVQEQ BX, R10 + CMOVQEQ CX, R11 + CMOVQEQ R8, R12 + CMOVQEQ R9, R13 RET -/* ---------------------------------------*/ -TEXT p256MulInternal(SB),NOSPLIT,$8 - MOVQ acc4, mul0 - MULQ t0 - MOVQ mul0, acc0 - MOVQ mul1, acc1 - - MOVQ acc4, mul0 - MULQ t1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, acc2 - - MOVQ acc4, mul0 - MULQ t2 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - - MOVQ acc4, mul0 - MULQ t3 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc4 - - MOVQ acc5, mul0 - MULQ t0 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc5, mul0 - MULQ t1 - ADDQ hlp, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc5, mul0 - MULQ t2 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc5, mul0 - MULQ t3 - ADDQ hlp, acc4 - ADCQ $0, mul1 - ADDQ mul0, acc4 - ADCQ $0, mul1 - MOVQ mul1, acc5 - - MOVQ acc6, mul0 - MULQ t0 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc6, mul0 - MULQ t1 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc6, mul0 - MULQ t2 - ADDQ hlp, acc4 - ADCQ $0, mul1 - ADDQ mul0, acc4 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc6, mul0 - MULQ t3 - ADDQ hlp, acc5 - ADCQ $0, mul1 - ADDQ mul0, acc5 - ADCQ $0, mul1 - MOVQ mul1, acc6 - - MOVQ acc7, mul0 - MULQ t0 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc7, mul0 - MULQ t1 - ADDQ hlp, acc4 - ADCQ $0, mul1 - ADDQ mul0, acc4 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc7, mul0 - MULQ t2 - ADDQ hlp, acc5 - ADCQ $0, mul1 - ADDQ mul0, acc5 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc7, mul0 - MULQ t3 - ADDQ hlp, acc6 - ADCQ $0, mul1 - ADDQ mul0, acc6 - ADCQ $0, mul1 - MOVQ mul1, acc7 + +// func p256MulInternal() +// Requires: CMOV +TEXT p256MulInternal(SB), NOSPLIT, $8 + MOVQ R10, AX + MULQ R14 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R10, AX + MULQ R15 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R10, AX + MULQ DI + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R9 + MOVQ R10, AX + MULQ SI + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ R11, AX + MULQ R14 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R11, AX + MULQ R15 + ADDQ BP, R8 + ADCQ $0x00, DX + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R11, AX + MULQ DI + ADDQ BP, R9 + ADCQ $0x00, DX + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R11, AX + MULQ SI + ADDQ BP, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, R11 + MOVQ R12, AX + MULQ R14 + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R12, AX + MULQ R15 + ADDQ BP, R9 + ADCQ $0x00, DX + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R12, AX + MULQ DI + ADDQ BP, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R12, AX + MULQ SI + ADDQ BP, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, R12 + MOVQ R13, AX + MULQ R14 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R13, AX + MULQ R15 + ADDQ BP, R10 + ADCQ $0x00, DX + ADDQ AX, R10 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R13, AX + MULQ DI + ADDQ BP, R11 + ADCQ $0x00, DX + ADDQ AX, R11 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R13, AX + MULQ SI + ADDQ BP, R12 + ADCQ $0x00, DX + ADDQ AX, R12 + ADCQ $0x00, DX + MOVQ DX, R13 + // First reduction step - MOVQ acc0, mul0 - MOVQ acc0, hlp - SHLQ $32, acc0 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc0, acc1 - ADCQ hlp, acc2 - ADCQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 + MOVQ BX, AX + MOVQ BX, BP + SHLQ $0x20, BX + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ BX, CX + ADCQ BP, R8 + ADCQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BX + // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, hlp - SHLQ $32, acc1 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc1, acc2 - ADCQ hlp, acc3 - ADCQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, acc1 + MOVQ CX, AX + MOVQ CX, BP + SHLQ $0x20, CX + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ CX, R8 + ADCQ BP, R9 + ADCQ AX, BX + ADCQ $0x00, DX + MOVQ DX, CX + // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, hlp - SHLQ $32, acc2 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc2, acc3 - ADCQ hlp, acc0 - ADCQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, acc2 + MOVQ R8, AX + MOVQ R8, BP + SHLQ $0x20, R8 + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ R8, R9 + ADCQ BP, BX + ADCQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, hlp - SHLQ $32, acc3 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc3, acc0 - ADCQ hlp, acc1 - ADCQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - MOVQ $0, BP + MOVQ R9, AX + MOVQ R9, BP + SHLQ $0x20, R9 + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ R9, BX + ADCQ BP, CX + ADCQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R9 + MOVQ $0x00000000, BP + // Add bits [511:256] of the result - ADCQ acc0, acc4 - ADCQ acc1, acc5 - ADCQ acc2, acc6 - ADCQ acc3, acc7 - ADCQ $0, hlp + ADCQ BX, R10 + ADCQ CX, R11 + ADCQ R8, R12 + ADCQ R9, R13 + ADCQ $0x00, BP + // Copy result - MOVQ acc4, acc0 - MOVQ acc5, acc1 - MOVQ acc6, acc2 - MOVQ acc7, acc3 + MOVQ R10, BX + MOVQ R11, CX + MOVQ R12, R8 + MOVQ R13, R9 + // Subtract p256 - SUBQ $-1, acc4 - SBBQ p256const0<>(SB) ,acc5 - SBBQ $0, acc6 - SBBQ p256const1<>(SB), acc7 - SBBQ $0, hlp - // If the result of the subtraction is negative, restore the previous result - CMOVQCS acc0, acc4 - CMOVQCS acc1, acc5 - CMOVQCS acc2, acc6 - CMOVQCS acc3, acc7 + SUBQ $-1, R10 + SBBQ p256const0<>+0(SB), R11 + SBBQ $0x00, R12 + SBBQ p256const1<>+0(SB), R13 + SBBQ $0x00, BP + // If the result of the subtraction is negative, restore the previous result + CMOVQCS BX, R10 + CMOVQCS CX, R11 + CMOVQCS R8, R12 + CMOVQCS R9, R13 RET -/* ---------------------------------------*/ -TEXT p256SqrInternal(SB),NOSPLIT,$8 - - MOVQ acc4, mul0 - MULQ acc5 - MOVQ mul0, acc1 - MOVQ mul1, acc2 - - MOVQ acc4, mul0 - MULQ acc6 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - - MOVQ acc4, mul0 - MULQ acc7 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, t0 - - MOVQ acc5, mul0 - MULQ acc6 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - - MOVQ acc5, mul0 - MULQ acc7 - ADDQ hlp, t0 - ADCQ $0, mul1 - ADDQ mul0, t0 - ADCQ $0, mul1 - MOVQ mul1, t1 - - MOVQ acc6, mul0 - MULQ acc7 - ADDQ mul0, t1 - ADCQ $0, mul1 - MOVQ mul1, t2 - XORQ t3, t3 + +// func p256SqrInternal() +// Requires: CMOV +TEXT p256SqrInternal(SB), NOSPLIT, $8 + MOVQ R10, AX + MULQ R11 + MOVQ AX, CX + MOVQ DX, R8 + MOVQ R10, AX + MULQ R12 + ADDQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R9 + MOVQ R10, AX + MULQ R13 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, R14 + MOVQ R11, AX + MULQ R12 + ADDQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BP + MOVQ R11, AX + MULQ R13 + ADDQ BP, R14 + ADCQ $0x00, DX + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R15 + MOVQ R12, AX + MULQ R13 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVQ DX, DI + XORQ SI, SI + // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ t0, t0 - ADCQ t1, t1 - ADCQ t2, t2 - ADCQ $0, t3 + ADDQ CX, CX + ADCQ R8, R8 + ADCQ R9, R9 + ADCQ R14, R14 + ADCQ R15, R15 + ADCQ DI, DI + ADCQ $0x00, SI + // Missing products - MOVQ acc4, mul0 - MULQ mul0 - MOVQ mul0, acc0 - MOVQ DX, acc4 - - MOVQ acc5, mul0 - MULQ mul0 - ADDQ acc4, acc1 - ADCQ mul0, acc2 - ADCQ $0, DX - MOVQ DX, acc4 - - MOVQ acc6, mul0 - MULQ mul0 - ADDQ acc4, acc3 - ADCQ mul0, t0 - ADCQ $0, DX - MOVQ DX, acc4 - - MOVQ acc7, mul0 - MULQ mul0 - ADDQ acc4, t1 - ADCQ mul0, t2 - ADCQ DX, t3 + MOVQ R10, AX + MULQ AX + MOVQ AX, BX + MOVQ DX, R10 + MOVQ R11, AX + MULQ AX + ADDQ R10, CX + ADCQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ R12, AX + MULQ AX + ADDQ R10, R9 + ADCQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ R13, AX + MULQ AX + ADDQ R10, R15 + ADCQ AX, DI + ADCQ DX, SI + // First reduction step - MOVQ acc0, mul0 - MOVQ acc0, hlp - SHLQ $32, acc0 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc0, acc1 - ADCQ hlp, acc2 - ADCQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 + MOVQ BX, AX + MOVQ BX, BP + SHLQ $0x20, BX + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ BX, CX + ADCQ BP, R8 + ADCQ AX, R9 + ADCQ $0x00, DX + MOVQ DX, BX + // Second reduction step - MOVQ acc1, mul0 - MOVQ acc1, hlp - SHLQ $32, acc1 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc1, acc2 - ADCQ hlp, acc3 - ADCQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, acc1 + MOVQ CX, AX + MOVQ CX, BP + SHLQ $0x20, CX + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ CX, R8 + ADCQ BP, R9 + ADCQ AX, BX + ADCQ $0x00, DX + MOVQ DX, CX + // Third reduction step - MOVQ acc2, mul0 - MOVQ acc2, hlp - SHLQ $32, acc2 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc2, acc3 - ADCQ hlp, acc0 - ADCQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, acc2 + MOVQ R8, AX + MOVQ R8, BP + SHLQ $0x20, R8 + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ R8, R9 + ADCQ BP, BX + ADCQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + // Last reduction step - MOVQ acc3, mul0 - MOVQ acc3, hlp - SHLQ $32, acc3 - MULQ p256const1<>(SB) - SHRQ $32, hlp - ADDQ acc3, acc0 - ADCQ hlp, acc1 - ADCQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 - MOVQ $0, BP + MOVQ R9, AX + MOVQ R9, BP + SHLQ $0x20, R9 + MULQ p256const1<>+0(SB) + SHRQ $0x20, BP + ADDQ R9, BX + ADCQ BP, CX + ADCQ AX, R8 + ADCQ $0x00, DX + MOVQ DX, R9 + MOVQ $0x00000000, BP + // Add bits [511:256] of the result - ADCQ acc0, t0 - ADCQ acc1, t1 - ADCQ acc2, t2 - ADCQ acc3, t3 - ADCQ $0, hlp + ADCQ BX, R14 + ADCQ CX, R15 + ADCQ R8, DI + ADCQ R9, SI + ADCQ $0x00, BP + // Copy result - MOVQ t0, acc4 - MOVQ t1, acc5 - MOVQ t2, acc6 - MOVQ t3, acc7 + MOVQ R14, R10 + MOVQ R15, R11 + MOVQ DI, R12 + MOVQ SI, R13 + // Subtract p256 - SUBQ $-1, acc4 - SBBQ p256const0<>(SB) ,acc5 - SBBQ $0, acc6 - SBBQ p256const1<>(SB), acc7 - SBBQ $0, hlp - // If the result of the subtraction is negative, restore the previous result - CMOVQCS t0, acc4 - CMOVQCS t1, acc5 - CMOVQCS t2, acc6 - CMOVQCS t3, acc7 + SUBQ $-1, R10 + SBBQ p256const0<>+0(SB), R11 + SBBQ $0x00, R12 + SBBQ p256const1<>+0(SB), R13 + SBBQ $0x00, BP + // If the result of the subtraction is negative, restore the previous result + CMOVQCS R14, R10 + CMOVQCS R15, R11 + CMOVQCS DI, R12 + CMOVQCS SI, R13 RET -/* ---------------------------------------*/ -#define p256MulBy2Inline\ - XORQ mul0, mul0;\ - ADDQ acc4, acc4;\ - ADCQ acc5, acc5;\ - ADCQ acc6, acc6;\ - ADCQ acc7, acc7;\ - ADCQ $0, mul0;\ - MOVQ acc4, t0;\ - MOVQ acc5, t1;\ - MOVQ acc6, t2;\ - MOVQ acc7, t3;\ - SUBQ $-1, t0;\ - SBBQ p256const0<>(SB), t1;\ - SBBQ $0, t2;\ - SBBQ p256const1<>(SB), t3;\ - SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ - CMOVQCS acc5, t1;\ - CMOVQCS acc6, t2;\ - CMOVQCS acc7, t3; -/* ---------------------------------------*/ -#define p256AddInline \ - XORQ mul0, mul0;\ - ADDQ t0, acc4;\ - ADCQ t1, acc5;\ - ADCQ t2, acc6;\ - ADCQ t3, acc7;\ - ADCQ $0, mul0;\ - MOVQ acc4, t0;\ - MOVQ acc5, t1;\ - MOVQ acc6, t2;\ - MOVQ acc7, t3;\ - SUBQ $-1, t0;\ - SBBQ p256const0<>(SB), t1;\ - SBBQ $0, t2;\ - SBBQ p256const1<>(SB), t3;\ - SBBQ $0, mul0;\ - CMOVQCS acc4, t0;\ - CMOVQCS acc5, t1;\ - CMOVQCS acc6, t2;\ - CMOVQCS acc7, t3; -/* ---------------------------------------*/ -#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 -#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 -#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) -#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) -#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 -#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 -/* ---------------------------------------*/ -#define x1in(off) (32*0 + off)(SP) -#define y1in(off) (32*1 + off)(SP) -#define z1in(off) (32*2 + off)(SP) -#define x2in(off) (32*3 + off)(SP) -#define y2in(off) (32*4 + off)(SP) -#define xout(off) (32*5 + off)(SP) -#define yout(off) (32*6 + off)(SP) -#define zout(off) (32*7 + off)(SP) -#define s2(off) (32*8 + off)(SP) -#define z1sqr(off) (32*9 + off)(SP) -#define h(off) (32*10 + off)(SP) -#define r(off) (32*11 + off)(SP) -#define hsqr(off) (32*12 + off)(SP) -#define rsqr(off) (32*13 + off)(SP) -#define hcub(off) (32*14 + off)(SP) -#define rptr (32*15)(SP) -#define sel_save (32*15 + 8)(SP) -#define zero_save (32*15 + 8 + 4)(SP) - -// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) -TEXT ·p256PointAddAffineAsm(SB),0,$512-48 - // Move input to stack in order to free registers - MOVQ res+0(FP), AX - MOVQ in1+8(FP), BX - MOVQ in2+16(FP), CX - MOVQ sign+24(FP), DX - MOVQ sel+32(FP), t1 - MOVQ zero+40(FP), t2 - - MOVOU (16*0)(BX), X0 - MOVOU (16*1)(BX), X1 - MOVOU (16*2)(BX), X2 - MOVOU (16*3)(BX), X3 - MOVOU (16*4)(BX), X4 - MOVOU (16*5)(BX), X5 - - MOVOU X0, x1in(16*0) - MOVOU X1, x1in(16*1) - MOVOU X2, y1in(16*0) - MOVOU X3, y1in(16*1) - MOVOU X4, z1in(16*0) - MOVOU X5, z1in(16*1) - - MOVOU (16*0)(CX), X0 - MOVOU (16*1)(CX), X1 - - MOVOU X0, x2in(16*0) - MOVOU X1, x2in(16*1) - // Store pointer to result - MOVQ mul0, rptr - MOVL t1, sel_save - MOVL t2, zero_save - // Negate y2in based on sign - MOVQ (16*2 + 8*0)(CX), acc4 - MOVQ (16*2 + 8*1)(CX), acc5 - MOVQ (16*2 + 8*2)(CX), acc6 - MOVQ (16*2 + 8*3)(CX), acc7 - MOVQ $-1, acc0 - MOVQ p256const0<>(SB), acc1 - MOVQ $0, acc2 - MOVQ p256const1<>(SB), acc3 - XORQ mul0, mul0 - // Speculatively subtract - SUBQ acc4, acc0 - SBBQ acc5, acc1 - SBBQ acc6, acc2 - SBBQ acc7, acc3 - SBBQ $0, mul0 - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - MOVQ acc3, t3 - // Add in case the operand was > p256 - ADDQ $-1, acc0 - ADCQ p256const0<>(SB), acc1 - ADCQ $0, acc2 - ADCQ p256const1<>(SB), acc3 - ADCQ $0, mul0 - CMOVQNE t0, acc0 - CMOVQNE t1, acc1 - CMOVQNE t2, acc2 - CMOVQNE t3, acc3 - // If condition is 0, keep original value - TESTQ DX, DX - CMOVQEQ acc4, acc0 - CMOVQEQ acc5, acc1 - CMOVQEQ acc6, acc2 - CMOVQEQ acc7, acc3 - // Store result - MOVQ acc0, y2in(8*0) - MOVQ acc1, y2in(8*1) - MOVQ acc2, y2in(8*2) - MOVQ acc3, y2in(8*3) - // Begin point add - LDacc (z1in) - CALL p256SqrInternal(SB) // z1ˆ2 - ST (z1sqr) - - LDt (x2in) - CALL p256MulInternal(SB) // x2 * z1ˆ2 - - LDt (x1in) - CALL p256SubInternal(SB) // h = u2 - u1 - ST (h) - - LDt (z1in) - CALL p256MulInternal(SB) // z3 = h * z1 - ST (zout) - - LDacc (z1sqr) - CALL p256MulInternal(SB) // z1ˆ3 - LDt (y2in) - CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 - ST (s2) +// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int) +// Requires: CMOV, SSE2 +TEXT ·p256PointAddAffineAsm(SB), $512-48 + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + MOVQ sign+24(FP), DX + MOVQ sel+32(FP), R15 + MOVQ zero+40(FP), DI + MOVOU (BX), X0 + MOVOU 16(BX), X1 + MOVOU 32(BX), X2 + MOVOU 48(BX), X3 + MOVOU 64(BX), X4 + MOVOU 80(BX), X5 + MOVOU X0, (SP) + MOVOU X1, 16(SP) + MOVOU X2, 32(SP) + MOVOU X3, 48(SP) + MOVOU X4, 64(SP) + MOVOU X5, 80(SP) + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU X0, 96(SP) + MOVOU X1, 112(SP) - LDt (y1in) - CALL p256SubInternal(SB) // r = s2 - s1 - ST (r) - - CALL p256SqrInternal(SB) // rsqr = rˆ2 - ST (rsqr) - - LDacc (h) - CALL p256SqrInternal(SB) // hsqr = hˆ2 - ST (hsqr) - - LDt (h) - CALL p256MulInternal(SB) // hcub = hˆ3 - ST (hcub) + // Store pointer to result + MOVQ AX, 480(SP) + MOVL R15, 488(SP) + MOVL DI, 492(SP) - LDt (y1in) - CALL p256MulInternal(SB) // y1 * hˆ3 - ST (s2) + // Negate y2in based on sign + MOVQ 32(CX), R10 + MOVQ 40(CX), R11 + MOVQ 48(CX), R12 + MOVQ 56(CX), R13 + MOVQ $-1, BX + MOVQ p256const0<>+0(SB), CX + MOVQ $0x00000000, R8 + MOVQ p256const1<>+0(SB), R9 + XORQ AX, AX - LDacc (x1in) - LDt (hsqr) - CALL p256MulInternal(SB) // u1 * hˆ2 - ST (h) + // Speculatively subtract + SUBQ R10, BX + SBBQ R11, CX + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ $0x00, AX + MOVQ BX, R14 + MOVQ CX, R15 + MOVQ R8, DI + MOVQ R9, SI - p256MulBy2Inline // u1 * hˆ2 * 2, inline - LDacc (rsqr) - CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + // Add in case the operand was > p256 + ADDQ $-1, BX + ADCQ p256const0<>+0(SB), CX + ADCQ $0x00, R8 + ADCQ p256const1<>+0(SB), R9 + ADCQ $0x00, AX + CMOVQNE R14, BX + CMOVQNE R15, CX + CMOVQNE DI, R8 + CMOVQNE SI, R9 - LDt (hcub) - CALL p256SubInternal(SB) - ST (xout) + // If condition is 0, keep original value + TESTQ DX, DX + CMOVQEQ R10, BX + CMOVQEQ R11, CX + CMOVQEQ R12, R8 + CMOVQEQ R13, R9 - MOVQ acc4, t0 - MOVQ acc5, t1 - MOVQ acc6, t2 - MOVQ acc7, t3 - LDacc (h) - CALL p256SubInternal(SB) + // Store result + MOVQ BX, 128(SP) + MOVQ CX, 136(SP) + MOVQ R8, 144(SP) + MOVQ R9, 152(SP) - LDt (r) - CALL p256MulInternal(SB) + // Begin point add + MOVQ 64(SP), R10 + MOVQ 72(SP), R11 + MOVQ 80(SP), R12 + MOVQ 88(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 288(SP) + MOVQ R11, 296(SP) + MOVQ R12, 304(SP) + MOVQ R13, 312(SP) + MOVQ 96(SP), R14 + MOVQ 104(SP), R15 + MOVQ 112(SP), DI + MOVQ 120(SP), SI + CALL p256MulInternal(SB) + MOVQ (SP), R14 + MOVQ 8(SP), R15 + MOVQ 16(SP), DI + MOVQ 24(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 320(SP) + MOVQ R11, 328(SP) + MOVQ R12, 336(SP) + MOVQ R13, 344(SP) + MOVQ 64(SP), R14 + MOVQ 72(SP), R15 + MOVQ 80(SP), DI + MOVQ 88(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 224(SP) + MOVQ R11, 232(SP) + MOVQ R12, 240(SP) + MOVQ R13, 248(SP) + MOVQ 288(SP), R10 + MOVQ 296(SP), R11 + MOVQ 304(SP), R12 + MOVQ 312(SP), R13 + CALL p256MulInternal(SB) + MOVQ 128(SP), R14 + MOVQ 136(SP), R15 + MOVQ 144(SP), DI + MOVQ 152(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 256(SP) + MOVQ R11, 264(SP) + MOVQ R12, 272(SP) + MOVQ R13, 280(SP) + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + MOVQ 48(SP), DI + MOVQ 56(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 352(SP) + MOVQ R11, 360(SP) + MOVQ R12, 368(SP) + MOVQ R13, 376(SP) + CALL p256SqrInternal(SB) + MOVQ R10, 416(SP) + MOVQ R11, 424(SP) + MOVQ R12, 432(SP) + MOVQ R13, 440(SP) + MOVQ 320(SP), R10 + MOVQ 328(SP), R11 + MOVQ 336(SP), R12 + MOVQ 344(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 384(SP) + MOVQ R11, 392(SP) + MOVQ R12, 400(SP) + MOVQ R13, 408(SP) + MOVQ 320(SP), R14 + MOVQ 328(SP), R15 + MOVQ 336(SP), DI + MOVQ 344(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 448(SP) + MOVQ R11, 456(SP) + MOVQ R12, 464(SP) + MOVQ R13, 472(SP) + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + MOVQ 48(SP), DI + MOVQ 56(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 256(SP) + MOVQ R11, 264(SP) + MOVQ R12, 272(SP) + MOVQ R13, 280(SP) + MOVQ (SP), R10 + MOVQ 8(SP), R11 + MOVQ 16(SP), R12 + MOVQ 24(SP), R13 + MOVQ 384(SP), R14 + MOVQ 392(SP), R15 + MOVQ 400(SP), DI + MOVQ 408(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 320(SP) + MOVQ R11, 328(SP) + MOVQ R12, 336(SP) + MOVQ R13, 344(SP) + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ 416(SP), R10 + MOVQ 424(SP), R11 + MOVQ 432(SP), R12 + MOVQ 440(SP), R13 + CALL p256SubInternal(SB) + MOVQ 448(SP), R14 + MOVQ 456(SP), R15 + MOVQ 464(SP), DI + MOVQ 472(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 160(SP) + MOVQ R11, 168(SP) + MOVQ R12, 176(SP) + MOVQ R13, 184(SP) + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + MOVQ 320(SP), R10 + MOVQ 328(SP), R11 + MOVQ 336(SP), R12 + MOVQ 344(SP), R13 + CALL p256SubInternal(SB) + MOVQ 352(SP), R14 + MOVQ 360(SP), R15 + MOVQ 368(SP), DI + MOVQ 376(SP), SI + CALL p256MulInternal(SB) + MOVQ 256(SP), R14 + MOVQ 264(SP), R15 + MOVQ 272(SP), DI + MOVQ 280(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 192(SP) + MOVQ R11, 200(SP) + MOVQ R12, 208(SP) + MOVQ R13, 216(SP) - LDt (s2) - CALL p256SubInternal(SB) - ST (yout) // Load stored values from stack - MOVQ rptr, AX - MOVL sel_save, BX - MOVL zero_save, CX - // The result is not valid if (sel == 0), conditional choose - MOVOU xout(16*0), X0 - MOVOU xout(16*1), X1 - MOVOU yout(16*0), X2 - MOVOU yout(16*1), X3 - MOVOU zout(16*0), X4 - MOVOU zout(16*1), X5 - - MOVL BX, X6 - MOVL CX, X7 + MOVQ 480(SP), AX + MOVL 488(SP), BX + MOVL 492(SP), CX - PXOR X8, X8 + // The result is not valid if (sel == 0), conditional choose + MOVOU 160(SP), X0 + MOVOU 176(SP), X1 + MOVOU 192(SP), X2 + MOVOU 208(SP), X3 + MOVOU 224(SP), X4 + MOVOU 240(SP), X5 + MOVL BX, X6 + MOVL CX, X7 + PXOR X8, X8 PCMPEQL X9, X9 - - PSHUFD $0, X6, X6 - PSHUFD $0, X7, X7 - + PSHUFD $0x00, X6, X6 + PSHUFD $0x00, X7, X7 PCMPEQL X8, X6 PCMPEQL X8, X7 + MOVOU X6, X15 + PANDN X9, X15 + MOVOU (SP), X9 + MOVOU 16(SP), X10 + MOVOU 32(SP), X11 + MOVOU 48(SP), X12 + MOVOU 64(SP), X13 + MOVOU 80(SP), X14 + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + PAND X6, X9 + PAND X6, X10 + PAND X6, X11 + PAND X6, X12 + PAND X6, X13 + PAND X6, X14 + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 - MOVOU X6, X15 - PANDN X9, X15 - - MOVOU x1in(16*0), X9 - MOVOU x1in(16*1), X10 - MOVOU y1in(16*0), X11 - MOVOU y1in(16*1), X12 - MOVOU z1in(16*0), X13 - MOVOU z1in(16*1), X14 - - PAND X15, X0 - PAND X15, X1 - PAND X15, X2 - PAND X15, X3 - PAND X15, X4 - PAND X15, X5 - - PAND X6, X9 - PAND X6, X10 - PAND X6, X11 - PAND X6, X12 - PAND X6, X13 - PAND X6, X14 - - PXOR X9, X0 - PXOR X10, X1 - PXOR X11, X2 - PXOR X12, X3 - PXOR X13, X4 - PXOR X14, X5 // Similarly if zero == 0 PCMPEQL X9, X9 - MOVOU X7, X15 - PANDN X9, X15 - - MOVOU x2in(16*0), X9 - MOVOU x2in(16*1), X10 - MOVOU y2in(16*0), X11 - MOVOU y2in(16*1), X12 - MOVOU p256one<>+0x00(SB), X13 - MOVOU p256one<>+0x10(SB), X14 - - PAND X15, X0 - PAND X15, X1 - PAND X15, X2 - PAND X15, X3 - PAND X15, X4 - PAND X15, X5 - - PAND X7, X9 - PAND X7, X10 - PAND X7, X11 - PAND X7, X12 - PAND X7, X13 - PAND X7, X14 - - PXOR X9, X0 - PXOR X10, X1 - PXOR X11, X2 - PXOR X12, X3 - PXOR X13, X4 - PXOR X14, X5 - // Finally output the result - MOVOU X0, (16*0)(AX) - MOVOU X1, (16*1)(AX) - MOVOU X2, (16*2)(AX) - MOVOU X3, (16*3)(AX) - MOVOU X4, (16*4)(AX) - MOVOU X5, (16*5)(AX) - MOVQ $0, rptr + MOVOU X7, X15 + PANDN X9, X15 + MOVOU 96(SP), X9 + MOVOU 112(SP), X10 + MOVOU 128(SP), X11 + MOVOU 144(SP), X12 + MOVOU p256one<>+0(SB), X13 + MOVOU p256one<>+16(SB), X14 + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + PAND X7, X9 + PAND X7, X10 + PAND X7, X11 + PAND X7, X12 + PAND X7, X13 + PAND X7, X14 + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Finally output the result + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVQ $0x00000000, 480(SP) RET -#undef x1in -#undef y1in -#undef z1in -#undef x2in -#undef y2in -#undef xout -#undef yout -#undef zout -#undef s2 -#undef z1sqr -#undef h -#undef r -#undef hsqr -#undef rsqr -#undef hcub -#undef rptr -#undef sel_save -#undef zero_save - -// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero -// otherwise. It writes to [acc4..acc7], t0 and t1. -TEXT p256IsZero(SB),NOSPLIT,$0 + +DATA p256one<>+0(SB)/8, $0x0000000000000001 +DATA p256one<>+8(SB)/8, $0xffffffff00000000 +DATA p256one<>+16(SB)/8, $0xffffffffffffffff +DATA p256one<>+24(SB)/8, $0x00000000fffffffe +GLOBL p256one<>(SB), RODATA, $32 + +// func p256IsZero() +// Requires: CMOV +TEXT p256IsZero(SB), NOSPLIT, $0 // AX contains a flag that is set if the input is zero. XORQ AX, AX - MOVQ $1, t1 + MOVQ $0x00000001, R15 // Check whether [acc4..acc7] are all zero. - MOVQ acc4, t0 - ORQ acc5, t0 - ORQ acc6, t0 - ORQ acc7, t0 + MOVQ R10, R14 + ORQ R11, R14 + ORQ R12, R14 + ORQ R13, R14 // Set the zero flag if so. (CMOV of a constant to a register doesn't // appear to be supported in Go. Thus t1 = 1.) - CMOVQEQ t1, AX + CMOVQEQ R15, AX // XOR [acc4..acc7] with P and compare with zero again. - XORQ $-1, acc4 - XORQ p256const0<>(SB), acc5 - XORQ p256const1<>(SB), acc7 - ORQ acc5, acc4 - ORQ acc6, acc4 - ORQ acc7, acc4 + XORQ $-1, R10 + XORQ p256const0<>+0(SB), R11 + XORQ p256const1<>+0(SB), R13 + ORQ R11, R10 + ORQ R12, R10 + ORQ R13, R10 // Set the zero flag if so. - CMOVQEQ t1, AX + CMOVQEQ R15, AX RET -/* ---------------------------------------*/ -#define x1in(off) (32*0 + off)(SP) -#define y1in(off) (32*1 + off)(SP) -#define z1in(off) (32*2 + off)(SP) -#define x2in(off) (32*3 + off)(SP) -#define y2in(off) (32*4 + off)(SP) -#define z2in(off) (32*5 + off)(SP) - -#define xout(off) (32*6 + off)(SP) -#define yout(off) (32*7 + off)(SP) -#define zout(off) (32*8 + off)(SP) - -#define u1(off) (32*9 + off)(SP) -#define u2(off) (32*10 + off)(SP) -#define s1(off) (32*11 + off)(SP) -#define s2(off) (32*12 + off)(SP) -#define z1sqr(off) (32*13 + off)(SP) -#define z2sqr(off) (32*14 + off)(SP) -#define h(off) (32*15 + off)(SP) -#define r(off) (32*16 + off)(SP) -#define hsqr(off) (32*17 + off)(SP) -#define rsqr(off) (32*18 + off)(SP) -#define hcub(off) (32*19 + off)(SP) -#define rptr (32*20)(SP) -#define points_eq (32*20+8)(SP) - -//func p256PointAddAsm(res, in1, in2 *P256Point) int -TEXT ·p256PointAddAsm(SB),0,$680-32 - // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl +// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int +// Requires: CMOV, SSE2 +TEXT ·p256PointAddAsm(SB), $680-32 // Move input to stack in order to free registers - MOVQ res+0(FP), AX - MOVQ in1+8(FP), BX - MOVQ in2+16(FP), CX + MOVQ res+0(FP), AX + MOVQ in1+8(FP), BX + MOVQ in2+16(FP), CX + MOVOU (BX), X0 + MOVOU 16(BX), X1 + MOVOU 32(BX), X2 + MOVOU 48(BX), X3 + MOVOU 64(BX), X4 + MOVOU 80(BX), X5 + MOVOU X0, (SP) + MOVOU X1, 16(SP) + MOVOU X2, 32(SP) + MOVOU X3, 48(SP) + MOVOU X4, 64(SP) + MOVOU X5, 80(SP) + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU 32(CX), X2 + MOVOU 48(CX), X3 + MOVOU 64(CX), X4 + MOVOU 80(CX), X5 + MOVOU X0, 96(SP) + MOVOU X1, 112(SP) + MOVOU X2, 128(SP) + MOVOU X3, 144(SP) + MOVOU X4, 160(SP) + MOVOU X5, 176(SP) - MOVOU (16*0)(BX), X0 - MOVOU (16*1)(BX), X1 - MOVOU (16*2)(BX), X2 - MOVOU (16*3)(BX), X3 - MOVOU (16*4)(BX), X4 - MOVOU (16*5)(BX), X5 - - MOVOU X0, x1in(16*0) - MOVOU X1, x1in(16*1) - MOVOU X2, y1in(16*0) - MOVOU X3, y1in(16*1) - MOVOU X4, z1in(16*0) - MOVOU X5, z1in(16*1) - - MOVOU (16*0)(CX), X0 - MOVOU (16*1)(CX), X1 - MOVOU (16*2)(CX), X2 - MOVOU (16*3)(CX), X3 - MOVOU (16*4)(CX), X4 - MOVOU (16*5)(CX), X5 - - MOVOU X0, x2in(16*0) - MOVOU X1, x2in(16*1) - MOVOU X2, y2in(16*0) - MOVOU X3, y2in(16*1) - MOVOU X4, z2in(16*0) - MOVOU X5, z2in(16*1) // Store pointer to result - MOVQ AX, rptr - // Begin point add - LDacc (z2in) - CALL p256SqrInternal(SB) // z2ˆ2 - ST (z2sqr) - LDt (z2in) - CALL p256MulInternal(SB) // z2ˆ3 - LDt (y1in) - CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 - ST (s1) - - LDacc (z1in) - CALL p256SqrInternal(SB) // z1ˆ2 - ST (z1sqr) - LDt (z1in) - CALL p256MulInternal(SB) // z1ˆ3 - LDt (y2in) - CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 - ST (s2) - - LDt (s1) - CALL p256SubInternal(SB) // r = s2 - s1 - ST (r) - CALL p256IsZero(SB) - MOVQ AX, points_eq - - LDacc (z2sqr) - LDt (x1in) - CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 - ST (u1) - LDacc (z1sqr) - LDt (x2in) - CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 - ST (u2) - - LDt (u1) - CALL p256SubInternal(SB) // h = u2 - u1 - ST (h) - CALL p256IsZero(SB) - ANDQ points_eq, AX - MOVQ AX, points_eq - - LDacc (r) - CALL p256SqrInternal(SB) // rsqr = rˆ2 - ST (rsqr) - - LDacc (h) - CALL p256SqrInternal(SB) // hsqr = hˆ2 - ST (hsqr) - - LDt (h) - CALL p256MulInternal(SB) // hcub = hˆ3 - ST (hcub) - - LDt (s1) - CALL p256MulInternal(SB) - ST (s2) - - LDacc (z1in) - LDt (z2in) - CALL p256MulInternal(SB) // z1 * z2 - LDt (h) - CALL p256MulInternal(SB) // z1 * z2 * h - ST (zout) - - LDacc (hsqr) - LDt (u1) - CALL p256MulInternal(SB) // hˆ2 * u1 - ST (u2) - - p256MulBy2Inline // u1 * hˆ2 * 2, inline - LDacc (rsqr) - CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + MOVQ AX, 640(SP) - LDt (hcub) - CALL p256SubInternal(SB) - ST (xout) - - MOVQ acc4, t0 - MOVQ acc5, t1 - MOVQ acc6, t2 - MOVQ acc7, t3 - LDacc (u2) - CALL p256SubInternal(SB) - - LDt (r) - CALL p256MulInternal(SB) + // Begin point add + MOVQ 160(SP), R10 + MOVQ 168(SP), R11 + MOVQ 176(SP), R12 + MOVQ 184(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 448(SP) + MOVQ R11, 456(SP) + MOVQ R12, 464(SP) + MOVQ R13, 472(SP) + MOVQ 160(SP), R14 + MOVQ 168(SP), R15 + MOVQ 176(SP), DI + MOVQ 184(SP), SI + CALL p256MulInternal(SB) + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + MOVQ 48(SP), DI + MOVQ 56(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 352(SP) + MOVQ R11, 360(SP) + MOVQ R12, 368(SP) + MOVQ R13, 376(SP) + MOVQ 64(SP), R10 + MOVQ 72(SP), R11 + MOVQ 80(SP), R12 + MOVQ 88(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 416(SP) + MOVQ R11, 424(SP) + MOVQ R12, 432(SP) + MOVQ R13, 440(SP) + MOVQ 64(SP), R14 + MOVQ 72(SP), R15 + MOVQ 80(SP), DI + MOVQ 88(SP), SI + CALL p256MulInternal(SB) + MOVQ 128(SP), R14 + MOVQ 136(SP), R15 + MOVQ 144(SP), DI + MOVQ 152(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 384(SP) + MOVQ R11, 392(SP) + MOVQ R12, 400(SP) + MOVQ R13, 408(SP) + MOVQ 352(SP), R14 + MOVQ 360(SP), R15 + MOVQ 368(SP), DI + MOVQ 376(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 512(SP) + MOVQ R11, 520(SP) + MOVQ R12, 528(SP) + MOVQ R13, 536(SP) + CALL p256IsZero(SB) + MOVQ AX, 648(SP) + MOVQ 448(SP), R10 + MOVQ 456(SP), R11 + MOVQ 464(SP), R12 + MOVQ 472(SP), R13 + MOVQ (SP), R14 + MOVQ 8(SP), R15 + MOVQ 16(SP), DI + MOVQ 24(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 288(SP) + MOVQ R11, 296(SP) + MOVQ R12, 304(SP) + MOVQ R13, 312(SP) + MOVQ 416(SP), R10 + MOVQ 424(SP), R11 + MOVQ 432(SP), R12 + MOVQ 440(SP), R13 + MOVQ 96(SP), R14 + MOVQ 104(SP), R15 + MOVQ 112(SP), DI + MOVQ 120(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 320(SP) + MOVQ R11, 328(SP) + MOVQ R12, 336(SP) + MOVQ R13, 344(SP) + MOVQ 288(SP), R14 + MOVQ 296(SP), R15 + MOVQ 304(SP), DI + MOVQ 312(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 480(SP) + MOVQ R11, 488(SP) + MOVQ R12, 496(SP) + MOVQ R13, 504(SP) + CALL p256IsZero(SB) + ANDQ 648(SP), AX + MOVQ AX, 648(SP) + MOVQ 512(SP), R10 + MOVQ 520(SP), R11 + MOVQ 528(SP), R12 + MOVQ 536(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 576(SP) + MOVQ R11, 584(SP) + MOVQ R12, 592(SP) + MOVQ R13, 600(SP) + MOVQ 480(SP), R10 + MOVQ 488(SP), R11 + MOVQ 496(SP), R12 + MOVQ 504(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 544(SP) + MOVQ R11, 552(SP) + MOVQ R12, 560(SP) + MOVQ R13, 568(SP) + MOVQ 480(SP), R14 + MOVQ 488(SP), R15 + MOVQ 496(SP), DI + MOVQ 504(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 608(SP) + MOVQ R11, 616(SP) + MOVQ R12, 624(SP) + MOVQ R13, 632(SP) + MOVQ 352(SP), R14 + MOVQ 360(SP), R15 + MOVQ 368(SP), DI + MOVQ 376(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 384(SP) + MOVQ R11, 392(SP) + MOVQ R12, 400(SP) + MOVQ R13, 408(SP) + MOVQ 64(SP), R10 + MOVQ 72(SP), R11 + MOVQ 80(SP), R12 + MOVQ 88(SP), R13 + MOVQ 160(SP), R14 + MOVQ 168(SP), R15 + MOVQ 176(SP), DI + MOVQ 184(SP), SI + CALL p256MulInternal(SB) + MOVQ 480(SP), R14 + MOVQ 488(SP), R15 + MOVQ 496(SP), DI + MOVQ 504(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 256(SP) + MOVQ R11, 264(SP) + MOVQ R12, 272(SP) + MOVQ R13, 280(SP) + MOVQ 544(SP), R10 + MOVQ 552(SP), R11 + MOVQ 560(SP), R12 + MOVQ 568(SP), R13 + MOVQ 288(SP), R14 + MOVQ 296(SP), R15 + MOVQ 304(SP), DI + MOVQ 312(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 320(SP) + MOVQ R11, 328(SP) + MOVQ R12, 336(SP) + MOVQ R13, 344(SP) + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ 576(SP), R10 + MOVQ 584(SP), R11 + MOVQ 592(SP), R12 + MOVQ 600(SP), R13 + CALL p256SubInternal(SB) + MOVQ 608(SP), R14 + MOVQ 616(SP), R15 + MOVQ 624(SP), DI + MOVQ 632(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 192(SP) + MOVQ R11, 200(SP) + MOVQ R12, 208(SP) + MOVQ R13, 216(SP) + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + MOVQ 320(SP), R10 + MOVQ 328(SP), R11 + MOVQ 336(SP), R12 + MOVQ 344(SP), R13 + CALL p256SubInternal(SB) + MOVQ 512(SP), R14 + MOVQ 520(SP), R15 + MOVQ 528(SP), DI + MOVQ 536(SP), SI + CALL p256MulInternal(SB) + MOVQ 384(SP), R14 + MOVQ 392(SP), R15 + MOVQ 400(SP), DI + MOVQ 408(SP), SI + CALL p256SubInternal(SB) + MOVQ R10, 224(SP) + MOVQ R11, 232(SP) + MOVQ R12, 240(SP) + MOVQ R13, 248(SP) + MOVOU 192(SP), X0 + MOVOU 208(SP), X1 + MOVOU 224(SP), X2 + MOVOU 240(SP), X3 + MOVOU 256(SP), X4 + MOVOU 272(SP), X5 - LDt (s2) - CALL p256SubInternal(SB) - ST (yout) - - MOVOU xout(16*0), X0 - MOVOU xout(16*1), X1 - MOVOU yout(16*0), X2 - MOVOU yout(16*1), X3 - MOVOU zout(16*0), X4 - MOVOU zout(16*1), X5 // Finally output the result - MOVQ rptr, AX - MOVQ $0, rptr - MOVOU X0, (16*0)(AX) - MOVOU X1, (16*1)(AX) - MOVOU X2, (16*2)(AX) - MOVOU X3, (16*3)(AX) - MOVOU X4, (16*4)(AX) - MOVOU X5, (16*5)(AX) - - MOVQ points_eq, AX - MOVQ AX, ret+24(FP) - + MOVQ 640(SP), AX + MOVQ $0x00000000, 640(SP) + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVQ 648(SP), AX + MOVQ AX, ret+24(FP) RET -#undef x1in -#undef y1in -#undef z1in -#undef x2in -#undef y2in -#undef z2in -#undef xout -#undef yout -#undef zout -#undef s1 -#undef s2 -#undef u1 -#undef u2 -#undef z1sqr -#undef z2sqr -#undef h -#undef r -#undef hsqr -#undef rsqr -#undef hcub -#undef rptr -/* ---------------------------------------*/ -#define x(off) (32*0 + off)(SP) -#define y(off) (32*1 + off)(SP) -#define z(off) (32*2 + off)(SP) - -#define s(off) (32*3 + off)(SP) -#define m(off) (32*4 + off)(SP) -#define zsqr(off) (32*5 + off)(SP) -#define tmp(off) (32*6 + off)(SP) -#define rptr (32*7)(SP) - -//func p256PointDoubleAsm(res, in *P256Point) -TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16 - // Move input to stack in order to free registers - MOVQ res+0(FP), AX - MOVQ in+8(FP), BX - - MOVOU (16*0)(BX), X0 - MOVOU (16*1)(BX), X1 - MOVOU (16*2)(BX), X2 - MOVOU (16*3)(BX), X3 - MOVOU (16*4)(BX), X4 - MOVOU (16*5)(BX), X5 - - MOVOU X0, x(16*0) - MOVOU X1, x(16*1) - MOVOU X2, y(16*0) - MOVOU X3, y(16*1) - MOVOU X4, z(16*0) - MOVOU X5, z(16*1) + +// func p256PointDoubleAsm(res *P256Point, in *P256Point) +// Requires: CMOV, SSE2 +TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16 + MOVQ res+0(FP), AX + MOVQ in+8(FP), BX + MOVOU (BX), X0 + MOVOU 16(BX), X1 + MOVOU 32(BX), X2 + MOVOU 48(BX), X3 + MOVOU 64(BX), X4 + MOVOU 80(BX), X5 + MOVOU X0, (SP) + MOVOU X1, 16(SP) + MOVOU X2, 32(SP) + MOVOU X3, 48(SP) + MOVOU X4, 64(SP) + MOVOU X5, 80(SP) + // Store pointer to result - MOVQ AX, rptr - // Begin point double - LDacc (z) - CALL p256SqrInternal(SB) - ST (zsqr) + MOVQ AX, 224(SP) - LDt (x) - p256AddInline - STt (m) + // Begin point double + MOVQ 64(SP), R10 + MOVQ 72(SP), R11 + MOVQ 80(SP), R12 + MOVQ 88(SP), R13 + CALL p256SqrInternal(SB) + MOVQ R10, 160(SP) + MOVQ R11, 168(SP) + MOVQ R12, 176(SP) + MOVQ R13, 184(SP) + MOVQ (SP), R14 + MOVQ 8(SP), R15 + MOVQ 16(SP), DI + MOVQ 24(SP), SI + XORQ AX, AX + ADDQ R14, R10 + ADCQ R15, R11 + ADCQ DI, R12 + ADCQ SI, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ R14, 128(SP) + MOVQ R15, 136(SP) + MOVQ DI, 144(SP) + MOVQ SI, 152(SP) + MOVQ 64(SP), R10 + MOVQ 72(SP), R11 + MOVQ 80(SP), R12 + MOVQ 88(SP), R13 + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + MOVQ 48(SP), DI + MOVQ 56(SP), SI + CALL p256MulInternal(SB) + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ 224(SP), AX - LDacc (z) - LDt (y) - CALL p256MulInternal(SB) - p256MulBy2Inline - MOVQ rptr, AX // Store z - MOVQ t0, (16*4 + 8*0)(AX) - MOVQ t1, (16*4 + 8*1)(AX) - MOVQ t2, (16*4 + 8*2)(AX) - MOVQ t3, (16*4 + 8*3)(AX) - - LDacc (x) - LDt (zsqr) + MOVQ R14, 64(AX) + MOVQ R15, 72(AX) + MOVQ DI, 80(AX) + MOVQ SI, 88(AX) + MOVQ (SP), R10 + MOVQ 8(SP), R11 + MOVQ 16(SP), R12 + MOVQ 24(SP), R13 + MOVQ 160(SP), R14 + MOVQ 168(SP), R15 + MOVQ 176(SP), DI + MOVQ 184(SP), SI CALL p256SubInternal(SB) - LDt (m) + MOVQ 128(SP), R14 + MOVQ 136(SP), R15 + MOVQ 144(SP), DI + MOVQ 152(SP), SI CALL p256MulInternal(SB) - ST (m) + MOVQ R10, 128(SP) + MOVQ R11, 136(SP) + MOVQ R12, 144(SP) + MOVQ R13, 152(SP) + // Multiply by 3 - p256MulBy2Inline - LDacc (m) - p256AddInline - STt (m) - //////////////////////// - LDacc (y) - p256MulBy2Inline - t2acc - CALL p256SqrInternal(SB) - ST (s) - CALL p256SqrInternal(SB) - // Divide by 2 - XORQ mul0, mul0 - MOVQ acc4, t0 - MOVQ acc5, t1 - MOVQ acc6, t2 - MOVQ acc7, t3 - - ADDQ $-1, acc4 - ADCQ p256const0<>(SB), acc5 - ADCQ $0, acc6 - ADCQ p256const1<>(SB), acc7 - ADCQ $0, mul0 - TESTQ $1, t0 - - CMOVQEQ t0, acc4 - CMOVQEQ t1, acc5 - CMOVQEQ t2, acc6 - CMOVQEQ t3, acc7 - ANDQ t0, mul0 - - SHRQ $1, acc5, acc4 - SHRQ $1, acc6, acc5 - SHRQ $1, acc7, acc6 - SHRQ $1, mul0, acc7 - ST (y) - ///////////////////////// - LDacc (x) - LDt (s) - CALL p256MulInternal(SB) - ST (s) - p256MulBy2Inline - STt (tmp) + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ 128(SP), R10 + MOVQ 136(SP), R11 + MOVQ 144(SP), R12 + MOVQ 152(SP), R13 + XORQ AX, AX + ADDQ R14, R10 + ADCQ R15, R11 + ADCQ DI, R12 + ADCQ SI, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ R14, 128(SP) + MOVQ R15, 136(SP) + MOVQ DI, 144(SP) + MOVQ SI, 152(SP) + + // //////////////////////// + MOVQ 32(SP), R10 + MOVQ 40(SP), R11 + MOVQ 48(SP), R12 + MOVQ 56(SP), R13 + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ R14, R10 + MOVQ R15, R11 + MOVQ DI, R12 + MOVQ SI, R13 + CALL p256SqrInternal(SB) + MOVQ R10, 96(SP) + MOVQ R11, 104(SP) + MOVQ R12, 112(SP) + MOVQ R13, 120(SP) + CALL p256SqrInternal(SB) - LDacc (m) - CALL p256SqrInternal(SB) - LDt (tmp) - CALL p256SubInternal(SB) + // Divide by 2 + XORQ AX, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + ADDQ $-1, R10 + ADCQ p256const0<>+0(SB), R11 + ADCQ $0x00, R12 + ADCQ p256const1<>+0(SB), R13 + ADCQ $0x00, AX + TESTQ $0x00000001, R14 + CMOVQEQ R14, R10 + CMOVQEQ R15, R11 + CMOVQEQ DI, R12 + CMOVQEQ SI, R13 + ANDQ R14, AX + SHRQ $0x01, R11, R10 + SHRQ $0x01, R12, R11 + SHRQ $0x01, R13, R12 + SHRQ $0x01, AX, R13 + MOVQ R10, 32(SP) + MOVQ R11, 40(SP) + MOVQ R12, 48(SP) + MOVQ R13, 56(SP) + + // ///////////////////////// + MOVQ (SP), R10 + MOVQ 8(SP), R11 + MOVQ 16(SP), R12 + MOVQ 24(SP), R13 + MOVQ 96(SP), R14 + MOVQ 104(SP), R15 + MOVQ 112(SP), DI + MOVQ 120(SP), SI + CALL p256MulInternal(SB) + MOVQ R10, 96(SP) + MOVQ R11, 104(SP) + MOVQ R12, 112(SP) + MOVQ R13, 120(SP) + XORQ AX, AX + ADDQ R10, R10 + ADCQ R11, R11 + ADCQ R12, R12 + ADCQ R13, R13 + ADCQ $+0, AX + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + SUBQ $-1, R14 + SBBQ p256const0<>+0(SB), R15 + SBBQ $+0, DI + SBBQ p256const1<>+0(SB), SI + SBBQ $+0, AX + CMOVQCS R10, R14 + CMOVQCS R11, R15 + CMOVQCS R12, DI + CMOVQCS R13, SI + MOVQ R14, 192(SP) + MOVQ R15, 200(SP) + MOVQ DI, 208(SP) + MOVQ SI, 216(SP) + MOVQ 128(SP), R10 + MOVQ 136(SP), R11 + MOVQ 144(SP), R12 + MOVQ 152(SP), R13 + CALL p256SqrInternal(SB) + MOVQ 192(SP), R14 + MOVQ 200(SP), R15 + MOVQ 208(SP), DI + MOVQ 216(SP), SI + CALL p256SubInternal(SB) + MOVQ 224(SP), AX - MOVQ rptr, AX // Store x - MOVQ acc4, (16*0 + 8*0)(AX) - MOVQ acc5, (16*0 + 8*1)(AX) - MOVQ acc6, (16*0 + 8*2)(AX) - MOVQ acc7, (16*0 + 8*3)(AX) - - acc2t - LDacc (s) + MOVQ R10, (AX) + MOVQ R11, 8(AX) + MOVQ R12, 16(AX) + MOVQ R13, 24(AX) + MOVQ R10, R14 + MOVQ R11, R15 + MOVQ R12, DI + MOVQ R13, SI + MOVQ 96(SP), R10 + MOVQ 104(SP), R11 + MOVQ 112(SP), R12 + MOVQ 120(SP), R13 CALL p256SubInternal(SB) - - LDt (m) + MOVQ 128(SP), R14 + MOVQ 136(SP), R15 + MOVQ 144(SP), DI + MOVQ 152(SP), SI CALL p256MulInternal(SB) - - LDt (y) + MOVQ 32(SP), R14 + MOVQ 40(SP), R15 + MOVQ 48(SP), DI + MOVQ 56(SP), SI CALL p256SubInternal(SB) - MOVQ rptr, AX + MOVQ 224(SP), AX + // Store y - MOVQ acc4, (16*2 + 8*0)(AX) - MOVQ acc5, (16*2 + 8*1)(AX) - MOVQ acc6, (16*2 + 8*2)(AX) - MOVQ acc7, (16*2 + 8*3)(AX) - /////////////////////// - MOVQ $0, rptr + MOVQ R10, 32(AX) + MOVQ R11, 40(AX) + MOVQ R12, 48(AX) + MOVQ R13, 56(AX) + // /////////////////////// + MOVQ $0x00000000, 224(SP) RET -/* ---------------------------------------*/ diff --git a/src/go/types/stdlib_test.go b/src/go/types/stdlib_test.go index 549eeba8f317b..4dd33a863e8eb 100644 --- a/src/go/types/stdlib_test.go +++ b/src/go/types/stdlib_test.go @@ -361,6 +361,7 @@ var excluded = map[string]bool{ "crypto/aes/_asm/standard": true, "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, + "crypto/internal/nistec/_asm": true, "crypto/md5/_asm": true, "crypto/sha1/_asm": true, "crypto/sha256/_asm": true,