Skip to content

Commit d9b6a72

Browse files
committed
crypto/aes: speedup CTR mode on AMD64 and ARM64
The implementation runs up to 8 AES instructions in different registers one after another in ASM code. Because CPU has instruction pipelining and the instructions do not depend on each other, they can run in parallel with this layout of code. This results in significant speedup compared to the regular implementation in which blocks are processed in the same registers so AES instructions do not run in parallel. GCM mode already utilizes the approach. The type implementing ctrAble has most of its code in XORKeyStreamAt() method which has an additional argument, offset. It allows to use it in a stateless way and to jump to any location in the stream. AES CTR benchmark delta. $ go test crypto/cipher -bench 'BenchmarkAESCTR*' AMD64. Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz name old time/op new time/op delta BenchmarkAESCTR1K-2 1259ns 266.9ns -78.8% BenchmarkAESCTR8K-2 9859ns 1953ns -80.1% ARM64. ARM Neoverse-N1 (AWS EC2 t4g.small instance) name old time/op new time/op delta BenchmarkAESCTR1K-2 1098ns 481.1ns -56.2% BenchmarkAESCTR8K-2 8447ns 3452ns -59.1% Original issue: #20967 Investigation and initial implementation: https://github.com/mmcloughlin/aesnix/ Full implementation in external repo: https://github.com/starius/aesctrat
1 parent be0b2a3 commit d9b6a72

5 files changed

+1687
-0
lines changed

src/crypto/aes/ctr_multiblock.go

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build amd64 || arm64
6+
7+
package aes
8+
9+
import (
10+
"crypto/cipher"
11+
"crypto/internal/subtle"
12+
"fmt"
13+
)
14+
15+
// defined in ctr_multiblock_*.s
16+
17+
//go:generate sh -c "PYTHONIOENCODING=utf8 python ctr_multiblock_amd64_gen.py 1,2,4,8 > ctr_multiblock_amd64.s"
18+
//go:generate sh -c "PYTHONIOENCODING=utf8 python ctr_multiblock_arm64_gen.py 1,2,4,8 > ctr_multiblock_arm64.s"
19+
20+
//go:noescape
21+
func rev16Asm(iv *byte)
22+
23+
//go:noescape
24+
func ctrBlocks1Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
25+
26+
//go:noescape
27+
func ctrBlocks2Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
28+
29+
//go:noescape
30+
func ctrBlocks4Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
31+
32+
//go:noescape
33+
func ctrBlocks8Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
34+
35+
type aesCtrWithIV struct {
36+
enc []uint32
37+
rounds int
38+
ivRev [BlockSize]byte
39+
offset uint64
40+
}
41+
42+
// NewCTR implements crypto/cipher.ctrAble so that crypto/cipher.NewCTR
43+
// will use the optimised implementation in this file when possible.
44+
func (c *aesCipherAsm) NewCTR(iv []byte) cipher.Stream {
45+
if len(iv) != BlockSize {
46+
panic(fmt.Sprintf("bad IV length: %d", len(iv)))
47+
}
48+
49+
// Reverse IV once, because it is needed in reversed form
50+
// in all subsequent ASM calls.
51+
var ivRev [BlockSize]byte
52+
copy(ivRev[:], iv)
53+
rev16Asm(&ivRev[0])
54+
55+
return &aesCtrWithIV{
56+
enc: c.enc,
57+
rounds: len(c.enc)/4 - 1,
58+
ivRev: ivRev,
59+
offset: 0,
60+
}
61+
}
62+
63+
func (c *aesCtrWithIV) XORKeyStream(dst, src []byte) {
64+
c.XORKeyStreamAt(dst, src, c.offset)
65+
c.offset += uint64(len(src))
66+
}
67+
68+
func (c *aesCtrWithIV) XORKeyStreamAt(dst, src []byte, offset uint64) {
69+
if len(dst) < len(src) {
70+
panic("len(dst) < len(src)")
71+
}
72+
dst = dst[:len(src)]
73+
74+
if subtle.InexactOverlap(dst, src) {
75+
panic("crypto/aes: invalid buffer overlap")
76+
}
77+
78+
offsetMod16 := offset % BlockSize
79+
80+
if offsetMod16 != 0 {
81+
// We have a partial block in the beginning.
82+
plaintext := make([]byte, BlockSize)
83+
copy(plaintext[offsetMod16:BlockSize], src)
84+
ciphertext := make([]byte, BlockSize)
85+
ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize)
86+
progress := BlockSize - offsetMod16
87+
if progress > uint64(len(src)) {
88+
progress = uint64(len(src))
89+
}
90+
copy(dst[:progress], ciphertext[offsetMod16:BlockSize])
91+
src = src[progress:]
92+
dst = dst[progress:]
93+
offset += progress
94+
}
95+
96+
for len(src) >= 8*BlockSize {
97+
ctrBlocks8Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
98+
src = src[8*BlockSize:]
99+
dst = dst[8*BlockSize:]
100+
offset += 8 * BlockSize
101+
}
102+
// 4, 2, and 1 blocks in the end can happen max 1 times, so if, not for.
103+
if len(src) >= 4*BlockSize {
104+
ctrBlocks4Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
105+
src = src[4*BlockSize:]
106+
dst = dst[4*BlockSize:]
107+
offset += 4 * BlockSize
108+
}
109+
if len(src) >= 2*BlockSize {
110+
ctrBlocks2Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
111+
src = src[2*BlockSize:]
112+
dst = dst[2*BlockSize:]
113+
offset += 2 * BlockSize
114+
}
115+
if len(src) >= 1*BlockSize {
116+
ctrBlocks1Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
117+
src = src[1*BlockSize:]
118+
dst = dst[1*BlockSize:]
119+
offset += 1 * BlockSize
120+
}
121+
122+
if len(src) != 0 {
123+
// We have a partial block in the end.
124+
plaintext := make([]byte, BlockSize)
125+
copy(plaintext, src)
126+
ciphertext := make([]byte, BlockSize)
127+
ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize)
128+
copy(dst, ciphertext)
129+
}
130+
}

0 commit comments

Comments
 (0)