forked from aklomp/base64
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbase64_neon64.c
145 lines (123 loc) · 4.97 KB
/
base64_neon64.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
* ARM NEON-accelerated base64 codec (arm64 version).
*
* Note there is no separate decoder for ARM64 because there would be no performance difference.
*/
#include "base64.h"
#if defined(__ARM_NEON) && defined(__LP64__)
#include <arm_neon.h>
void
base64_stream_encode_neon64 (struct base64_state *state, const char *const src, size_t srclen, char *const out, size_t *const outlen)
{
/* Assume that *out is large enough to contain the output.
* Theoretically it should be 4/3 the length of src. */
const unsigned char *c = (unsigned char *)src;
char *o = out;
/* Use local temporaries to avoid cache thrashing: */
size_t outl = 0;
struct base64_state st;
/*
* Store the entire encoding table into 4 128-bit vectors; we
* copy from a transposed version of the table to match what
* vld4q_u8 expects.
*/
uint8x16x4_t venc4 = vld4q_u8((const unsigned char *) state->base64_table_enc_T);
uint8x16_t lower_6_bits_mask = vdupq_n_u32(0x3F000000);
st.bytes = state->bytes;
st.carry = state->carry;
/* Turn three bytes into four 6-bit numbers: */
/* in[0] = 00111111 */
/* in[1] = 00112222 */
/* in[2] = 00222233 */
/* in[3] = 00333333 */
/* Duff's device, a for() loop inside a switch() statement. Legal! */
switch (st.bytes)
{
for (;;)
{
case 0:
/* ARM64 NEON version */
while (srclen >= 16) /* we read 16 bytes, process the first 12, and output 16 */
{
uint8x16_t str, mask, res;
/* Load string: */
str = vld1q_u8((void *) c);
/* Reorder to 32-bit big-endian, duplicating the third byte in every block of four.
* This copies the third byte to its final destination, so we can include it later
* by just masking instead of shifting and masking.
* The workset must be in big-endian, otherwise the shifted bits do not carry over
* properly among adjacent bytes: */
str = __builtin_shufflevector(str,
str,
2, 2, 1, 0,
5, 5, 4, 3,
8, 8, 7, 6,
11, 11, 10, 9);
/* Mask to pass through only the lower 6 bits of one byte: */
mask = lower_6_bits_mask;
/* Shift bits by 2, mask in only the first byte: */
res = vshrq_n_u32(str, 2) & mask;
mask = vshrq_n_u32(mask, 8);
/* Shift bits by 4, mask in only the second byte: */
res |= vshrq_n_u32(str, 4) & mask;
mask = vshrq_n_u32(mask, 8);
/* Shift bits by 6, mask in only the third byte: */
res |= vshrq_n_u32(str, 6) & mask;
mask = vshrq_n_u32(mask, 8);
/* No shift necessary for the fourth byte because we duplicated
* the third byte to this position; just mask: */
res |= str & mask;
/* Reorder to 32-bit little-endian: */
res = __builtin_shufflevector(res,
res,
3, 2, 1, 0,
7, 6, 5, 4,
11, 10, 9, 8,
15, 14, 13, 12);
/* ARM64 allows lookup in a 64 byte table -- perfect! */
str = vqtbl4q_u8(venc4, res); /* look up each byte in the table */
/* store resulting 16 bytes in o */
vst1q_u8((void *) o, str);
c += 12; /* 3 * 4 bytes of input */
o += 16; /* 4 * 4 bytes of output */
outl += 16;
srclen -= 12;
}
if (srclen-- == 0) {
break;
}
*o++ = state->base64_table_enc[*c >> 2];
st.carry = (*c++ << 4) & 0x30;
st.bytes++;
outl += 1;
case 1: if (srclen-- == 0) {
break;
}
*o++ = state->base64_table_enc[st.carry | (*c >> 4)];
st.carry = (*c++ << 2) & 0x3C;
st.bytes++;
outl += 1;
case 2: if (srclen-- == 0) {
break;
}
*o++ = state->base64_table_enc[st.carry | (*c >> 6)];
*o++ = state->base64_table_enc[*c++ & 0x3F];
st.bytes = 0;
outl += 2;
}
}
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = outl;
}
#else /* defined(__ARM_NEON) && defined(__LP64__) */
void
base64_stream_encode_neon64 (struct base64_state *state, const char *const src, size_t srclen, char *const out, size_t *const outlen)
{
(void) state;
(void) src;
(void) srclen;
(void) out;
(void) outlen;
}
#endif /* defined(__ARM_NEON) && defined(__LP64__) */