commit f8ed7d5
xplshn
·
2026-06-10 19:00:17 +0000 UTC
parent c264a7c
fix: sed, ls Signed-off-by: xplshn <anto@xplshn.com.ar>
6 files changed,
+4314,
-24
M
Makefile
+10,
-6
1@@ -264,7 +264,8 @@ PSEUDO_BIN_ALL =\
2 cmd/pseudo/whoami\
3 cmd/pseudo/xinstall\
4 cmd/pseudo/yes\
5- cmd/pseudo/base64
6+ cmd/pseudo/base64\
7+ cmd/extra/b3sum
8
9 MAKEOBJ =\
10 cmd/posix/make/defaults.o\
11@@ -431,6 +432,7 @@ BIN_whoami_1 = cmd/pseudo/whoami
12 BIN_xinstall_1 = cmd/pseudo/xinstall
13 BIN_yes_1 = cmd/pseudo/yes
14 BIN_base64_1 = cmd/pseudo/base64
15+BIN_b3sum_1 = cmd/extra/b3sum
16
17 BIN_make_tool_1 = cmd/posix/make/make
18
19@@ -596,7 +598,12 @@ PSEUDO_BIN = \
20 $(BIN_whoami_$(BUILD_PSEUDO_WHOAMI)) \
21 $(BIN_xinstall_$(BUILD_PSEUDO_XINSTALL)) \
22 $(BIN_yes_$(BUILD_PSEUDO_YES)) \
23- $(BIN_base64_$(BUILD_PSEUDO_BASE64))
24+ $(BIN_base64_$(BUILD_PSEUDO_BASE64)) \
25+ $(BIN_b3sum_$(BUILD_PSEUDO_B3SUM)) \
26+ $(BIN_ar_$(BUILD_DEV_AR)) \
27+ $(BIN_as_$(BUILD_DEV_CC)) \
28+ $(BIN_ld_$(BUILD_DEV_LD)) \
29+ $(BIN_cc_$(BUILD_DEV_CC))
30
31 MAKE_BIN = $(BIN_make_tool_$(BUILD_MAKE_MAKE))
32
33@@ -647,7 +654,7 @@ box: $(LIB)
34 scripts/mkbox
35
36 clean:
37- rm -f shared/libutf/*.o shared/libutil/*.o cmd/posix/make/*.o cmd/posix/awk/*.o cmd/posix/sh/*.o cmd/dev/ar/*.o cmd/dev/ld/*.o
38+ rm -f shared/libutf/*.o shared/libutil/*.o cmd/posix/make/*.o cmd/posix/awk/*.o cmd/posix/sh/*.o cmd/extra/*.o
39 rm -f $(POSIX_BIN_ALL) $(LINUX_BIN_ALL) $(NET_BIN_ALL) $(XSI_BIN_ALL) $(PSEUDO_BIN_ALL) $(LIB)
40 rm -f cmd/posix/make/make cmd/posix/getconf.h cmd/posix/bc.c
41 rm -f cmd/posix/awk/awk cmd/posix/awk/maketab cmd/posix/awk/awkgram.tab.c cmd/posix/awk/awkgram.tab.h cmd/posix/awk/proctab.c
42@@ -747,6 +754,3 @@ cmd/posix/sh/%.o: cmd/posix/sh/%.c
43
44 cmd/posix/sh/sh: $(SHOBJ) $(LIB)
45 $(CC) $(LDFLAGS) -o $@ $(SHOBJ) $(LIB) $(LDLIBS)
46-
47-cmd/pseudo/base64: cmd/pseudo/base64.o $(LIB)
48- $(CC) $(LDFLAGS) -o $@ cmd/pseudo/base64.o $(LIB) $(LDLIBS)
+4029,
-0
1@@ -0,0 +1,4029 @@
2+/* See LICENSE file for copyright and license details. */
3+#include "util.h"
4+#include "arg.h"
5+
6+#include <assert.h>
7+#include <immintrin.h>
8+#include <stdint.h>
9+#include <stdio.h>
10+#include <stdlib.h>
11+#include <string.h>
12+
13+#define BLAKE3_VERSION_STRING "1.5.0"
14+#define BLAKE3_KEY_LEN 32
15+#define BLAKE3_OUT_LEN 32
16+#define BLAKE3_BLOCK_LEN 64
17+#define BLAKE3_CHUNK_LEN 1024
18+#define BLAKE3_MAX_DEPTH 54
19+
20+#if defined(__x86_64__)
21+#define MAX_SIMD_DEGREE 16
22+#else
23+#define MAX_SIMD_DEGREE 1
24+#endif
25+
26+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
27+
28+enum Blake3Flags {
29+ CHUNK_START = 1 << 0,
30+ CHUNK_END = 1 << 1,
31+ PARENT = 1 << 2,
32+ ROOT = 1 << 3,
33+ KEYED_HASH = 1 << 4,
34+ DERIVE_KEY_CONTEXT = 1 << 5,
35+ DERIVE_KEY_MATERIAL = 1 << 6
36+};
37+
38+struct Blake3ChunkState {
39+ uint32_t cv[8];
40+ uint64_t chunk_counter;
41+ uint8_t buf[BLAKE3_BLOCK_LEN];
42+ uint8_t buf_len;
43+ uint8_t blocks_compressed;
44+ uint8_t flags;
45+};
46+
47+struct Blake3Hasher {
48+ uint32_t key[8];
49+ struct Blake3ChunkState chunk;
50+ uint8_t cv_stack_len;
51+ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
52+};
53+
54+void blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len);
55+void blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len);
56+
57+struct Output {
58+ uint32_t input_cv[8];
59+ uint64_t counter;
60+ uint8_t block[BLAKE3_BLOCK_LEN];
61+ uint8_t block_len;
62+ uint8_t flags;
63+};
64+
65+static const uint32_t IV[8] = {
66+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
67+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
68+};
69+
70+static const uint8_t MSG_SCHEDULE[7][16] = {
71+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
72+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
73+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
74+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
75+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
76+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
77+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
78+};
79+
80+static inline uint32_t
81+load32(const void *src)
82+{
83+ const uint8_t *p = (const uint8_t *)src;
84+
85+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
86+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
87+}
88+
89+static inline void
90+store32(void *dst, uint32_t w)
91+{
92+ uint8_t *p = (uint8_t *)dst;
93+
94+ p[0] = (uint8_t)(w >> 0);
95+ p[1] = (uint8_t)(w >> 8);
96+ p[2] = (uint8_t)(w >> 16);
97+ p[3] = (uint8_t)(w >> 24);
98+}
99+
100+static inline void
101+store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8])
102+{
103+ store32(&bytes_out[0 * 4], cv_words[0]);
104+ store32(&bytes_out[1 * 4], cv_words[1]);
105+ store32(&bytes_out[2 * 4], cv_words[2]);
106+ store32(&bytes_out[3 * 4], cv_words[3]);
107+ store32(&bytes_out[4 * 4], cv_words[4]);
108+ store32(&bytes_out[5 * 4], cv_words[5]);
109+ store32(&bytes_out[6 * 4], cv_words[6]);
110+ store32(&bytes_out[7 * 4], cv_words[7]);
111+}
112+
113+static inline uint32_t
114+counter_low(uint64_t counter)
115+{
116+ return (uint32_t)counter;
117+}
118+
119+static inline uint32_t
120+counter_high(uint64_t counter)
121+{
122+ return (uint32_t)(counter >> 32);
123+}
124+
125+/* forward declarations */
126+#if defined(__x86_64__)
127+void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
128+void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
129+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
130+
131+void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
132+void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
133+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
134+
135+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
136+
137+void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
138+void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
139+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
140+#endif
141+
142+void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
143+void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
144+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
145+
146+void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
147+void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
148+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
149+
150+/* portable implementations */
151+static inline uint32_t
152+rotr32(uint32_t w, uint32_t c)
153+{
154+ return (w >> c) | (w << (32 - c));
155+}
156+
157+static inline void
158+g_portable(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
159+{
160+ state[a] = state[a] + state[b] + x;
161+ state[d] = rotr32(state[d] ^ state[a], 16);
162+ state[c] = state[c] + state[d];
163+ state[b] = rotr32(state[b] ^ state[c], 12);
164+ state[a] = state[a] + state[b] + y;
165+ state[d] = rotr32(state[d] ^ state[a], 8);
166+ state[c] = state[c] + state[d];
167+ state[b] = rotr32(state[b] ^ state[c], 7);
168+}
169+
170+static inline void
171+round_fn_portable(uint32_t state[16], const uint32_t *msg, size_t round)
172+{
173+ const uint8_t *schedule = MSG_SCHEDULE[round];
174+
175+ g_portable(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
176+ g_portable(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
177+ g_portable(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
178+ g_portable(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
179+
180+ g_portable(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
181+ g_portable(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
182+ g_portable(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
183+ g_portable(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
184+}
185+
186+static inline void
187+compress_pre_portable(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
188+{
189+ uint32_t block_words[16];
190+
191+ block_words[0] = load32(block + 4 * 0);
192+ block_words[1] = load32(block + 4 * 1);
193+ block_words[2] = load32(block + 4 * 2);
194+ block_words[3] = load32(block + 4 * 3);
195+ block_words[4] = load32(block + 4 * 4);
196+ block_words[5] = load32(block + 4 * 5);
197+ block_words[6] = load32(block + 4 * 6);
198+ block_words[7] = load32(block + 4 * 7);
199+ block_words[8] = load32(block + 4 * 8);
200+ block_words[9] = load32(block + 4 * 9);
201+ block_words[10] = load32(block + 4 * 10);
202+ block_words[11] = load32(block + 4 * 11);
203+ block_words[12] = load32(block + 4 * 12);
204+ block_words[13] = load32(block + 4 * 13);
205+ block_words[14] = load32(block + 4 * 14);
206+ block_words[15] = load32(block + 4 * 15);
207+
208+ state[0] = cv[0];
209+ state[1] = cv[1];
210+ state[2] = cv[2];
211+ state[3] = cv[3];
212+ state[4] = cv[4];
213+ state[5] = cv[5];
214+ state[6] = cv[6];
215+ state[7] = cv[7];
216+ state[8] = IV[0];
217+ state[9] = IV[1];
218+ state[10] = IV[2];
219+ state[11] = IV[3];
220+ state[12] = counter_low(counter);
221+ state[13] = counter_high(counter);
222+ state[14] = (uint32_t)block_len;
223+ state[15] = (uint32_t)flags;
224+
225+ round_fn_portable(state, &block_words[0], 0);
226+ round_fn_portable(state, &block_words[0], 1);
227+ round_fn_portable(state, &block_words[0], 2);
228+ round_fn_portable(state, &block_words[0], 3);
229+ round_fn_portable(state, &block_words[0], 4);
230+ round_fn_portable(state, &block_words[0], 5);
231+ round_fn_portable(state, &block_words[0], 6);
232+}
233+
234+void
235+blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
236+{
237+ uint32_t state[16];
238+
239+ compress_pre_portable(state, cv, block, block_len, counter, flags);
240+ cv[0] = state[0] ^ state[8];
241+ cv[1] = state[1] ^ state[9];
242+ cv[2] = state[2] ^ state[10];
243+ cv[3] = state[3] ^ state[11];
244+ cv[4] = state[4] ^ state[12];
245+ cv[5] = state[5] ^ state[13];
246+ cv[6] = state[6] ^ state[14];
247+ cv[7] = state[7] ^ state[15];
248+}
249+
250+void
251+blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
252+{
253+ uint32_t state[16];
254+
255+ compress_pre_portable(state, cv, block, block_len, counter, flags);
256+
257+ store32(&out[0 * 4], state[0] ^ state[8]);
258+ store32(&out[1 * 4], state[1] ^ state[9]);
259+ store32(&out[2 * 4], state[2] ^ state[10]);
260+ store32(&out[3 * 4], state[3] ^ state[11]);
261+ store32(&out[4 * 4], state[4] ^ state[12]);
262+ store32(&out[5 * 4], state[5] ^ state[13]);
263+ store32(&out[6 * 4], state[6] ^ state[14]);
264+ store32(&out[7 * 4], state[7] ^ state[15]);
265+ store32(&out[8 * 4], state[8] ^ cv[0]);
266+ store32(&out[9 * 4], state[9] ^ cv[1]);
267+ store32(&out[10 * 4], state[10] ^ cv[2]);
268+ store32(&out[11 * 4], state[11] ^ cv[3]);
269+ store32(&out[12 * 4], state[12] ^ cv[4]);
270+ store32(&out[13 * 4], state[13] ^ cv[5]);
271+ store32(&out[14 * 4], state[14] ^ cv[6]);
272+ store32(&out[15 * 4], state[15] ^ cv[7]);
273+}
274+
275+static inline void
276+hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
277+{
278+ uint32_t cv[8];
279+ uint8_t block_flags;
280+
281+ memcpy(cv, key, BLAKE3_KEY_LEN);
282+ block_flags = flags | flags_start;
283+ while (blocks > 0) {
284+ if (blocks == 1) {
285+ block_flags |= flags_end;
286+ }
287+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
288+ input = &input[BLAKE3_BLOCK_LEN];
289+ blocks -= 1;
290+ block_flags = flags;
291+ }
292+ store_cv_words(out, cv);
293+}
294+
295+void
296+blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
297+{
298+ while (num_inputs > 0) {
299+ hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
300+ if (increment_counter) {
301+ counter += 1;
302+ }
303+ inputs += 1;
304+ num_inputs -= 1;
305+ out = &out[BLAKE3_OUT_LEN];
306+ }
307+}
308+
309+/* cpu features detection */
310+enum {
311+ SSE2 = 1 << 0,
312+ SSE41 = 1 << 1,
313+ AVX2 = 1 << 2,
314+ AVX512 = 1 << 3
315+};
316+
317+static int blake3_cpu_features = 0;
318+static int blake3_cpu_detected = 0;
319+
320+#if defined(__x86_64__)
321+#include <cpuid.h>
322+
323+static void
324+blake3_cpuid(uint32_t out[4], uint32_t id, uint32_t sid)
325+{
326+ __cpuid_count(id, sid, out[0], out[1], out[2], out[3]);
327+}
328+
329+static uint64_t
330+blake3_xgetbv(void)
331+{
332+ uint32_t eax, edx;
333+
334+ __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
335+ return ((uint64_t)edx << 32) | eax;
336+}
337+#endif
338+
339+static void
340+blake3_detect_cpu_features(void)
341+{
342+#if defined(__x86_64__)
343+ enum { EAX, EBX, ECX, EDX };
344+ uint32_t regs[4];
345+ uint64_t xcr0;
346+ int features = 0;
347+
348+ blake3_cpuid(regs, 1, 0);
349+ if (regs[EDX] & (1UL << 26))
350+ features |= SSE2;
351+ if (regs[ECX] & (1UL << 19))
352+ features |= SSE41;
353+ /* osxsave */
354+ if (regs[ECX] & (1UL << 27)) {
355+ blake3_cpuid(regs, 0, 0);
356+ if (regs[EAX] >= 7) {
357+ blake3_cpuid(regs, 7, 0);
358+ xcr0 = blake3_xgetbv();
359+ /* avx2 and xcr0 sse, avx */
360+ if ((regs[EBX] & (1UL << 5)) && (xcr0 & 0x06) == 0x06)
361+ features |= AVX2;
362+ /* avx512f, avx512vl and xcr0 opmask, zmm_hi256, hi16_zmm */
363+ if ((regs[EBX] & (1UL << 31 | 1UL << 16)) && (xcr0 & 0xe0) == 0xe0)
364+ features |= AVX512;
365+ }
366+ }
367+ blake3_cpu_features = features;
368+#endif
369+ blake3_cpu_detected = 1;
370+}
371+
372+#if defined(__x86_64__)
373+__attribute__((constructor))
374+static void
375+blake3_init_cpu(void)
376+{
377+ if (!blake3_cpu_detected)
378+ blake3_detect_cpu_features();
379+}
380+#endif
381+
382+#if defined(__x86_64__)
383+#pragma GCC push_options
384+#pragma GCC target("sse2")
385+#define DEGREE_SSE2 4
386+
387+#define _mm_shuffle_ps2(a, b, c) \
388+ (_mm_castps_si128( \
389+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
390+
391+static inline __m128i
392+loadu_sse2(const uint8_t src[16])
393+{
394+ return _mm_loadu_si128((const __m128i *)src);
395+}
396+
397+static inline void
398+storeu_sse2(__m128i src, uint8_t dest[16])
399+{
400+ _mm_storeu_si128((__m128i *)dest, src);
401+}
402+
403+static inline __m128i
404+addv_sse2(__m128i a, __m128i b)
405+{
406+ return _mm_add_epi32(a, b);
407+}
408+
409+static inline __m128i
410+xorv_sse2(__m128i a, __m128i b)
411+{
412+ return _mm_xor_si128(a, b);
413+}
414+
415+static inline __m128i
416+set1_sse2(uint32_t x)
417+{
418+ return _mm_set1_epi32((int32_t)x);
419+}
420+
421+static inline __m128i
422+set4_sse2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
423+{
424+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
425+}
426+
427+static inline __m128i
428+rot16_sse2(__m128i x)
429+{
430+ return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
431+}
432+
433+static inline __m128i
434+rot12_sse2(__m128i x)
435+{
436+ return xorv_sse2(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
437+}
438+
439+static inline __m128i
440+rot8_sse2(__m128i x)
441+{
442+ return xorv_sse2(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
443+}
444+
445+static inline __m128i
446+rot7_sse2(__m128i x)
447+{
448+ return xorv_sse2(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
449+}
450+
451+static inline void
452+g1_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
453+{
454+ *row0 = addv_sse2(addv_sse2(*row0, m), *row1);
455+ *row3 = xorv_sse2(*row3, *row0);
456+ *row3 = rot16_sse2(*row3);
457+ *row2 = addv_sse2(*row2, *row3);
458+ *row1 = xorv_sse2(*row1, *row2);
459+ *row1 = rot12_sse2(*row1);
460+}
461+
462+static inline void
463+g2_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
464+{
465+ *row0 = addv_sse2(addv_sse2(*row0, m), *row1);
466+ *row3 = xorv_sse2(*row3, *row0);
467+ *row3 = rot8_sse2(*row3);
468+ *row2 = addv_sse2(*row2, *row3);
469+ *row1 = xorv_sse2(*row1, *row2);
470+ *row1 = rot7_sse2(*row1);
471+}
472+
473+static inline void
474+diagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
475+{
476+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
477+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
478+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
479+}
480+
481+static inline void
482+undiagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
483+{
484+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
485+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
486+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
487+}
488+
489+static inline __m128i
490+blend_epi16_sse2(__m128i a, __m128i b, const int16_t imm8)
491+{
492+ const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
493+ __m128i mask = _mm_set1_epi16(imm8);
494+
495+ mask = _mm_and_si128(mask, bits);
496+ mask = _mm_cmpeq_epi16(mask, bits);
497+ return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
498+}
499+
500+static inline void
501+compress_pre_sse2(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
502+{
503+ __m128i m0, m1, m2, m3;
504+ __m128i t0, t1, t2, t3, tt;
505+
506+ rows[0] = loadu_sse2((uint8_t *)&cv[0]);
507+ rows[1] = loadu_sse2((uint8_t *)&cv[4]);
508+ rows[2] = set4_sse2(IV[0], IV[1], IV[2], IV[3]);
509+ rows[3] = set4_sse2(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
510+
511+ m0 = loadu_sse2(&block[sizeof(__m128i) * 0]);
512+ m1 = loadu_sse2(&block[sizeof(__m128i) * 1]);
513+ m2 = loadu_sse2(&block[sizeof(__m128i) * 2]);
514+ m3 = loadu_sse2(&block[sizeof(__m128i) * 3]);
515+
516+ /* round 1 */
517+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
518+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
519+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
520+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
521+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
522+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
523+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
524+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
525+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
526+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
527+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
528+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
529+ m0 = t0;
530+ m1 = t1;
531+ m2 = t2;
532+ m3 = t3;
533+
534+ /* round 2 */
535+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
536+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
537+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
538+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
539+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
540+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
541+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
542+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
543+ t2 = _mm_unpacklo_epi64(m3, m1);
544+ tt = blend_epi16_sse2(t2, m2, 0xC0);
545+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
546+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
547+ t3 = _mm_unpackhi_epi32(m1, m3);
548+ tt = _mm_unpacklo_epi32(m2, t3);
549+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
550+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
551+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
552+ m0 = t0;
553+ m1 = t1;
554+ m2 = t2;
555+ m3 = t3;
556+
557+ /* round 3 */
558+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
559+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
560+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
561+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
562+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
563+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
564+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
565+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
566+ t2 = _mm_unpacklo_epi64(m3, m1);
567+ tt = blend_epi16_sse2(t2, m2, 0xC0);
568+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
569+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
570+ t3 = _mm_unpackhi_epi32(m1, m3);
571+ tt = _mm_unpacklo_epi32(m2, t3);
572+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
573+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
574+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
575+ m0 = t0;
576+ m1 = t1;
577+ m2 = t2;
578+ m3 = t3;
579+
580+ /* round 4 */
581+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
582+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
583+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
584+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
585+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
586+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
587+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
588+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
589+ t2 = _mm_unpacklo_epi64(m3, m1);
590+ tt = blend_epi16_sse2(t2, m2, 0xC0);
591+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
592+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
593+ t3 = _mm_unpackhi_epi32(m1, m3);
594+ tt = _mm_unpacklo_epi32(m2, t3);
595+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
596+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
597+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
598+ m0 = t0;
599+ m1 = t1;
600+ m2 = t2;
601+ m3 = t3;
602+
603+ /* round 5 */
604+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
605+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
606+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
607+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
608+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
609+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
610+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
611+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
612+ t2 = _mm_unpacklo_epi64(m3, m1);
613+ tt = blend_epi16_sse2(t2, m2, 0xC0);
614+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
615+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
616+ t3 = _mm_unpackhi_epi32(m1, m3);
617+ tt = _mm_unpacklo_epi32(m2, t3);
618+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
619+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
620+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
621+ m0 = t0;
622+ m1 = t1;
623+ m2 = t2;
624+ m3 = t3;
625+
626+ /* round 6 */
627+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
628+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
629+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
630+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
631+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
632+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
633+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
634+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
635+ t2 = _mm_unpacklo_epi64(m3, m1);
636+ tt = blend_epi16_sse2(t2, m2, 0xC0);
637+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
638+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
639+ t3 = _mm_unpackhi_epi32(m1, m3);
640+ tt = _mm_unpacklo_epi32(m2, t3);
641+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
642+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
643+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
644+ m0 = t0;
645+ m1 = t1;
646+ m2 = t2;
647+ m3 = t3;
648+
649+ /* round 7 */
650+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
651+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
652+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
653+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
654+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
655+ t1 = blend_epi16_sse2(tt, t1, 0xCC);
656+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
657+ diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
658+ t2 = _mm_unpacklo_epi64(m3, m1);
659+ tt = blend_epi16_sse2(t2, m2, 0xC0);
660+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
661+ g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
662+ t3 = _mm_unpackhi_epi32(m1, m3);
663+ tt = _mm_unpacklo_epi32(m2, t3);
664+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
665+ g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
666+ undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
667+}
668+
669+void
670+blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
671+{
672+ __m128i rows[4];
673+
674+ compress_pre_sse2(rows, cv, block, block_len, counter, flags);
675+ storeu_sse2(xorv_sse2(rows[0], rows[2]), (uint8_t *)&cv[0]);
676+ storeu_sse2(xorv_sse2(rows[1], rows[3]), (uint8_t *)&cv[4]);
677+}
678+
679+void
680+blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
681+{
682+ __m128i rows[4];
683+
684+ compress_pre_sse2(rows, cv, block, block_len, counter, flags);
685+ storeu_sse2(xorv_sse2(rows[0], rows[2]), &out_buf[0]);
686+ storeu_sse2(xorv_sse2(rows[1], rows[3]), &out_buf[16]);
687+ storeu_sse2(xorv_sse2(rows[2], loadu_sse2((uint8_t *)&cv[0])), &out_buf[32]);
688+ storeu_sse2(xorv_sse2(rows[3], loadu_sse2((uint8_t *)&cv[4])), &out_buf[48]);
689+}
690+
691+static inline void
692+round_fn_sse2(__m128i v[16], __m128i m[16], size_t r)
693+{
694+ v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
695+ v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
696+ v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
697+ v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
698+ v[0] = addv_sse2(v[0], v[4]);
699+ v[1] = addv_sse2(v[1], v[5]);
700+ v[2] = addv_sse2(v[2], v[6]);
701+ v[3] = addv_sse2(v[3], v[7]);
702+ v[12] = xorv_sse2(v[12], v[0]);
703+ v[13] = xorv_sse2(v[13], v[1]);
704+ v[14] = xorv_sse2(v[14], v[2]);
705+ v[15] = xorv_sse2(v[15], v[3]);
706+ v[12] = rot16_sse2(v[12]);
707+ v[13] = rot16_sse2(v[13]);
708+ v[14] = rot16_sse2(v[14]);
709+ v[15] = rot16_sse2(v[15]);
710+ v[8] = addv_sse2(v[8], v[12]);
711+ v[9] = addv_sse2(v[9], v[13]);
712+ v[10] = addv_sse2(v[10], v[14]);
713+ v[11] = addv_sse2(v[11], v[15]);
714+ v[4] = xorv_sse2(v[4], v[8]);
715+ v[5] = xorv_sse2(v[5], v[9]);
716+ v[6] = xorv_sse2(v[6], v[10]);
717+ v[7] = xorv_sse2(v[7], v[11]);
718+ v[4] = rot12_sse2(v[4]);
719+ v[5] = rot12_sse2(v[5]);
720+ v[6] = rot12_sse2(v[6]);
721+ v[7] = rot12_sse2(v[7]);
722+ v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
723+ v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
724+ v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
725+ v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
726+ v[0] = addv_sse2(v[0], v[4]);
727+ v[1] = addv_sse2(v[1], v[5]);
728+ v[2] = addv_sse2(v[2], v[6]);
729+ v[3] = addv_sse2(v[3], v[7]);
730+ v[12] = xorv_sse2(v[12], v[0]);
731+ v[13] = xorv_sse2(v[13], v[1]);
732+ v[14] = xorv_sse2(v[14], v[2]);
733+ v[15] = xorv_sse2(v[15], v[3]);
734+ v[12] = rot8_sse2(v[12]);
735+ v[13] = rot8_sse2(v[13]);
736+ v[14] = rot8_sse2(v[14]);
737+ v[15] = rot8_sse2(v[15]);
738+ v[8] = addv_sse2(v[8], v[12]);
739+ v[9] = addv_sse2(v[9], v[13]);
740+ v[10] = addv_sse2(v[10], v[14]);
741+ v[11] = addv_sse2(v[11], v[15]);
742+ v[4] = xorv_sse2(v[4], v[8]);
743+ v[5] = xorv_sse2(v[5], v[9]);
744+ v[6] = xorv_sse2(v[6], v[10]);
745+ v[7] = xorv_sse2(v[7], v[11]);
746+ v[4] = rot7_sse2(v[4]);
747+ v[5] = rot7_sse2(v[5]);
748+ v[6] = rot7_sse2(v[6]);
749+ v[7] = rot7_sse2(v[7]);
750+
751+ v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
752+ v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
753+ v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
754+ v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
755+ v[0] = addv_sse2(v[0], v[5]);
756+ v[1] = addv_sse2(v[1], v[6]);
757+ v[2] = addv_sse2(v[2], v[7]);
758+ v[3] = addv_sse2(v[3], v[4]);
759+ v[15] = xorv_sse2(v[15], v[0]);
760+ v[12] = xorv_sse2(v[12], v[1]);
761+ v[13] = xorv_sse2(v[13], v[2]);
762+ v[14] = xorv_sse2(v[14], v[3]);
763+ v[15] = rot16_sse2(v[15]);
764+ v[12] = rot16_sse2(v[12]);
765+ v[13] = rot16_sse2(v[13]);
766+ v[14] = rot16_sse2(v[14]);
767+ v[10] = addv_sse2(v[10], v[15]);
768+ v[11] = addv_sse2(v[11], v[12]);
769+ v[8] = addv_sse2(v[8], v[13]);
770+ v[9] = addv_sse2(v[9], v[14]);
771+ v[5] = xorv_sse2(v[5], v[10]);
772+ v[6] = xorv_sse2(v[6], v[11]);
773+ v[7] = xorv_sse2(v[7], v[8]);
774+ v[4] = xorv_sse2(v[4], v[9]);
775+ v[5] = rot12_sse2(v[5]);
776+ v[6] = rot12_sse2(v[6]);
777+ v[7] = rot12_sse2(v[7]);
778+ v[4] = rot12_sse2(v[4]);
779+ v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
780+ v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
781+ v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
782+ v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
783+ v[0] = addv_sse2(v[0], v[5]);
784+ v[1] = addv_sse2(v[1], v[6]);
785+ v[2] = addv_sse2(v[2], v[7]);
786+ v[3] = addv_sse2(v[3], v[4]);
787+ v[15] = xorv_sse2(v[15], v[0]);
788+ v[12] = xorv_sse2(v[12], v[1]);
789+ v[13] = xorv_sse2(v[13], v[2]);
790+ v[14] = xorv_sse2(v[14], v[3]);
791+ v[15] = rot8_sse2(v[15]);
792+ v[12] = rot8_sse2(v[12]);
793+ v[13] = rot8_sse2(v[13]);
794+ v[14] = rot8_sse2(v[14]);
795+ v[10] = addv_sse2(v[10], v[15]);
796+ v[11] = addv_sse2(v[11], v[12]);
797+ v[8] = addv_sse2(v[8], v[13]);
798+ v[9] = addv_sse2(v[9], v[14]);
799+ v[5] = xorv_sse2(v[5], v[10]);
800+ v[6] = xorv_sse2(v[6], v[11]);
801+ v[7] = xorv_sse2(v[7], v[8]);
802+ v[4] = xorv_sse2(v[4], v[9]);
803+ v[5] = rot7_sse2(v[5]);
804+ v[6] = rot7_sse2(v[6]);
805+ v[7] = rot7_sse2(v[7]);
806+ v[4] = rot7_sse2(v[4]);
807+}
808+
809+static inline void
810+transpose_vecs_sse2(__m128i vecs[DEGREE_SSE2])
811+{
812+ __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
813+ __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
814+ __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
815+ __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
816+
817+ __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
818+ __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
819+ __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
820+ __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
821+
822+ vecs[0] = abcd_0;
823+ vecs[1] = abcd_1;
824+ vecs[2] = abcd_2;
825+ vecs[3] = abcd_3;
826+}
827+
828+static inline void
829+transpose_msg_vecs_sse2(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
830+{
831+ size_t i;
832+
833+ out_msg[0] = loadu_sse2(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
834+ out_msg[1] = loadu_sse2(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
835+ out_msg[2] = loadu_sse2(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
836+ out_msg[3] = loadu_sse2(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
837+ out_msg[4] = loadu_sse2(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
838+ out_msg[5] = loadu_sse2(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
839+ out_msg[6] = loadu_sse2(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
840+ out_msg[7] = loadu_sse2(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
841+ out_msg[8] = loadu_sse2(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
842+ out_msg[9] = loadu_sse2(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
843+ out_msg[10] = loadu_sse2(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
844+ out_msg[11] = loadu_sse2(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
845+ out_msg[12] = loadu_sse2(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
846+ out_msg[13] = loadu_sse2(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
847+ out_msg[14] = loadu_sse2(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
848+ out_msg[15] = loadu_sse2(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
849+
850+ for (i = 0; i < 4; i++) {
851+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
852+ }
853+ transpose_vecs_sse2(&out_msg[0]);
854+ transpose_vecs_sse2(&out_msg[4]);
855+ transpose_vecs_sse2(&out_msg[8]);
856+ transpose_vecs_sse2(&out_msg[12]);
857+}
858+
859+static inline void
860+load_counters_sse2(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
861+{
862+ const __m128i mask = _mm_set1_epi32(-increment_counter);
863+ const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
864+ const __m128i add1 = _mm_and_si128(mask, add0);
865+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
866+ __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
867+ _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
868+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
869+
870+ *out_lo = l;
871+ *out_hi = h;
872+}
873+
874+void
875+blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
876+{
877+ __m128i h_vecs[8];
878+ __m128i counter_low_vec, counter_high_vec;
879+ uint8_t block_flags;
880+ size_t block;
881+
882+ h_vecs[0] = set1_sse2(key[0]);
883+ h_vecs[1] = set1_sse2(key[1]);
884+ h_vecs[2] = set1_sse2(key[2]);
885+ h_vecs[3] = set1_sse2(key[3]);
886+ h_vecs[4] = set1_sse2(key[4]);
887+ h_vecs[5] = set1_sse2(key[5]);
888+ h_vecs[6] = set1_sse2(key[6]);
889+ h_vecs[7] = set1_sse2(key[7]);
890+
891+ load_counters_sse2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
892+ block_flags = flags | flags_start;
893+
894+ for (block = 0; block < blocks; block++) {
895+ __m128i block_len_vec;
896+ __m128i block_flags_vec;
897+ __m128i msg_vecs[16];
898+ __m128i v[16];
899+
900+ if (block + 1 == blocks) {
901+ block_flags |= flags_end;
902+ }
903+ block_len_vec = set1_sse2(BLAKE3_BLOCK_LEN);
904+ block_flags_vec = set1_sse2(block_flags);
905+ transpose_msg_vecs_sse2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
906+
907+ v[0] = h_vecs[0];
908+ v[1] = h_vecs[1];
909+ v[2] = h_vecs[2];
910+ v[3] = h_vecs[3];
911+ v[4] = h_vecs[4];
912+ v[5] = h_vecs[5];
913+ v[6] = h_vecs[6];
914+ v[7] = h_vecs[7];
915+ v[8] = set1_sse2(IV[0]);
916+ v[9] = set1_sse2(IV[1]);
917+ v[10] = set1_sse2(IV[2]);
918+ v[11] = set1_sse2(IV[3]);
919+ v[12] = counter_low_vec;
920+ v[13] = counter_high_vec;
921+ v[14] = block_len_vec;
922+ v[15] = block_flags_vec;
923+
924+ round_fn_sse2(v, msg_vecs, 0);
925+ round_fn_sse2(v, msg_vecs, 1);
926+ round_fn_sse2(v, msg_vecs, 2);
927+ round_fn_sse2(v, msg_vecs, 3);
928+ round_fn_sse2(v, msg_vecs, 4);
929+ round_fn_sse2(v, msg_vecs, 5);
930+ round_fn_sse2(v, msg_vecs, 6);
931+
932+ h_vecs[0] = xorv_sse2(v[0], v[8]);
933+ h_vecs[1] = xorv_sse2(v[1], v[9]);
934+ h_vecs[2] = xorv_sse2(v[2], v[10]);
935+ h_vecs[3] = xorv_sse2(v[3], v[11]);
936+ h_vecs[4] = xorv_sse2(v[4], v[12]);
937+ h_vecs[5] = xorv_sse2(v[5], v[13]);
938+ h_vecs[6] = xorv_sse2(v[6], v[14]);
939+ h_vecs[7] = xorv_sse2(v[7], v[15]);
940+
941+ block_flags = flags;
942+ }
943+
944+ transpose_vecs_sse2(&h_vecs[0]);
945+ transpose_vecs_sse2(&h_vecs[4]);
946+ storeu_sse2(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
947+ storeu_sse2(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
948+ storeu_sse2(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
949+ storeu_sse2(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
950+ storeu_sse2(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
951+ storeu_sse2(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
952+ storeu_sse2(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
953+ storeu_sse2(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
954+}
955+
956+static inline void
957+hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
958+{
959+ uint32_t cv[8];
960+ uint8_t block_flags;
961+
962+ memcpy(cv, key, BLAKE3_KEY_LEN);
963+ block_flags = flags | flags_start;
964+ while (blocks > 0) {
965+ if (blocks == 1) {
966+ block_flags |= flags_end;
967+ }
968+ blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
969+ input = &input[BLAKE3_BLOCK_LEN];
970+ blocks -= 1;
971+ block_flags = flags;
972+ }
973+ memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
974+}
975+
976+void
977+blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
978+{
979+ while (num_inputs >= DEGREE_SSE2) {
980+ blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
981+ if (increment_counter) {
982+ counter += DEGREE_SSE2;
983+ }
984+ inputs += DEGREE_SSE2;
985+ num_inputs -= DEGREE_SSE2;
986+ out_bytes = &out_bytes[DEGREE_SSE2 * BLAKE3_OUT_LEN];
987+ }
988+ while (num_inputs > 0) {
989+ hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
990+ if (increment_counter) {
991+ counter += 1;
992+ }
993+ inputs += 1;
994+ num_inputs -= 1;
995+ out_bytes = &out_bytes[BLAKE3_OUT_LEN];
996+ }
997+}
998+#pragma GCC pop_options
999+
1000+#pragma GCC push_options
1001+#pragma GCC target("sse4.1")
1002+#define DEGREE_SSE41 4
1003+
1004+#define _mm_shuffle_ps2_sse41(a, b, c) \
1005+ (_mm_castps_si128( \
1006+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
1007+
1008+static inline __m128i
1009+loadu_sse41(const uint8_t src[16])
1010+{
1011+ return _mm_loadu_si128((const __m128i *)src);
1012+}
1013+
1014+static inline void
1015+storeu_sse41(__m128i src, uint8_t dest[16])
1016+{
1017+ _mm_storeu_si128((__m128i *)dest, src);
1018+}
1019+
1020+static inline __m128i
1021+addv_sse41(__m128i a, __m128i b)
1022+{
1023+ return _mm_add_epi32(a, b);
1024+}
1025+
1026+static inline __m128i
1027+xorv_sse41(__m128i a, __m128i b)
1028+{
1029+ return _mm_xor_si128(a, b);
1030+}
1031+
1032+static inline __m128i
1033+set1_sse41(uint32_t x)
1034+{
1035+ return _mm_set1_epi32((int32_t)x);
1036+}
1037+
1038+static inline __m128i
1039+set4_sse41(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1040+{
1041+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
1042+}
1043+
1044+static inline __m128i
1045+rot16_sse41(__m128i x)
1046+{
1047+ return _mm_shuffle_epi8(x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1048+}
1049+
1050+static inline __m128i
1051+rot12_sse41(__m128i x)
1052+{
1053+ return xorv_sse41(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
1054+}
1055+
1056+static inline __m128i
1057+rot8_sse41(__m128i x)
1058+{
1059+ return _mm_shuffle_epi8(x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1060+}
1061+
1062+static inline __m128i
1063+rot7_sse41(__m128i x)
1064+{
1065+ return xorv_sse41(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
1066+}
1067+
1068+static inline void
1069+g1_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1070+{
1071+ *row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1072+ *row3 = xorv_sse41(*row3, *row0);
1073+ *row3 = rot16_sse41(*row3);
1074+ *row2 = addv_sse41(*row2, *row3);
1075+ *row1 = xorv_sse41(*row1, *row2);
1076+ *row1 = rot12_sse41(*row1);
1077+}
1078+
1079+static inline void
1080+g2_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1081+{
1082+ *row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1083+ *row3 = xorv_sse41(*row3, *row0);
1084+ *row3 = rot8_sse41(*row3);
1085+ *row2 = addv_sse41(*row2, *row3);
1086+ *row1 = xorv_sse41(*row1, *row2);
1087+ *row1 = rot7_sse41(*row1);
1088+}
1089+
1090+static inline void
1091+diagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1092+{
1093+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
1094+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1095+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
1096+}
1097+
1098+static inline void
1099+undiagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1100+{
1101+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
1102+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1103+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
1104+}
1105+
1106+static inline void
1107+compress_pre_sse41(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1108+{
1109+ __m128i m0, m1, m2, m3;
1110+ __m128i t0, t1, t2, t3, tt;
1111+
1112+ rows[0] = loadu_sse41((uint8_t *)&cv[0]);
1113+ rows[1] = loadu_sse41((uint8_t *)&cv[4]);
1114+ rows[2] = set4_sse41(IV[0], IV[1], IV[2], IV[3]);
1115+ rows[3] = set4_sse41(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
1116+
1117+ m0 = loadu_sse41(&block[sizeof(__m128i) * 0]);
1118+ m1 = loadu_sse41(&block[sizeof(__m128i) * 1]);
1119+ m2 = loadu_sse41(&block[sizeof(__m128i) * 2]);
1120+ m3 = loadu_sse41(&block[sizeof(__m128i) * 3]);
1121+
1122+ /* round 1 */
1123+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
1124+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1125+ t1 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
1126+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1127+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1128+ t2 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
1129+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
1130+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1131+ t3 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
1132+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
1133+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1134+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1135+ m0 = t0;
1136+ m1 = t1;
1137+ m2 = t2;
1138+ m3 = t3;
1139+
1140+ /* round 2 */
1141+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1142+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1143+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1144+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1145+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1146+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1147+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1148+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1149+ t2 = _mm_unpacklo_epi64(m3, m1);
1150+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1151+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1152+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1153+ t3 = _mm_unpackhi_epi32(m1, m3);
1154+ tt = _mm_unpacklo_epi32(m2, t3);
1155+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1156+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1157+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1158+ m0 = t0;
1159+ m1 = t1;
1160+ m2 = t2;
1161+ m3 = t3;
1162+
1163+ /* round 3 */
1164+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1165+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1166+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1167+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1168+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1169+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1170+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1171+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1172+ t2 = _mm_unpacklo_epi64(m3, m1);
1173+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1174+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1175+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1176+ t3 = _mm_unpackhi_epi32(m1, m3);
1177+ tt = _mm_unpacklo_epi32(m2, t3);
1178+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1179+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1180+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1181+ m0 = t0;
1182+ m1 = t1;
1183+ m2 = t2;
1184+ m3 = t3;
1185+
1186+ /* round 4 */
1187+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1188+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1189+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1190+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1191+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1192+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1193+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1194+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1195+ t2 = _mm_unpacklo_epi64(m3, m1);
1196+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1197+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1198+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1199+ t3 = _mm_unpackhi_epi32(m1, m3);
1200+ tt = _mm_unpacklo_epi32(m2, t3);
1201+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1202+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1203+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1204+ m0 = t0;
1205+ m1 = t1;
1206+ m2 = t2;
1207+ m3 = t3;
1208+
1209+ /* round 5 */
1210+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1211+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1212+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1213+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1214+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1215+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1216+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1217+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1218+ t2 = _mm_unpacklo_epi64(m3, m1);
1219+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1220+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1221+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1222+ t3 = _mm_unpackhi_epi32(m1, m3);
1223+ tt = _mm_unpacklo_epi32(m2, t3);
1224+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1225+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1226+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1227+ m0 = t0;
1228+ m1 = t1;
1229+ m2 = t2;
1230+ m3 = t3;
1231+
1232+ /* round 6 */
1233+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1234+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1235+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1236+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1237+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1238+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1239+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1240+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1241+ t2 = _mm_unpacklo_epi64(m3, m1);
1242+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1243+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1244+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1245+ t3 = _mm_unpackhi_epi32(m1, m3);
1246+ tt = _mm_unpacklo_epi32(m2, t3);
1247+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1248+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1249+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1250+ m0 = t0;
1251+ m1 = t1;
1252+ m2 = t2;
1253+ m3 = t3;
1254+
1255+ /* round 7 */
1256+ t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1257+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1258+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1259+ t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1260+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1261+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
1262+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1263+ diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1264+ t2 = _mm_unpacklo_epi64(m3, m1);
1265+ tt = _mm_blend_epi16(t2, m2, 0xC0);
1266+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1267+ g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1268+ t3 = _mm_unpackhi_epi32(m1, m3);
1269+ tt = _mm_unpacklo_epi32(m2, t3);
1270+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1271+ g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1272+ undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1273+}
1274+
1275+void
1276+blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1277+{
1278+ __m128i rows[4];
1279+
1280+ compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1281+ storeu_sse41(xorv_sse41(rows[0], rows[2]), (uint8_t *)&cv[0]);
1282+ storeu_sse41(xorv_sse41(rows[1], rows[3]), (uint8_t *)&cv[4]);
1283+}
1284+
1285+void
1286+blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
1287+{
1288+ __m128i rows[4];
1289+
1290+ compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1291+ storeu_sse41(xorv_sse41(rows[0], rows[2]), &out_buf[0]);
1292+ storeu_sse41(xorv_sse41(rows[1], rows[3]), &out_buf[16]);
1293+ storeu_sse41(xorv_sse41(rows[2], loadu_sse41((uint8_t *)&cv[0])), &out_buf[32]);
1294+ storeu_sse41(xorv_sse41(rows[3], loadu_sse41((uint8_t *)&cv[4])), &out_buf[48]);
1295+}
1296+
1297+static inline void
1298+round_fn_sse41(__m128i v[16], __m128i m[16], size_t r)
1299+{
1300+ v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1301+ v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1302+ v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1303+ v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1304+ v[0] = addv_sse41(v[0], v[4]);
1305+ v[1] = addv_sse41(v[1], v[5]);
1306+ v[2] = addv_sse41(v[2], v[6]);
1307+ v[3] = addv_sse41(v[3], v[7]);
1308+ v[12] = xorv_sse41(v[12], v[0]);
1309+ v[13] = xorv_sse41(v[13], v[1]);
1310+ v[14] = xorv_sse41(v[14], v[2]);
1311+ v[15] = xorv_sse41(v[15], v[3]);
1312+ v[12] = rot16_sse41(v[12]);
1313+ v[13] = rot16_sse41(v[13]);
1314+ v[14] = rot16_sse41(v[14]);
1315+ v[15] = rot16_sse41(v[15]);
1316+ v[8] = addv_sse41(v[8], v[12]);
1317+ v[9] = addv_sse41(v[9], v[13]);
1318+ v[10] = addv_sse41(v[10], v[14]);
1319+ v[11] = addv_sse41(v[11], v[15]);
1320+ v[4] = xorv_sse41(v[4], v[8]);
1321+ v[5] = xorv_sse41(v[5], v[9]);
1322+ v[6] = xorv_sse41(v[6], v[10]);
1323+ v[7] = xorv_sse41(v[7], v[11]);
1324+ v[4] = rot12_sse41(v[4]);
1325+ v[5] = rot12_sse41(v[5]);
1326+ v[6] = rot12_sse41(v[6]);
1327+ v[7] = rot12_sse41(v[7]);
1328+ v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1329+ v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1330+ v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1331+ v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1332+ v[0] = addv_sse41(v[0], v[4]);
1333+ v[1] = addv_sse41(v[1], v[5]);
1334+ v[2] = addv_sse41(v[2], v[6]);
1335+ v[3] = addv_sse41(v[3], v[7]);
1336+ v[12] = xorv_sse41(v[12], v[0]);
1337+ v[13] = xorv_sse41(v[13], v[1]);
1338+ v[14] = xorv_sse41(v[14], v[2]);
1339+ v[15] = xorv_sse41(v[15], v[3]);
1340+ v[12] = rot8_sse41(v[12]);
1341+ v[13] = rot8_sse41(v[13]);
1342+ v[14] = rot8_sse41(v[14]);
1343+ v[15] = rot8_sse41(v[15]);
1344+ v[8] = addv_sse41(v[8], v[12]);
1345+ v[9] = addv_sse41(v[9], v[13]);
1346+ v[10] = addv_sse41(v[10], v[14]);
1347+ v[11] = addv_sse41(v[11], v[15]);
1348+ v[4] = xorv_sse41(v[4], v[8]);
1349+ v[5] = xorv_sse41(v[5], v[9]);
1350+ v[6] = xorv_sse41(v[6], v[10]);
1351+ v[7] = xorv_sse41(v[7], v[11]);
1352+ v[4] = rot7_sse41(v[4]);
1353+ v[5] = rot7_sse41(v[5]);
1354+ v[6] = rot7_sse41(v[6]);
1355+ v[7] = rot7_sse41(v[7]);
1356+
1357+ v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1358+ v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1359+ v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1360+ v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1361+ v[0] = addv_sse41(v[0], v[5]);
1362+ v[1] = addv_sse41(v[1], v[6]);
1363+ v[2] = addv_sse41(v[2], v[7]);
1364+ v[3] = addv_sse41(v[3], v[4]);
1365+ v[15] = xorv_sse41(v[15], v[0]);
1366+ v[12] = xorv_sse41(v[12], v[1]);
1367+ v[13] = xorv_sse41(v[13], v[2]);
1368+ v[14] = xorv_sse41(v[14], v[3]);
1369+ v[15] = rot16_sse41(v[15]);
1370+ v[12] = rot16_sse41(v[12]);
1371+ v[13] = rot16_sse41(v[13]);
1372+ v[14] = rot16_sse41(v[14]);
1373+ v[10] = addv_sse41(v[10], v[15]);
1374+ v[11] = addv_sse41(v[11], v[12]);
1375+ v[8] = addv_sse41(v[8], v[13]);
1376+ v[9] = addv_sse41(v[9], v[14]);
1377+ v[5] = xorv_sse41(v[5], v[10]);
1378+ v[6] = xorv_sse41(v[6], v[11]);
1379+ v[7] = xorv_sse41(v[7], v[8]);
1380+ v[4] = xorv_sse41(v[4], v[9]);
1381+ v[5] = rot12_sse41(v[5]);
1382+ v[6] = rot12_sse41(v[6]);
1383+ v[7] = rot12_sse41(v[7]);
1384+ v[4] = rot12_sse41(v[4]);
1385+ v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1386+ v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1387+ v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1388+ v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1389+ v[0] = addv_sse41(v[0], v[5]);
1390+ v[1] = addv_sse41(v[1], v[6]);
1391+ v[2] = addv_sse41(v[2], v[7]);
1392+ v[3] = addv_sse41(v[3], v[4]);
1393+ v[15] = xorv_sse41(v[15], v[0]);
1394+ v[12] = xorv_sse41(v[12], v[1]);
1395+ v[13] = xorv_sse41(v[13], v[2]);
1396+ v[14] = xorv_sse41(v[14], v[3]);
1397+ v[15] = rot8_sse41(v[15]);
1398+ v[12] = rot8_sse41(v[12]);
1399+ v[13] = rot8_sse41(v[13]);
1400+ v[14] = rot8_sse41(v[14]);
1401+ v[10] = addv_sse41(v[10], v[15]);
1402+ v[11] = addv_sse41(v[11], v[12]);
1403+ v[8] = addv_sse41(v[8], v[13]);
1404+ v[9] = addv_sse41(v[9], v[14]);
1405+ v[5] = xorv_sse41(v[5], v[10]);
1406+ v[6] = xorv_sse41(v[6], v[11]);
1407+ v[7] = xorv_sse41(v[7], v[8]);
1408+ v[4] = xorv_sse41(v[4], v[9]);
1409+ v[5] = rot7_sse41(v[5]);
1410+ v[6] = rot7_sse41(v[6]);
1411+ v[7] = rot7_sse41(v[7]);
1412+ v[4] = rot7_sse41(v[4]);
1413+}
1414+
1415+static inline void
1416+transpose_vecs_sse41(__m128i vecs[DEGREE_SSE41])
1417+{
1418+ __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
1419+ __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
1420+ __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
1421+ __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
1422+
1423+ __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
1424+ __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
1425+ __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
1426+ __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
1427+
1428+ vecs[0] = abcd_0;
1429+ vecs[1] = abcd_1;
1430+ vecs[2] = abcd_2;
1431+ vecs[3] = abcd_3;
1432+}
1433+
1434+static inline void
1435+transpose_msg_vecs_sse41(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
1436+{
1437+ size_t i;
1438+
1439+ out_msg[0] = loadu_sse41(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
1440+ out_msg[1] = loadu_sse41(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
1441+ out_msg[2] = loadu_sse41(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
1442+ out_msg[3] = loadu_sse41(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
1443+ out_msg[4] = loadu_sse41(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
1444+ out_msg[5] = loadu_sse41(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
1445+ out_msg[6] = loadu_sse41(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
1446+ out_msg[7] = loadu_sse41(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
1447+ out_msg[8] = loadu_sse41(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
1448+ out_msg[9] = loadu_sse41(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
1449+ out_msg[10] = loadu_sse41(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
1450+ out_msg[11] = loadu_sse41(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
1451+ out_msg[12] = loadu_sse41(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
1452+ out_msg[13] = loadu_sse41(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
1453+ out_msg[14] = loadu_sse41(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
1454+ out_msg[15] = loadu_sse41(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
1455+
1456+ for (i = 0; i < 4; i++) {
1457+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1458+ }
1459+ transpose_vecs_sse41(&out_msg[0]);
1460+ transpose_vecs_sse41(&out_msg[4]);
1461+ transpose_vecs_sse41(&out_msg[8]);
1462+ transpose_vecs_sse41(&out_msg[12]);
1463+}
1464+
1465+static inline void
1466+load_counters_sse41(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
1467+{
1468+ const __m128i mask = _mm_set1_epi32(-increment_counter);
1469+ const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
1470+ const __m128i add1 = _mm_and_si128(mask, add0);
1471+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
1472+ __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
1473+ _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
1474+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
1475+
1476+ *out_lo = l;
1477+ *out_hi = h;
1478+}
1479+
1480+void
1481+blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1482+{
1483+ __m128i h_vecs[8];
1484+ __m128i counter_low_vec, counter_high_vec;
1485+ uint8_t block_flags;
1486+ size_t block;
1487+
1488+ h_vecs[0] = set1_sse41(key[0]);
1489+ h_vecs[1] = set1_sse41(key[1]);
1490+ h_vecs[2] = set1_sse41(key[2]);
1491+ h_vecs[3] = set1_sse41(key[3]);
1492+ h_vecs[4] = set1_sse41(key[4]);
1493+ h_vecs[5] = set1_sse41(key[5]);
1494+ h_vecs[6] = set1_sse41(key[6]);
1495+ h_vecs[7] = set1_sse41(key[7]);
1496+
1497+ load_counters_sse41(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1498+ block_flags = flags | flags_start;
1499+
1500+ for (block = 0; block < blocks; block++) {
1501+ __m128i block_len_vec;
1502+ __m128i block_flags_vec;
1503+ __m128i msg_vecs[16];
1504+ __m128i v[16];
1505+
1506+ if (block + 1 == blocks) {
1507+ block_flags |= flags_end;
1508+ }
1509+ block_len_vec = set1_sse41(BLAKE3_BLOCK_LEN);
1510+ block_flags_vec = set1_sse41(block_flags);
1511+ transpose_msg_vecs_sse41(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1512+
1513+ v[0] = h_vecs[0];
1514+ v[1] = h_vecs[1];
1515+ v[2] = h_vecs[2];
1516+ v[3] = h_vecs[3];
1517+ v[4] = h_vecs[4];
1518+ v[5] = h_vecs[5];
1519+ v[6] = h_vecs[6];
1520+ v[7] = h_vecs[7];
1521+ v[8] = set1_sse41(IV[0]);
1522+ v[9] = set1_sse41(IV[1]);
1523+ v[10] = set1_sse41(IV[2]);
1524+ v[11] = set1_sse41(IV[3]);
1525+ v[12] = counter_low_vec;
1526+ v[13] = counter_high_vec;
1527+ v[14] = block_len_vec;
1528+ v[15] = block_flags_vec;
1529+
1530+ round_fn_sse41(v, msg_vecs, 0);
1531+ round_fn_sse41(v, msg_vecs, 1);
1532+ round_fn_sse41(v, msg_vecs, 2);
1533+ round_fn_sse41(v, msg_vecs, 3);
1534+ round_fn_sse41(v, msg_vecs, 4);
1535+ round_fn_sse41(v, msg_vecs, 5);
1536+ round_fn_sse41(v, msg_vecs, 6);
1537+
1538+ h_vecs[0] = xorv_sse41(v[0], v[8]);
1539+ h_vecs[1] = xorv_sse41(v[1], v[9]);
1540+ h_vecs[2] = xorv_sse41(v[2], v[10]);
1541+ h_vecs[3] = xorv_sse41(v[3], v[11]);
1542+ h_vecs[4] = xorv_sse41(v[4], v[12]);
1543+ h_vecs[5] = xorv_sse41(v[5], v[13]);
1544+ h_vecs[6] = xorv_sse41(v[6], v[14]);
1545+ h_vecs[7] = xorv_sse41(v[7], v[15]);
1546+
1547+ block_flags = flags;
1548+ }
1549+
1550+ transpose_vecs_sse41(&h_vecs[0]);
1551+ transpose_vecs_sse41(&h_vecs[4]);
1552+ storeu_sse41(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
1553+ storeu_sse41(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
1554+ storeu_sse41(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
1555+ storeu_sse41(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
1556+ storeu_sse41(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
1557+ storeu_sse41(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
1558+ storeu_sse41(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
1559+ storeu_sse41(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
1560+}
1561+
1562+static inline void
1563+hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
1564+{
1565+ uint32_t cv[8];
1566+ uint8_t block_flags;
1567+
1568+ memcpy(cv, key, BLAKE3_KEY_LEN);
1569+ block_flags = flags | flags_start;
1570+ while (blocks > 0) {
1571+ if (blocks == 1) {
1572+ block_flags |= flags_end;
1573+ }
1574+ blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
1575+ input = &input[BLAKE3_BLOCK_LEN];
1576+ blocks -= 1;
1577+ block_flags = flags;
1578+ }
1579+ memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
1580+}
1581+
1582+void
1583+blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1584+{
1585+ while (num_inputs >= DEGREE_SSE41) {
1586+ blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1587+ if (increment_counter) {
1588+ counter += DEGREE_SSE41;
1589+ }
1590+ inputs += DEGREE_SSE41;
1591+ num_inputs -= DEGREE_SSE41;
1592+ out_bytes = &out_bytes[DEGREE_SSE41 * BLAKE3_OUT_LEN];
1593+ }
1594+ while (num_inputs > 0) {
1595+ hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
1596+ if (increment_counter) {
1597+ counter += 1;
1598+ }
1599+ inputs += 1;
1600+ num_inputs -= 1;
1601+ out_bytes = &out_bytes[BLAKE3_OUT_LEN];
1602+ }
1603+}
1604+#pragma GCC pop_options
1605+
1606+#pragma GCC push_options
1607+#pragma GCC target("avx2")
1608+#define DEGREE_AVX2 8
1609+
1610+static inline __m256i
1611+loadu_avx2(const uint8_t src[32])
1612+{
1613+ return _mm256_loadu_si256((const __m256i *)src);
1614+}
1615+
1616+static inline void
1617+storeu_avx2(__m256i src, uint8_t dest[32])
1618+{
1619+ _mm256_storeu_si256((__m256i *)dest, src);
1620+}
1621+
1622+static inline __m256i
1623+addv_avx2(__m256i a, __m256i b)
1624+{
1625+ return _mm256_add_epi32(a, b);
1626+}
1627+
1628+static inline __m256i
1629+xorv_avx2(__m256i a, __m256i b)
1630+{
1631+ return _mm256_xor_si256(a, b);
1632+}
1633+
1634+static inline __m256i
1635+set1_avx2(uint32_t x)
1636+{
1637+ return _mm256_set1_epi32((int32_t)x);
1638+}
1639+
1640+static inline __m256i
1641+rot16_avx2(__m256i x)
1642+{
1643+ return _mm256_shuffle_epi8(x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
1644+ 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1645+}
1646+
1647+static inline __m256i
1648+rot12_avx2(__m256i x)
1649+{
1650+ return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
1651+}
1652+
1653+static inline __m256i
1654+rot8_avx2(__m256i x)
1655+{
1656+ return _mm256_shuffle_epi8(x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
1657+ 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1658+}
1659+
1660+static inline __m256i
1661+rot7_avx2(__m256i x)
1662+{
1663+ return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
1664+}
1665+
1666+static inline void
1667+round_fn_avx2(__m256i v[16], __m256i m[16], size_t r)
1668+{
1669+ v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1670+ v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1671+ v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1672+ v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1673+ v[0] = addv_avx2(v[0], v[4]);
1674+ v[1] = addv_avx2(v[1], v[5]);
1675+ v[2] = addv_avx2(v[2], v[6]);
1676+ v[3] = addv_avx2(v[3], v[7]);
1677+ v[12] = xorv_avx2(v[12], v[0]);
1678+ v[13] = xorv_avx2(v[13], v[1]);
1679+ v[14] = xorv_avx2(v[14], v[2]);
1680+ v[15] = xorv_avx2(v[15], v[3]);
1681+ v[12] = rot16_avx2(v[12]);
1682+ v[13] = rot16_avx2(v[13]);
1683+ v[14] = rot16_avx2(v[14]);
1684+ v[15] = rot16_avx2(v[15]);
1685+ v[8] = addv_avx2(v[8], v[12]);
1686+ v[9] = addv_avx2(v[9], v[13]);
1687+ v[10] = addv_avx2(v[10], v[14]);
1688+ v[11] = addv_avx2(v[11], v[15]);
1689+ v[4] = xorv_avx2(v[4], v[8]);
1690+ v[5] = xorv_avx2(v[5], v[9]);
1691+ v[6] = xorv_avx2(v[6], v[10]);
1692+ v[7] = xorv_avx2(v[7], v[11]);
1693+ v[4] = rot12_avx2(v[4]);
1694+ v[5] = rot12_avx2(v[5]);
1695+ v[6] = rot12_avx2(v[6]);
1696+ v[7] = rot12_avx2(v[7]);
1697+ v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1698+ v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1699+ v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1700+ v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1701+ v[0] = addv_avx2(v[0], v[4]);
1702+ v[1] = addv_avx2(v[1], v[5]);
1703+ v[2] = addv_avx2(v[2], v[6]);
1704+ v[3] = addv_avx2(v[3], v[7]);
1705+ v[12] = xorv_avx2(v[12], v[0]);
1706+ v[13] = xorv_avx2(v[13], v[1]);
1707+ v[14] = xorv_avx2(v[14], v[2]);
1708+ v[15] = xorv_avx2(v[15], v[3]);
1709+ v[12] = rot8_avx2(v[12]);
1710+ v[13] = rot8_avx2(v[13]);
1711+ v[14] = rot8_avx2(v[14]);
1712+ v[15] = rot8_avx2(v[15]);
1713+ v[8] = addv_avx2(v[8], v[12]);
1714+ v[9] = addv_avx2(v[9], v[13]);
1715+ v[10] = addv_avx2(v[10], v[14]);
1716+ v[11] = addv_avx2(v[11], v[15]);
1717+ v[4] = xorv_avx2(v[4], v[8]);
1718+ v[5] = xorv_avx2(v[5], v[9]);
1719+ v[6] = xorv_avx2(v[6], v[10]);
1720+ v[7] = xorv_avx2(v[7], v[11]);
1721+ v[4] = rot7_avx2(v[4]);
1722+ v[5] = rot7_avx2(v[5]);
1723+ v[6] = rot7_avx2(v[6]);
1724+ v[7] = rot7_avx2(v[7]);
1725+
1726+ v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1727+ v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1728+ v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1729+ v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1730+ v[0] = addv_avx2(v[0], v[5]);
1731+ v[1] = addv_avx2(v[1], v[6]);
1732+ v[2] = addv_avx2(v[2], v[7]);
1733+ v[3] = addv_avx2(v[3], v[4]);
1734+ v[15] = xorv_avx2(v[15], v[0]);
1735+ v[12] = xorv_avx2(v[12], v[1]);
1736+ v[13] = xorv_avx2(v[13], v[2]);
1737+ v[14] = xorv_avx2(v[14], v[3]);
1738+ v[15] = rot16_avx2(v[15]);
1739+ v[12] = rot16_avx2(v[12]);
1740+ v[13] = rot16_avx2(v[13]);
1741+ v[14] = rot16_avx2(v[14]);
1742+ v[10] = addv_avx2(v[10], v[15]);
1743+ v[11] = addv_avx2(v[11], v[12]);
1744+ v[8] = addv_avx2(v[8], v[13]);
1745+ v[9] = addv_avx2(v[9], v[14]);
1746+ v[5] = xorv_avx2(v[5], v[10]);
1747+ v[6] = xorv_avx2(v[6], v[11]);
1748+ v[7] = xorv_avx2(v[7], v[8]);
1749+ v[4] = xorv_avx2(v[4], v[9]);
1750+ v[5] = rot12_avx2(v[5]);
1751+ v[6] = rot12_avx2(v[6]);
1752+ v[7] = rot12_avx2(v[7]);
1753+ v[4] = rot12_avx2(v[4]);
1754+ v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1755+ v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1756+ v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1757+ v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1758+ v[0] = addv_avx2(v[0], v[5]);
1759+ v[1] = addv_avx2(v[1], v[6]);
1760+ v[2] = addv_avx2(v[2], v[7]);
1761+ v[3] = addv_avx2(v[3], v[4]);
1762+ v[15] = xorv_avx2(v[15], v[0]);
1763+ v[12] = xorv_avx2(v[12], v[1]);
1764+ v[13] = xorv_avx2(v[13], v[2]);
1765+ v[14] = xorv_avx2(v[14], v[3]);
1766+ v[15] = rot8_avx2(v[15]);
1767+ v[12] = rot8_avx2(v[12]);
1768+ v[13] = rot8_avx2(v[13]);
1769+ v[14] = rot8_avx2(v[14]);
1770+ v[10] = addv_avx2(v[10], v[15]);
1771+ v[11] = addv_avx2(v[11], v[12]);
1772+ v[8] = addv_avx2(v[8], v[13]);
1773+ v[9] = addv_avx2(v[9], v[14]);
1774+ v[5] = xorv_avx2(v[5], v[10]);
1775+ v[6] = xorv_avx2(v[6], v[11]);
1776+ v[7] = xorv_avx2(v[7], v[8]);
1777+ v[4] = xorv_avx2(v[4], v[9]);
1778+ v[5] = rot7_avx2(v[5]);
1779+ v[6] = rot7_avx2(v[6]);
1780+ v[7] = rot7_avx2(v[7]);
1781+ v[4] = rot7_avx2(v[4]);
1782+}
1783+
1784+static inline void
1785+transpose_vecs_avx2(__m256i vecs[DEGREE_AVX2])
1786+{
1787+ __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
1788+ __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
1789+ __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
1790+ __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
1791+ __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
1792+ __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
1793+ __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
1794+ __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
1795+
1796+ __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
1797+ __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
1798+ __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
1799+ __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
1800+ __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
1801+ __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
1802+ __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
1803+ __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
1804+
1805+ vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
1806+ vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
1807+ vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
1808+ vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
1809+ vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
1810+ vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
1811+ vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
1812+ vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
1813+}
1814+
1815+static inline void
1816+transpose_msg_vecs_avx2(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
1817+{
1818+ size_t i;
1819+
1820+ out_msg[0] = loadu_avx2(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
1821+ out_msg[1] = loadu_avx2(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
1822+ out_msg[2] = loadu_avx2(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
1823+ out_msg[3] = loadu_avx2(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
1824+ out_msg[4] = loadu_avx2(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
1825+ out_msg[5] = loadu_avx2(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
1826+ out_msg[6] = loadu_avx2(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
1827+ out_msg[7] = loadu_avx2(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
1828+ out_msg[8] = loadu_avx2(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
1829+ out_msg[9] = loadu_avx2(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
1830+ out_msg[10] = loadu_avx2(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
1831+ out_msg[11] = loadu_avx2(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
1832+ out_msg[12] = loadu_avx2(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
1833+ out_msg[13] = loadu_avx2(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
1834+ out_msg[14] = loadu_avx2(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
1835+ out_msg[15] = loadu_avx2(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
1836+
1837+ for (i = 0; i < 8; i++) {
1838+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1839+ }
1840+ transpose_vecs_avx2(&out_msg[0]);
1841+ transpose_vecs_avx2(&out_msg[8]);
1842+}
1843+
1844+static inline void
1845+load_counters_avx2(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
1846+{
1847+ const __m256i mask = _mm256_set1_epi32(-increment_counter);
1848+ const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1849+ const __m256i add1 = _mm256_and_si256(mask, add0);
1850+ __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
1851+ __m256i carry = _mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000));
1852+ __m256i comp = _mm256_xor_si256(l, _mm256_set1_epi32(0x80000000));
1853+ __m256i gt = _mm256_cmpgt_epi32(carry, comp);
1854+ __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), gt);
1855+
1856+ *out_lo = l;
1857+ *out_hi = h;
1858+}
1859+
1860+void
1861+blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1862+{
1863+ __m256i h_vecs[8];
1864+ __m256i counter_low_vec, counter_high_vec;
1865+ uint8_t block_flags;
1866+ size_t block;
1867+
1868+ h_vecs[0] = set1_avx2(key[0]);
1869+ h_vecs[1] = set1_avx2(key[1]);
1870+ h_vecs[2] = set1_avx2(key[2]);
1871+ h_vecs[3] = set1_avx2(key[3]);
1872+ h_vecs[4] = set1_avx2(key[4]);
1873+ h_vecs[5] = set1_avx2(key[5]);
1874+ h_vecs[6] = set1_avx2(key[6]);
1875+ h_vecs[7] = set1_avx2(key[7]);
1876+
1877+ load_counters_avx2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1878+ block_flags = flags | flags_start;
1879+
1880+ for (block = 0; block < blocks; block++) {
1881+ __m256i block_len_vec;
1882+ __m256i block_flags_vec;
1883+ __m256i msg_vecs[16];
1884+ __m256i v[16];
1885+
1886+ if (block + 1 == blocks) {
1887+ block_flags |= flags_end;
1888+ }
1889+ block_len_vec = set1_avx2(BLAKE3_BLOCK_LEN);
1890+ block_flags_vec = set1_avx2(block_flags);
1891+ transpose_msg_vecs_avx2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1892+
1893+ v[0] = h_vecs[0];
1894+ v[1] = h_vecs[1];
1895+ v[2] = h_vecs[2];
1896+ v[3] = h_vecs[3];
1897+ v[4] = h_vecs[4];
1898+ v[5] = h_vecs[5];
1899+ v[6] = h_vecs[6];
1900+ v[7] = h_vecs[7];
1901+ v[8] = set1_avx2(IV[0]);
1902+ v[9] = set1_avx2(IV[1]);
1903+ v[10] = set1_avx2(IV[2]);
1904+ v[11] = set1_avx2(IV[3]);
1905+ v[12] = counter_low_vec;
1906+ v[13] = counter_high_vec;
1907+ v[14] = block_len_vec;
1908+ v[15] = block_flags_vec;
1909+
1910+ round_fn_avx2(v, msg_vecs, 0);
1911+ round_fn_avx2(v, msg_vecs, 1);
1912+ round_fn_avx2(v, msg_vecs, 2);
1913+ round_fn_avx2(v, msg_vecs, 3);
1914+ round_fn_avx2(v, msg_vecs, 4);
1915+ round_fn_avx2(v, msg_vecs, 5);
1916+ round_fn_avx2(v, msg_vecs, 6);
1917+
1918+ h_vecs[0] = xorv_avx2(v[0], v[8]);
1919+ h_vecs[1] = xorv_avx2(v[1], v[9]);
1920+ h_vecs[2] = xorv_avx2(v[2], v[10]);
1921+ h_vecs[3] = xorv_avx2(v[3], v[11]);
1922+ h_vecs[4] = xorv_avx2(v[4], v[12]);
1923+ h_vecs[5] = xorv_avx2(v[5], v[13]);
1924+ h_vecs[6] = xorv_avx2(v[6], v[14]);
1925+ h_vecs[7] = xorv_avx2(v[7], v[15]);
1926+
1927+ block_flags = flags;
1928+ }
1929+
1930+ transpose_vecs_avx2(h_vecs);
1931+ storeu_avx2(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
1932+ storeu_avx2(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
1933+ storeu_avx2(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
1934+ storeu_avx2(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
1935+ storeu_avx2(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
1936+ storeu_avx2(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
1937+ storeu_avx2(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
1938+ storeu_avx2(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
1939+}
1940+
1941+void
1942+blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1943+{
1944+ while (num_inputs >= DEGREE_AVX2) {
1945+ blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1946+ if (increment_counter) {
1947+ counter += DEGREE_AVX2;
1948+ }
1949+ inputs += DEGREE_AVX2;
1950+ num_inputs -= DEGREE_AVX2;
1951+ out_bytes = &out_bytes[DEGREE_AVX2 * BLAKE3_OUT_LEN];
1952+ }
1953+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1954+}
1955+#pragma GCC pop_options
1956+
1957+#pragma GCC push_options
1958+#pragma GCC target("avx512f,avx512vl")
1959+static inline __m128i
1960+loadu_128_avx512(const uint8_t src[16])
1961+{
1962+ return _mm_loadu_si128((const __m128i *)src);
1963+}
1964+
1965+static inline __m256i
1966+loadu_256_avx512(const uint8_t src[32])
1967+{
1968+ return _mm256_loadu_si256((const __m256i *)src);
1969+}
1970+
1971+static inline __m512i
1972+loadu_512_avx512(const uint8_t src[64])
1973+{
1974+ return _mm512_loadu_si512((const __m512i *)src);
1975+}
1976+
1977+static inline void
1978+storeu_128_avx512(__m128i src, uint8_t dest[16])
1979+{
1980+ _mm_storeu_si128((__m128i *)dest, src);
1981+}
1982+
1983+static inline void
1984+storeu_256_avx512(__m256i src, uint8_t dest[32])
1985+{
1986+ _mm256_storeu_si256((__m256i *)dest, src);
1987+}
1988+
1989+static inline __m128i
1990+add_128_avx512(__m128i a, __m128i b)
1991+{
1992+ return _mm_add_epi32(a, b);
1993+}
1994+
1995+static inline __m256i
1996+add_256_avx512(__m256i a, __m256i b)
1997+{
1998+ return _mm256_add_epi32(a, b);
1999+}
2000+
2001+static inline __m512i
2002+add_512_avx512(__m512i a, __m512i b)
2003+{
2004+ return _mm512_add_epi32(a, b);
2005+}
2006+
2007+static inline __m128i
2008+xor_128_avx512(__m128i a, __m128i b)
2009+{
2010+ return _mm_xor_si128(a, b);
2011+}
2012+
2013+static inline __m256i
2014+xor_256_avx512(__m256i a, __m256i b)
2015+{
2016+ return _mm256_xor_si256(a, b);
2017+}
2018+
2019+static inline __m512i
2020+xor_512_avx512(__m512i a, __m512i b)
2021+{
2022+ return _mm512_xor_si512(a, b);
2023+}
2024+
2025+static inline __m128i
2026+set1_128_avx512(uint32_t x)
2027+{
2028+ return _mm_set1_epi32((int32_t)x);
2029+}
2030+
2031+static inline __m256i
2032+set1_256_avx512(uint32_t x)
2033+{
2034+ return _mm256_set1_epi32((int32_t)x);
2035+}
2036+
2037+static inline __m512i
2038+set1_512_avx512(uint32_t x)
2039+{
2040+ return _mm512_set1_epi32((int32_t)x);
2041+}
2042+
2043+static inline __m128i
2044+set4_avx512(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
2045+{
2046+ return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
2047+}
2048+
2049+static inline __m128i
2050+rot16_128_avx512(__m128i x)
2051+{
2052+ return _mm_ror_epi32(x, 16);
2053+}
2054+
2055+static inline __m256i
2056+rot16_256_avx512(__m256i x)
2057+{
2058+ return _mm256_ror_epi32(x, 16);
2059+}
2060+
2061+static inline __m512i
2062+rot16_512_avx512(__m512i x)
2063+{
2064+ return _mm512_ror_epi32(x, 16);
2065+}
2066+
2067+static inline __m128i
2068+rot12_128_avx512(__m128i x)
2069+{
2070+ return _mm_ror_epi32(x, 12);
2071+}
2072+
2073+static inline __m256i
2074+rot12_256_avx512(__m256i x)
2075+{
2076+ return _mm256_ror_epi32(x, 12);
2077+}
2078+
2079+static inline __m512i
2080+rot12_512_avx512(__m512i x)
2081+{
2082+ return _mm512_ror_epi32(x, 12);
2083+}
2084+
2085+static inline __m128i
2086+rot8_128_avx512(__m128i x)
2087+{
2088+ return _mm_ror_epi32(x, 8);
2089+}
2090+
2091+static inline __m256i
2092+rot8_256_avx512(__m256i x)
2093+{
2094+ return _mm256_ror_epi32(x, 8);
2095+}
2096+
2097+static inline __m512i
2098+rot8_512_avx512(__m512i x)
2099+{
2100+ return _mm512_ror_epi32(x, 8);
2101+}
2102+
2103+static inline __m128i
2104+rot7_128_avx512(__m128i x)
2105+{
2106+ return _mm_ror_epi32(x, 7);
2107+}
2108+
2109+static inline __m256i
2110+rot7_256_avx512(__m256i x)
2111+{
2112+ return _mm256_ror_epi32(x, 7);
2113+}
2114+
2115+static inline __m512i
2116+rot7_512_avx512(__m512i x)
2117+{
2118+ return _mm512_ror_epi32(x, 7);
2119+}
2120+
2121+static inline void
2122+g1_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2123+{
2124+ *row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2125+ *row3 = xor_128_avx512(*row3, *row0);
2126+ *row3 = rot16_128_avx512(*row3);
2127+ *row2 = add_128_avx512(*row2, *row3);
2128+ *row1 = xor_128_avx512(*row1, *row2);
2129+ *row1 = rot12_128_avx512(*row1);
2130+}
2131+
2132+static inline void
2133+g2_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2134+{
2135+ *row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2136+ *row3 = xor_128_avx512(*row3, *row0);
2137+ *row3 = rot8_128_avx512(*row3);
2138+ *row2 = add_128_avx512(*row2, *row3);
2139+ *row1 = xor_128_avx512(*row1, *row2);
2140+ *row1 = rot7_128_avx512(*row1);
2141+}
2142+
2143+static inline void
2144+diagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2145+{
2146+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
2147+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2148+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
2149+}
2150+
2151+static inline void
2152+undiagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2153+{
2154+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
2155+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2156+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
2157+}
2158+
2159+static inline void
2160+compress_pre_avx512(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2161+{
2162+ __m128i m0, m1, m2, m3;
2163+ __m128i t0, t1, t2, t3;
2164+
2165+ rows[0] = loadu_128_avx512((const uint8_t *)&cv[0]);
2166+ rows[1] = loadu_128_avx512((const uint8_t *)&cv[4]);
2167+ rows[2] = set4_avx512(IV[0], IV[1], IV[2], IV[3]);
2168+ rows[3] = set4_avx512(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
2169+
2170+ m0 = loadu_128_avx512(&block[sizeof(__m128i) * 0]);
2171+ m1 = loadu_128_avx512(&block[sizeof(__m128i) * 1]);
2172+ m2 = loadu_128_avx512(&block[sizeof(__m128i) * 2]);
2173+ m3 = loadu_128_avx512(&block[sizeof(__m128i) * 3]);
2174+
2175+ /* round 1 */
2176+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
2177+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2178+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
2179+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2180+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2181+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
2182+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
2183+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2184+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
2185+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
2186+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2187+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2188+ m0 = t0;
2189+ m1 = t1;
2190+ m2 = t2;
2191+ m3 = t3;
2192+
2193+ /* round 2 */
2194+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2195+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2196+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2197+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2198+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2199+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2200+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2201+ t2 = _mm_unpacklo_epi64(m3, m1);
2202+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2203+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2204+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2205+ t3 = _mm_unpackhi_epi32(m1, m3);
2206+ t3 = _mm_unpacklo_epi32(m2, t3);
2207+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2208+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2209+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2210+ m0 = t0;
2211+ m1 = t1;
2212+ m2 = t2;
2213+ m3 = t3;
2214+
2215+ /* round 3 */
2216+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2217+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2218+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2219+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2220+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2221+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2222+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2223+ t2 = _mm_unpacklo_epi64(m3, m1);
2224+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2225+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2226+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2227+ t3 = _mm_unpackhi_epi32(m1, m3);
2228+ t3 = _mm_unpacklo_epi32(m2, t3);
2229+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2230+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2231+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2232+ m0 = t0;
2233+ m1 = t1;
2234+ m2 = t2;
2235+ m3 = t3;
2236+
2237+ /* round 4 */
2238+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2239+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2240+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2241+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2242+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2243+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2244+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2245+ t2 = _mm_unpacklo_epi64(m3, m1);
2246+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2247+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2248+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2249+ t3 = _mm_unpackhi_epi32(m1, m3);
2250+ t3 = _mm_unpacklo_epi32(m2, t3);
2251+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2252+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2253+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2254+ m0 = t0;
2255+ m1 = t1;
2256+ m2 = t2;
2257+ m3 = t3;
2258+
2259+ /* round 5 */
2260+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2261+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2262+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2263+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2264+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2265+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2266+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2267+ t2 = _mm_unpacklo_epi64(m3, m1);
2268+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2269+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2270+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2271+ t3 = _mm_unpackhi_epi32(m1, m3);
2272+ t3 = _mm_unpacklo_epi32(m2, t3);
2273+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2274+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2275+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2276+ m0 = t0;
2277+ m1 = t1;
2278+ m2 = t2;
2279+ m3 = t3;
2280+
2281+ /* round 6 */
2282+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2283+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2284+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2285+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2286+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2287+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2288+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2289+ t2 = _mm_unpacklo_epi64(m3, m1);
2290+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2291+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2292+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2293+ t3 = _mm_unpackhi_epi32(m1, m3);
2294+ t3 = _mm_unpacklo_epi32(m2, t3);
2295+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2296+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2297+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2298+ m0 = t0;
2299+ m1 = t1;
2300+ m2 = t2;
2301+ m3 = t3;
2302+
2303+ /* round 7 */
2304+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2305+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2306+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2307+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2308+ t1 = _mm_blend_epi16(m0, t1, 0xCC);
2309+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2310+ diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2311+ t2 = _mm_unpacklo_epi64(m3, m1);
2312+ t2 = _mm_blend_epi16(t2, m2, 0xC0);
2313+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2314+ g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2315+ t3 = _mm_unpackhi_epi32(m1, m3);
2316+ t3 = _mm_unpacklo_epi32(m2, t3);
2317+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2318+ g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2319+ undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2320+}
2321+
2322+void
2323+blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2324+{
2325+ __m128i rows[4];
2326+
2327+ compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2328+ storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), (uint8_t *)&cv[0]);
2329+ storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), (uint8_t *)&cv[4]);
2330+}
2331+
2332+void
2333+blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
2334+{
2335+ __m128i rows[4];
2336+
2337+ compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2338+ storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), &out_buf[0]);
2339+ storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), &out_buf[16]);
2340+ storeu_128_avx512(xor_128_avx512(rows[2], loadu_128_avx512((const uint8_t *)&cv[0])), &out_buf[32]);
2341+ storeu_128_avx512(xor_128_avx512(rows[3], loadu_128_avx512((const uint8_t *)&cv[4])), &out_buf[48]);
2342+}
2343+
2344+static inline void
2345+round_fn4_avx512(__m128i v[16], __m128i m[16], size_t r)
2346+{
2347+ v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2348+ v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2349+ v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2350+ v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2351+ v[0] = add_128_avx512(v[0], v[4]);
2352+ v[1] = add_128_avx512(v[1], v[5]);
2353+ v[2] = add_128_avx512(v[2], v[6]);
2354+ v[3] = add_128_avx512(v[3], v[7]);
2355+ v[12] = xor_128_avx512(v[12], v[0]);
2356+ v[13] = xor_128_avx512(v[13], v[1]);
2357+ v[14] = xor_128_avx512(v[14], v[2]);
2358+ v[15] = xor_128_avx512(v[15], v[3]);
2359+ v[12] = rot16_128_avx512(v[12]);
2360+ v[13] = rot16_128_avx512(v[13]);
2361+ v[14] = rot16_128_avx512(v[14]);
2362+ v[15] = rot16_128_avx512(v[15]);
2363+ v[8] = add_128_avx512(v[8], v[12]);
2364+ v[9] = add_128_avx512(v[9], v[13]);
2365+ v[10] = add_128_avx512(v[10], v[14]);
2366+ v[11] = add_128_avx512(v[11], v[15]);
2367+ v[4] = xor_128_avx512(v[4], v[8]);
2368+ v[5] = xor_128_avx512(v[5], v[9]);
2369+ v[6] = xor_128_avx512(v[6], v[10]);
2370+ v[7] = xor_128_avx512(v[7], v[11]);
2371+ v[4] = rot12_128_avx512(v[4]);
2372+ v[5] = rot12_128_avx512(v[5]);
2373+ v[6] = rot12_128_avx512(v[6]);
2374+ v[7] = rot12_128_avx512(v[7]);
2375+ v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2376+ v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2377+ v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2378+ v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2379+ v[0] = add_128_avx512(v[0], v[4]);
2380+ v[1] = add_128_avx512(v[1], v[5]);
2381+ v[2] = add_128_avx512(v[2], v[6]);
2382+ v[3] = add_128_avx512(v[3], v[7]);
2383+ v[12] = xor_128_avx512(v[12], v[0]);
2384+ v[13] = xor_128_avx512(v[13], v[1]);
2385+ v[14] = xor_128_avx512(v[14], v[2]);
2386+ v[15] = xor_128_avx512(v[15], v[3]);
2387+ v[12] = rot8_128_avx512(v[12]);
2388+ v[13] = rot8_128_avx512(v[13]);
2389+ v[14] = rot8_128_avx512(v[14]);
2390+ v[15] = rot8_128_avx512(v[15]);
2391+ v[8] = add_128_avx512(v[8], v[12]);
2392+ v[9] = add_128_avx512(v[9], v[13]);
2393+ v[10] = add_128_avx512(v[10], v[14]);
2394+ v[11] = add_128_avx512(v[11], v[15]);
2395+ v[4] = xor_128_avx512(v[4], v[8]);
2396+ v[5] = xor_128_avx512(v[5], v[9]);
2397+ v[6] = xor_128_avx512(v[6], v[10]);
2398+ v[7] = xor_128_avx512(v[7], v[11]);
2399+ v[4] = rot7_128_avx512(v[4]);
2400+ v[5] = rot7_128_avx512(v[5]);
2401+ v[6] = rot7_128_avx512(v[6]);
2402+ v[7] = rot7_128_avx512(v[7]);
2403+
2404+ v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2405+ v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2406+ v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2407+ v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2408+ v[0] = add_128_avx512(v[0], v[5]);
2409+ v[1] = add_128_avx512(v[1], v[6]);
2410+ v[2] = add_128_avx512(v[2], v[7]);
2411+ v[3] = add_128_avx512(v[3], v[4]);
2412+ v[15] = xor_128_avx512(v[15], v[0]);
2413+ v[12] = xor_128_avx512(v[12], v[1]);
2414+ v[13] = xor_128_avx512(v[13], v[2]);
2415+ v[14] = xor_128_avx512(v[14], v[3]);
2416+ v[15] = rot16_128_avx512(v[15]);
2417+ v[12] = rot16_128_avx512(v[12]);
2418+ v[13] = rot16_128_avx512(v[13]);
2419+ v[14] = rot16_128_avx512(v[14]);
2420+ v[10] = add_128_avx512(v[10], v[15]);
2421+ v[11] = add_128_avx512(v[11], v[12]);
2422+ v[8] = add_128_avx512(v[8], v[13]);
2423+ v[9] = add_128_avx512(v[9], v[14]);
2424+ v[5] = xor_128_avx512(v[5], v[10]);
2425+ v[6] = xor_128_avx512(v[6], v[11]);
2426+ v[7] = xor_128_avx512(v[7], v[8]);
2427+ v[4] = xor_128_avx512(v[4], v[9]);
2428+ v[5] = rot12_128_avx512(v[5]);
2429+ v[6] = rot12_128_avx512(v[6]);
2430+ v[7] = rot12_128_avx512(v[7]);
2431+ v[4] = rot12_128_avx512(v[4]);
2432+ v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2433+ v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2434+ v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2435+ v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2436+ v[0] = add_128_avx512(v[0], v[5]);
2437+ v[1] = add_128_avx512(v[1], v[6]);
2438+ v[2] = add_128_avx512(v[2], v[7]);
2439+ v[3] = add_128_avx512(v[3], v[4]);
2440+ v[15] = xor_128_avx512(v[15], v[0]);
2441+ v[12] = xor_128_avx512(v[12], v[1]);
2442+ v[13] = xor_128_avx512(v[13], v[2]);
2443+ v[14] = xor_128_avx512(v[14], v[3]);
2444+ v[15] = rot8_128_avx512(v[15]);
2445+ v[12] = rot8_128_avx512(v[12]);
2446+ v[13] = rot8_128_avx512(v[13]);
2447+ v[14] = rot8_128_avx512(v[14]);
2448+ v[10] = add_128_avx512(v[10], v[15]);
2449+ v[11] = add_128_avx512(v[11], v[12]);
2450+ v[8] = add_128_avx512(v[8], v[13]);
2451+ v[9] = add_128_avx512(v[9], v[14]);
2452+ v[5] = xor_128_avx512(v[5], v[10]);
2453+ v[6] = xor_128_avx512(v[6], v[11]);
2454+ v[7] = xor_128_avx512(v[7], v[8]);
2455+ v[4] = xor_128_avx512(v[4], v[9]);
2456+ v[5] = rot7_128_avx512(v[5]);
2457+ v[6] = rot7_128_avx512(v[6]);
2458+ v[7] = rot7_128_avx512(v[7]);
2459+ v[4] = rot7_128_avx512(v[4]);
2460+}
2461+
2462+static inline void
2463+round_fn8_avx512(__m256i v[16], __m256i m[16], size_t r)
2464+{
2465+ v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2466+ v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2467+ v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2468+ v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2469+ v[0] = add_256_avx512(v[0], v[4]);
2470+ v[1] = add_256_avx512(v[1], v[5]);
2471+ v[2] = add_256_avx512(v[2], v[6]);
2472+ v[3] = add_256_avx512(v[3], v[7]);
2473+ v[12] = xor_256_avx512(v[12], v[0]);
2474+ v[13] = xor_256_avx512(v[13], v[1]);
2475+ v[14] = xor_256_avx512(v[14], v[2]);
2476+ v[15] = xor_256_avx512(v[15], v[3]);
2477+ v[12] = rot16_256_avx512(v[12]);
2478+ v[13] = rot16_256_avx512(v[13]);
2479+ v[14] = rot16_256_avx512(v[14]);
2480+ v[15] = rot16_256_avx512(v[15]);
2481+ v[8] = add_256_avx512(v[8], v[12]);
2482+ v[9] = add_256_avx512(v[9], v[13]);
2483+ v[10] = add_256_avx512(v[10], v[14]);
2484+ v[11] = add_256_avx512(v[11], v[15]);
2485+ v[4] = xor_256_avx512(v[4], v[8]);
2486+ v[5] = xor_256_avx512(v[5], v[9]);
2487+ v[6] = xor_256_avx512(v[6], v[10]);
2488+ v[7] = xor_256_avx512(v[7], v[11]);
2489+ v[4] = rot12_256_avx512(v[4]);
2490+ v[5] = rot12_256_avx512(v[5]);
2491+ v[6] = rot12_256_avx512(v[6]);
2492+ v[7] = rot12_256_avx512(v[7]);
2493+ v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2494+ v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2495+ v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2496+ v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2497+ v[0] = add_256_avx512(v[0], v[4]);
2498+ v[1] = add_256_avx512(v[1], v[5]);
2499+ v[2] = add_256_avx512(v[2], v[6]);
2500+ v[3] = add_256_avx512(v[3], v[7]);
2501+ v[12] = xor_256_avx512(v[12], v[0]);
2502+ v[13] = xor_256_avx512(v[13], v[1]);
2503+ v[14] = xor_256_avx512(v[14], v[2]);
2504+ v[15] = xor_256_avx512(v[15], v[3]);
2505+ v[12] = rot8_256_avx512(v[12]);
2506+ v[13] = rot8_256_avx512(v[13]);
2507+ v[14] = rot8_256_avx512(v[14]);
2508+ v[15] = rot8_256_avx512(v[15]);
2509+ v[8] = add_256_avx512(v[8], v[12]);
2510+ v[9] = add_256_avx512(v[9], v[13]);
2511+ v[10] = add_256_avx512(v[10], v[14]);
2512+ v[11] = add_256_avx512(v[11], v[15]);
2513+ v[4] = xor_256_avx512(v[4], v[8]);
2514+ v[5] = xor_256_avx512(v[5], v[9]);
2515+ v[6] = xor_256_avx512(v[6], v[10]);
2516+ v[7] = xor_256_avx512(v[7], v[11]);
2517+ v[4] = rot7_256_avx512(v[4]);
2518+ v[5] = rot7_256_avx512(v[5]);
2519+ v[6] = rot7_256_avx512(v[6]);
2520+ v[7] = rot7_256_avx512(v[7]);
2521+
2522+ v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2523+ v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2524+ v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2525+ v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2526+ v[0] = add_256_avx512(v[0], v[5]);
2527+ v[1] = add_256_avx512(v[1], v[6]);
2528+ v[2] = add_256_avx512(v[2], v[7]);
2529+ v[3] = add_256_avx512(v[3], v[4]);
2530+ v[15] = xor_256_avx512(v[15], v[0]);
2531+ v[12] = xor_256_avx512(v[12], v[1]);
2532+ v[13] = xor_256_avx512(v[13], v[2]);
2533+ v[14] = xor_256_avx512(v[14], v[3]);
2534+ v[15] = rot16_256_avx512(v[15]);
2535+ v[12] = rot16_256_avx512(v[12]);
2536+ v[13] = rot16_256_avx512(v[13]);
2537+ v[14] = rot16_256_avx512(v[14]);
2538+ v[10] = add_256_avx512(v[10], v[15]);
2539+ v[11] = add_256_avx512(v[11], v[12]);
2540+ v[8] = add_256_avx512(v[8], v[13]);
2541+ v[9] = add_256_avx512(v[9], v[14]);
2542+ v[5] = xor_256_avx512(v[5], v[10]);
2543+ v[6] = xor_256_avx512(v[6], v[11]);
2544+ v[7] = xor_256_avx512(v[7], v[8]);
2545+ v[4] = xor_256_avx512(v[4], v[9]);
2546+ v[5] = rot12_256_avx512(v[5]);
2547+ v[6] = rot12_256_avx512(v[6]);
2548+ v[7] = rot12_256_avx512(v[7]);
2549+ v[4] = rot12_256_avx512(v[4]);
2550+ v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2551+ v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2552+ v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2553+ v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2554+ v[0] = add_256_avx512(v[0], v[5]);
2555+ v[1] = add_256_avx512(v[1], v[6]);
2556+ v[2] = add_256_avx512(v[2], v[7]);
2557+ v[3] = add_256_avx512(v[3], v[4]);
2558+ v[15] = xor_256_avx512(v[15], v[0]);
2559+ v[12] = xor_256_avx512(v[12], v[1]);
2560+ v[13] = xor_256_avx512(v[13], v[2]);
2561+ v[14] = xor_256_avx512(v[14], v[3]);
2562+ v[15] = rot8_256_avx512(v[15]);
2563+ v[12] = rot8_256_avx512(v[12]);
2564+ v[13] = rot8_256_avx512(v[13]);
2565+ v[14] = rot8_256_avx512(v[14]);
2566+ v[10] = add_256_avx512(v[10], v[15]);
2567+ v[11] = add_256_avx512(v[11], v[12]);
2568+ v[8] = add_256_avx512(v[8], v[13]);
2569+ v[9] = add_256_avx512(v[9], v[14]);
2570+ v[5] = xor_256_avx512(v[5], v[10]);
2571+ v[6] = xor_256_avx512(v[6], v[11]);
2572+ v[7] = xor_256_avx512(v[7], v[8]);
2573+ v[4] = xor_256_avx512(v[4], v[9]);
2574+ v[5] = rot7_256_avx512(v[5]);
2575+ v[6] = rot7_256_avx512(v[6]);
2576+ v[7] = rot7_256_avx512(v[7]);
2577+ v[4] = rot7_256_avx512(v[4]);
2578+}
2579+
2580+static inline void
2581+round_fn16_avx512(__m512i v[16], __m512i m[16], size_t r)
2582+{
2583+ v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2584+ v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2585+ v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2586+ v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2587+ v[0] = add_512_avx512(v[0], v[4]);
2588+ v[1] = add_512_avx512(v[1], v[5]);
2589+ v[2] = add_512_avx512(v[2], v[6]);
2590+ v[3] = add_512_avx512(v[3], v[7]);
2591+ v[12] = xor_512_avx512(v[12], v[0]);
2592+ v[13] = xor_512_avx512(v[13], v[1]);
2593+ v[14] = xor_512_avx512(v[14], v[2]);
2594+ v[15] = xor_512_avx512(v[15], v[3]);
2595+ v[12] = rot16_512_avx512(v[12]);
2596+ v[13] = rot16_512_avx512(v[13]);
2597+ v[14] = rot16_512_avx512(v[14]);
2598+ v[15] = rot16_512_avx512(v[15]);
2599+ v[8] = add_512_avx512(v[8], v[12]);
2600+ v[9] = add_512_avx512(v[9], v[13]);
2601+ v[10] = add_512_avx512(v[10], v[14]);
2602+ v[11] = add_512_avx512(v[11], v[15]);
2603+ v[4] = xor_512_avx512(v[4], v[8]);
2604+ v[5] = xor_512_avx512(v[5], v[9]);
2605+ v[6] = xor_512_avx512(v[6], v[10]);
2606+ v[7] = xor_512_avx512(v[7], v[11]);
2607+ v[4] = rot12_512_avx512(v[4]);
2608+ v[5] = rot12_512_avx512(v[5]);
2609+ v[6] = rot12_512_avx512(v[6]);
2610+ v[7] = rot12_512_avx512(v[7]);
2611+ v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2612+ v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2613+ v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2614+ v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2615+ v[0] = add_512_avx512(v[0], v[4]);
2616+ v[1] = add_512_avx512(v[1], v[5]);
2617+ v[2] = add_512_avx512(v[2], v[6]);
2618+ v[3] = add_512_avx512(v[3], v[7]);
2619+ v[12] = xor_512_avx512(v[12], v[0]);
2620+ v[13] = xor_512_avx512(v[13], v[1]);
2621+ v[14] = xor_512_avx512(v[14], v[2]);
2622+ v[15] = xor_512_avx512(v[15], v[3]);
2623+ v[12] = rot8_512_avx512(v[12]);
2624+ v[13] = rot8_512_avx512(v[13]);
2625+ v[14] = rot8_512_avx512(v[14]);
2626+ v[15] = rot8_512_avx512(v[15]);
2627+ v[8] = add_512_avx512(v[8], v[12]);
2628+ v[9] = add_512_avx512(v[9], v[13]);
2629+ v[10] = add_512_avx512(v[10], v[14]);
2630+ v[11] = add_512_avx512(v[11], v[15]);
2631+ v[4] = xor_512_avx512(v[4], v[8]);
2632+ v[5] = xor_512_avx512(v[5], v[9]);
2633+ v[6] = xor_512_avx512(v[6], v[10]);
2634+ v[7] = xor_512_avx512(v[7], v[11]);
2635+ v[4] = rot7_512_avx512(v[4]);
2636+ v[5] = rot7_512_avx512(v[5]);
2637+ v[6] = rot7_512_avx512(v[6]);
2638+ v[7] = rot7_512_avx512(v[7]);
2639+
2640+ v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2641+ v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2642+ v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2643+ v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2644+ v[0] = add_512_avx512(v[0], v[5]);
2645+ v[1] = add_512_avx512(v[1], v[6]);
2646+ v[2] = add_512_avx512(v[2], v[7]);
2647+ v[3] = add_512_avx512(v[3], v[4]);
2648+ v[15] = xor_512_avx512(v[15], v[0]);
2649+ v[12] = xor_512_avx512(v[12], v[1]);
2650+ v[13] = xor_512_avx512(v[13], v[2]);
2651+ v[14] = xor_512_avx512(v[14], v[3]);
2652+ v[15] = rot16_512_avx512(v[15]);
2653+ v[12] = rot16_512_avx512(v[12]);
2654+ v[13] = rot16_512_avx512(v[13]);
2655+ v[14] = rot16_512_avx512(v[14]);
2656+ v[10] = add_512_avx512(v[10], v[15]);
2657+ v[11] = add_512_avx512(v[11], v[12]);
2658+ v[8] = add_512_avx512(v[8], v[13]);
2659+ v[9] = add_512_avx512(v[9], v[14]);
2660+ v[5] = xor_512_avx512(v[5], v[10]);
2661+ v[6] = xor_512_avx512(v[6], v[11]);
2662+ v[7] = xor_512_avx512(v[7], v[8]);
2663+ v[4] = xor_512_avx512(v[4], v[9]);
2664+ v[5] = rot12_512_avx512(v[5]);
2665+ v[6] = rot12_512_avx512(v[6]);
2666+ v[7] = rot12_512_avx512(v[7]);
2667+ v[4] = rot12_512_avx512(v[4]);
2668+ v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2669+ v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2670+ v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2671+ v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2672+ v[0] = add_512_avx512(v[0], v[5]);
2673+ v[1] = add_512_avx512(v[1], v[6]);
2674+ v[2] = add_512_avx512(v[2], v[7]);
2675+ v[3] = add_512_avx512(v[3], v[4]);
2676+ v[15] = xor_512_avx512(v[15], v[0]);
2677+ v[12] = xor_512_avx512(v[12], v[1]);
2678+ v[13] = xor_512_avx512(v[13], v[2]);
2679+ v[14] = xor_512_avx512(v[14], v[3]);
2680+ v[15] = rot8_512_avx512(v[15]);
2681+ v[12] = rot8_512_avx512(v[12]);
2682+ v[13] = rot8_512_avx512(v[13]);
2683+ v[14] = rot8_512_avx512(v[14]);
2684+ v[10] = add_512_avx512(v[10], v[15]);
2685+ v[11] = add_512_avx512(v[11], v[12]);
2686+ v[8] = add_512_avx512(v[8], v[13]);
2687+ v[9] = add_512_avx512(v[9], v[14]);
2688+ v[5] = xor_512_avx512(v[5], v[10]);
2689+ v[6] = xor_512_avx512(v[6], v[11]);
2690+ v[7] = xor_512_avx512(v[7], v[8]);
2691+ v[4] = xor_512_avx512(v[4], v[9]);
2692+ v[5] = rot7_512_avx512(v[5]);
2693+ v[6] = rot7_512_avx512(v[6]);
2694+ v[7] = rot7_512_avx512(v[7]);
2695+ v[4] = rot7_512_avx512(v[4]);
2696+}
2697+
2698+#define LO_IMM8 0x88
2699+#define HI_IMM8 0xdd
2700+
2701+static inline __m512i
2702+unpack_lo_128_avx512(__m512i a, __m512i b)
2703+{
2704+ return _mm512_shuffle_i32x4(a, b, LO_IMM8);
2705+}
2706+
2707+static inline __m512i
2708+unpack_hi_128_avx512(__m512i a, __m512i b)
2709+{
2710+ return _mm512_shuffle_i32x4(a, b, HI_IMM8);
2711+}
2712+
2713+static inline void
2714+transpose_vecs_512_avx512(__m512i vecs[16])
2715+{
2716+ __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
2717+ __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
2718+ __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
2719+ __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
2720+ __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
2721+ __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
2722+ __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
2723+ __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
2724+ __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
2725+ __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
2726+ __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
2727+ __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
2728+ __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
2729+ __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
2730+ __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
2731+ __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
2732+
2733+ __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
2734+ __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
2735+ __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
2736+ __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
2737+ __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
2738+ __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
2739+ __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
2740+ __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
2741+ __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
2742+ __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
2743+ __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
2744+ __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
2745+ __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
2746+ __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
2747+ __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
2748+ __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
2749+
2750+ __m512i abcdefgh_0 = unpack_lo_128_avx512(abcd_0, efgh_0);
2751+ __m512i abcdefgh_1 = unpack_lo_128_avx512(abcd_1, efgh_1);
2752+ __m512i abcdefgh_2 = unpack_lo_128_avx512(abcd_2, efgh_2);
2753+ __m512i abcdefgh_3 = unpack_lo_128_avx512(abcd_3, efgh_3);
2754+ __m512i abcdefgh_4 = unpack_hi_128_avx512(abcd_0, efgh_0);
2755+ __m512i abcdefgh_5 = unpack_hi_128_avx512(abcd_1, efgh_1);
2756+ __m512i abcdefgh_6 = unpack_hi_128_avx512(abcd_2, efgh_2);
2757+ __m512i abcdefgh_7 = unpack_hi_128_avx512(abcd_3, efgh_3);
2758+ __m512i ijklmnop_0 = unpack_lo_128_avx512(ijkl_0, mnop_0);
2759+ __m512i ijklmnop_1 = unpack_lo_128_avx512(ijkl_1, mnop_1);
2760+ __m512i ijklmnop_2 = unpack_lo_128_avx512(ijkl_2, mnop_2);
2761+ __m512i ijklmnop_3 = unpack_lo_128_avx512(ijkl_3, mnop_3);
2762+ __m512i ijklmnop_4 = unpack_hi_128_avx512(ijkl_0, mnop_0);
2763+ __m512i ijklmnop_5 = unpack_hi_128_avx512(ijkl_1, mnop_1);
2764+ __m512i ijklmnop_6 = unpack_hi_128_avx512(ijkl_2, mnop_2);
2765+ __m512i ijklmnop_7 = unpack_hi_128_avx512(ijkl_3, mnop_3);
2766+
2767+ vecs[0] = unpack_lo_128_avx512(abcdefgh_0, ijklmnop_0);
2768+ vecs[1] = unpack_lo_128_avx512(abcdefgh_1, ijklmnop_1);
2769+ vecs[2] = unpack_lo_128_avx512(abcdefgh_2, ijklmnop_2);
2770+ vecs[3] = unpack_lo_128_avx512(abcdefgh_3, ijklmnop_3);
2771+ vecs[4] = unpack_lo_128_avx512(abcdefgh_4, ijklmnop_4);
2772+ vecs[5] = unpack_lo_128_avx512(abcdefgh_5, ijklmnop_5);
2773+ vecs[6] = unpack_lo_128_avx512(abcdefgh_6, ijklmnop_6);
2774+ vecs[7] = unpack_lo_128_avx512(abcdefgh_7, ijklmnop_7);
2775+ vecs[8] = unpack_hi_128_avx512(abcdefgh_0, ijklmnop_0);
2776+ vecs[9] = unpack_hi_128_avx512(abcdefgh_1, ijklmnop_1);
2777+ vecs[10] = unpack_hi_128_avx512(abcdefgh_2, ijklmnop_2);
2778+ vecs[11] = unpack_hi_128_avx512(abcdefgh_3, ijklmnop_3);
2779+ vecs[12] = unpack_hi_128_avx512(abcdefgh_4, ijklmnop_4);
2780+ vecs[13] = unpack_hi_128_avx512(abcdefgh_5, ijklmnop_5);
2781+ vecs[14] = unpack_hi_128_avx512(abcdefgh_6, ijklmnop_6);
2782+ vecs[15] = unpack_hi_128_avx512(abcdefgh_7, ijklmnop_7);
2783+}
2784+
2785+static inline void
2786+transpose_msg_vecs16_avx512(const uint8_t *const *inputs, size_t block_offset, __m512i out_msg[16])
2787+{
2788+ size_t i;
2789+
2790+ out_msg[0] = loadu_512_avx512(&inputs[0][block_offset]);
2791+ out_msg[1] = loadu_512_avx512(&inputs[1][block_offset]);
2792+ out_msg[2] = loadu_512_avx512(&inputs[2][block_offset]);
2793+ out_msg[3] = loadu_512_avx512(&inputs[3][block_offset]);
2794+ out_msg[4] = loadu_512_avx512(&inputs[4][block_offset]);
2795+ out_msg[5] = loadu_512_avx512(&inputs[5][block_offset]);
2796+ out_msg[6] = loadu_512_avx512(&inputs[6][block_offset]);
2797+ out_msg[7] = loadu_512_avx512(&inputs[7][block_offset]);
2798+ out_msg[8] = loadu_512_avx512(&inputs[8][block_offset]);
2799+ out_msg[9] = loadu_512_avx512(&inputs[9][block_offset]);
2800+ out_msg[10] = loadu_512_avx512(&inputs[10][block_offset]);
2801+ out_msg[11] = loadu_512_avx512(&inputs[11][block_offset]);
2802+ out_msg[12] = loadu_512_avx512(&inputs[12][block_offset]);
2803+ out_msg[13] = loadu_512_avx512(&inputs[13][block_offset]);
2804+ out_msg[14] = loadu_512_avx512(&inputs[14][block_offset]);
2805+ out_msg[15] = loadu_512_avx512(&inputs[15][block_offset]);
2806+
2807+ for (i = 0; i < 16; i++) {
2808+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
2809+ }
2810+ transpose_vecs_512_avx512(out_msg);
2811+}
2812+
2813+static inline void
2814+load_counters16_avx512(uint64_t counter, int increment_counter, __m512i *out_lo, __m512i *out_hi)
2815+{
2816+ const __m512i mask = _mm512_set1_epi32(-increment_counter);
2817+ const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2818+ const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
2819+ const __m512i low_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), masked_deltas);
2820+ const __m512i carries = _mm512_srli_epi32(_mm512_andnot_si512(low_words, _mm512_set1_epi32((int32_t)counter)), 31);
2821+ const __m512i high_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carries);
2822+
2823+ *out_lo = low_words;
2824+ *out_hi = high_words;
2825+}
2826+
2827+void
2828+blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2829+{
2830+ __m512i h_vecs[8];
2831+ __m512i counter_low_vec, counter_high_vec;
2832+ uint8_t block_flags;
2833+ size_t block;
2834+
2835+ h_vecs[0] = set1_512_avx512(key[0]);
2836+ h_vecs[1] = set1_512_avx512(key[1]);
2837+ h_vecs[2] = set1_512_avx512(key[2]);
2838+ h_vecs[3] = set1_512_avx512(key[3]);
2839+ h_vecs[4] = set1_512_avx512(key[4]);
2840+ h_vecs[5] = set1_512_avx512(key[5]);
2841+ h_vecs[6] = set1_512_avx512(key[6]);
2842+ h_vecs[7] = set1_512_avx512(key[7]);
2843+
2844+ load_counters16_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2845+ block_flags = flags | flags_start;
2846+
2847+ for (block = 0; block < blocks; block++) {
2848+ __m512i block_len_vec;
2849+ __m512i block_flags_vec;
2850+ __m512i msg_vecs[16];
2851+ __m512i v[16];
2852+
2853+ if (block + 1 == blocks) {
2854+ block_flags |= flags_end;
2855+ }
2856+ block_len_vec = set1_512_avx512(BLAKE3_BLOCK_LEN);
2857+ block_flags_vec = set1_512_avx512(block_flags);
2858+ transpose_msg_vecs16_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2859+
2860+ v[0] = h_vecs[0];
2861+ v[1] = h_vecs[1];
2862+ v[2] = h_vecs[2];
2863+ v[3] = h_vecs[3];
2864+ v[4] = h_vecs[4];
2865+ v[5] = h_vecs[5];
2866+ v[6] = h_vecs[6];
2867+ v[7] = h_vecs[7];
2868+ v[8] = set1_512_avx512(IV[0]);
2869+ v[9] = set1_512_avx512(IV[1]);
2870+ v[10] = set1_512_avx512(IV[2]);
2871+ v[11] = set1_512_avx512(IV[3]);
2872+ v[12] = counter_low_vec;
2873+ v[13] = counter_high_vec;
2874+ v[14] = block_len_vec;
2875+ v[15] = block_flags_vec;
2876+
2877+ round_fn16_avx512(v, msg_vecs, 0);
2878+ round_fn16_avx512(v, msg_vecs, 1);
2879+ round_fn16_avx512(v, msg_vecs, 2);
2880+ round_fn16_avx512(v, msg_vecs, 3);
2881+ round_fn16_avx512(v, msg_vecs, 4);
2882+ round_fn16_avx512(v, msg_vecs, 5);
2883+ round_fn16_avx512(v, msg_vecs, 6);
2884+
2885+ h_vecs[0] = xor_512_avx512(v[0], v[8]);
2886+ h_vecs[1] = xor_512_avx512(v[1], v[9]);
2887+ h_vecs[2] = xor_512_avx512(v[2], v[10]);
2888+ h_vecs[3] = xor_512_avx512(v[3], v[11]);
2889+ h_vecs[4] = xor_512_avx512(v[4], v[12]);
2890+ h_vecs[5] = xor_512_avx512(v[5], v[13]);
2891+ h_vecs[6] = xor_512_avx512(v[6], v[14]);
2892+ h_vecs[7] = xor_512_avx512(v[7], v[15]);
2893+
2894+ block_flags = flags;
2895+ }
2896+
2897+ __m512i padded[16] = {
2898+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
2899+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
2900+ set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2901+ set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2902+ };
2903+ transpose_vecs_512_avx512(padded);
2904+ _mm256_mask_storeu_epi32(&out_bytes[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
2905+ _mm256_mask_storeu_epi32(&out_bytes[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
2906+ _mm256_mask_storeu_epi32(&out_bytes[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
2907+ _mm256_mask_storeu_epi32(&out_bytes[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
2908+ _mm256_mask_storeu_epi32(&out_bytes[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
2909+ _mm256_mask_storeu_epi32(&out_bytes[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
2910+ _mm256_mask_storeu_epi32(&out_bytes[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
2911+ _mm256_mask_storeu_epi32(&out_bytes[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
2912+ _mm256_mask_storeu_epi32(&out_bytes[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
2913+ _mm256_mask_storeu_epi32(&out_bytes[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
2914+ _mm256_mask_storeu_epi32(&out_bytes[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
2915+ _mm256_mask_storeu_epi32(&out_bytes[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
2916+ _mm256_mask_storeu_epi32(&out_bytes[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
2917+ _mm256_mask_storeu_epi32(&out_bytes[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
2918+ _mm256_mask_storeu_epi32(&out_bytes[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
2919+ _mm256_mask_storeu_epi32(&out_bytes[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
2920+}
2921+
2922+static inline void
2923+transpose_msg_vecs8_avx512(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
2924+{
2925+ out_msg[0] = loadu_256_avx512(&inputs[0][block_offset]);
2926+ out_msg[1] = loadu_256_avx512(&inputs[1][block_offset]);
2927+ out_msg[2] = loadu_256_avx512(&inputs[2][block_offset]);
2928+ out_msg[3] = loadu_256_avx512(&inputs[3][block_offset]);
2929+ out_msg[4] = loadu_256_avx512(&inputs[4][block_offset]);
2930+ out_msg[5] = loadu_256_avx512(&inputs[5][block_offset]);
2931+ out_msg[6] = loadu_256_avx512(&inputs[6][block_offset]);
2932+ out_msg[7] = loadu_256_avx512(&inputs[7][block_offset]);
2933+ out_msg[8] = loadu_256_avx512(&inputs[0][block_offset + 32]);
2934+ out_msg[9] = loadu_256_avx512(&inputs[1][block_offset + 32]);
2935+ out_msg[10] = loadu_256_avx512(&inputs[2][block_offset + 32]);
2936+ out_msg[11] = loadu_256_avx512(&inputs[3][block_offset + 32]);
2937+ out_msg[12] = loadu_256_avx512(&inputs[4][block_offset + 32]);
2938+ out_msg[13] = loadu_256_avx512(&inputs[5][block_offset + 32]);
2939+ out_msg[14] = loadu_256_avx512(&inputs[6][block_offset + 32]);
2940+ out_msg[15] = loadu_256_avx512(&inputs[7][block_offset + 32]);
2941+
2942+ transpose_vecs_avx2(out_msg);
2943+}
2944+
2945+static inline void
2946+load_counters8_avx512(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
2947+{
2948+ const __m256i mask = _mm256_set1_epi32(-increment_counter);
2949+ const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2950+ const __m256i add1 = _mm256_and_si256(mask, add0);
2951+ __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
2952+ __m256i carry = _mm256_srli_epi32(_mm256_andnot_si256(l, _mm256_set1_epi32((int32_t)counter)), 31);
2953+ __m256i h = _mm256_add_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
2954+
2955+ *out_lo = l;
2956+ *out_hi = h;
2957+}
2958+
2959+void
2960+blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2961+{
2962+ __m256i h_vecs[8];
2963+ __m256i counter_low_vec, counter_high_vec;
2964+ uint8_t block_flags;
2965+ size_t block;
2966+
2967+ h_vecs[0] = set1_256_avx512(key[0]);
2968+ h_vecs[1] = set1_256_avx512(key[1]);
2969+ h_vecs[2] = set1_256_avx512(key[2]);
2970+ h_vecs[3] = set1_256_avx512(key[3]);
2971+ h_vecs[4] = set1_256_avx512(key[4]);
2972+ h_vecs[5] = set1_256_avx512(key[5]);
2973+ h_vecs[6] = set1_256_avx512(key[6]);
2974+ h_vecs[7] = set1_256_avx512(key[7]);
2975+
2976+ load_counters8_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2977+ block_flags = flags | flags_start;
2978+
2979+ for (block = 0; block < blocks; block++) {
2980+ __m256i block_len_vec;
2981+ __m256i block_flags_vec;
2982+ __m256i msg_vecs[16];
2983+ __m256i v[16];
2984+
2985+ if (block + 1 == blocks) {
2986+ block_flags |= flags_end;
2987+ }
2988+ block_len_vec = set1_256_avx512(BLAKE3_BLOCK_LEN);
2989+ block_flags_vec = set1_256_avx512(block_flags);
2990+ transpose_msg_vecs8_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2991+
2992+ v[0] = h_vecs[0];
2993+ v[1] = h_vecs[1];
2994+ v[2] = h_vecs[2];
2995+ v[3] = h_vecs[3];
2996+ v[4] = h_vecs[4];
2997+ v[5] = h_vecs[5];
2998+ v[6] = h_vecs[6];
2999+ v[7] = h_vecs[7];
3000+ v[8] = set1_256_avx512(IV[0]);
3001+ v[9] = set1_256_avx512(IV[1]);
3002+ v[10] = set1_256_avx512(IV[2]);
3003+ v[11] = set1_256_avx512(IV[3]);
3004+ v[12] = counter_low_vec;
3005+ v[13] = counter_high_vec;
3006+ v[14] = block_len_vec;
3007+ v[15] = block_flags_vec;
3008+
3009+ round_fn8_avx512(v, msg_vecs, 0);
3010+ round_fn8_avx512(v, msg_vecs, 1);
3011+ round_fn8_avx512(v, msg_vecs, 2);
3012+ round_fn8_avx512(v, msg_vecs, 3);
3013+ round_fn8_avx512(v, msg_vecs, 4);
3014+ round_fn8_avx512(v, msg_vecs, 5);
3015+ round_fn8_avx512(v, msg_vecs, 6);
3016+
3017+ h_vecs[0] = xor_256_avx512(v[0], v[8]);
3018+ h_vecs[1] = xor_256_avx512(v[1], v[9]);
3019+ h_vecs[2] = xor_256_avx512(v[2], v[10]);
3020+ h_vecs[3] = xor_256_avx512(v[3], v[11]);
3021+ h_vecs[4] = xor_256_avx512(v[4], v[12]);
3022+ h_vecs[5] = xor_256_avx512(v[5], v[13]);
3023+ h_vecs[6] = xor_256_avx512(v[6], v[14]);
3024+ h_vecs[7] = xor_256_avx512(v[7], v[15]);
3025+
3026+ block_flags = flags;
3027+ }
3028+
3029+ transpose_vecs_avx2(h_vecs);
3030+ storeu_256_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
3031+ storeu_256_avx512(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
3032+ storeu_256_avx512(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
3033+ storeu_256_avx512(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
3034+ storeu_256_avx512(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
3035+ storeu_256_avx512(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
3036+ storeu_256_avx512(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
3037+ storeu_256_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
3038+}
3039+
3040+static inline void
3041+transpose_msg_vecs4_avx512(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
3042+{
3043+ out_msg[0] = loadu_128_avx512(&inputs[0][block_offset]);
3044+ out_msg[1] = loadu_128_avx512(&inputs[1][block_offset]);
3045+ out_msg[2] = loadu_128_avx512(&inputs[2][block_offset]);
3046+ out_msg[3] = loadu_128_avx512(&inputs[3][block_offset]);
3047+ out_msg[4] = loadu_128_avx512(&inputs[0][block_offset + 16]);
3048+ out_msg[5] = loadu_128_avx512(&inputs[1][block_offset + 16]);
3049+ out_msg[6] = loadu_128_avx512(&inputs[2][block_offset + 16]);
3050+ out_msg[7] = loadu_128_avx512(&inputs[3][block_offset + 16]);
3051+ out_msg[8] = loadu_128_avx512(&inputs[0][block_offset + 32]);
3052+ out_msg[9] = loadu_128_avx512(&inputs[1][block_offset + 32]);
3053+ out_msg[10] = loadu_128_avx512(&inputs[2][block_offset + 32]);
3054+ out_msg[11] = loadu_128_avx512(&inputs[3][block_offset + 32]);
3055+ out_msg[12] = loadu_128_avx512(&inputs[0][block_offset + 48]);
3056+ out_msg[13] = loadu_128_avx512(&inputs[1][block_offset + 48]);
3057+ out_msg[14] = loadu_128_avx512(&inputs[2][block_offset + 48]);
3058+ out_msg[15] = loadu_128_avx512(&inputs[3][block_offset + 48]);
3059+
3060+ transpose_vecs_sse2(out_msg);
3061+}
3062+
3063+static inline void
3064+load_counters4_avx512(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
3065+{
3066+ const __m128i mask = _mm_set1_epi32(-increment_counter);
3067+ const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
3068+ const __m128i add1 = _mm_and_si128(mask, add0);
3069+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
3070+ __m128i carry = _mm_srli_epi32(_mm_andnot_si128(l, _mm_set1_epi32((int32_t)counter)), 31);
3071+ __m128i h = _mm_add_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
3072+
3073+ *out_lo = l;
3074+ *out_hi = h;
3075+}
3076+
3077+void
3078+blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3079+{
3080+ __m128i h_vecs[8];
3081+ __m128i counter_low_vec, counter_high_vec;
3082+ uint8_t block_flags;
3083+ size_t block;
3084+
3085+ h_vecs[0] = set1_128_avx512(key[0]);
3086+ h_vecs[1] = set1_128_avx512(key[1]);
3087+ h_vecs[2] = set1_128_avx512(key[2]);
3088+ h_vecs[3] = set1_128_avx512(key[3]);
3089+ h_vecs[4] = set1_128_avx512(key[4]);
3090+ h_vecs[5] = set1_128_avx512(key[5]);
3091+ h_vecs[6] = set1_128_avx512(key[6]);
3092+ h_vecs[7] = set1_128_avx512(key[7]);
3093+
3094+ load_counters4_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
3095+ block_flags = flags | flags_start;
3096+
3097+ for (block = 0; block < blocks; block++) {
3098+ __m128i block_len_vec;
3099+ __m128i block_flags_vec;
3100+ __m128i msg_vecs[16];
3101+ __m128i v[16];
3102+
3103+ if (block + 1 == blocks) {
3104+ block_flags |= flags_end;
3105+ }
3106+ block_len_vec = set1_128_avx512(BLAKE3_BLOCK_LEN);
3107+ block_flags_vec = set1_128_avx512(block_flags);
3108+ transpose_msg_vecs4_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
3109+
3110+ v[0] = h_vecs[0];
3111+ v[1] = h_vecs[1];
3112+ v[2] = h_vecs[2];
3113+ v[3] = h_vecs[3];
3114+ v[4] = h_vecs[4];
3115+ v[5] = h_vecs[5];
3116+ v[6] = h_vecs[6];
3117+ v[7] = h_vecs[7];
3118+ v[8] = set1_128_avx512(IV[0]);
3119+ v[9] = set1_128_avx512(IV[1]);
3120+ v[10] = set1_128_avx512(IV[2]);
3121+ v[11] = set1_128_avx512(IV[3]);
3122+ v[12] = counter_low_vec;
3123+ v[13] = counter_high_vec;
3124+ v[14] = block_len_vec;
3125+ v[15] = block_flags_vec;
3126+
3127+ round_fn4_avx512(v, msg_vecs, 0);
3128+ round_fn4_avx512(v, msg_vecs, 1);
3129+ round_fn4_avx512(v, msg_vecs, 2);
3130+ round_fn4_avx512(v, msg_vecs, 3);
3131+ round_fn4_avx512(v, msg_vecs, 4);
3132+ round_fn4_avx512(v, msg_vecs, 5);
3133+ round_fn4_avx512(v, msg_vecs, 6);
3134+
3135+ h_vecs[0] = xor_128_avx512(v[0], v[8]);
3136+ h_vecs[1] = xor_128_avx512(v[1], v[9]);
3137+ h_vecs[2] = xor_128_avx512(v[2], v[10]);
3138+ h_vecs[3] = xor_128_avx512(v[3], v[11]);
3139+ h_vecs[4] = xor_128_avx512(v[4], v[12]);
3140+ h_vecs[5] = xor_128_avx512(v[5], v[13]);
3141+ h_vecs[6] = xor_128_avx512(v[6], v[14]);
3142+ h_vecs[7] = xor_128_avx512(v[7], v[15]);
3143+
3144+ block_flags = flags;
3145+ }
3146+
3147+ transpose_vecs_sse2(h_vecs);
3148+ storeu_128_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
3149+ storeu_128_avx512(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
3150+ storeu_128_avx512(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
3151+ storeu_128_avx512(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
3152+ storeu_128_avx512(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
3153+ storeu_128_avx512(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
3154+ storeu_128_avx512(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
3155+ storeu_128_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
3156+}
3157+
3158+static inline void
3159+hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
3160+{
3161+ uint32_t cv[8];
3162+ uint8_t block_flags;
3163+
3164+ memcpy(cv, key, BLAKE3_KEY_LEN);
3165+ block_flags = flags | flags_start;
3166+ while (blocks > 0) {
3167+ if (blocks == 1) {
3168+ block_flags |= flags_end;
3169+ }
3170+ blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
3171+ input = &input[BLAKE3_BLOCK_LEN];
3172+ blocks -= 1;
3173+ block_flags = flags;
3174+ }
3175+ memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
3176+}
3177+
3178+void
3179+blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3180+{
3181+ while (num_inputs >= 16) {
3182+ blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3183+ if (increment_counter) {
3184+ counter += 16;
3185+ }
3186+ inputs += 16;
3187+ num_inputs -= 16;
3188+ out_bytes = &out_bytes[16 * BLAKE3_OUT_LEN];
3189+ }
3190+ while (num_inputs >= 8) {
3191+ blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3192+ if (increment_counter) {
3193+ counter += 8;
3194+ }
3195+ inputs += 8;
3196+ num_inputs -= 8;
3197+ out_bytes = &out_bytes[8 * BLAKE3_OUT_LEN];
3198+ }
3199+ while (num_inputs >= 4) {
3200+ blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3201+ if (increment_counter) {
3202+ counter += 4;
3203+ }
3204+ inputs += 4;
3205+ num_inputs -= 4;
3206+ out_bytes = &out_bytes[4 * BLAKE3_OUT_LEN];
3207+ }
3208+ while (num_inputs > 0) {
3209+ hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
3210+ if (increment_counter) {
3211+ counter += 1;
3212+ }
3213+ inputs += 1;
3214+ num_inputs -= 1;
3215+ out_bytes = &out_bytes[BLAKE3_OUT_LEN];
3216+ }
3217+}
3218+#pragma GCC pop_options
3219+#endif
3220+
3221+/* dispatch functions */
3222+void
3223+blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3224+{
3225+#if defined(__x86_64__)
3226+ if (!blake3_cpu_detected)
3227+ blake3_detect_cpu_features();
3228+ if (blake3_cpu_features & AVX512) {
3229+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
3230+ return;
3231+ }
3232+ if (blake3_cpu_features & SSE41) {
3233+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
3234+ return;
3235+ }
3236+ if (blake3_cpu_features & SSE2) {
3237+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
3238+ return;
3239+ }
3240+#endif
3241+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
3242+}
3243+
3244+void
3245+blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
3246+{
3247+#if defined(__x86_64__)
3248+ if (!blake3_cpu_detected)
3249+ blake3_detect_cpu_features();
3250+ if (blake3_cpu_features & AVX512) {
3251+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
3252+ return;
3253+ }
3254+ if (blake3_cpu_features & SSE41) {
3255+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
3256+ return;
3257+ }
3258+ if (blake3_cpu_features & SSE2) {
3259+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
3260+ return;
3261+ }
3262+#endif
3263+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
3264+}
3265+
3266+void
3267+blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
3268+{
3269+#if defined(__x86_64__)
3270+ if (!blake3_cpu_detected)
3271+ blake3_detect_cpu_features();
3272+ if (blake3_cpu_features & AVX512) {
3273+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3274+ return;
3275+ }
3276+ if (blake3_cpu_features & AVX2) {
3277+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3278+ return;
3279+ }
3280+ if (blake3_cpu_features & SSE41) {
3281+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3282+ return;
3283+ }
3284+ if (blake3_cpu_features & SSE2) {
3285+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3286+ return;
3287+ }
3288+#endif
3289+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3290+}
3291+
3292+size_t
3293+blake3_simd_degree(void)
3294+{
3295+#if defined(__x86_64__)
3296+ if (!blake3_cpu_detected)
3297+ blake3_detect_cpu_features();
3298+ if (blake3_cpu_features & AVX512)
3299+ return 16;
3300+ if (blake3_cpu_features & AVX2)
3301+ return 8;
3302+ if (blake3_cpu_features & SSE41)
3303+ return 4;
3304+ if (blake3_cpu_features & SSE2)
3305+ return 4;
3306+#endif
3307+ return 1;
3308+}
3309+
3310+/* core hasher implementation */
3311+const char *
3312+blake3_version(void)
3313+{
3314+ return BLAKE3_VERSION_STRING;
3315+}
3316+
3317+static inline void
3318+chunk_state_init(struct Blake3ChunkState *self, const uint32_t key[8], uint8_t flags)
3319+{
3320+ memcpy(self->cv, key, BLAKE3_KEY_LEN);
3321+ self->chunk_counter = 0;
3322+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3323+ self->buf_len = 0;
3324+ self->blocks_compressed = 0;
3325+ self->flags = flags;
3326+}
3327+
3328+static inline void
3329+chunk_state_reset(struct Blake3ChunkState *self, const uint32_t key[8], uint64_t chunk_counter)
3330+{
3331+ memcpy(self->cv, key, BLAKE3_KEY_LEN);
3332+ self->chunk_counter = chunk_counter;
3333+ self->blocks_compressed = 0;
3334+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3335+ self->buf_len = 0;
3336+}
3337+
3338+static inline size_t
3339+chunk_state_len(const struct Blake3ChunkState *self)
3340+{
3341+ return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len);
3342+}
3343+
3344+static inline size_t
3345+chunk_state_fill_buf(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3346+{
3347+ size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
3348+ uint8_t *dest;
3349+
3350+ if (take > input_len) {
3351+ take = input_len;
3352+ }
3353+ dest = self->buf + ((size_t)self->buf_len);
3354+ memcpy(dest, input, take);
3355+ self->buf_len += (uint8_t)take;
3356+ return take;
3357+}
3358+
3359+static inline uint8_t
3360+chunk_state_maybe_start_flag(const struct Blake3ChunkState *self)
3361+{
3362+ if (self->blocks_compressed == 0) {
3363+ return CHUNK_START;
3364+ } else {
3365+ return 0;
3366+ }
3367+}
3368+
3369+static inline struct Output
3370+make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3371+{
3372+ struct Output ret;
3373+
3374+ memcpy(ret.input_cv, input_cv, 32);
3375+ memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
3376+ ret.block_len = block_len;
3377+ ret.counter = counter;
3378+ ret.flags = flags;
3379+ return ret;
3380+}
3381+
3382+static inline void
3383+output_chaining_value(const struct Output *self, uint8_t cv[32])
3384+{
3385+ uint32_t cv_words[8];
3386+
3387+ memcpy(cv_words, self->input_cv, 32);
3388+ blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags);
3389+ store_cv_words(cv, cv_words);
3390+}
3391+
3392+static inline void
3393+output_root_bytes(const struct Output *self, uint64_t seek, uint8_t *out, size_t out_len)
3394+{
3395+ uint64_t output_block_counter = seek / 64;
3396+ size_t offset_within_block = seek % 64;
3397+ uint8_t wide_buf[64];
3398+ size_t available_bytes;
3399+ size_t memcpy_len;
3400+
3401+ while (out_len > 0) {
3402+ blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
3403+ available_bytes = 64 - offset_within_block;
3404+ if (out_len > available_bytes) {
3405+ memcpy_len = available_bytes;
3406+ } else {
3407+ memcpy_len = out_len;
3408+ }
3409+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
3410+ out += memcpy_len;
3411+ out_len -= memcpy_len;
3412+ output_block_counter += 1;
3413+ offset_within_block = 0;
3414+ }
3415+}
3416+
3417+static inline void
3418+chunk_state_update(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3419+{
3420+ size_t take;
3421+
3422+ if (self->buf_len > 0) {
3423+ take = chunk_state_fill_buf(self, input, input_len);
3424+ input += take;
3425+ input_len -= take;
3426+ if (input_len > 0) {
3427+ blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3428+ self->blocks_compressed += 1;
3429+ self->buf_len = 0;
3430+ memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3431+ }
3432+ }
3433+
3434+ while (input_len > BLAKE3_BLOCK_LEN) {
3435+ blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3436+ self->blocks_compressed += 1;
3437+ input += BLAKE3_BLOCK_LEN;
3438+ input_len -= BLAKE3_BLOCK_LEN;
3439+ }
3440+
3441+ take = chunk_state_fill_buf(self, input, input_len);
3442+ input += take;
3443+ input_len -= take;
3444+}
3445+
3446+static inline struct Output
3447+chunk_state_output(const struct Blake3ChunkState *self)
3448+{
3449+ uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
3450+
3451+ return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
3452+}
3453+
3454+static inline struct Output
3455+parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags)
3456+{
3457+ return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
3458+}
3459+
3460+static unsigned int
3461+highest_one(uint64_t x)
3462+{
3463+#if defined(__GNUC__) || defined(__clang__)
3464+ return 63 ^ __builtin_clzll(x);
3465+#else
3466+ unsigned int c = 0;
3467+ if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
3468+ if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
3469+ if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
3470+ if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
3471+ if (x & 0x000000000000000cULL) { x >>= 2; c += 2; }
3472+ if (x & 0x0000000000000002ULL) { c += 1; }
3473+ return c;
3474+#endif
3475+}
3476+
3477+static inline uint64_t
3478+round_down_to_power_of_2(uint64_t x)
3479+{
3480+ return 1ULL << highest_one(x | 1);
3481+}
3482+
3483+static inline size_t
3484+left_len(size_t content_len)
3485+{
3486+ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
3487+
3488+ return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
3489+}
3490+
3491+static inline size_t
3492+compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3493+{
3494+ const uint8_t *chunks_array[MAX_SIMD_DEGREE];
3495+ size_t input_position = 0;
3496+ size_t chunks_array_len = 0;
3497+
3498+ assert(0 < input_len);
3499+ assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
3500+
3501+ while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
3502+ chunks_array[chunks_array_len] = &input[input_position];
3503+ input_position += BLAKE3_CHUNK_LEN;
3504+ chunks_array_len += 1;
3505+ }
3506+
3507+ blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, 1, flags, CHUNK_START, CHUNK_END, out);
3508+
3509+ if (input_len > input_position) {
3510+ uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
3511+ struct Blake3ChunkState chunk_state;
3512+ struct Output output;
3513+
3514+ chunk_state_init(&chunk_state, key, flags);
3515+ chunk_state.chunk_counter = counter;
3516+ chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
3517+ output = chunk_state_output(&chunk_state);
3518+ output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
3519+ return chunks_array_len + 1;
3520+ } else {
3521+ return chunks_array_len;
3522+ }
3523+}
3524+
3525+static inline size_t
3526+compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out)
3527+{
3528+ const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
3529+ size_t parents_array_len = 0;
3530+
3531+ assert(2 <= num_chaining_values);
3532+ assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
3533+
3534+ while (num_chaining_values - (2 * parents_array_len) >= 2) {
3535+ parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
3536+ parents_array_len += 1;
3537+ }
3538+
3539+ blake3_hash_many(parents_array, parents_array_len, 1, key, 0, 0, flags | PARENT, 0, 0, out);
3540+
3541+ if (num_chaining_values > 2 * parents_array_len) {
3542+ memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
3543+ return parents_array_len + 1;
3544+ } else {
3545+ return parents_array_len;
3546+ }
3547+}
3548+
3549+static inline size_t
3550+blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3551+{
3552+ size_t degree;
3553+
3554+ if (input_len <= (size_t)blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
3555+ return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out);
3556+ }
3557+
3558+ degree = blake3_simd_degree();
3559+ if (degree > 1) {
3560+ size_t child_len = round_down_to_power_of_2(input_len - 1) / degree;
3561+ size_t cvs_written = 0;
3562+
3563+ if (child_len < BLAKE3_CHUNK_LEN) {
3564+ child_len = BLAKE3_CHUNK_LEN;
3565+ }
3566+
3567+ while (input_len > 0) {
3568+ size_t take = child_len;
3569+ size_t sub_cvs;
3570+
3571+ if (take > input_len) {
3572+ take = input_len;
3573+ }
3574+ sub_cvs = blake3_compress_subtree_wide(input, take, key, chunk_counter, flags, &out[cvs_written * BLAKE3_OUT_LEN]);
3575+ cvs_written += sub_cvs;
3576+ chunk_counter += take / BLAKE3_CHUNK_LEN;
3577+ input += take;
3578+ input_len -= take;
3579+ }
3580+
3581+ while (cvs_written > 2) {
3582+ cvs_written = compress_parents_parallel(out, cvs_written, key, flags, out);
3583+ }
3584+ return cvs_written;
3585+ } else {
3586+ /* fallback when simd degree is 1 */
3587+ size_t left = left_len(input_len);
3588+ size_t right = input_len - left;
3589+ size_t left_cvs = blake3_compress_subtree_wide(input, left, key, chunk_counter, flags, out);
3590+ size_t right_cvs = blake3_compress_subtree_wide(input + left, right, key, chunk_counter + (left / BLAKE3_CHUNK_LEN), flags, &out[left_cvs * BLAKE3_OUT_LEN]);
3591+
3592+ return left_cvs + right_cvs;
3593+ }
3594+}
3595+
3596+static void
3597+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
3598+{
3599+ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
3600+ size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array);
3601+
3602+ assert(num_cvs >= 2);
3603+ while (num_cvs > 2) {
3604+ num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, cv_array);
3605+ }
3606+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
3607+}
3608+
3609+static inline void
3610+hasher_init_base(struct Blake3Hasher *self, const uint32_t key[8], uint8_t flags)
3611+{
3612+ memcpy(self->key, key, BLAKE3_KEY_LEN);
3613+ chunk_state_init(&self->chunk, key, flags);
3614+ self->cv_stack_len = 0;
3615+}
3616+
3617+void
3618+blake3_hasher_init(struct Blake3Hasher *self)
3619+{
3620+ if (!blake3_cpu_detected)
3621+ blake3_detect_cpu_features();
3622+ hasher_init_base(self, IV, 0);
3623+}
3624+
3625+static inline void
3626+load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8])
3627+{
3628+ key_words[0] = load32(&key[0 * 4]);
3629+ key_words[1] = load32(&key[1 * 4]);
3630+ key_words[2] = load32(&key[2 * 4]);
3631+ key_words[3] = load32(&key[3 * 4]);
3632+ key_words[4] = load32(&key[4 * 4]);
3633+ key_words[5] = load32(&key[5 * 4]);
3634+ key_words[6] = load32(&key[6 * 4]);
3635+ key_words[7] = load32(&key[7 * 4]);
3636+}
3637+
3638+void
3639+blake3_hasher_init_keyed(struct Blake3Hasher *self, const uint8_t key[BLAKE3_KEY_LEN])
3640+{
3641+ uint32_t key_words[8];
3642+
3643+ load_key_words(key, key_words);
3644+ hasher_init_base(self, key_words, KEYED_HASH);
3645+}
3646+
3647+void
3648+blake3_hasher_init_derive_key_raw(struct Blake3Hasher *self, const void *context, size_t context_len)
3649+{
3650+ struct Blake3Hasher context_hasher;
3651+ uint8_t context_key[BLAKE3_KEY_LEN];
3652+ uint32_t context_key_words[8];
3653+
3654+ hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
3655+ blake3_hasher_update(&context_hasher, context, context_len);
3656+ blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
3657+ load_key_words(context_key, context_key_words);
3658+ hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
3659+}
3660+
3661+static inline unsigned int
3662+popcnt(uint64_t x)
3663+{
3664+#if defined(__GNUC__) || defined(__clang__)
3665+ return __builtin_popcountll(x);
3666+#else
3667+ unsigned int count = 0;
3668+ while (x != 0) {
3669+ count += 1;
3670+ x &= x - 1;
3671+ }
3672+ return count;
3673+#endif
3674+}
3675+
3676+void
3677+blake3_hasher_init_derive_key(struct Blake3Hasher *self, const char *context)
3678+{
3679+ blake3_hasher_init_derive_key_raw(self, context, strlen(context));
3680+}
3681+
3682+static inline void
3683+hasher_merge_cv_stack(struct Blake3Hasher *self, uint64_t total_len)
3684+{
3685+ size_t post_merge_stack_len = (size_t)popcnt(total_len);
3686+
3687+ while (self->cv_stack_len > post_merge_stack_len) {
3688+ uint8_t *parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
3689+ struct Output output = parent_output(parent_node, self->key, self->chunk.flags);
3690+
3691+ output_chaining_value(&output, parent_node);
3692+ self->cv_stack_len -= 1;
3693+ }
3694+}
3695+
3696+static inline void
3697+hasher_push_cv(struct Blake3Hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
3698+{
3699+ hasher_merge_cv_stack(self, chunk_counter);
3700+ memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
3701+ self->cv_stack_len += 1;
3702+}
3703+
3704+void
3705+blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len)
3706+{
3707+ const uint8_t *input_bytes;
3708+
3709+ if (input_len == 0) {
3710+ return;
3711+ }
3712+
3713+ input_bytes = (const uint8_t *)input;
3714+
3715+ if (chunk_state_len(&self->chunk) > 0) {
3716+ size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
3717+
3718+ if (take > input_len) {
3719+ take = input_len;
3720+ }
3721+ chunk_state_update(&self->chunk, input_bytes, take);
3722+ input_bytes += take;
3723+ input_len -= take;
3724+ if (input_len > 0) {
3725+ struct Output output = chunk_state_output(&self->chunk);
3726+ uint8_t chunk_cv[32];
3727+
3728+ output_chaining_value(&output, chunk_cv);
3729+ hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
3730+ chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
3731+ } else {
3732+ return;
3733+ }
3734+ }
3735+
3736+ while (input_len > BLAKE3_CHUNK_LEN) {
3737+ size_t subtree_len = round_down_to_power_of_2(input_len);
3738+ uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
3739+ uint64_t subtree_chunks;
3740+
3741+ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
3742+ subtree_len /= 2;
3743+ }
3744+ subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
3745+ if (subtree_len <= BLAKE3_CHUNK_LEN) {
3746+ struct Blake3ChunkState chunk_state;
3747+ struct Output output;
3748+ uint8_t cv[BLAKE3_OUT_LEN];
3749+
3750+ chunk_state_init(&chunk_state, self->key, self->chunk.flags);
3751+ chunk_state.chunk_counter = self->chunk.chunk_counter;
3752+ chunk_state_update(&chunk_state, input_bytes, subtree_len);
3753+ output = chunk_state_output(&chunk_state);
3754+ output_chaining_value(&output, cv);
3755+ hasher_push_cv(self, cv, chunk_state.chunk_counter);
3756+ } else {
3757+ uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
3758+
3759+ compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
3760+ hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
3761+ hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2));
3762+ }
3763+ self->chunk.chunk_counter += subtree_chunks;
3764+ input_bytes += subtree_len;
3765+ input_len -= subtree_len;
3766+ }
3767+
3768+ if (input_len > 0) {
3769+ chunk_state_update(&self->chunk, input_bytes, input_len);
3770+ hasher_merge_cv_stack(self, self->chunk.chunk_counter);
3771+ }
3772+}
3773+
3774+void
3775+blake3_hasher_finalize_seek(const struct Blake3Hasher *self, uint64_t seek, uint8_t *out, size_t out_len)
3776+{
3777+ struct Output output;
3778+ size_t cvs_remaining;
3779+
3780+ if (out_len == 0) {
3781+ return;
3782+ }
3783+
3784+ if (self->cv_stack_len == 0) {
3785+ output = chunk_state_output(&self->chunk);
3786+ output_root_bytes(&output, seek, out, out_len);
3787+ return;
3788+ }
3789+
3790+ if (chunk_state_len(&self->chunk) > 0) {
3791+ cvs_remaining = self->cv_stack_len;
3792+ output = chunk_state_output(&self->chunk);
3793+ } else {
3794+ cvs_remaining = self->cv_stack_len - 2;
3795+ output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags);
3796+ }
3797+
3798+ while (cvs_remaining > 0) {
3799+ uint8_t parent_block[BLAKE3_BLOCK_LEN];
3800+
3801+ cvs_remaining -= 1;
3802+ memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
3803+ output_chaining_value(&output, &parent_block[32]);
3804+ output = parent_output(parent_block, self->key, self->chunk.flags);
3805+ }
3806+ output_root_bytes(&output, seek, out, out_len);
3807+}
3808+
3809+void
3810+blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len)
3811+{
3812+ blake3_hasher_finalize_seek(self, 0, out, out_len);
3813+}
3814+
3815+void
3816+blake3_hasher_reset(struct Blake3Hasher *self)
3817+{
3818+ chunk_state_reset(&self->chunk, self->key, 0);
3819+ self->cv_stack_len = 0;
3820+}
3821+
3822+/* utility implementation */
3823+static unsigned char *out;
3824+static size_t outlen = BLAKE3_OUT_LEN;
3825+
3826+static void
3827+usage(void)
3828+{
3829+ fprintf(stderr, "usage: %s [-bct] [-l length] [file ...]\n", argv0);
3830+ exit(1);
3831+}
3832+
3833+static int
3834+sumfile(const char *name, FILE *file, unsigned char *out_buf, size_t out_len)
3835+{
3836+ char buf[16384];
3837+ struct Blake3Hasher ctx;
3838+ size_t len;
3839+
3840+ blake3_hasher_init(&ctx);
3841+ do {
3842+ len = fread(buf, 1, sizeof(buf), file);
3843+ if (len > 0)
3844+ blake3_hasher_update(&ctx, buf, len);
3845+ } while (len == sizeof(buf));
3846+
3847+ if (ferror(file)) {
3848+ fprintf(stderr, "%s: read %s: ", argv0, name);
3849+ perror(NULL);
3850+ return 1;
3851+ }
3852+ blake3_hasher_finalize(&ctx, out_buf, out_len);
3853+ return 0;
3854+}
3855+
3856+static int
3857+sum(const char *name, FILE *file)
3858+{
3859+ size_t i;
3860+
3861+ if (sumfile(name, file, out, outlen) != 0)
3862+ return 1;
3863+ for (i = 0; i < outlen; i++)
3864+ printf("%02x", out[i]);
3865+ printf(" %s\n", name);
3866+ return 0;
3867+}
3868+
3869+static int
3870+hexval(int c)
3871+{
3872+ if ('0' <= c && c <= '9')
3873+ return c - '0';
3874+ if ('a' <= c && c <= 'f')
3875+ return c - 'a' + 10;
3876+ if ('A' <= c && c <= 'F')
3877+ return c - 'A' + 10;
3878+ return -1;
3879+}
3880+
3881+static int
3882+checkfile(const char *name, const char *mode, const char *str, unsigned char *out_buf, size_t len)
3883+{
3884+ FILE *file;
3885+ int c1, c2;
3886+ size_t i;
3887+
3888+ file = fopen(name, mode);
3889+ if (!file) {
3890+ fprintf(stderr, "%s: open %s: ", argv0, name);
3891+ perror(NULL);
3892+ return 1;
3893+ }
3894+ sumfile(name, file, out_buf, len);
3895+ fclose(file);
3896+
3897+ for (i = 0; i < len; i++) {
3898+ c1 = hexval(str[i * 2]);
3899+ c2 = hexval(str[i * 2 + 1]);
3900+ if (c1 == -1 || c2 == -1) {
3901+ fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3902+ return 1;
3903+ }
3904+ if (out_buf[i] != (c1 << 4 | c2)) {
3905+ printf("%s: FAILED\n", name);
3906+ return 1;
3907+ }
3908+ }
3909+ printf("%s: OK\n", name);
3910+ return 0;
3911+}
3912+
3913+static int
3914+check(const char *name, FILE *file)
3915+{
3916+ const char *mode;
3917+ char buf[8192], *pos, *end;
3918+ size_t len;
3919+ int ret = 0, skip = 0;
3920+
3921+ buf[sizeof(buf) - 2] = 0;
3922+ while (fgets(buf, sizeof(buf), file)) {
3923+ if (buf[sizeof(buf) - 2]) {
3924+ fprintf(stderr, "%s: skipping line that is too long\n", argv0);
3925+ buf[sizeof(buf) - 2] = 0;
3926+ skip = 1;
3927+ ret = 1;
3928+ continue;
3929+ }
3930+ if (skip) {
3931+ skip = 0;
3932+ continue;
3933+ }
3934+ pos = strchr(buf, ' ');
3935+ if (!pos || pos == buf || (pos[1] != ' ' && pos[1] != '*') || (pos - buf) & 1) {
3936+ fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3937+ ret = 1;
3938+ continue;
3939+ }
3940+ mode = pos[1] == ' ' ? "r" : "rb";
3941+ len = (pos - buf) / 2;
3942+ if (len > outlen) {
3943+ outlen = len;
3944+ free(out);
3945+ out = malloc(len);
3946+ if (!out) {
3947+ perror(argv0);
3948+ return 1;
3949+ }
3950+ }
3951+ *pos = '\0';
3952+ pos += 2;
3953+ end = strchr(pos, '\n');
3954+ if (end)
3955+ *end = '\0';
3956+ ret |= checkfile(pos, mode, buf, out, len);
3957+ }
3958+ if (ferror(file)) {
3959+ fprintf(stderr, "%s: read %s: ", argv0, name);
3960+ perror(NULL);
3961+ ret = 1;
3962+ }
3963+ return ret;
3964+}
3965+
3966+int
3967+main(int argc, char *argv[])
3968+{
3969+ int (*func)(const char *, FILE *) = sum;
3970+ FILE *file;
3971+ char *end;
3972+ const char *name, *mode = NULL;
3973+ int ret = 0;
3974+
3975+ ARGBEGIN {
3976+ case 'b':
3977+ mode = "rb";
3978+ break;
3979+ case 'c':
3980+ func = check;
3981+ break;
3982+ case 'l':
3983+ outlen = strtoul(EARGF(usage()), &end, 10);
3984+ if (*end)
3985+ usage();
3986+ break;
3987+ case 't':
3988+ mode = "r";
3989+ break;
3990+ default:
3991+ usage();
3992+ } ARGEND
3993+
3994+ out = malloc(outlen);
3995+ if (!out) {
3996+ perror(NULL);
3997+ return 1;
3998+ }
3999+
4000+ if (argc == 0) {
4001+ if (!mode || strcmp(mode, "r") == 0 || freopen(NULL, mode, stdin)) {
4002+ ret |= func("<stdin>", stdin);
4003+ } else {
4004+ fprintf(stderr, "%s: reopen stdin: ", argv0);
4005+ perror(NULL);
4006+ ret = 1;
4007+ }
4008+ } else {
4009+ if (!mode)
4010+ mode = "r";
4011+ for (; argc > 0; argc--, argv++) {
4012+ name = *argv;
4013+ file = fopen(name, mode);
4014+ if (file) {
4015+ ret |= func(name, file);
4016+ fclose(file);
4017+ } else {
4018+ fprintf(stderr, "%s: open %s: ", argv0, name);
4019+ perror(NULL);
4020+ ret = 1;
4021+ }
4022+ }
4023+ }
4024+
4025+ free(out);
4026+ if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"))
4027+ ret = 1;
4028+
4029+ return ret;
4030+}
+233,
-10
1@@ -58,7 +58,27 @@ static int first = 1;
2 static char sort = 0;
3 static int showdirs;
4
5+static int Cflag = 0;
6+static int one_flag = 0;
7+static int termwidth = 80;
8+
9+#if FEATURE_LS_COLOR
10+#define COLOR_DIR "\033[1;34m"
11+#define COLOR_LNK "\033[1;36m"
12+#define COLOR_FIFO "\033[33m"
13+#define COLOR_SOCK "\033[1;35m"
14+#define COLOR_DEV "\033[1;33m"
15+#define COLOR_EXE "\033[1;32m"
16+#define COLOR_RST "\033[0m"
17+
18+enum { COLOR_NEVER, COLOR_ALWAYS, COLOR_AUTO };
19+static int color_mode = COLOR_NEVER;
20+#endif
21+
22 static void ls(const char *, const struct entry *, int);
23+static void printname_colored(const char *, mode_t);
24+static void printcols(const struct entry *, size_t);
25+static void output(const struct entry *);
26
27 static void
28 mkent(struct entry *ent, char *path, int dostat, int follow)
29@@ -132,6 +152,155 @@ printname(const char *name)
30 }
31 }
32
33+#if FEATURE_LS_COLOR
34+static int
35+should_color(void)
36+{
37+ if (color_mode == COLOR_ALWAYS)
38+ return 1;
39+ if (color_mode == COLOR_AUTO)
40+ return isatty(STDOUT_FILENO);
41+ return 0;
42+}
43+#endif
44+
45+static void
46+printname_colored(const char *name, mode_t mode)
47+{
48+#if FEATURE_LS_COLOR
49+ int need_reset = 0;
50+
51+ if (should_color()) {
52+ if (S_ISDIR(mode)) {
53+ fputs(COLOR_DIR, stdout);
54+ need_reset = 1;
55+ } else if (S_ISLNK(mode)) {
56+ fputs(COLOR_LNK, stdout);
57+ need_reset = 1;
58+ } else if (S_ISFIFO(mode)) {
59+ fputs(COLOR_FIFO, stdout);
60+ need_reset = 1;
61+ } else if (S_ISSOCK(mode)) {
62+ fputs(COLOR_SOCK, stdout);
63+ need_reset = 1;
64+ } else if (S_ISBLK(mode) || S_ISCHR(mode)) {
65+ fputs(COLOR_DEV, stdout);
66+ need_reset = 1;
67+ } else if (S_ISREG(mode) && (mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
68+ fputs(COLOR_EXE, stdout);
69+ need_reset = 1;
70+ }
71+ }
72+ printname(name);
73+ if (need_reset)
74+ fputs(COLOR_RST, stdout);
75+#else
76+ (void)mode;
77+ printname(name);
78+#endif
79+}
80+
81+#include <sys/ioctl.h>
82+
83+static void
84+gettermwidth(void)
85+{
86+ struct winsize ws;
87+
88+ if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws) == 0 && ws.ws_col > 0)
89+ termwidth = ws.ws_col;
90+ else
91+ termwidth = 80;
92+}
93+
94+static size_t
95+entrywidth(const struct entry *ent)
96+{
97+ size_t w;
98+ char buf[32];
99+
100+ w = utflen(ent->name);
101+ if (iflag) {
102+ snprintf(buf, sizeof(buf), "%lu ", (unsigned long)ent->ino);
103+ w += strlen(buf);
104+ }
105+ w += strlen(indicator(ent->mode));
106+ return w;
107+}
108+
109+static void
110+printcols(const struct entry *ents, size_t n)
111+{
112+ int i, r, c, ncols, nrows, total_width;
113+ int *colwidths;
114+ int maxcols;
115+
116+ if (n == 0)
117+ return;
118+
119+ gettermwidth();
120+
121+ colwidths = ecalloc(n, sizeof(*colwidths));
122+
123+ maxcols = termwidth / 2;
124+ if (maxcols > (int)n)
125+ maxcols = n;
126+
127+ for (ncols = maxcols; ncols > 1; ncols--) {
128+ nrows = (n + ncols - 1) / ncols;
129+ total_width = 0;
130+
131+ for (c = 0; c < ncols; c++) {
132+ int maxw = 0;
133+ for (r = 0; r < nrows; r++) {
134+ int idx = c * nrows + r;
135+ if (idx < (int)n) {
136+ int w = entrywidth(&ents[idx]);
137+ if (w > maxw)
138+ maxw = w;
139+ }
140+ }
141+ colwidths[c] = maxw;
142+ total_width += maxw;
143+ }
144+ total_width += 2 * (ncols - 1);
145+
146+ if (total_width < termwidth)
147+ break;
148+ }
149+
150+ if (ncols <= 1) {
151+ for (i = 0; i < (int)n; i++) {
152+ output(&ents[i]);
153+ }
154+ free(colwidths);
155+ return;
156+ }
157+
158+ nrows = (n + ncols - 1) / ncols;
159+ for (r = 0; r < nrows; r++) {
160+ for (c = 0; c < ncols; c++) {
161+ int idx = c * nrows + r;
162+ if (idx < (int)n) {
163+ int w = entrywidth(&ents[idx]);
164+ if (iflag)
165+ printf("%lu ", (unsigned long)ents[idx].ino);
166+ printname_colored(ents[idx].name, ents[idx].mode);
167+ fputs(indicator(ents[idx].mode), stdout);
168+
169+ if (c < ncols - 1 && (c + 1) * nrows + r < (int)n) {
170+ int pad = colwidths[c] - w + 2;
171+ while (pad-- > 0)
172+ putchar(' ');
173+ }
174+ }
175+ }
176+ putchar('\n');
177+ }
178+
179+ free(colwidths);
180+}
181+
182 static void
183 output(const struct entry *ent)
184 {
185@@ -145,7 +314,7 @@ output(const struct entry *ent)
186 if (iflag)
187 printf("%lu ", (unsigned long)ent->ino);
188 if (!lflag) {
189- printname(ent->name);
190+ printname_colored(ent->name, ent->mode);
191 puts(indicator(ent->mode));
192 return;
193 }
194@@ -208,13 +377,15 @@ output(const struct entry *ent)
195 else
196 printf("%10lu ", (unsigned long)ent->size);
197 printf("%s ", buf);
198- printname(ent->name);
199+ printname_colored(ent->name, ent->mode);
200 fputs(indicator(ent->mode), stdout);
201 if (S_ISLNK(ent->mode)) {
202 if ((len = readlink(ent->name, buf, sizeof(buf) - 1)) < 0)
203 eprintf("readlink %s:", ent->name);
204 buf[len] = '\0';
205- printf(" -> %s%s", buf, indicator(ent->tmode));
206+ printf(" -> ");
207+ printname_colored(buf, ent->tmode);
208+ fputs(indicator(ent->tmode), stdout);
209 }
210 putchar('\n');
211 }
212@@ -281,8 +452,12 @@ lsdir(const char *path, const struct entry *dir)
213 printname(dir->name);
214 puts(":");
215 }
216- for (i = 0; i < n; i++)
217- output(&ents[i]);
218+ if (!lflag && Cflag) {
219+ printcols(ents, n);
220+ } else {
221+ for (i = 0; i < n; i++)
222+ output(&ents[i]);
223+ }
224
225 if (Rflag) {
226 if (snprintf(prefix, PATH_MAX, "%s%s/", path, dir->name) >=
227@@ -362,7 +537,7 @@ ls(const char *path, const struct entry *ent, int listdir)
228 static void
229 usage(void)
230 {
231- eprintf("usage: %s [-1AacdFfHhiLlnpqRrtUu] [file ...]\n", argv0);
232+ eprintf("usage: %s [-1ACacdFfHhiLlnpqRrtUu] [file ...]\n", argv0);
233 }
234
235 int
236@@ -371,12 +546,18 @@ main(int argc, char *argv[])
237 struct entry ent, *dents, *fents;
238 size_t i, ds, fs;
239
240+ if (isatty(STDOUT_FILENO))
241+ Cflag = 1;
242+ else
243+ one_flag = 1;
244+
245 tree = ereallocarray(NULL, PATH_MAX, sizeof(*tree));
246
247 ARGBEGIN {
248 case '1':
249- /* force output to 1 entry per line */
250- qflag = 1;
251+ one_flag = 1;
252+ Cflag = 0;
253+ lflag = 0;
254 break;
255 case 'A':
256 Aflag = 1;
257@@ -388,6 +569,11 @@ main(int argc, char *argv[])
258 cflag = 1;
259 uflag = 0;
260 break;
261+ case 'C':
262+ Cflag = 1;
263+ one_flag = 0;
264+ lflag = 0;
265+ break;
266 case 'd':
267 dflag = 1;
268 break;
269@@ -413,10 +599,14 @@ main(int argc, char *argv[])
270 break;
271 case 'l':
272 lflag = 1;
273+ Cflag = 0;
274+ one_flag = 0;
275 break;
276 case 'n':
277 lflag = 1;
278 nflag = 1;
279+ Cflag = 0;
280+ one_flag = 0;
281 break;
282 case 'p':
283 pflag = 1;
284@@ -443,6 +633,35 @@ main(int argc, char *argv[])
285 uflag = 1;
286 cflag = 0;
287 break;
288+ case '-':
289+#if FEATURE_LS_COLOR
290+ if (strncmp(argv[0], "-color", 6) == 0) {
291+ char *val = NULL;
292+ if (argv[0][6] == '=') {
293+ val = &argv[0][7];
294+ } else if (argv[0][6] == '\0') {
295+ val = "always";
296+ }
297+ if (val) {
298+ if (strcmp(val, "always") == 0)
299+ color_mode = COLOR_ALWAYS;
300+ else if (strcmp(val, "never") == 0)
301+ color_mode = COLOR_NEVER;
302+ else if (strcmp(val, "auto") == 0)
303+ color_mode = COLOR_AUTO;
304+ else {
305+ fprintf(stderr, "ls: invalid --color value: %s\n", val);
306+ usage();
307+ }
308+ }
309+ brk_ = 1;
310+ } else {
311+ usage();
312+ }
313+#else
314+ usage();
315+#endif
316+ break;
317 default:
318 usage();
319 } ARGEND
320@@ -478,8 +697,12 @@ main(int argc, char *argv[])
321 qsort(fents, fs, sizeof(ent), entcmp);
322 qsort(dents, ds, sizeof(ent), entcmp);
323
324- for (i = 0; i < fs; ++i)
325- ls("", &fents[i], 0);
326+ if (!lflag && Cflag && fs > 0) {
327+ printcols(fents, fs);
328+ } else {
329+ for (i = 0; i < fs; ++i)
330+ ls("", &fents[i], 0);
331+ }
332 free(fents);
333 if (fs && ds)
334 putchar('\n');
+30,
-7
1@@ -169,6 +169,7 @@ static int is_eof(FILE *f);
2 static void do_writes(void);
3 static void write_file(char *path, FILE *out);
4 static void check_puts(char *s, FILE *f);
5+static void write_patt(char *s, FILE *f);
6 static void update_ranges(Cmd *beg, Cmd *end);
7
8 /* Sed functions */
9@@ -216,6 +217,9 @@ static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
10 static Cmd *prog, *pc; /* Program, program counter */
11 static size_t pcap;
12 static size_t lineno;
13+#if FEATURE_SED_PRESERVE_NEWLINE
14+static int hadnl = 1;
15+#endif
16
17 static regex_t *lastre; /* last used regex for empty regex search */
18 static char **files; /* list of file names from argv */
19@@ -479,6 +483,10 @@ read_line(FILE *f, String *s)
20 eprintf("getline:");
21 return EOF;
22 }
23+#if FEATURE_SED_PRESERVE_NEWLINE
24+ if (len > 0)
25+ hadnl = (s->str[len - 1] == '\n');
26+#endif
27 if (s->str[--len] == '\n')
28 s->str[len] = '\0';
29 return 0;
30@@ -1227,6 +1235,21 @@ check_puts(char *s, FILE *f)
31 eprintf("fputs:");
32 }
33
34+static void
35+write_patt(char *s, FILE *f)
36+{
37+#if FEATURE_SED_PRESERVE_NEWLINE
38+ if (s && fputs(s, f) == EOF)
39+ eprintf("fputs:");
40+ if (hadnl) {
41+ if (fputs("\n", f) == EOF)
42+ eprintf("fputs:");
43+ }
44+#else
45+ check_puts(s, f);
46+#endif
47+}
48+
49 /* iterate from beg to end updating ranges so we don't miss any commands
50 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
51 */
52@@ -1391,7 +1414,7 @@ cmd_n(Cmd *c)
53 return;
54
55 if (!gflags.n)
56- check_puts(patt.str, stdout);
57+ write_patt(patt.str, stdout);
58 do_writes();
59 new_line();
60 }
61@@ -1409,7 +1432,7 @@ static void
62 cmd_p(Cmd *c)
63 {
64 if (in_range(c))
65- check_puts(patt.str, stdout);
66+ write_patt(patt.str, stdout);
67 }
68
69 static void
70@@ -1423,7 +1446,7 @@ cmd_P(Cmd *c)
71 if ((p = strchr(patt.str, '\n')))
72 *p = '\0';
73
74- check_puts(patt.str, stdout);
75+ write_patt(patt.str, stdout);
76
77 if (p)
78 *p = '\n';
79@@ -1551,9 +1574,9 @@ cmd_s(Cmd *c)
80 genbuf = tmp;
81
82 if (c->u.s.p)
83- check_puts(patt.str, stdout);
84+ write_patt(patt.str, stdout);
85 if (c->u.s.file)
86- check_puts(patt.str, c->u.s.file);
87+ write_patt(patt.str, c->u.s.file);
88 }
89
90 static void
91@@ -1572,7 +1595,7 @@ static void
92 cmd_w(Cmd *c)
93 {
94 if (in_range(c))
95- check_puts(patt.str, c->u.file);
96+ write_patt(patt.str, c->u.file);
97 }
98
99 static void
100@@ -1662,7 +1685,7 @@ cmd_last(Cmd *c)
101 {
102 (void)c;
103 if (!gflags.n)
104- check_puts(patt.str, stdout);
105+ write_patt(patt.str, stdout);
106 do_writes();
107 new_next();
108 }
+6,
-1
1@@ -178,6 +178,7 @@ BUILD_PSEUDO_WHOAMI = $(BUILD_PSEUDO)
2 BUILD_PSEUDO_XINSTALL = $(BUILD_PSEUDO)
3 BUILD_PSEUDO_YES = $(BUILD_PSEUDO)
4 BUILD_PSEUDO_BASE64 = $(BUILD_PSEUDO)
5+BUILD_PSEUDO_B3SUM = $(BUILD_PSEUDO)
6
7 # make tool
8 BUILD_MAKE_MAKE = $(BUILD_MAKE)
9@@ -214,6 +215,8 @@ FEATURE_SH_ULIMIT = 1
10 FEATURE_SH_SETVAR = 1
11 FEATURE_SH_WORDEXP = 1
12 FEATURE_CAL_EXT = 1
13+FEATURE_SED_PRESERVE_NEWLINE = 1
14+FEATURE_LS_COLOR = 1
15
16 CPPFLAGS =\
17 -Ishared\
18@@ -255,7 +258,9 @@ CPPFLAGS =\
19 -DFEATURE_SH_ULIMIT=$(FEATURE_SH_ULIMIT)\
20 -DFEATURE_SH_SETVAR=$(FEATURE_SH_SETVAR)\
21 -DFEATURE_SH_WORDEXP=$(FEATURE_SH_WORDEXP)\
22- -DFEATURE_CAL_EXT=$(FEATURE_CAL_EXT)
23+ -DFEATURE_CAL_EXT=$(FEATURE_CAL_EXT)\
24+ -DFEATURE_SED_PRESERVE_NEWLINE=$(FEATURE_SED_PRESERVE_NEWLINE)\
25+ -DFEATURE_LS_COLOR=$(FEATURE_LS_COLOR)
26
27 CFLAGS = -std=c99 -Wall -Wextra -pedantic
28 LDFLAGS =
1@@ -81,3 +81,9 @@
2 #ifndef FEATURE_OD_ENDIAN
3 #define FEATURE_OD_ENDIAN 1
4 #endif
5+#ifndef FEATURE_SED_PRESERVE_NEWLINE
6+#define FEATURE_SED_PRESERVE_NEWLINE 1
7+#endif
8+#ifndef FEATURE_LS_COLOR
9+#define FEATURE_LS_COLOR 1
10+#endif