1/* See LICENSE file for copyright and license details. */
2
3#include "util.h"
4#include "arg.h"
5
6#include <assert.h>
7#include <immintrin.h>
8#include <stdint.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12
13#define BLAKE3_VERSION_STRING "1.5.0"
14#define BLAKE3_KEY_LEN 32
15#define BLAKE3_OUT_LEN 32
16#define BLAKE3_BLOCK_LEN 64
17#define BLAKE3_CHUNK_LEN 1024
18#define BLAKE3_MAX_DEPTH 54
19
20#if defined(__x86_64__)
21#define MAX_SIMD_DEGREE 16
22#else
23#define MAX_SIMD_DEGREE 1
24#endif
25
26#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
27
28enum Blake3Flags {
29 CHUNK_START = 1 << 0,
30 CHUNK_END = 1 << 1,
31 PARENT = 1 << 2,
32 ROOT = 1 << 3,
33 KEYED_HASH = 1 << 4,
34 DERIVE_KEY_CONTEXT = 1 << 5,
35 DERIVE_KEY_MATERIAL = 1 << 6
36};
37
38struct Blake3ChunkState {
39 uint32_t cv[8];
40 uint64_t chunk_counter;
41 uint8_t buf[BLAKE3_BLOCK_LEN];
42 uint8_t buf_len;
43 uint8_t blocks_compressed;
44 uint8_t flags;
45};
46
47struct Blake3Hasher {
48 uint32_t key[8];
49 struct Blake3ChunkState chunk;
50 uint8_t cv_stack_len;
51 uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
52};
53
54void blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len);
55void blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len);
56
57struct Output {
58 uint32_t input_cv[8];
59 uint64_t counter;
60 uint8_t block[BLAKE3_BLOCK_LEN];
61 uint8_t block_len;
62 uint8_t flags;
63};
64
65static const uint32_t IV[8] = {
66 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
67 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
68};
69
70static const uint8_t MSG_SCHEDULE[7][16] = {
71 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
72 {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
73 {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
74 {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
75 {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
76 {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
77 {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
78};
79
80static inline uint32_t
81load32(const void *src)
82{
83 const uint8_t *p = (const uint8_t *)src;
84
85 return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
86 ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
87}
88
89static inline void
90store32(void *dst, uint32_t w)
91{
92 uint8_t *p = (uint8_t *)dst;
93
94 p[0] = (uint8_t)(w >> 0);
95 p[1] = (uint8_t)(w >> 8);
96 p[2] = (uint8_t)(w >> 16);
97 p[3] = (uint8_t)(w >> 24);
98}
99
100static inline void
101store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8])
102{
103 store32(&bytes_out[0 * 4], cv_words[0]);
104 store32(&bytes_out[1 * 4], cv_words[1]);
105 store32(&bytes_out[2 * 4], cv_words[2]);
106 store32(&bytes_out[3 * 4], cv_words[3]);
107 store32(&bytes_out[4 * 4], cv_words[4]);
108 store32(&bytes_out[5 * 4], cv_words[5]);
109 store32(&bytes_out[6 * 4], cv_words[6]);
110 store32(&bytes_out[7 * 4], cv_words[7]);
111}
112
113static inline uint32_t
114counter_low(uint64_t counter)
115{
116 return (uint32_t)counter;
117}
118
119static inline uint32_t
120counter_high(uint64_t counter)
121{
122 return (uint32_t)(counter >> 32);
123}
124
125/* forward declarations */
126#if defined(__x86_64__)
127void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
128void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
129void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
130
131void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
132void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
133void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
134
135void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
136
137void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
138void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
139void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
140#endif
141
142void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
143void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
144void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
145
146void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
147void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
148void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
149
150/* portable implementations */
151static inline uint32_t
152rotr32(uint32_t w, uint32_t c)
153{
154 return (w >> c) | (w << (32 - c));
155}
156
157static inline void
158g_portable(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
159{
160 state[a] = state[a] + state[b] + x;
161 state[d] = rotr32(state[d] ^ state[a], 16);
162 state[c] = state[c] + state[d];
163 state[b] = rotr32(state[b] ^ state[c], 12);
164 state[a] = state[a] + state[b] + y;
165 state[d] = rotr32(state[d] ^ state[a], 8);
166 state[c] = state[c] + state[d];
167 state[b] = rotr32(state[b] ^ state[c], 7);
168}
169
170static inline void
171round_fn_portable(uint32_t state[16], const uint32_t *msg, size_t round)
172{
173 const uint8_t *schedule = MSG_SCHEDULE[round];
174
175 g_portable(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
176 g_portable(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
177 g_portable(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
178 g_portable(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
179
180 g_portable(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
181 g_portable(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
182 g_portable(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
183 g_portable(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
184}
185
186static inline void
187compress_pre_portable(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
188{
189 uint32_t block_words[16];
190
191 block_words[0] = load32(block + 4 * 0);
192 block_words[1] = load32(block + 4 * 1);
193 block_words[2] = load32(block + 4 * 2);
194 block_words[3] = load32(block + 4 * 3);
195 block_words[4] = load32(block + 4 * 4);
196 block_words[5] = load32(block + 4 * 5);
197 block_words[6] = load32(block + 4 * 6);
198 block_words[7] = load32(block + 4 * 7);
199 block_words[8] = load32(block + 4 * 8);
200 block_words[9] = load32(block + 4 * 9);
201 block_words[10] = load32(block + 4 * 10);
202 block_words[11] = load32(block + 4 * 11);
203 block_words[12] = load32(block + 4 * 12);
204 block_words[13] = load32(block + 4 * 13);
205 block_words[14] = load32(block + 4 * 14);
206 block_words[15] = load32(block + 4 * 15);
207
208 state[0] = cv[0];
209 state[1] = cv[1];
210 state[2] = cv[2];
211 state[3] = cv[3];
212 state[4] = cv[4];
213 state[5] = cv[5];
214 state[6] = cv[6];
215 state[7] = cv[7];
216 state[8] = IV[0];
217 state[9] = IV[1];
218 state[10] = IV[2];
219 state[11] = IV[3];
220 state[12] = counter_low(counter);
221 state[13] = counter_high(counter);
222 state[14] = (uint32_t)block_len;
223 state[15] = (uint32_t)flags;
224
225 round_fn_portable(state, &block_words[0], 0);
226 round_fn_portable(state, &block_words[0], 1);
227 round_fn_portable(state, &block_words[0], 2);
228 round_fn_portable(state, &block_words[0], 3);
229 round_fn_portable(state, &block_words[0], 4);
230 round_fn_portable(state, &block_words[0], 5);
231 round_fn_portable(state, &block_words[0], 6);
232}
233
234void
235blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
236{
237 uint32_t state[16];
238
239 compress_pre_portable(state, cv, block, block_len, counter, flags);
240 cv[0] = state[0] ^ state[8];
241 cv[1] = state[1] ^ state[9];
242 cv[2] = state[2] ^ state[10];
243 cv[3] = state[3] ^ state[11];
244 cv[4] = state[4] ^ state[12];
245 cv[5] = state[5] ^ state[13];
246 cv[6] = state[6] ^ state[14];
247 cv[7] = state[7] ^ state[15];
248}
249
250void
251blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
252{
253 uint32_t state[16];
254
255 compress_pre_portable(state, cv, block, block_len, counter, flags);
256
257 store32(&out[0 * 4], state[0] ^ state[8]);
258 store32(&out[1 * 4], state[1] ^ state[9]);
259 store32(&out[2 * 4], state[2] ^ state[10]);
260 store32(&out[3 * 4], state[3] ^ state[11]);
261 store32(&out[4 * 4], state[4] ^ state[12]);
262 store32(&out[5 * 4], state[5] ^ state[13]);
263 store32(&out[6 * 4], state[6] ^ state[14]);
264 store32(&out[7 * 4], state[7] ^ state[15]);
265 store32(&out[8 * 4], state[8] ^ cv[0]);
266 store32(&out[9 * 4], state[9] ^ cv[1]);
267 store32(&out[10 * 4], state[10] ^ cv[2]);
268 store32(&out[11 * 4], state[11] ^ cv[3]);
269 store32(&out[12 * 4], state[12] ^ cv[4]);
270 store32(&out[13 * 4], state[13] ^ cv[5]);
271 store32(&out[14 * 4], state[14] ^ cv[6]);
272 store32(&out[15 * 4], state[15] ^ cv[7]);
273}
274
275static inline void
276hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
277{
278 uint32_t cv[8];
279 uint8_t block_flags;
280
281 memcpy(cv, key, BLAKE3_KEY_LEN);
282 block_flags = flags | flags_start;
283 while (blocks > 0) {
284 if (blocks == 1) {
285 block_flags |= flags_end;
286 }
287 blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
288 input = &input[BLAKE3_BLOCK_LEN];
289 blocks -= 1;
290 block_flags = flags;
291 }
292 store_cv_words(out, cv);
293}
294
295void
296blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
297{
298 while (num_inputs > 0) {
299 hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
300 if (increment_counter) {
301 counter += 1;
302 }
303 inputs += 1;
304 num_inputs -= 1;
305 out = &out[BLAKE3_OUT_LEN];
306 }
307}
308
309/* cpu features detection */
310enum {
311 SSE2 = 1 << 0,
312 SSE41 = 1 << 1,
313 AVX2 = 1 << 2,
314 AVX512 = 1 << 3
315};
316
317static int blake3_cpu_features = 0;
318static int blake3_cpu_detected = 0;
319
320#if defined(__x86_64__)
321#include <cpuid.h>
322
323static void
324blake3_cpuid(uint32_t out[4], uint32_t id, uint32_t sid)
325{
326 __cpuid_count(id, sid, out[0], out[1], out[2], out[3]);
327}
328
329static uint64_t
330blake3_xgetbv(void)
331{
332 uint32_t eax, edx;
333
334 __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
335 return ((uint64_t)edx << 32) | eax;
336}
337#endif
338
339static void
340blake3_detect_cpu_features(void)
341{
342#if defined(__x86_64__)
343 enum { EAX, EBX, ECX, EDX };
344 uint32_t regs[4];
345 uint64_t xcr0;
346 int features = 0;
347
348 blake3_cpuid(regs, 1, 0);
349 if (regs[EDX] & (1UL << 26))
350 features |= SSE2;
351 if (regs[ECX] & (1UL << 19))
352 features |= SSE41;
353 /* osxsave */
354 if (regs[ECX] & (1UL << 27)) {
355 blake3_cpuid(regs, 0, 0);
356 if (regs[EAX] >= 7) {
357 blake3_cpuid(regs, 7, 0);
358 xcr0 = blake3_xgetbv();
359 /* avx2 and xcr0 sse, avx */
360 if ((regs[EBX] & (1UL << 5)) && (xcr0 & 0x06) == 0x06)
361 features |= AVX2;
362 /* avx512f, avx512vl and xcr0 opmask, zmm_hi256, hi16_zmm */
363 if ((regs[EBX] & (1UL << 31 | 1UL << 16)) && (xcr0 & 0xe0) == 0xe0)
364 features |= AVX512;
365 }
366 }
367 blake3_cpu_features = features;
368#endif
369 blake3_cpu_detected = 1;
370}
371
372#if defined(__x86_64__)
373__attribute__((constructor))
374static void
375blake3_init_cpu(void)
376{
377 if (!blake3_cpu_detected)
378 blake3_detect_cpu_features();
379}
380#endif
381
382#if defined(__x86_64__)
383#pragma GCC push_options
384#pragma GCC target("sse2")
385#define DEGREE_SSE2 4
386
387#define _mm_shuffle_ps2(a, b, c) \
388 (_mm_castps_si128( \
389 _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
390
391static inline __m128i
392loadu_sse2(const uint8_t src[16])
393{
394 return _mm_loadu_si128((const __m128i *)src);
395}
396
397static inline void
398storeu_sse2(__m128i src, uint8_t dest[16])
399{
400 _mm_storeu_si128((__m128i *)dest, src);
401}
402
403static inline __m128i
404addv_sse2(__m128i a, __m128i b)
405{
406 return _mm_add_epi32(a, b);
407}
408
409static inline __m128i
410xorv_sse2(__m128i a, __m128i b)
411{
412 return _mm_xor_si128(a, b);
413}
414
415static inline __m128i
416set1_sse2(uint32_t x)
417{
418 return _mm_set1_epi32((int32_t)x);
419}
420
421static inline __m128i
422set4_sse2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
423{
424 return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
425}
426
427static inline __m128i
428rot16_sse2(__m128i x)
429{
430 return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
431}
432
433static inline __m128i
434rot12_sse2(__m128i x)
435{
436 return xorv_sse2(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
437}
438
439static inline __m128i
440rot8_sse2(__m128i x)
441{
442 return xorv_sse2(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
443}
444
445static inline __m128i
446rot7_sse2(__m128i x)
447{
448 return xorv_sse2(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
449}
450
451static inline void
452g1_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
453{
454 *row0 = addv_sse2(addv_sse2(*row0, m), *row1);
455 *row3 = xorv_sse2(*row3, *row0);
456 *row3 = rot16_sse2(*row3);
457 *row2 = addv_sse2(*row2, *row3);
458 *row1 = xorv_sse2(*row1, *row2);
459 *row1 = rot12_sse2(*row1);
460}
461
462static inline void
463g2_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
464{
465 *row0 = addv_sse2(addv_sse2(*row0, m), *row1);
466 *row3 = xorv_sse2(*row3, *row0);
467 *row3 = rot8_sse2(*row3);
468 *row2 = addv_sse2(*row2, *row3);
469 *row1 = xorv_sse2(*row1, *row2);
470 *row1 = rot7_sse2(*row1);
471}
472
473static inline void
474diagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
475{
476 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
477 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
478 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
479}
480
481static inline void
482undiagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
483{
484 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
485 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
486 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
487}
488
489static inline __m128i
490blend_epi16_sse2(__m128i a, __m128i b, const int16_t imm8)
491{
492 const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
493 __m128i mask = _mm_set1_epi16(imm8);
494
495 mask = _mm_and_si128(mask, bits);
496 mask = _mm_cmpeq_epi16(mask, bits);
497 return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
498}
499
500static inline void
501compress_pre_sse2(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
502{
503 __m128i m0, m1, m2, m3;
504 __m128i t0, t1, t2, t3, tt;
505
506 rows[0] = loadu_sse2((uint8_t *)&cv[0]);
507 rows[1] = loadu_sse2((uint8_t *)&cv[4]);
508 rows[2] = set4_sse2(IV[0], IV[1], IV[2], IV[3]);
509 rows[3] = set4_sse2(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
510
511 m0 = loadu_sse2(&block[sizeof(__m128i) * 0]);
512 m1 = loadu_sse2(&block[sizeof(__m128i) * 1]);
513 m2 = loadu_sse2(&block[sizeof(__m128i) * 2]);
514 m3 = loadu_sse2(&block[sizeof(__m128i) * 3]);
515
516 /* round 1 */
517 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
518 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
519 t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
520 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
521 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
522 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
523 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
524 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
525 t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
526 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
527 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
528 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
529 m0 = t0;
530 m1 = t1;
531 m2 = t2;
532 m3 = t3;
533
534 /* round 2 */
535 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
536 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
537 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
538 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
539 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
540 t1 = blend_epi16_sse2(tt, t1, 0xCC);
541 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
542 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
543 t2 = _mm_unpacklo_epi64(m3, m1);
544 tt = blend_epi16_sse2(t2, m2, 0xC0);
545 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
546 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
547 t3 = _mm_unpackhi_epi32(m1, m3);
548 tt = _mm_unpacklo_epi32(m2, t3);
549 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
550 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
551 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
552 m0 = t0;
553 m1 = t1;
554 m2 = t2;
555 m3 = t3;
556
557 /* round 3 */
558 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
559 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
560 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
561 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
562 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
563 t1 = blend_epi16_sse2(tt, t1, 0xCC);
564 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
565 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
566 t2 = _mm_unpacklo_epi64(m3, m1);
567 tt = blend_epi16_sse2(t2, m2, 0xC0);
568 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
569 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
570 t3 = _mm_unpackhi_epi32(m1, m3);
571 tt = _mm_unpacklo_epi32(m2, t3);
572 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
573 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
574 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
575 m0 = t0;
576 m1 = t1;
577 m2 = t2;
578 m3 = t3;
579
580 /* round 4 */
581 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
582 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
583 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
584 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
585 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
586 t1 = blend_epi16_sse2(tt, t1, 0xCC);
587 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
588 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
589 t2 = _mm_unpacklo_epi64(m3, m1);
590 tt = blend_epi16_sse2(t2, m2, 0xC0);
591 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
592 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
593 t3 = _mm_unpackhi_epi32(m1, m3);
594 tt = _mm_unpacklo_epi32(m2, t3);
595 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
596 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
597 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
598 m0 = t0;
599 m1 = t1;
600 m2 = t2;
601 m3 = t3;
602
603 /* round 5 */
604 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
605 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
606 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
607 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
608 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
609 t1 = blend_epi16_sse2(tt, t1, 0xCC);
610 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
611 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
612 t2 = _mm_unpacklo_epi64(m3, m1);
613 tt = blend_epi16_sse2(t2, m2, 0xC0);
614 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
615 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
616 t3 = _mm_unpackhi_epi32(m1, m3);
617 tt = _mm_unpacklo_epi32(m2, t3);
618 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
619 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
620 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
621 m0 = t0;
622 m1 = t1;
623 m2 = t2;
624 m3 = t3;
625
626 /* round 6 */
627 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
628 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
629 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
630 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
631 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
632 t1 = blend_epi16_sse2(tt, t1, 0xCC);
633 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
634 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
635 t2 = _mm_unpacklo_epi64(m3, m1);
636 tt = blend_epi16_sse2(t2, m2, 0xC0);
637 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
638 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
639 t3 = _mm_unpackhi_epi32(m1, m3);
640 tt = _mm_unpacklo_epi32(m2, t3);
641 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
642 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
643 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
644 m0 = t0;
645 m1 = t1;
646 m2 = t2;
647 m3 = t3;
648
649 /* round 7 */
650 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
651 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
652 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
653 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
654 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
655 t1 = blend_epi16_sse2(tt, t1, 0xCC);
656 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
657 diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
658 t2 = _mm_unpacklo_epi64(m3, m1);
659 tt = blend_epi16_sse2(t2, m2, 0xC0);
660 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
661 g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
662 t3 = _mm_unpackhi_epi32(m1, m3);
663 tt = _mm_unpacklo_epi32(m2, t3);
664 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
665 g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
666 undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
667}
668
669void
670blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
671{
672 __m128i rows[4];
673
674 compress_pre_sse2(rows, cv, block, block_len, counter, flags);
675 storeu_sse2(xorv_sse2(rows[0], rows[2]), (uint8_t *)&cv[0]);
676 storeu_sse2(xorv_sse2(rows[1], rows[3]), (uint8_t *)&cv[4]);
677}
678
679void
680blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
681{
682 __m128i rows[4];
683
684 compress_pre_sse2(rows, cv, block, block_len, counter, flags);
685 storeu_sse2(xorv_sse2(rows[0], rows[2]), &out_buf[0]);
686 storeu_sse2(xorv_sse2(rows[1], rows[3]), &out_buf[16]);
687 storeu_sse2(xorv_sse2(rows[2], loadu_sse2((uint8_t *)&cv[0])), &out_buf[32]);
688 storeu_sse2(xorv_sse2(rows[3], loadu_sse2((uint8_t *)&cv[4])), &out_buf[48]);
689}
690
691static inline void
692round_fn_sse2(__m128i v[16], __m128i m[16], size_t r)
693{
694 v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
695 v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
696 v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
697 v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
698 v[0] = addv_sse2(v[0], v[4]);
699 v[1] = addv_sse2(v[1], v[5]);
700 v[2] = addv_sse2(v[2], v[6]);
701 v[3] = addv_sse2(v[3], v[7]);
702 v[12] = xorv_sse2(v[12], v[0]);
703 v[13] = xorv_sse2(v[13], v[1]);
704 v[14] = xorv_sse2(v[14], v[2]);
705 v[15] = xorv_sse2(v[15], v[3]);
706 v[12] = rot16_sse2(v[12]);
707 v[13] = rot16_sse2(v[13]);
708 v[14] = rot16_sse2(v[14]);
709 v[15] = rot16_sse2(v[15]);
710 v[8] = addv_sse2(v[8], v[12]);
711 v[9] = addv_sse2(v[9], v[13]);
712 v[10] = addv_sse2(v[10], v[14]);
713 v[11] = addv_sse2(v[11], v[15]);
714 v[4] = xorv_sse2(v[4], v[8]);
715 v[5] = xorv_sse2(v[5], v[9]);
716 v[6] = xorv_sse2(v[6], v[10]);
717 v[7] = xorv_sse2(v[7], v[11]);
718 v[4] = rot12_sse2(v[4]);
719 v[5] = rot12_sse2(v[5]);
720 v[6] = rot12_sse2(v[6]);
721 v[7] = rot12_sse2(v[7]);
722 v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
723 v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
724 v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
725 v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
726 v[0] = addv_sse2(v[0], v[4]);
727 v[1] = addv_sse2(v[1], v[5]);
728 v[2] = addv_sse2(v[2], v[6]);
729 v[3] = addv_sse2(v[3], v[7]);
730 v[12] = xorv_sse2(v[12], v[0]);
731 v[13] = xorv_sse2(v[13], v[1]);
732 v[14] = xorv_sse2(v[14], v[2]);
733 v[15] = xorv_sse2(v[15], v[3]);
734 v[12] = rot8_sse2(v[12]);
735 v[13] = rot8_sse2(v[13]);
736 v[14] = rot8_sse2(v[14]);
737 v[15] = rot8_sse2(v[15]);
738 v[8] = addv_sse2(v[8], v[12]);
739 v[9] = addv_sse2(v[9], v[13]);
740 v[10] = addv_sse2(v[10], v[14]);
741 v[11] = addv_sse2(v[11], v[15]);
742 v[4] = xorv_sse2(v[4], v[8]);
743 v[5] = xorv_sse2(v[5], v[9]);
744 v[6] = xorv_sse2(v[6], v[10]);
745 v[7] = xorv_sse2(v[7], v[11]);
746 v[4] = rot7_sse2(v[4]);
747 v[5] = rot7_sse2(v[5]);
748 v[6] = rot7_sse2(v[6]);
749 v[7] = rot7_sse2(v[7]);
750
751 v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
752 v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
753 v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
754 v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
755 v[0] = addv_sse2(v[0], v[5]);
756 v[1] = addv_sse2(v[1], v[6]);
757 v[2] = addv_sse2(v[2], v[7]);
758 v[3] = addv_sse2(v[3], v[4]);
759 v[15] = xorv_sse2(v[15], v[0]);
760 v[12] = xorv_sse2(v[12], v[1]);
761 v[13] = xorv_sse2(v[13], v[2]);
762 v[14] = xorv_sse2(v[14], v[3]);
763 v[15] = rot16_sse2(v[15]);
764 v[12] = rot16_sse2(v[12]);
765 v[13] = rot16_sse2(v[13]);
766 v[14] = rot16_sse2(v[14]);
767 v[10] = addv_sse2(v[10], v[15]);
768 v[11] = addv_sse2(v[11], v[12]);
769 v[8] = addv_sse2(v[8], v[13]);
770 v[9] = addv_sse2(v[9], v[14]);
771 v[5] = xorv_sse2(v[5], v[10]);
772 v[6] = xorv_sse2(v[6], v[11]);
773 v[7] = xorv_sse2(v[7], v[8]);
774 v[4] = xorv_sse2(v[4], v[9]);
775 v[5] = rot12_sse2(v[5]);
776 v[6] = rot12_sse2(v[6]);
777 v[7] = rot12_sse2(v[7]);
778 v[4] = rot12_sse2(v[4]);
779 v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
780 v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
781 v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
782 v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
783 v[0] = addv_sse2(v[0], v[5]);
784 v[1] = addv_sse2(v[1], v[6]);
785 v[2] = addv_sse2(v[2], v[7]);
786 v[3] = addv_sse2(v[3], v[4]);
787 v[15] = xorv_sse2(v[15], v[0]);
788 v[12] = xorv_sse2(v[12], v[1]);
789 v[13] = xorv_sse2(v[13], v[2]);
790 v[14] = xorv_sse2(v[14], v[3]);
791 v[15] = rot8_sse2(v[15]);
792 v[12] = rot8_sse2(v[12]);
793 v[13] = rot8_sse2(v[13]);
794 v[14] = rot8_sse2(v[14]);
795 v[10] = addv_sse2(v[10], v[15]);
796 v[11] = addv_sse2(v[11], v[12]);
797 v[8] = addv_sse2(v[8], v[13]);
798 v[9] = addv_sse2(v[9], v[14]);
799 v[5] = xorv_sse2(v[5], v[10]);
800 v[6] = xorv_sse2(v[6], v[11]);
801 v[7] = xorv_sse2(v[7], v[8]);
802 v[4] = xorv_sse2(v[4], v[9]);
803 v[5] = rot7_sse2(v[5]);
804 v[6] = rot7_sse2(v[6]);
805 v[7] = rot7_sse2(v[7]);
806 v[4] = rot7_sse2(v[4]);
807}
808
809static inline void
810transpose_vecs_sse2(__m128i vecs[DEGREE_SSE2])
811{
812 __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
813 __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
814 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
815 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
816
817 __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
818 __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
819 __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
820 __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
821
822 vecs[0] = abcd_0;
823 vecs[1] = abcd_1;
824 vecs[2] = abcd_2;
825 vecs[3] = abcd_3;
826}
827
828static inline void
829transpose_msg_vecs_sse2(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
830{
831 size_t i;
832
833 out_msg[0] = loadu_sse2(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
834 out_msg[1] = loadu_sse2(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
835 out_msg[2] = loadu_sse2(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
836 out_msg[3] = loadu_sse2(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
837 out_msg[4] = loadu_sse2(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
838 out_msg[5] = loadu_sse2(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
839 out_msg[6] = loadu_sse2(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
840 out_msg[7] = loadu_sse2(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
841 out_msg[8] = loadu_sse2(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
842 out_msg[9] = loadu_sse2(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
843 out_msg[10] = loadu_sse2(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
844 out_msg[11] = loadu_sse2(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
845 out_msg[12] = loadu_sse2(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
846 out_msg[13] = loadu_sse2(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
847 out_msg[14] = loadu_sse2(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
848 out_msg[15] = loadu_sse2(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
849
850 for (i = 0; i < 4; i++) {
851 _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
852 }
853 transpose_vecs_sse2(&out_msg[0]);
854 transpose_vecs_sse2(&out_msg[4]);
855 transpose_vecs_sse2(&out_msg[8]);
856 transpose_vecs_sse2(&out_msg[12]);
857}
858
859static inline void
860load_counters_sse2(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
861{
862 const __m128i mask = _mm_set1_epi32(-increment_counter);
863 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
864 const __m128i add1 = _mm_and_si128(mask, add0);
865 __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
866 __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
867 _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
868 __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
869
870 *out_lo = l;
871 *out_hi = h;
872}
873
874void
875blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
876{
877 __m128i h_vecs[8];
878 __m128i counter_low_vec, counter_high_vec;
879 uint8_t block_flags;
880 size_t block;
881
882 h_vecs[0] = set1_sse2(key[0]);
883 h_vecs[1] = set1_sse2(key[1]);
884 h_vecs[2] = set1_sse2(key[2]);
885 h_vecs[3] = set1_sse2(key[3]);
886 h_vecs[4] = set1_sse2(key[4]);
887 h_vecs[5] = set1_sse2(key[5]);
888 h_vecs[6] = set1_sse2(key[6]);
889 h_vecs[7] = set1_sse2(key[7]);
890
891 load_counters_sse2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
892 block_flags = flags | flags_start;
893
894 for (block = 0; block < blocks; block++) {
895 __m128i block_len_vec;
896 __m128i block_flags_vec;
897 __m128i msg_vecs[16];
898 __m128i v[16];
899
900 if (block + 1 == blocks) {
901 block_flags |= flags_end;
902 }
903 block_len_vec = set1_sse2(BLAKE3_BLOCK_LEN);
904 block_flags_vec = set1_sse2(block_flags);
905 transpose_msg_vecs_sse2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
906
907 v[0] = h_vecs[0];
908 v[1] = h_vecs[1];
909 v[2] = h_vecs[2];
910 v[3] = h_vecs[3];
911 v[4] = h_vecs[4];
912 v[5] = h_vecs[5];
913 v[6] = h_vecs[6];
914 v[7] = h_vecs[7];
915 v[8] = set1_sse2(IV[0]);
916 v[9] = set1_sse2(IV[1]);
917 v[10] = set1_sse2(IV[2]);
918 v[11] = set1_sse2(IV[3]);
919 v[12] = counter_low_vec;
920 v[13] = counter_high_vec;
921 v[14] = block_len_vec;
922 v[15] = block_flags_vec;
923
924 round_fn_sse2(v, msg_vecs, 0);
925 round_fn_sse2(v, msg_vecs, 1);
926 round_fn_sse2(v, msg_vecs, 2);
927 round_fn_sse2(v, msg_vecs, 3);
928 round_fn_sse2(v, msg_vecs, 4);
929 round_fn_sse2(v, msg_vecs, 5);
930 round_fn_sse2(v, msg_vecs, 6);
931
932 h_vecs[0] = xorv_sse2(v[0], v[8]);
933 h_vecs[1] = xorv_sse2(v[1], v[9]);
934 h_vecs[2] = xorv_sse2(v[2], v[10]);
935 h_vecs[3] = xorv_sse2(v[3], v[11]);
936 h_vecs[4] = xorv_sse2(v[4], v[12]);
937 h_vecs[5] = xorv_sse2(v[5], v[13]);
938 h_vecs[6] = xorv_sse2(v[6], v[14]);
939 h_vecs[7] = xorv_sse2(v[7], v[15]);
940
941 block_flags = flags;
942 }
943
944 transpose_vecs_sse2(&h_vecs[0]);
945 transpose_vecs_sse2(&h_vecs[4]);
946 storeu_sse2(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
947 storeu_sse2(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
948 storeu_sse2(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
949 storeu_sse2(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
950 storeu_sse2(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
951 storeu_sse2(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
952 storeu_sse2(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
953 storeu_sse2(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
954}
955
956static inline void
957hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
958{
959 uint32_t cv[8];
960 uint8_t block_flags;
961
962 memcpy(cv, key, BLAKE3_KEY_LEN);
963 block_flags = flags | flags_start;
964 while (blocks > 0) {
965 if (blocks == 1) {
966 block_flags |= flags_end;
967 }
968 blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
969 input = &input[BLAKE3_BLOCK_LEN];
970 blocks -= 1;
971 block_flags = flags;
972 }
973 memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
974}
975
976void
977blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
978{
979 while (num_inputs >= DEGREE_SSE2) {
980 blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
981 if (increment_counter) {
982 counter += DEGREE_SSE2;
983 }
984 inputs += DEGREE_SSE2;
985 num_inputs -= DEGREE_SSE2;
986 out_bytes = &out_bytes[DEGREE_SSE2 * BLAKE3_OUT_LEN];
987 }
988 while (num_inputs > 0) {
989 hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
990 if (increment_counter) {
991 counter += 1;
992 }
993 inputs += 1;
994 num_inputs -= 1;
995 out_bytes = &out_bytes[BLAKE3_OUT_LEN];
996 }
997}
998#pragma GCC pop_options
999
1000#pragma GCC push_options
1001#pragma GCC target("sse4.1")
1002#define DEGREE_SSE41 4
1003
1004#define _mm_shuffle_ps2_sse41(a, b, c) \
1005 (_mm_castps_si128( \
1006 _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
1007
1008static inline __m128i
1009loadu_sse41(const uint8_t src[16])
1010{
1011 return _mm_loadu_si128((const __m128i *)src);
1012}
1013
1014static inline void
1015storeu_sse41(__m128i src, uint8_t dest[16])
1016{
1017 _mm_storeu_si128((__m128i *)dest, src);
1018}
1019
1020static inline __m128i
1021addv_sse41(__m128i a, __m128i b)
1022{
1023 return _mm_add_epi32(a, b);
1024}
1025
1026static inline __m128i
1027xorv_sse41(__m128i a, __m128i b)
1028{
1029 return _mm_xor_si128(a, b);
1030}
1031
1032static inline __m128i
1033set1_sse41(uint32_t x)
1034{
1035 return _mm_set1_epi32((int32_t)x);
1036}
1037
1038static inline __m128i
1039set4_sse41(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1040{
1041 return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
1042}
1043
1044static inline __m128i
1045rot16_sse41(__m128i x)
1046{
1047 return _mm_shuffle_epi8(x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1048}
1049
1050static inline __m128i
1051rot12_sse41(__m128i x)
1052{
1053 return xorv_sse41(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
1054}
1055
1056static inline __m128i
1057rot8_sse41(__m128i x)
1058{
1059 return _mm_shuffle_epi8(x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1060}
1061
1062static inline __m128i
1063rot7_sse41(__m128i x)
1064{
1065 return xorv_sse41(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
1066}
1067
1068static inline void
1069g1_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1070{
1071 *row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1072 *row3 = xorv_sse41(*row3, *row0);
1073 *row3 = rot16_sse41(*row3);
1074 *row2 = addv_sse41(*row2, *row3);
1075 *row1 = xorv_sse41(*row1, *row2);
1076 *row1 = rot12_sse41(*row1);
1077}
1078
1079static inline void
1080g2_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1081{
1082 *row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1083 *row3 = xorv_sse41(*row3, *row0);
1084 *row3 = rot8_sse41(*row3);
1085 *row2 = addv_sse41(*row2, *row3);
1086 *row1 = xorv_sse41(*row1, *row2);
1087 *row1 = rot7_sse41(*row1);
1088}
1089
1090static inline void
1091diagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1092{
1093 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
1094 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1095 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
1096}
1097
1098static inline void
1099undiagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1100{
1101 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
1102 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1103 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
1104}
1105
1106static inline void
1107compress_pre_sse41(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1108{
1109 __m128i m0, m1, m2, m3;
1110 __m128i t0, t1, t2, t3, tt;
1111
1112 rows[0] = loadu_sse41((uint8_t *)&cv[0]);
1113 rows[1] = loadu_sse41((uint8_t *)&cv[4]);
1114 rows[2] = set4_sse41(IV[0], IV[1], IV[2], IV[3]);
1115 rows[3] = set4_sse41(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
1116
1117 m0 = loadu_sse41(&block[sizeof(__m128i) * 0]);
1118 m1 = loadu_sse41(&block[sizeof(__m128i) * 1]);
1119 m2 = loadu_sse41(&block[sizeof(__m128i) * 2]);
1120 m3 = loadu_sse41(&block[sizeof(__m128i) * 3]);
1121
1122 /* round 1 */
1123 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
1124 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1125 t1 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
1126 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1127 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1128 t2 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
1129 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
1130 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1131 t3 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
1132 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
1133 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1134 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1135 m0 = t0;
1136 m1 = t1;
1137 m2 = t2;
1138 m3 = t3;
1139
1140 /* round 2 */
1141 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1142 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1143 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1144 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1145 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1146 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1147 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1148 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1149 t2 = _mm_unpacklo_epi64(m3, m1);
1150 tt = _mm_blend_epi16(t2, m2, 0xC0);
1151 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1152 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1153 t3 = _mm_unpackhi_epi32(m1, m3);
1154 tt = _mm_unpacklo_epi32(m2, t3);
1155 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1156 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1157 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1158 m0 = t0;
1159 m1 = t1;
1160 m2 = t2;
1161 m3 = t3;
1162
1163 /* round 3 */
1164 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1165 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1166 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1167 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1168 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1169 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1170 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1171 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1172 t2 = _mm_unpacklo_epi64(m3, m1);
1173 tt = _mm_blend_epi16(t2, m2, 0xC0);
1174 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1175 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1176 t3 = _mm_unpackhi_epi32(m1, m3);
1177 tt = _mm_unpacklo_epi32(m2, t3);
1178 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1179 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1180 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1181 m0 = t0;
1182 m1 = t1;
1183 m2 = t2;
1184 m3 = t3;
1185
1186 /* round 4 */
1187 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1188 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1189 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1190 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1191 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1192 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1193 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1194 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1195 t2 = _mm_unpacklo_epi64(m3, m1);
1196 tt = _mm_blend_epi16(t2, m2, 0xC0);
1197 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1198 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1199 t3 = _mm_unpackhi_epi32(m1, m3);
1200 tt = _mm_unpacklo_epi32(m2, t3);
1201 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1202 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1203 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1204 m0 = t0;
1205 m1 = t1;
1206 m2 = t2;
1207 m3 = t3;
1208
1209 /* round 5 */
1210 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1211 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1212 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1213 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1214 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1215 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1216 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1217 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1218 t2 = _mm_unpacklo_epi64(m3, m1);
1219 tt = _mm_blend_epi16(t2, m2, 0xC0);
1220 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1221 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1222 t3 = _mm_unpackhi_epi32(m1, m3);
1223 tt = _mm_unpacklo_epi32(m2, t3);
1224 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1225 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1226 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1227 m0 = t0;
1228 m1 = t1;
1229 m2 = t2;
1230 m3 = t3;
1231
1232 /* round 6 */
1233 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1234 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1235 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1236 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1237 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1238 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1239 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1240 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1241 t2 = _mm_unpacklo_epi64(m3, m1);
1242 tt = _mm_blend_epi16(t2, m2, 0xC0);
1243 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1244 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1245 t3 = _mm_unpackhi_epi32(m1, m3);
1246 tt = _mm_unpacklo_epi32(m2, t3);
1247 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1248 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1249 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1250 m0 = t0;
1251 m1 = t1;
1252 m2 = t2;
1253 m3 = t3;
1254
1255 /* round 7 */
1256 t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1257 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1258 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1259 t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1260 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1261 t1 = _mm_blend_epi16(tt, t1, 0xCC);
1262 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1263 diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1264 t2 = _mm_unpacklo_epi64(m3, m1);
1265 tt = _mm_blend_epi16(t2, m2, 0xC0);
1266 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1267 g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1268 t3 = _mm_unpackhi_epi32(m1, m3);
1269 tt = _mm_unpacklo_epi32(m2, t3);
1270 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1271 g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1272 undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1273}
1274
1275void
1276blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1277{
1278 __m128i rows[4];
1279
1280 compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1281 storeu_sse41(xorv_sse41(rows[0], rows[2]), (uint8_t *)&cv[0]);
1282 storeu_sse41(xorv_sse41(rows[1], rows[3]), (uint8_t *)&cv[4]);
1283}
1284
1285void
1286blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
1287{
1288 __m128i rows[4];
1289
1290 compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1291 storeu_sse41(xorv_sse41(rows[0], rows[2]), &out_buf[0]);
1292 storeu_sse41(xorv_sse41(rows[1], rows[3]), &out_buf[16]);
1293 storeu_sse41(xorv_sse41(rows[2], loadu_sse41((uint8_t *)&cv[0])), &out_buf[32]);
1294 storeu_sse41(xorv_sse41(rows[3], loadu_sse41((uint8_t *)&cv[4])), &out_buf[48]);
1295}
1296
1297static inline void
1298round_fn_sse41(__m128i v[16], __m128i m[16], size_t r)
1299{
1300 v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1301 v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1302 v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1303 v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1304 v[0] = addv_sse41(v[0], v[4]);
1305 v[1] = addv_sse41(v[1], v[5]);
1306 v[2] = addv_sse41(v[2], v[6]);
1307 v[3] = addv_sse41(v[3], v[7]);
1308 v[12] = xorv_sse41(v[12], v[0]);
1309 v[13] = xorv_sse41(v[13], v[1]);
1310 v[14] = xorv_sse41(v[14], v[2]);
1311 v[15] = xorv_sse41(v[15], v[3]);
1312 v[12] = rot16_sse41(v[12]);
1313 v[13] = rot16_sse41(v[13]);
1314 v[14] = rot16_sse41(v[14]);
1315 v[15] = rot16_sse41(v[15]);
1316 v[8] = addv_sse41(v[8], v[12]);
1317 v[9] = addv_sse41(v[9], v[13]);
1318 v[10] = addv_sse41(v[10], v[14]);
1319 v[11] = addv_sse41(v[11], v[15]);
1320 v[4] = xorv_sse41(v[4], v[8]);
1321 v[5] = xorv_sse41(v[5], v[9]);
1322 v[6] = xorv_sse41(v[6], v[10]);
1323 v[7] = xorv_sse41(v[7], v[11]);
1324 v[4] = rot12_sse41(v[4]);
1325 v[5] = rot12_sse41(v[5]);
1326 v[6] = rot12_sse41(v[6]);
1327 v[7] = rot12_sse41(v[7]);
1328 v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1329 v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1330 v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1331 v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1332 v[0] = addv_sse41(v[0], v[4]);
1333 v[1] = addv_sse41(v[1], v[5]);
1334 v[2] = addv_sse41(v[2], v[6]);
1335 v[3] = addv_sse41(v[3], v[7]);
1336 v[12] = xorv_sse41(v[12], v[0]);
1337 v[13] = xorv_sse41(v[13], v[1]);
1338 v[14] = xorv_sse41(v[14], v[2]);
1339 v[15] = xorv_sse41(v[15], v[3]);
1340 v[12] = rot8_sse41(v[12]);
1341 v[13] = rot8_sse41(v[13]);
1342 v[14] = rot8_sse41(v[14]);
1343 v[15] = rot8_sse41(v[15]);
1344 v[8] = addv_sse41(v[8], v[12]);
1345 v[9] = addv_sse41(v[9], v[13]);
1346 v[10] = addv_sse41(v[10], v[14]);
1347 v[11] = addv_sse41(v[11], v[15]);
1348 v[4] = xorv_sse41(v[4], v[8]);
1349 v[5] = xorv_sse41(v[5], v[9]);
1350 v[6] = xorv_sse41(v[6], v[10]);
1351 v[7] = xorv_sse41(v[7], v[11]);
1352 v[4] = rot7_sse41(v[4]);
1353 v[5] = rot7_sse41(v[5]);
1354 v[6] = rot7_sse41(v[6]);
1355 v[7] = rot7_sse41(v[7]);
1356
1357 v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1358 v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1359 v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1360 v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1361 v[0] = addv_sse41(v[0], v[5]);
1362 v[1] = addv_sse41(v[1], v[6]);
1363 v[2] = addv_sse41(v[2], v[7]);
1364 v[3] = addv_sse41(v[3], v[4]);
1365 v[15] = xorv_sse41(v[15], v[0]);
1366 v[12] = xorv_sse41(v[12], v[1]);
1367 v[13] = xorv_sse41(v[13], v[2]);
1368 v[14] = xorv_sse41(v[14], v[3]);
1369 v[15] = rot16_sse41(v[15]);
1370 v[12] = rot16_sse41(v[12]);
1371 v[13] = rot16_sse41(v[13]);
1372 v[14] = rot16_sse41(v[14]);
1373 v[10] = addv_sse41(v[10], v[15]);
1374 v[11] = addv_sse41(v[11], v[12]);
1375 v[8] = addv_sse41(v[8], v[13]);
1376 v[9] = addv_sse41(v[9], v[14]);
1377 v[5] = xorv_sse41(v[5], v[10]);
1378 v[6] = xorv_sse41(v[6], v[11]);
1379 v[7] = xorv_sse41(v[7], v[8]);
1380 v[4] = xorv_sse41(v[4], v[9]);
1381 v[5] = rot12_sse41(v[5]);
1382 v[6] = rot12_sse41(v[6]);
1383 v[7] = rot12_sse41(v[7]);
1384 v[4] = rot12_sse41(v[4]);
1385 v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1386 v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1387 v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1388 v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1389 v[0] = addv_sse41(v[0], v[5]);
1390 v[1] = addv_sse41(v[1], v[6]);
1391 v[2] = addv_sse41(v[2], v[7]);
1392 v[3] = addv_sse41(v[3], v[4]);
1393 v[15] = xorv_sse41(v[15], v[0]);
1394 v[12] = xorv_sse41(v[12], v[1]);
1395 v[13] = xorv_sse41(v[13], v[2]);
1396 v[14] = xorv_sse41(v[14], v[3]);
1397 v[15] = rot8_sse41(v[15]);
1398 v[12] = rot8_sse41(v[12]);
1399 v[13] = rot8_sse41(v[13]);
1400 v[14] = rot8_sse41(v[14]);
1401 v[10] = addv_sse41(v[10], v[15]);
1402 v[11] = addv_sse41(v[11], v[12]);
1403 v[8] = addv_sse41(v[8], v[13]);
1404 v[9] = addv_sse41(v[9], v[14]);
1405 v[5] = xorv_sse41(v[5], v[10]);
1406 v[6] = xorv_sse41(v[6], v[11]);
1407 v[7] = xorv_sse41(v[7], v[8]);
1408 v[4] = xorv_sse41(v[4], v[9]);
1409 v[5] = rot7_sse41(v[5]);
1410 v[6] = rot7_sse41(v[6]);
1411 v[7] = rot7_sse41(v[7]);
1412 v[4] = rot7_sse41(v[4]);
1413}
1414
1415static inline void
1416transpose_vecs_sse41(__m128i vecs[DEGREE_SSE41])
1417{
1418 __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
1419 __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
1420 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
1421 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
1422
1423 __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
1424 __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
1425 __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
1426 __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
1427
1428 vecs[0] = abcd_0;
1429 vecs[1] = abcd_1;
1430 vecs[2] = abcd_2;
1431 vecs[3] = abcd_3;
1432}
1433
1434static inline void
1435transpose_msg_vecs_sse41(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
1436{
1437 size_t i;
1438
1439 out_msg[0] = loadu_sse41(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
1440 out_msg[1] = loadu_sse41(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
1441 out_msg[2] = loadu_sse41(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
1442 out_msg[3] = loadu_sse41(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
1443 out_msg[4] = loadu_sse41(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
1444 out_msg[5] = loadu_sse41(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
1445 out_msg[6] = loadu_sse41(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
1446 out_msg[7] = loadu_sse41(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
1447 out_msg[8] = loadu_sse41(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
1448 out_msg[9] = loadu_sse41(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
1449 out_msg[10] = loadu_sse41(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
1450 out_msg[11] = loadu_sse41(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
1451 out_msg[12] = loadu_sse41(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
1452 out_msg[13] = loadu_sse41(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
1453 out_msg[14] = loadu_sse41(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
1454 out_msg[15] = loadu_sse41(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
1455
1456 for (i = 0; i < 4; i++) {
1457 _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1458 }
1459 transpose_vecs_sse41(&out_msg[0]);
1460 transpose_vecs_sse41(&out_msg[4]);
1461 transpose_vecs_sse41(&out_msg[8]);
1462 transpose_vecs_sse41(&out_msg[12]);
1463}
1464
1465static inline void
1466load_counters_sse41(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
1467{
1468 const __m128i mask = _mm_set1_epi32(-increment_counter);
1469 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
1470 const __m128i add1 = _mm_and_si128(mask, add0);
1471 __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
1472 __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
1473 _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
1474 __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
1475
1476 *out_lo = l;
1477 *out_hi = h;
1478}
1479
1480void
1481blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1482{
1483 __m128i h_vecs[8];
1484 __m128i counter_low_vec, counter_high_vec;
1485 uint8_t block_flags;
1486 size_t block;
1487
1488 h_vecs[0] = set1_sse41(key[0]);
1489 h_vecs[1] = set1_sse41(key[1]);
1490 h_vecs[2] = set1_sse41(key[2]);
1491 h_vecs[3] = set1_sse41(key[3]);
1492 h_vecs[4] = set1_sse41(key[4]);
1493 h_vecs[5] = set1_sse41(key[5]);
1494 h_vecs[6] = set1_sse41(key[6]);
1495 h_vecs[7] = set1_sse41(key[7]);
1496
1497 load_counters_sse41(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1498 block_flags = flags | flags_start;
1499
1500 for (block = 0; block < blocks; block++) {
1501 __m128i block_len_vec;
1502 __m128i block_flags_vec;
1503 __m128i msg_vecs[16];
1504 __m128i v[16];
1505
1506 if (block + 1 == blocks) {
1507 block_flags |= flags_end;
1508 }
1509 block_len_vec = set1_sse41(BLAKE3_BLOCK_LEN);
1510 block_flags_vec = set1_sse41(block_flags);
1511 transpose_msg_vecs_sse41(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1512
1513 v[0] = h_vecs[0];
1514 v[1] = h_vecs[1];
1515 v[2] = h_vecs[2];
1516 v[3] = h_vecs[3];
1517 v[4] = h_vecs[4];
1518 v[5] = h_vecs[5];
1519 v[6] = h_vecs[6];
1520 v[7] = h_vecs[7];
1521 v[8] = set1_sse41(IV[0]);
1522 v[9] = set1_sse41(IV[1]);
1523 v[10] = set1_sse41(IV[2]);
1524 v[11] = set1_sse41(IV[3]);
1525 v[12] = counter_low_vec;
1526 v[13] = counter_high_vec;
1527 v[14] = block_len_vec;
1528 v[15] = block_flags_vec;
1529
1530 round_fn_sse41(v, msg_vecs, 0);
1531 round_fn_sse41(v, msg_vecs, 1);
1532 round_fn_sse41(v, msg_vecs, 2);
1533 round_fn_sse41(v, msg_vecs, 3);
1534 round_fn_sse41(v, msg_vecs, 4);
1535 round_fn_sse41(v, msg_vecs, 5);
1536 round_fn_sse41(v, msg_vecs, 6);
1537
1538 h_vecs[0] = xorv_sse41(v[0], v[8]);
1539 h_vecs[1] = xorv_sse41(v[1], v[9]);
1540 h_vecs[2] = xorv_sse41(v[2], v[10]);
1541 h_vecs[3] = xorv_sse41(v[3], v[11]);
1542 h_vecs[4] = xorv_sse41(v[4], v[12]);
1543 h_vecs[5] = xorv_sse41(v[5], v[13]);
1544 h_vecs[6] = xorv_sse41(v[6], v[14]);
1545 h_vecs[7] = xorv_sse41(v[7], v[15]);
1546
1547 block_flags = flags;
1548 }
1549
1550 transpose_vecs_sse41(&h_vecs[0]);
1551 transpose_vecs_sse41(&h_vecs[4]);
1552 storeu_sse41(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
1553 storeu_sse41(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
1554 storeu_sse41(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
1555 storeu_sse41(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
1556 storeu_sse41(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
1557 storeu_sse41(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
1558 storeu_sse41(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
1559 storeu_sse41(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
1560}
1561
1562static inline void
1563hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
1564{
1565 uint32_t cv[8];
1566 uint8_t block_flags;
1567
1568 memcpy(cv, key, BLAKE3_KEY_LEN);
1569 block_flags = flags | flags_start;
1570 while (blocks > 0) {
1571 if (blocks == 1) {
1572 block_flags |= flags_end;
1573 }
1574 blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
1575 input = &input[BLAKE3_BLOCK_LEN];
1576 blocks -= 1;
1577 block_flags = flags;
1578 }
1579 memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
1580}
1581
1582void
1583blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1584{
1585 while (num_inputs >= DEGREE_SSE41) {
1586 blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1587 if (increment_counter) {
1588 counter += DEGREE_SSE41;
1589 }
1590 inputs += DEGREE_SSE41;
1591 num_inputs -= DEGREE_SSE41;
1592 out_bytes = &out_bytes[DEGREE_SSE41 * BLAKE3_OUT_LEN];
1593 }
1594 while (num_inputs > 0) {
1595 hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
1596 if (increment_counter) {
1597 counter += 1;
1598 }
1599 inputs += 1;
1600 num_inputs -= 1;
1601 out_bytes = &out_bytes[BLAKE3_OUT_LEN];
1602 }
1603}
1604#pragma GCC pop_options
1605
1606#pragma GCC push_options
1607#pragma GCC target("avx2")
1608#define DEGREE_AVX2 8
1609
1610static inline __m256i
1611loadu_avx2(const uint8_t src[32])
1612{
1613 return _mm256_loadu_si256((const __m256i *)src);
1614}
1615
1616static inline void
1617storeu_avx2(__m256i src, uint8_t dest[32])
1618{
1619 _mm256_storeu_si256((__m256i *)dest, src);
1620}
1621
1622static inline __m256i
1623addv_avx2(__m256i a, __m256i b)
1624{
1625 return _mm256_add_epi32(a, b);
1626}
1627
1628static inline __m256i
1629xorv_avx2(__m256i a, __m256i b)
1630{
1631 return _mm256_xor_si256(a, b);
1632}
1633
1634static inline __m256i
1635set1_avx2(uint32_t x)
1636{
1637 return _mm256_set1_epi32((int32_t)x);
1638}
1639
1640static inline __m256i
1641rot16_avx2(__m256i x)
1642{
1643 return _mm256_shuffle_epi8(x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
1644 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1645}
1646
1647static inline __m256i
1648rot12_avx2(__m256i x)
1649{
1650 return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
1651}
1652
1653static inline __m256i
1654rot8_avx2(__m256i x)
1655{
1656 return _mm256_shuffle_epi8(x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
1657 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1658}
1659
1660static inline __m256i
1661rot7_avx2(__m256i x)
1662{
1663 return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
1664}
1665
1666static inline void
1667round_fn_avx2(__m256i v[16], __m256i m[16], size_t r)
1668{
1669 v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1670 v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1671 v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1672 v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1673 v[0] = addv_avx2(v[0], v[4]);
1674 v[1] = addv_avx2(v[1], v[5]);
1675 v[2] = addv_avx2(v[2], v[6]);
1676 v[3] = addv_avx2(v[3], v[7]);
1677 v[12] = xorv_avx2(v[12], v[0]);
1678 v[13] = xorv_avx2(v[13], v[1]);
1679 v[14] = xorv_avx2(v[14], v[2]);
1680 v[15] = xorv_avx2(v[15], v[3]);
1681 v[12] = rot16_avx2(v[12]);
1682 v[13] = rot16_avx2(v[13]);
1683 v[14] = rot16_avx2(v[14]);
1684 v[15] = rot16_avx2(v[15]);
1685 v[8] = addv_avx2(v[8], v[12]);
1686 v[9] = addv_avx2(v[9], v[13]);
1687 v[10] = addv_avx2(v[10], v[14]);
1688 v[11] = addv_avx2(v[11], v[15]);
1689 v[4] = xorv_avx2(v[4], v[8]);
1690 v[5] = xorv_avx2(v[5], v[9]);
1691 v[6] = xorv_avx2(v[6], v[10]);
1692 v[7] = xorv_avx2(v[7], v[11]);
1693 v[4] = rot12_avx2(v[4]);
1694 v[5] = rot12_avx2(v[5]);
1695 v[6] = rot12_avx2(v[6]);
1696 v[7] = rot12_avx2(v[7]);
1697 v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1698 v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1699 v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1700 v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1701 v[0] = addv_avx2(v[0], v[4]);
1702 v[1] = addv_avx2(v[1], v[5]);
1703 v[2] = addv_avx2(v[2], v[6]);
1704 v[3] = addv_avx2(v[3], v[7]);
1705 v[12] = xorv_avx2(v[12], v[0]);
1706 v[13] = xorv_avx2(v[13], v[1]);
1707 v[14] = xorv_avx2(v[14], v[2]);
1708 v[15] = xorv_avx2(v[15], v[3]);
1709 v[12] = rot8_avx2(v[12]);
1710 v[13] = rot8_avx2(v[13]);
1711 v[14] = rot8_avx2(v[14]);
1712 v[15] = rot8_avx2(v[15]);
1713 v[8] = addv_avx2(v[8], v[12]);
1714 v[9] = addv_avx2(v[9], v[13]);
1715 v[10] = addv_avx2(v[10], v[14]);
1716 v[11] = addv_avx2(v[11], v[15]);
1717 v[4] = xorv_avx2(v[4], v[8]);
1718 v[5] = xorv_avx2(v[5], v[9]);
1719 v[6] = xorv_avx2(v[6], v[10]);
1720 v[7] = xorv_avx2(v[7], v[11]);
1721 v[4] = rot7_avx2(v[4]);
1722 v[5] = rot7_avx2(v[5]);
1723 v[6] = rot7_avx2(v[6]);
1724 v[7] = rot7_avx2(v[7]);
1725
1726 v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1727 v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1728 v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1729 v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1730 v[0] = addv_avx2(v[0], v[5]);
1731 v[1] = addv_avx2(v[1], v[6]);
1732 v[2] = addv_avx2(v[2], v[7]);
1733 v[3] = addv_avx2(v[3], v[4]);
1734 v[15] = xorv_avx2(v[15], v[0]);
1735 v[12] = xorv_avx2(v[12], v[1]);
1736 v[13] = xorv_avx2(v[13], v[2]);
1737 v[14] = xorv_avx2(v[14], v[3]);
1738 v[15] = rot16_avx2(v[15]);
1739 v[12] = rot16_avx2(v[12]);
1740 v[13] = rot16_avx2(v[13]);
1741 v[14] = rot16_avx2(v[14]);
1742 v[10] = addv_avx2(v[10], v[15]);
1743 v[11] = addv_avx2(v[11], v[12]);
1744 v[8] = addv_avx2(v[8], v[13]);
1745 v[9] = addv_avx2(v[9], v[14]);
1746 v[5] = xorv_avx2(v[5], v[10]);
1747 v[6] = xorv_avx2(v[6], v[11]);
1748 v[7] = xorv_avx2(v[7], v[8]);
1749 v[4] = xorv_avx2(v[4], v[9]);
1750 v[5] = rot12_avx2(v[5]);
1751 v[6] = rot12_avx2(v[6]);
1752 v[7] = rot12_avx2(v[7]);
1753 v[4] = rot12_avx2(v[4]);
1754 v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1755 v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1756 v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1757 v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1758 v[0] = addv_avx2(v[0], v[5]);
1759 v[1] = addv_avx2(v[1], v[6]);
1760 v[2] = addv_avx2(v[2], v[7]);
1761 v[3] = addv_avx2(v[3], v[4]);
1762 v[15] = xorv_avx2(v[15], v[0]);
1763 v[12] = xorv_avx2(v[12], v[1]);
1764 v[13] = xorv_avx2(v[13], v[2]);
1765 v[14] = xorv_avx2(v[14], v[3]);
1766 v[15] = rot8_avx2(v[15]);
1767 v[12] = rot8_avx2(v[12]);
1768 v[13] = rot8_avx2(v[13]);
1769 v[14] = rot8_avx2(v[14]);
1770 v[10] = addv_avx2(v[10], v[15]);
1771 v[11] = addv_avx2(v[11], v[12]);
1772 v[8] = addv_avx2(v[8], v[13]);
1773 v[9] = addv_avx2(v[9], v[14]);
1774 v[5] = xorv_avx2(v[5], v[10]);
1775 v[6] = xorv_avx2(v[6], v[11]);
1776 v[7] = xorv_avx2(v[7], v[8]);
1777 v[4] = xorv_avx2(v[4], v[9]);
1778 v[5] = rot7_avx2(v[5]);
1779 v[6] = rot7_avx2(v[6]);
1780 v[7] = rot7_avx2(v[7]);
1781 v[4] = rot7_avx2(v[4]);
1782}
1783
1784static inline void
1785transpose_vecs_avx2(__m256i vecs[DEGREE_AVX2])
1786{
1787 __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
1788 __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
1789 __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
1790 __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
1791 __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
1792 __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
1793 __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
1794 __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
1795
1796 __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
1797 __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
1798 __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
1799 __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
1800 __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
1801 __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
1802 __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
1803 __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
1804
1805 vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
1806 vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
1807 vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
1808 vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
1809 vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
1810 vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
1811 vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
1812 vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
1813}
1814
1815static inline void
1816transpose_msg_vecs_avx2(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
1817{
1818 size_t i;
1819
1820 out_msg[0] = loadu_avx2(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
1821 out_msg[1] = loadu_avx2(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
1822 out_msg[2] = loadu_avx2(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
1823 out_msg[3] = loadu_avx2(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
1824 out_msg[4] = loadu_avx2(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
1825 out_msg[5] = loadu_avx2(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
1826 out_msg[6] = loadu_avx2(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
1827 out_msg[7] = loadu_avx2(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
1828 out_msg[8] = loadu_avx2(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
1829 out_msg[9] = loadu_avx2(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
1830 out_msg[10] = loadu_avx2(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
1831 out_msg[11] = loadu_avx2(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
1832 out_msg[12] = loadu_avx2(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
1833 out_msg[13] = loadu_avx2(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
1834 out_msg[14] = loadu_avx2(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
1835 out_msg[15] = loadu_avx2(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
1836
1837 for (i = 0; i < 8; i++) {
1838 _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1839 }
1840 transpose_vecs_avx2(&out_msg[0]);
1841 transpose_vecs_avx2(&out_msg[8]);
1842}
1843
1844static inline void
1845load_counters_avx2(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
1846{
1847 const __m256i mask = _mm256_set1_epi32(-increment_counter);
1848 const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1849 const __m256i add1 = _mm256_and_si256(mask, add0);
1850 __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
1851 __m256i carry = _mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000));
1852 __m256i comp = _mm256_xor_si256(l, _mm256_set1_epi32(0x80000000));
1853 __m256i gt = _mm256_cmpgt_epi32(carry, comp);
1854 __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), gt);
1855
1856 *out_lo = l;
1857 *out_hi = h;
1858}
1859
1860void
1861blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1862{
1863 __m256i h_vecs[8];
1864 __m256i counter_low_vec, counter_high_vec;
1865 uint8_t block_flags;
1866 size_t block;
1867
1868 h_vecs[0] = set1_avx2(key[0]);
1869 h_vecs[1] = set1_avx2(key[1]);
1870 h_vecs[2] = set1_avx2(key[2]);
1871 h_vecs[3] = set1_avx2(key[3]);
1872 h_vecs[4] = set1_avx2(key[4]);
1873 h_vecs[5] = set1_avx2(key[5]);
1874 h_vecs[6] = set1_avx2(key[6]);
1875 h_vecs[7] = set1_avx2(key[7]);
1876
1877 load_counters_avx2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1878 block_flags = flags | flags_start;
1879
1880 for (block = 0; block < blocks; block++) {
1881 __m256i block_len_vec;
1882 __m256i block_flags_vec;
1883 __m256i msg_vecs[16];
1884 __m256i v[16];
1885
1886 if (block + 1 == blocks) {
1887 block_flags |= flags_end;
1888 }
1889 block_len_vec = set1_avx2(BLAKE3_BLOCK_LEN);
1890 block_flags_vec = set1_avx2(block_flags);
1891 transpose_msg_vecs_avx2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1892
1893 v[0] = h_vecs[0];
1894 v[1] = h_vecs[1];
1895 v[2] = h_vecs[2];
1896 v[3] = h_vecs[3];
1897 v[4] = h_vecs[4];
1898 v[5] = h_vecs[5];
1899 v[6] = h_vecs[6];
1900 v[7] = h_vecs[7];
1901 v[8] = set1_avx2(IV[0]);
1902 v[9] = set1_avx2(IV[1]);
1903 v[10] = set1_avx2(IV[2]);
1904 v[11] = set1_avx2(IV[3]);
1905 v[12] = counter_low_vec;
1906 v[13] = counter_high_vec;
1907 v[14] = block_len_vec;
1908 v[15] = block_flags_vec;
1909
1910 round_fn_avx2(v, msg_vecs, 0);
1911 round_fn_avx2(v, msg_vecs, 1);
1912 round_fn_avx2(v, msg_vecs, 2);
1913 round_fn_avx2(v, msg_vecs, 3);
1914 round_fn_avx2(v, msg_vecs, 4);
1915 round_fn_avx2(v, msg_vecs, 5);
1916 round_fn_avx2(v, msg_vecs, 6);
1917
1918 h_vecs[0] = xorv_avx2(v[0], v[8]);
1919 h_vecs[1] = xorv_avx2(v[1], v[9]);
1920 h_vecs[2] = xorv_avx2(v[2], v[10]);
1921 h_vecs[3] = xorv_avx2(v[3], v[11]);
1922 h_vecs[4] = xorv_avx2(v[4], v[12]);
1923 h_vecs[5] = xorv_avx2(v[5], v[13]);
1924 h_vecs[6] = xorv_avx2(v[6], v[14]);
1925 h_vecs[7] = xorv_avx2(v[7], v[15]);
1926
1927 block_flags = flags;
1928 }
1929
1930 transpose_vecs_avx2(h_vecs);
1931 storeu_avx2(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
1932 storeu_avx2(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
1933 storeu_avx2(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
1934 storeu_avx2(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
1935 storeu_avx2(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
1936 storeu_avx2(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
1937 storeu_avx2(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
1938 storeu_avx2(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
1939}
1940
1941void
1942blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1943{
1944 while (num_inputs >= DEGREE_AVX2) {
1945 blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1946 if (increment_counter) {
1947 counter += DEGREE_AVX2;
1948 }
1949 inputs += DEGREE_AVX2;
1950 num_inputs -= DEGREE_AVX2;
1951 out_bytes = &out_bytes[DEGREE_AVX2 * BLAKE3_OUT_LEN];
1952 }
1953 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1954}
1955#pragma GCC pop_options
1956
1957#pragma GCC push_options
1958#pragma GCC target("avx512f,avx512vl")
1959static inline __m128i
1960loadu_128_avx512(const uint8_t src[16])
1961{
1962 return _mm_loadu_si128((const __m128i *)src);
1963}
1964
1965static inline __m256i
1966loadu_256_avx512(const uint8_t src[32])
1967{
1968 return _mm256_loadu_si256((const __m256i *)src);
1969}
1970
1971static inline __m512i
1972loadu_512_avx512(const uint8_t src[64])
1973{
1974 return _mm512_loadu_si512((const __m512i *)src);
1975}
1976
1977static inline void
1978storeu_128_avx512(__m128i src, uint8_t dest[16])
1979{
1980 _mm_storeu_si128((__m128i *)dest, src);
1981}
1982
1983static inline void
1984storeu_256_avx512(__m256i src, uint8_t dest[32])
1985{
1986 _mm256_storeu_si256((__m256i *)dest, src);
1987}
1988
1989static inline __m128i
1990add_128_avx512(__m128i a, __m128i b)
1991{
1992 return _mm_add_epi32(a, b);
1993}
1994
1995static inline __m256i
1996add_256_avx512(__m256i a, __m256i b)
1997{
1998 return _mm256_add_epi32(a, b);
1999}
2000
2001static inline __m512i
2002add_512_avx512(__m512i a, __m512i b)
2003{
2004 return _mm512_add_epi32(a, b);
2005}
2006
2007static inline __m128i
2008xor_128_avx512(__m128i a, __m128i b)
2009{
2010 return _mm_xor_si128(a, b);
2011}
2012
2013static inline __m256i
2014xor_256_avx512(__m256i a, __m256i b)
2015{
2016 return _mm256_xor_si256(a, b);
2017}
2018
2019static inline __m512i
2020xor_512_avx512(__m512i a, __m512i b)
2021{
2022 return _mm512_xor_si512(a, b);
2023}
2024
2025static inline __m128i
2026set1_128_avx512(uint32_t x)
2027{
2028 return _mm_set1_epi32((int32_t)x);
2029}
2030
2031static inline __m256i
2032set1_256_avx512(uint32_t x)
2033{
2034 return _mm256_set1_epi32((int32_t)x);
2035}
2036
2037static inline __m512i
2038set1_512_avx512(uint32_t x)
2039{
2040 return _mm512_set1_epi32((int32_t)x);
2041}
2042
2043static inline __m128i
2044set4_avx512(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
2045{
2046 return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
2047}
2048
2049static inline __m128i
2050rot16_128_avx512(__m128i x)
2051{
2052 return _mm_ror_epi32(x, 16);
2053}
2054
2055static inline __m256i
2056rot16_256_avx512(__m256i x)
2057{
2058 return _mm256_ror_epi32(x, 16);
2059}
2060
2061static inline __m512i
2062rot16_512_avx512(__m512i x)
2063{
2064 return _mm512_ror_epi32(x, 16);
2065}
2066
2067static inline __m128i
2068rot12_128_avx512(__m128i x)
2069{
2070 return _mm_ror_epi32(x, 12);
2071}
2072
2073static inline __m256i
2074rot12_256_avx512(__m256i x)
2075{
2076 return _mm256_ror_epi32(x, 12);
2077}
2078
2079static inline __m512i
2080rot12_512_avx512(__m512i x)
2081{
2082 return _mm512_ror_epi32(x, 12);
2083}
2084
2085static inline __m128i
2086rot8_128_avx512(__m128i x)
2087{
2088 return _mm_ror_epi32(x, 8);
2089}
2090
2091static inline __m256i
2092rot8_256_avx512(__m256i x)
2093{
2094 return _mm256_ror_epi32(x, 8);
2095}
2096
2097static inline __m512i
2098rot8_512_avx512(__m512i x)
2099{
2100 return _mm512_ror_epi32(x, 8);
2101}
2102
2103static inline __m128i
2104rot7_128_avx512(__m128i x)
2105{
2106 return _mm_ror_epi32(x, 7);
2107}
2108
2109static inline __m256i
2110rot7_256_avx512(__m256i x)
2111{
2112 return _mm256_ror_epi32(x, 7);
2113}
2114
2115static inline __m512i
2116rot7_512_avx512(__m512i x)
2117{
2118 return _mm512_ror_epi32(x, 7);
2119}
2120
2121static inline void
2122g1_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2123{
2124 *row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2125 *row3 = xor_128_avx512(*row3, *row0);
2126 *row3 = rot16_128_avx512(*row3);
2127 *row2 = add_128_avx512(*row2, *row3);
2128 *row1 = xor_128_avx512(*row1, *row2);
2129 *row1 = rot12_128_avx512(*row1);
2130}
2131
2132static inline void
2133g2_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2134{
2135 *row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2136 *row3 = xor_128_avx512(*row3, *row0);
2137 *row3 = rot8_128_avx512(*row3);
2138 *row2 = add_128_avx512(*row2, *row3);
2139 *row1 = xor_128_avx512(*row1, *row2);
2140 *row1 = rot7_128_avx512(*row1);
2141}
2142
2143static inline void
2144diagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2145{
2146 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
2147 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2148 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
2149}
2150
2151static inline void
2152undiagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2153{
2154 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
2155 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2156 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
2157}
2158
2159static inline void
2160compress_pre_avx512(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2161{
2162 __m128i m0, m1, m2, m3;
2163 __m128i t0, t1, t2, t3;
2164
2165 rows[0] = loadu_128_avx512((const uint8_t *)&cv[0]);
2166 rows[1] = loadu_128_avx512((const uint8_t *)&cv[4]);
2167 rows[2] = set4_avx512(IV[0], IV[1], IV[2], IV[3]);
2168 rows[3] = set4_avx512(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
2169
2170 m0 = loadu_128_avx512(&block[sizeof(__m128i) * 0]);
2171 m1 = loadu_128_avx512(&block[sizeof(__m128i) * 1]);
2172 m2 = loadu_128_avx512(&block[sizeof(__m128i) * 2]);
2173 m3 = loadu_128_avx512(&block[sizeof(__m128i) * 3]);
2174
2175 /* round 1 */
2176 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
2177 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2178 t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
2179 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2180 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2181 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
2182 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
2183 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2184 t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
2185 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
2186 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2187 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2188 m0 = t0;
2189 m1 = t1;
2190 m2 = t2;
2191 m3 = t3;
2192
2193 /* round 2 */
2194 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2195 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2196 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2197 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2198 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2199 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2200 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2201 t2 = _mm_unpacklo_epi64(m3, m1);
2202 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2203 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2204 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2205 t3 = _mm_unpackhi_epi32(m1, m3);
2206 t3 = _mm_unpacklo_epi32(m2, t3);
2207 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2208 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2209 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2210 m0 = t0;
2211 m1 = t1;
2212 m2 = t2;
2213 m3 = t3;
2214
2215 /* round 3 */
2216 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2217 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2218 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2219 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2220 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2221 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2222 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2223 t2 = _mm_unpacklo_epi64(m3, m1);
2224 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2225 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2226 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2227 t3 = _mm_unpackhi_epi32(m1, m3);
2228 t3 = _mm_unpacklo_epi32(m2, t3);
2229 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2230 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2231 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2232 m0 = t0;
2233 m1 = t1;
2234 m2 = t2;
2235 m3 = t3;
2236
2237 /* round 4 */
2238 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2239 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2240 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2241 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2242 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2243 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2244 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2245 t2 = _mm_unpacklo_epi64(m3, m1);
2246 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2247 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2248 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2249 t3 = _mm_unpackhi_epi32(m1, m3);
2250 t3 = _mm_unpacklo_epi32(m2, t3);
2251 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2252 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2253 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2254 m0 = t0;
2255 m1 = t1;
2256 m2 = t2;
2257 m3 = t3;
2258
2259 /* round 5 */
2260 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2261 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2262 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2263 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2264 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2265 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2266 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2267 t2 = _mm_unpacklo_epi64(m3, m1);
2268 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2269 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2270 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2271 t3 = _mm_unpackhi_epi32(m1, m3);
2272 t3 = _mm_unpacklo_epi32(m2, t3);
2273 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2274 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2275 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2276 m0 = t0;
2277 m1 = t1;
2278 m2 = t2;
2279 m3 = t3;
2280
2281 /* round 6 */
2282 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2283 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2284 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2285 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2286 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2287 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2288 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2289 t2 = _mm_unpacklo_epi64(m3, m1);
2290 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2291 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2292 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2293 t3 = _mm_unpackhi_epi32(m1, m3);
2294 t3 = _mm_unpacklo_epi32(m2, t3);
2295 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2296 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2297 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2298 m0 = t0;
2299 m1 = t1;
2300 m2 = t2;
2301 m3 = t3;
2302
2303 /* round 7 */
2304 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2305 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2306 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2307 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2308 t1 = _mm_blend_epi16(m0, t1, 0xCC);
2309 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2310 diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2311 t2 = _mm_unpacklo_epi64(m3, m1);
2312 t2 = _mm_blend_epi16(t2, m2, 0xC0);
2313 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2314 g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2315 t3 = _mm_unpackhi_epi32(m1, m3);
2316 t3 = _mm_unpacklo_epi32(m2, t3);
2317 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2318 g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2319 undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2320}
2321
2322void
2323blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2324{
2325 __m128i rows[4];
2326
2327 compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2328 storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), (uint8_t *)&cv[0]);
2329 storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), (uint8_t *)&cv[4]);
2330}
2331
2332void
2333blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
2334{
2335 __m128i rows[4];
2336
2337 compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2338 storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), &out_buf[0]);
2339 storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), &out_buf[16]);
2340 storeu_128_avx512(xor_128_avx512(rows[2], loadu_128_avx512((const uint8_t *)&cv[0])), &out_buf[32]);
2341 storeu_128_avx512(xor_128_avx512(rows[3], loadu_128_avx512((const uint8_t *)&cv[4])), &out_buf[48]);
2342}
2343
2344static inline void
2345round_fn4_avx512(__m128i v[16], __m128i m[16], size_t r)
2346{
2347 v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2348 v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2349 v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2350 v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2351 v[0] = add_128_avx512(v[0], v[4]);
2352 v[1] = add_128_avx512(v[1], v[5]);
2353 v[2] = add_128_avx512(v[2], v[6]);
2354 v[3] = add_128_avx512(v[3], v[7]);
2355 v[12] = xor_128_avx512(v[12], v[0]);
2356 v[13] = xor_128_avx512(v[13], v[1]);
2357 v[14] = xor_128_avx512(v[14], v[2]);
2358 v[15] = xor_128_avx512(v[15], v[3]);
2359 v[12] = rot16_128_avx512(v[12]);
2360 v[13] = rot16_128_avx512(v[13]);
2361 v[14] = rot16_128_avx512(v[14]);
2362 v[15] = rot16_128_avx512(v[15]);
2363 v[8] = add_128_avx512(v[8], v[12]);
2364 v[9] = add_128_avx512(v[9], v[13]);
2365 v[10] = add_128_avx512(v[10], v[14]);
2366 v[11] = add_128_avx512(v[11], v[15]);
2367 v[4] = xor_128_avx512(v[4], v[8]);
2368 v[5] = xor_128_avx512(v[5], v[9]);
2369 v[6] = xor_128_avx512(v[6], v[10]);
2370 v[7] = xor_128_avx512(v[7], v[11]);
2371 v[4] = rot12_128_avx512(v[4]);
2372 v[5] = rot12_128_avx512(v[5]);
2373 v[6] = rot12_128_avx512(v[6]);
2374 v[7] = rot12_128_avx512(v[7]);
2375 v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2376 v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2377 v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2378 v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2379 v[0] = add_128_avx512(v[0], v[4]);
2380 v[1] = add_128_avx512(v[1], v[5]);
2381 v[2] = add_128_avx512(v[2], v[6]);
2382 v[3] = add_128_avx512(v[3], v[7]);
2383 v[12] = xor_128_avx512(v[12], v[0]);
2384 v[13] = xor_128_avx512(v[13], v[1]);
2385 v[14] = xor_128_avx512(v[14], v[2]);
2386 v[15] = xor_128_avx512(v[15], v[3]);
2387 v[12] = rot8_128_avx512(v[12]);
2388 v[13] = rot8_128_avx512(v[13]);
2389 v[14] = rot8_128_avx512(v[14]);
2390 v[15] = rot8_128_avx512(v[15]);
2391 v[8] = add_128_avx512(v[8], v[12]);
2392 v[9] = add_128_avx512(v[9], v[13]);
2393 v[10] = add_128_avx512(v[10], v[14]);
2394 v[11] = add_128_avx512(v[11], v[15]);
2395 v[4] = xor_128_avx512(v[4], v[8]);
2396 v[5] = xor_128_avx512(v[5], v[9]);
2397 v[6] = xor_128_avx512(v[6], v[10]);
2398 v[7] = xor_128_avx512(v[7], v[11]);
2399 v[4] = rot7_128_avx512(v[4]);
2400 v[5] = rot7_128_avx512(v[5]);
2401 v[6] = rot7_128_avx512(v[6]);
2402 v[7] = rot7_128_avx512(v[7]);
2403
2404 v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2405 v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2406 v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2407 v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2408 v[0] = add_128_avx512(v[0], v[5]);
2409 v[1] = add_128_avx512(v[1], v[6]);
2410 v[2] = add_128_avx512(v[2], v[7]);
2411 v[3] = add_128_avx512(v[3], v[4]);
2412 v[15] = xor_128_avx512(v[15], v[0]);
2413 v[12] = xor_128_avx512(v[12], v[1]);
2414 v[13] = xor_128_avx512(v[13], v[2]);
2415 v[14] = xor_128_avx512(v[14], v[3]);
2416 v[15] = rot16_128_avx512(v[15]);
2417 v[12] = rot16_128_avx512(v[12]);
2418 v[13] = rot16_128_avx512(v[13]);
2419 v[14] = rot16_128_avx512(v[14]);
2420 v[10] = add_128_avx512(v[10], v[15]);
2421 v[11] = add_128_avx512(v[11], v[12]);
2422 v[8] = add_128_avx512(v[8], v[13]);
2423 v[9] = add_128_avx512(v[9], v[14]);
2424 v[5] = xor_128_avx512(v[5], v[10]);
2425 v[6] = xor_128_avx512(v[6], v[11]);
2426 v[7] = xor_128_avx512(v[7], v[8]);
2427 v[4] = xor_128_avx512(v[4], v[9]);
2428 v[5] = rot12_128_avx512(v[5]);
2429 v[6] = rot12_128_avx512(v[6]);
2430 v[7] = rot12_128_avx512(v[7]);
2431 v[4] = rot12_128_avx512(v[4]);
2432 v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2433 v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2434 v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2435 v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2436 v[0] = add_128_avx512(v[0], v[5]);
2437 v[1] = add_128_avx512(v[1], v[6]);
2438 v[2] = add_128_avx512(v[2], v[7]);
2439 v[3] = add_128_avx512(v[3], v[4]);
2440 v[15] = xor_128_avx512(v[15], v[0]);
2441 v[12] = xor_128_avx512(v[12], v[1]);
2442 v[13] = xor_128_avx512(v[13], v[2]);
2443 v[14] = xor_128_avx512(v[14], v[3]);
2444 v[15] = rot8_128_avx512(v[15]);
2445 v[12] = rot8_128_avx512(v[12]);
2446 v[13] = rot8_128_avx512(v[13]);
2447 v[14] = rot8_128_avx512(v[14]);
2448 v[10] = add_128_avx512(v[10], v[15]);
2449 v[11] = add_128_avx512(v[11], v[12]);
2450 v[8] = add_128_avx512(v[8], v[13]);
2451 v[9] = add_128_avx512(v[9], v[14]);
2452 v[5] = xor_128_avx512(v[5], v[10]);
2453 v[6] = xor_128_avx512(v[6], v[11]);
2454 v[7] = xor_128_avx512(v[7], v[8]);
2455 v[4] = xor_128_avx512(v[4], v[9]);
2456 v[5] = rot7_128_avx512(v[5]);
2457 v[6] = rot7_128_avx512(v[6]);
2458 v[7] = rot7_128_avx512(v[7]);
2459 v[4] = rot7_128_avx512(v[4]);
2460}
2461
2462static inline void
2463round_fn8_avx512(__m256i v[16], __m256i m[16], size_t r)
2464{
2465 v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2466 v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2467 v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2468 v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2469 v[0] = add_256_avx512(v[0], v[4]);
2470 v[1] = add_256_avx512(v[1], v[5]);
2471 v[2] = add_256_avx512(v[2], v[6]);
2472 v[3] = add_256_avx512(v[3], v[7]);
2473 v[12] = xor_256_avx512(v[12], v[0]);
2474 v[13] = xor_256_avx512(v[13], v[1]);
2475 v[14] = xor_256_avx512(v[14], v[2]);
2476 v[15] = xor_256_avx512(v[15], v[3]);
2477 v[12] = rot16_256_avx512(v[12]);
2478 v[13] = rot16_256_avx512(v[13]);
2479 v[14] = rot16_256_avx512(v[14]);
2480 v[15] = rot16_256_avx512(v[15]);
2481 v[8] = add_256_avx512(v[8], v[12]);
2482 v[9] = add_256_avx512(v[9], v[13]);
2483 v[10] = add_256_avx512(v[10], v[14]);
2484 v[11] = add_256_avx512(v[11], v[15]);
2485 v[4] = xor_256_avx512(v[4], v[8]);
2486 v[5] = xor_256_avx512(v[5], v[9]);
2487 v[6] = xor_256_avx512(v[6], v[10]);
2488 v[7] = xor_256_avx512(v[7], v[11]);
2489 v[4] = rot12_256_avx512(v[4]);
2490 v[5] = rot12_256_avx512(v[5]);
2491 v[6] = rot12_256_avx512(v[6]);
2492 v[7] = rot12_256_avx512(v[7]);
2493 v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2494 v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2495 v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2496 v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2497 v[0] = add_256_avx512(v[0], v[4]);
2498 v[1] = add_256_avx512(v[1], v[5]);
2499 v[2] = add_256_avx512(v[2], v[6]);
2500 v[3] = add_256_avx512(v[3], v[7]);
2501 v[12] = xor_256_avx512(v[12], v[0]);
2502 v[13] = xor_256_avx512(v[13], v[1]);
2503 v[14] = xor_256_avx512(v[14], v[2]);
2504 v[15] = xor_256_avx512(v[15], v[3]);
2505 v[12] = rot8_256_avx512(v[12]);
2506 v[13] = rot8_256_avx512(v[13]);
2507 v[14] = rot8_256_avx512(v[14]);
2508 v[15] = rot8_256_avx512(v[15]);
2509 v[8] = add_256_avx512(v[8], v[12]);
2510 v[9] = add_256_avx512(v[9], v[13]);
2511 v[10] = add_256_avx512(v[10], v[14]);
2512 v[11] = add_256_avx512(v[11], v[15]);
2513 v[4] = xor_256_avx512(v[4], v[8]);
2514 v[5] = xor_256_avx512(v[5], v[9]);
2515 v[6] = xor_256_avx512(v[6], v[10]);
2516 v[7] = xor_256_avx512(v[7], v[11]);
2517 v[4] = rot7_256_avx512(v[4]);
2518 v[5] = rot7_256_avx512(v[5]);
2519 v[6] = rot7_256_avx512(v[6]);
2520 v[7] = rot7_256_avx512(v[7]);
2521
2522 v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2523 v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2524 v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2525 v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2526 v[0] = add_256_avx512(v[0], v[5]);
2527 v[1] = add_256_avx512(v[1], v[6]);
2528 v[2] = add_256_avx512(v[2], v[7]);
2529 v[3] = add_256_avx512(v[3], v[4]);
2530 v[15] = xor_256_avx512(v[15], v[0]);
2531 v[12] = xor_256_avx512(v[12], v[1]);
2532 v[13] = xor_256_avx512(v[13], v[2]);
2533 v[14] = xor_256_avx512(v[14], v[3]);
2534 v[15] = rot16_256_avx512(v[15]);
2535 v[12] = rot16_256_avx512(v[12]);
2536 v[13] = rot16_256_avx512(v[13]);
2537 v[14] = rot16_256_avx512(v[14]);
2538 v[10] = add_256_avx512(v[10], v[15]);
2539 v[11] = add_256_avx512(v[11], v[12]);
2540 v[8] = add_256_avx512(v[8], v[13]);
2541 v[9] = add_256_avx512(v[9], v[14]);
2542 v[5] = xor_256_avx512(v[5], v[10]);
2543 v[6] = xor_256_avx512(v[6], v[11]);
2544 v[7] = xor_256_avx512(v[7], v[8]);
2545 v[4] = xor_256_avx512(v[4], v[9]);
2546 v[5] = rot12_256_avx512(v[5]);
2547 v[6] = rot12_256_avx512(v[6]);
2548 v[7] = rot12_256_avx512(v[7]);
2549 v[4] = rot12_256_avx512(v[4]);
2550 v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2551 v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2552 v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2553 v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2554 v[0] = add_256_avx512(v[0], v[5]);
2555 v[1] = add_256_avx512(v[1], v[6]);
2556 v[2] = add_256_avx512(v[2], v[7]);
2557 v[3] = add_256_avx512(v[3], v[4]);
2558 v[15] = xor_256_avx512(v[15], v[0]);
2559 v[12] = xor_256_avx512(v[12], v[1]);
2560 v[13] = xor_256_avx512(v[13], v[2]);
2561 v[14] = xor_256_avx512(v[14], v[3]);
2562 v[15] = rot8_256_avx512(v[15]);
2563 v[12] = rot8_256_avx512(v[12]);
2564 v[13] = rot8_256_avx512(v[13]);
2565 v[14] = rot8_256_avx512(v[14]);
2566 v[10] = add_256_avx512(v[10], v[15]);
2567 v[11] = add_256_avx512(v[11], v[12]);
2568 v[8] = add_256_avx512(v[8], v[13]);
2569 v[9] = add_256_avx512(v[9], v[14]);
2570 v[5] = xor_256_avx512(v[5], v[10]);
2571 v[6] = xor_256_avx512(v[6], v[11]);
2572 v[7] = xor_256_avx512(v[7], v[8]);
2573 v[4] = xor_256_avx512(v[4], v[9]);
2574 v[5] = rot7_256_avx512(v[5]);
2575 v[6] = rot7_256_avx512(v[6]);
2576 v[7] = rot7_256_avx512(v[7]);
2577 v[4] = rot7_256_avx512(v[4]);
2578}
2579
2580static inline void
2581round_fn16_avx512(__m512i v[16], __m512i m[16], size_t r)
2582{
2583 v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2584 v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2585 v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2586 v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2587 v[0] = add_512_avx512(v[0], v[4]);
2588 v[1] = add_512_avx512(v[1], v[5]);
2589 v[2] = add_512_avx512(v[2], v[6]);
2590 v[3] = add_512_avx512(v[3], v[7]);
2591 v[12] = xor_512_avx512(v[12], v[0]);
2592 v[13] = xor_512_avx512(v[13], v[1]);
2593 v[14] = xor_512_avx512(v[14], v[2]);
2594 v[15] = xor_512_avx512(v[15], v[3]);
2595 v[12] = rot16_512_avx512(v[12]);
2596 v[13] = rot16_512_avx512(v[13]);
2597 v[14] = rot16_512_avx512(v[14]);
2598 v[15] = rot16_512_avx512(v[15]);
2599 v[8] = add_512_avx512(v[8], v[12]);
2600 v[9] = add_512_avx512(v[9], v[13]);
2601 v[10] = add_512_avx512(v[10], v[14]);
2602 v[11] = add_512_avx512(v[11], v[15]);
2603 v[4] = xor_512_avx512(v[4], v[8]);
2604 v[5] = xor_512_avx512(v[5], v[9]);
2605 v[6] = xor_512_avx512(v[6], v[10]);
2606 v[7] = xor_512_avx512(v[7], v[11]);
2607 v[4] = rot12_512_avx512(v[4]);
2608 v[5] = rot12_512_avx512(v[5]);
2609 v[6] = rot12_512_avx512(v[6]);
2610 v[7] = rot12_512_avx512(v[7]);
2611 v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2612 v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2613 v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2614 v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2615 v[0] = add_512_avx512(v[0], v[4]);
2616 v[1] = add_512_avx512(v[1], v[5]);
2617 v[2] = add_512_avx512(v[2], v[6]);
2618 v[3] = add_512_avx512(v[3], v[7]);
2619 v[12] = xor_512_avx512(v[12], v[0]);
2620 v[13] = xor_512_avx512(v[13], v[1]);
2621 v[14] = xor_512_avx512(v[14], v[2]);
2622 v[15] = xor_512_avx512(v[15], v[3]);
2623 v[12] = rot8_512_avx512(v[12]);
2624 v[13] = rot8_512_avx512(v[13]);
2625 v[14] = rot8_512_avx512(v[14]);
2626 v[15] = rot8_512_avx512(v[15]);
2627 v[8] = add_512_avx512(v[8], v[12]);
2628 v[9] = add_512_avx512(v[9], v[13]);
2629 v[10] = add_512_avx512(v[10], v[14]);
2630 v[11] = add_512_avx512(v[11], v[15]);
2631 v[4] = xor_512_avx512(v[4], v[8]);
2632 v[5] = xor_512_avx512(v[5], v[9]);
2633 v[6] = xor_512_avx512(v[6], v[10]);
2634 v[7] = xor_512_avx512(v[7], v[11]);
2635 v[4] = rot7_512_avx512(v[4]);
2636 v[5] = rot7_512_avx512(v[5]);
2637 v[6] = rot7_512_avx512(v[6]);
2638 v[7] = rot7_512_avx512(v[7]);
2639
2640 v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2641 v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2642 v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2643 v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2644 v[0] = add_512_avx512(v[0], v[5]);
2645 v[1] = add_512_avx512(v[1], v[6]);
2646 v[2] = add_512_avx512(v[2], v[7]);
2647 v[3] = add_512_avx512(v[3], v[4]);
2648 v[15] = xor_512_avx512(v[15], v[0]);
2649 v[12] = xor_512_avx512(v[12], v[1]);
2650 v[13] = xor_512_avx512(v[13], v[2]);
2651 v[14] = xor_512_avx512(v[14], v[3]);
2652 v[15] = rot16_512_avx512(v[15]);
2653 v[12] = rot16_512_avx512(v[12]);
2654 v[13] = rot16_512_avx512(v[13]);
2655 v[14] = rot16_512_avx512(v[14]);
2656 v[10] = add_512_avx512(v[10], v[15]);
2657 v[11] = add_512_avx512(v[11], v[12]);
2658 v[8] = add_512_avx512(v[8], v[13]);
2659 v[9] = add_512_avx512(v[9], v[14]);
2660 v[5] = xor_512_avx512(v[5], v[10]);
2661 v[6] = xor_512_avx512(v[6], v[11]);
2662 v[7] = xor_512_avx512(v[7], v[8]);
2663 v[4] = xor_512_avx512(v[4], v[9]);
2664 v[5] = rot12_512_avx512(v[5]);
2665 v[6] = rot12_512_avx512(v[6]);
2666 v[7] = rot12_512_avx512(v[7]);
2667 v[4] = rot12_512_avx512(v[4]);
2668 v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2669 v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2670 v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2671 v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2672 v[0] = add_512_avx512(v[0], v[5]);
2673 v[1] = add_512_avx512(v[1], v[6]);
2674 v[2] = add_512_avx512(v[2], v[7]);
2675 v[3] = add_512_avx512(v[3], v[4]);
2676 v[15] = xor_512_avx512(v[15], v[0]);
2677 v[12] = xor_512_avx512(v[12], v[1]);
2678 v[13] = xor_512_avx512(v[13], v[2]);
2679 v[14] = xor_512_avx512(v[14], v[3]);
2680 v[15] = rot8_512_avx512(v[15]);
2681 v[12] = rot8_512_avx512(v[12]);
2682 v[13] = rot8_512_avx512(v[13]);
2683 v[14] = rot8_512_avx512(v[14]);
2684 v[10] = add_512_avx512(v[10], v[15]);
2685 v[11] = add_512_avx512(v[11], v[12]);
2686 v[8] = add_512_avx512(v[8], v[13]);
2687 v[9] = add_512_avx512(v[9], v[14]);
2688 v[5] = xor_512_avx512(v[5], v[10]);
2689 v[6] = xor_512_avx512(v[6], v[11]);
2690 v[7] = xor_512_avx512(v[7], v[8]);
2691 v[4] = xor_512_avx512(v[4], v[9]);
2692 v[5] = rot7_512_avx512(v[5]);
2693 v[6] = rot7_512_avx512(v[6]);
2694 v[7] = rot7_512_avx512(v[7]);
2695 v[4] = rot7_512_avx512(v[4]);
2696}
2697
2698#define LO_IMM8 0x88
2699#define HI_IMM8 0xdd
2700
2701static inline __m512i
2702unpack_lo_128_avx512(__m512i a, __m512i b)
2703{
2704 return _mm512_shuffle_i32x4(a, b, LO_IMM8);
2705}
2706
2707static inline __m512i
2708unpack_hi_128_avx512(__m512i a, __m512i b)
2709{
2710 return _mm512_shuffle_i32x4(a, b, HI_IMM8);
2711}
2712
2713static inline void
2714transpose_vecs_512_avx512(__m512i vecs[16])
2715{
2716 __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
2717 __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
2718 __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
2719 __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
2720 __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
2721 __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
2722 __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
2723 __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
2724 __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
2725 __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
2726 __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
2727 __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
2728 __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
2729 __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
2730 __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
2731 __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
2732
2733 __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
2734 __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
2735 __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
2736 __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
2737 __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
2738 __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
2739 __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
2740 __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
2741 __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
2742 __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
2743 __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
2744 __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
2745 __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
2746 __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
2747 __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
2748 __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
2749
2750 __m512i abcdefgh_0 = unpack_lo_128_avx512(abcd_0, efgh_0);
2751 __m512i abcdefgh_1 = unpack_lo_128_avx512(abcd_1, efgh_1);
2752 __m512i abcdefgh_2 = unpack_lo_128_avx512(abcd_2, efgh_2);
2753 __m512i abcdefgh_3 = unpack_lo_128_avx512(abcd_3, efgh_3);
2754 __m512i abcdefgh_4 = unpack_hi_128_avx512(abcd_0, efgh_0);
2755 __m512i abcdefgh_5 = unpack_hi_128_avx512(abcd_1, efgh_1);
2756 __m512i abcdefgh_6 = unpack_hi_128_avx512(abcd_2, efgh_2);
2757 __m512i abcdefgh_7 = unpack_hi_128_avx512(abcd_3, efgh_3);
2758 __m512i ijklmnop_0 = unpack_lo_128_avx512(ijkl_0, mnop_0);
2759 __m512i ijklmnop_1 = unpack_lo_128_avx512(ijkl_1, mnop_1);
2760 __m512i ijklmnop_2 = unpack_lo_128_avx512(ijkl_2, mnop_2);
2761 __m512i ijklmnop_3 = unpack_lo_128_avx512(ijkl_3, mnop_3);
2762 __m512i ijklmnop_4 = unpack_hi_128_avx512(ijkl_0, mnop_0);
2763 __m512i ijklmnop_5 = unpack_hi_128_avx512(ijkl_1, mnop_1);
2764 __m512i ijklmnop_6 = unpack_hi_128_avx512(ijkl_2, mnop_2);
2765 __m512i ijklmnop_7 = unpack_hi_128_avx512(ijkl_3, mnop_3);
2766
2767 vecs[0] = unpack_lo_128_avx512(abcdefgh_0, ijklmnop_0);
2768 vecs[1] = unpack_lo_128_avx512(abcdefgh_1, ijklmnop_1);
2769 vecs[2] = unpack_lo_128_avx512(abcdefgh_2, ijklmnop_2);
2770 vecs[3] = unpack_lo_128_avx512(abcdefgh_3, ijklmnop_3);
2771 vecs[4] = unpack_lo_128_avx512(abcdefgh_4, ijklmnop_4);
2772 vecs[5] = unpack_lo_128_avx512(abcdefgh_5, ijklmnop_5);
2773 vecs[6] = unpack_lo_128_avx512(abcdefgh_6, ijklmnop_6);
2774 vecs[7] = unpack_lo_128_avx512(abcdefgh_7, ijklmnop_7);
2775 vecs[8] = unpack_hi_128_avx512(abcdefgh_0, ijklmnop_0);
2776 vecs[9] = unpack_hi_128_avx512(abcdefgh_1, ijklmnop_1);
2777 vecs[10] = unpack_hi_128_avx512(abcdefgh_2, ijklmnop_2);
2778 vecs[11] = unpack_hi_128_avx512(abcdefgh_3, ijklmnop_3);
2779 vecs[12] = unpack_hi_128_avx512(abcdefgh_4, ijklmnop_4);
2780 vecs[13] = unpack_hi_128_avx512(abcdefgh_5, ijklmnop_5);
2781 vecs[14] = unpack_hi_128_avx512(abcdefgh_6, ijklmnop_6);
2782 vecs[15] = unpack_hi_128_avx512(abcdefgh_7, ijklmnop_7);
2783}
2784
2785static inline void
2786transpose_msg_vecs16_avx512(const uint8_t *const *inputs, size_t block_offset, __m512i out_msg[16])
2787{
2788 size_t i;
2789
2790 out_msg[0] = loadu_512_avx512(&inputs[0][block_offset]);
2791 out_msg[1] = loadu_512_avx512(&inputs[1][block_offset]);
2792 out_msg[2] = loadu_512_avx512(&inputs[2][block_offset]);
2793 out_msg[3] = loadu_512_avx512(&inputs[3][block_offset]);
2794 out_msg[4] = loadu_512_avx512(&inputs[4][block_offset]);
2795 out_msg[5] = loadu_512_avx512(&inputs[5][block_offset]);
2796 out_msg[6] = loadu_512_avx512(&inputs[6][block_offset]);
2797 out_msg[7] = loadu_512_avx512(&inputs[7][block_offset]);
2798 out_msg[8] = loadu_512_avx512(&inputs[8][block_offset]);
2799 out_msg[9] = loadu_512_avx512(&inputs[9][block_offset]);
2800 out_msg[10] = loadu_512_avx512(&inputs[10][block_offset]);
2801 out_msg[11] = loadu_512_avx512(&inputs[11][block_offset]);
2802 out_msg[12] = loadu_512_avx512(&inputs[12][block_offset]);
2803 out_msg[13] = loadu_512_avx512(&inputs[13][block_offset]);
2804 out_msg[14] = loadu_512_avx512(&inputs[14][block_offset]);
2805 out_msg[15] = loadu_512_avx512(&inputs[15][block_offset]);
2806
2807 for (i = 0; i < 16; i++) {
2808 _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
2809 }
2810 transpose_vecs_512_avx512(out_msg);
2811}
2812
2813static inline void
2814load_counters16_avx512(uint64_t counter, int increment_counter, __m512i *out_lo, __m512i *out_hi)
2815{
2816 const __m512i mask = _mm512_set1_epi32(-increment_counter);
2817 const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2818 const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
2819 const __m512i low_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), masked_deltas);
2820 const __m512i carries = _mm512_srli_epi32(_mm512_andnot_si512(low_words, _mm512_set1_epi32((int32_t)counter)), 31);
2821 const __m512i high_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carries);
2822
2823 *out_lo = low_words;
2824 *out_hi = high_words;
2825}
2826
2827void
2828blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2829{
2830 __m512i h_vecs[8];
2831 __m512i counter_low_vec, counter_high_vec;
2832 uint8_t block_flags;
2833 size_t block;
2834
2835 h_vecs[0] = set1_512_avx512(key[0]);
2836 h_vecs[1] = set1_512_avx512(key[1]);
2837 h_vecs[2] = set1_512_avx512(key[2]);
2838 h_vecs[3] = set1_512_avx512(key[3]);
2839 h_vecs[4] = set1_512_avx512(key[4]);
2840 h_vecs[5] = set1_512_avx512(key[5]);
2841 h_vecs[6] = set1_512_avx512(key[6]);
2842 h_vecs[7] = set1_512_avx512(key[7]);
2843
2844 load_counters16_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2845 block_flags = flags | flags_start;
2846
2847 for (block = 0; block < blocks; block++) {
2848 __m512i block_len_vec;
2849 __m512i block_flags_vec;
2850 __m512i msg_vecs[16];
2851 __m512i v[16];
2852
2853 if (block + 1 == blocks) {
2854 block_flags |= flags_end;
2855 }
2856 block_len_vec = set1_512_avx512(BLAKE3_BLOCK_LEN);
2857 block_flags_vec = set1_512_avx512(block_flags);
2858 transpose_msg_vecs16_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2859
2860 v[0] = h_vecs[0];
2861 v[1] = h_vecs[1];
2862 v[2] = h_vecs[2];
2863 v[3] = h_vecs[3];
2864 v[4] = h_vecs[4];
2865 v[5] = h_vecs[5];
2866 v[6] = h_vecs[6];
2867 v[7] = h_vecs[7];
2868 v[8] = set1_512_avx512(IV[0]);
2869 v[9] = set1_512_avx512(IV[1]);
2870 v[10] = set1_512_avx512(IV[2]);
2871 v[11] = set1_512_avx512(IV[3]);
2872 v[12] = counter_low_vec;
2873 v[13] = counter_high_vec;
2874 v[14] = block_len_vec;
2875 v[15] = block_flags_vec;
2876
2877 round_fn16_avx512(v, msg_vecs, 0);
2878 round_fn16_avx512(v, msg_vecs, 1);
2879 round_fn16_avx512(v, msg_vecs, 2);
2880 round_fn16_avx512(v, msg_vecs, 3);
2881 round_fn16_avx512(v, msg_vecs, 4);
2882 round_fn16_avx512(v, msg_vecs, 5);
2883 round_fn16_avx512(v, msg_vecs, 6);
2884
2885 h_vecs[0] = xor_512_avx512(v[0], v[8]);
2886 h_vecs[1] = xor_512_avx512(v[1], v[9]);
2887 h_vecs[2] = xor_512_avx512(v[2], v[10]);
2888 h_vecs[3] = xor_512_avx512(v[3], v[11]);
2889 h_vecs[4] = xor_512_avx512(v[4], v[12]);
2890 h_vecs[5] = xor_512_avx512(v[5], v[13]);
2891 h_vecs[6] = xor_512_avx512(v[6], v[14]);
2892 h_vecs[7] = xor_512_avx512(v[7], v[15]);
2893
2894 block_flags = flags;
2895 }
2896
2897 __m512i padded[16] = {
2898 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
2899 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
2900 set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2901 set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2902 };
2903 transpose_vecs_512_avx512(padded);
2904 _mm256_mask_storeu_epi32(&out_bytes[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
2905 _mm256_mask_storeu_epi32(&out_bytes[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
2906 _mm256_mask_storeu_epi32(&out_bytes[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
2907 _mm256_mask_storeu_epi32(&out_bytes[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
2908 _mm256_mask_storeu_epi32(&out_bytes[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
2909 _mm256_mask_storeu_epi32(&out_bytes[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
2910 _mm256_mask_storeu_epi32(&out_bytes[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
2911 _mm256_mask_storeu_epi32(&out_bytes[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
2912 _mm256_mask_storeu_epi32(&out_bytes[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
2913 _mm256_mask_storeu_epi32(&out_bytes[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
2914 _mm256_mask_storeu_epi32(&out_bytes[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
2915 _mm256_mask_storeu_epi32(&out_bytes[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
2916 _mm256_mask_storeu_epi32(&out_bytes[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
2917 _mm256_mask_storeu_epi32(&out_bytes[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
2918 _mm256_mask_storeu_epi32(&out_bytes[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
2919 _mm256_mask_storeu_epi32(&out_bytes[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
2920}
2921
2922static inline void
2923transpose_msg_vecs8_avx512(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
2924{
2925 out_msg[0] = loadu_256_avx512(&inputs[0][block_offset]);
2926 out_msg[1] = loadu_256_avx512(&inputs[1][block_offset]);
2927 out_msg[2] = loadu_256_avx512(&inputs[2][block_offset]);
2928 out_msg[3] = loadu_256_avx512(&inputs[3][block_offset]);
2929 out_msg[4] = loadu_256_avx512(&inputs[4][block_offset]);
2930 out_msg[5] = loadu_256_avx512(&inputs[5][block_offset]);
2931 out_msg[6] = loadu_256_avx512(&inputs[6][block_offset]);
2932 out_msg[7] = loadu_256_avx512(&inputs[7][block_offset]);
2933 out_msg[8] = loadu_256_avx512(&inputs[0][block_offset + 32]);
2934 out_msg[9] = loadu_256_avx512(&inputs[1][block_offset + 32]);
2935 out_msg[10] = loadu_256_avx512(&inputs[2][block_offset + 32]);
2936 out_msg[11] = loadu_256_avx512(&inputs[3][block_offset + 32]);
2937 out_msg[12] = loadu_256_avx512(&inputs[4][block_offset + 32]);
2938 out_msg[13] = loadu_256_avx512(&inputs[5][block_offset + 32]);
2939 out_msg[14] = loadu_256_avx512(&inputs[6][block_offset + 32]);
2940 out_msg[15] = loadu_256_avx512(&inputs[7][block_offset + 32]);
2941
2942 transpose_vecs_avx2(out_msg);
2943}
2944
2945static inline void
2946load_counters8_avx512(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
2947{
2948 const __m256i mask = _mm256_set1_epi32(-increment_counter);
2949 const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2950 const __m256i add1 = _mm256_and_si256(mask, add0);
2951 __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
2952 __m256i carry = _mm256_srli_epi32(_mm256_andnot_si256(l, _mm256_set1_epi32((int32_t)counter)), 31);
2953 __m256i h = _mm256_add_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
2954
2955 *out_lo = l;
2956 *out_hi = h;
2957}
2958
2959void
2960blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2961{
2962 __m256i h_vecs[8];
2963 __m256i counter_low_vec, counter_high_vec;
2964 uint8_t block_flags;
2965 size_t block;
2966
2967 h_vecs[0] = set1_256_avx512(key[0]);
2968 h_vecs[1] = set1_256_avx512(key[1]);
2969 h_vecs[2] = set1_256_avx512(key[2]);
2970 h_vecs[3] = set1_256_avx512(key[3]);
2971 h_vecs[4] = set1_256_avx512(key[4]);
2972 h_vecs[5] = set1_256_avx512(key[5]);
2973 h_vecs[6] = set1_256_avx512(key[6]);
2974 h_vecs[7] = set1_256_avx512(key[7]);
2975
2976 load_counters8_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2977 block_flags = flags | flags_start;
2978
2979 for (block = 0; block < blocks; block++) {
2980 __m256i block_len_vec;
2981 __m256i block_flags_vec;
2982 __m256i msg_vecs[16];
2983 __m256i v[16];
2984
2985 if (block + 1 == blocks) {
2986 block_flags |= flags_end;
2987 }
2988 block_len_vec = set1_256_avx512(BLAKE3_BLOCK_LEN);
2989 block_flags_vec = set1_256_avx512(block_flags);
2990 transpose_msg_vecs8_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2991
2992 v[0] = h_vecs[0];
2993 v[1] = h_vecs[1];
2994 v[2] = h_vecs[2];
2995 v[3] = h_vecs[3];
2996 v[4] = h_vecs[4];
2997 v[5] = h_vecs[5];
2998 v[6] = h_vecs[6];
2999 v[7] = h_vecs[7];
3000 v[8] = set1_256_avx512(IV[0]);
3001 v[9] = set1_256_avx512(IV[1]);
3002 v[10] = set1_256_avx512(IV[2]);
3003 v[11] = set1_256_avx512(IV[3]);
3004 v[12] = counter_low_vec;
3005 v[13] = counter_high_vec;
3006 v[14] = block_len_vec;
3007 v[15] = block_flags_vec;
3008
3009 round_fn8_avx512(v, msg_vecs, 0);
3010 round_fn8_avx512(v, msg_vecs, 1);
3011 round_fn8_avx512(v, msg_vecs, 2);
3012 round_fn8_avx512(v, msg_vecs, 3);
3013 round_fn8_avx512(v, msg_vecs, 4);
3014 round_fn8_avx512(v, msg_vecs, 5);
3015 round_fn8_avx512(v, msg_vecs, 6);
3016
3017 h_vecs[0] = xor_256_avx512(v[0], v[8]);
3018 h_vecs[1] = xor_256_avx512(v[1], v[9]);
3019 h_vecs[2] = xor_256_avx512(v[2], v[10]);
3020 h_vecs[3] = xor_256_avx512(v[3], v[11]);
3021 h_vecs[4] = xor_256_avx512(v[4], v[12]);
3022 h_vecs[5] = xor_256_avx512(v[5], v[13]);
3023 h_vecs[6] = xor_256_avx512(v[6], v[14]);
3024 h_vecs[7] = xor_256_avx512(v[7], v[15]);
3025
3026 block_flags = flags;
3027 }
3028
3029 transpose_vecs_avx2(h_vecs);
3030 storeu_256_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
3031 storeu_256_avx512(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
3032 storeu_256_avx512(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
3033 storeu_256_avx512(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
3034 storeu_256_avx512(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
3035 storeu_256_avx512(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
3036 storeu_256_avx512(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
3037 storeu_256_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
3038}
3039
3040static inline void
3041transpose_msg_vecs4_avx512(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
3042{
3043 out_msg[0] = loadu_128_avx512(&inputs[0][block_offset]);
3044 out_msg[1] = loadu_128_avx512(&inputs[1][block_offset]);
3045 out_msg[2] = loadu_128_avx512(&inputs[2][block_offset]);
3046 out_msg[3] = loadu_128_avx512(&inputs[3][block_offset]);
3047 out_msg[4] = loadu_128_avx512(&inputs[0][block_offset + 16]);
3048 out_msg[5] = loadu_128_avx512(&inputs[1][block_offset + 16]);
3049 out_msg[6] = loadu_128_avx512(&inputs[2][block_offset + 16]);
3050 out_msg[7] = loadu_128_avx512(&inputs[3][block_offset + 16]);
3051 out_msg[8] = loadu_128_avx512(&inputs[0][block_offset + 32]);
3052 out_msg[9] = loadu_128_avx512(&inputs[1][block_offset + 32]);
3053 out_msg[10] = loadu_128_avx512(&inputs[2][block_offset + 32]);
3054 out_msg[11] = loadu_128_avx512(&inputs[3][block_offset + 32]);
3055 out_msg[12] = loadu_128_avx512(&inputs[0][block_offset + 48]);
3056 out_msg[13] = loadu_128_avx512(&inputs[1][block_offset + 48]);
3057 out_msg[14] = loadu_128_avx512(&inputs[2][block_offset + 48]);
3058 out_msg[15] = loadu_128_avx512(&inputs[3][block_offset + 48]);
3059
3060 transpose_vecs_sse2(out_msg);
3061}
3062
3063static inline void
3064load_counters4_avx512(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
3065{
3066 const __m128i mask = _mm_set1_epi32(-increment_counter);
3067 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
3068 const __m128i add1 = _mm_and_si128(mask, add0);
3069 __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
3070 __m128i carry = _mm_srli_epi32(_mm_andnot_si128(l, _mm_set1_epi32((int32_t)counter)), 31);
3071 __m128i h = _mm_add_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
3072
3073 *out_lo = l;
3074 *out_hi = h;
3075}
3076
3077void
3078blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3079{
3080 __m128i h_vecs[8];
3081 __m128i counter_low_vec, counter_high_vec;
3082 uint8_t block_flags;
3083 size_t block;
3084
3085 h_vecs[0] = set1_128_avx512(key[0]);
3086 h_vecs[1] = set1_128_avx512(key[1]);
3087 h_vecs[2] = set1_128_avx512(key[2]);
3088 h_vecs[3] = set1_128_avx512(key[3]);
3089 h_vecs[4] = set1_128_avx512(key[4]);
3090 h_vecs[5] = set1_128_avx512(key[5]);
3091 h_vecs[6] = set1_128_avx512(key[6]);
3092 h_vecs[7] = set1_128_avx512(key[7]);
3093
3094 load_counters4_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
3095 block_flags = flags | flags_start;
3096
3097 for (block = 0; block < blocks; block++) {
3098 __m128i block_len_vec;
3099 __m128i block_flags_vec;
3100 __m128i msg_vecs[16];
3101 __m128i v[16];
3102
3103 if (block + 1 == blocks) {
3104 block_flags |= flags_end;
3105 }
3106 block_len_vec = set1_128_avx512(BLAKE3_BLOCK_LEN);
3107 block_flags_vec = set1_128_avx512(block_flags);
3108 transpose_msg_vecs4_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
3109
3110 v[0] = h_vecs[0];
3111 v[1] = h_vecs[1];
3112 v[2] = h_vecs[2];
3113 v[3] = h_vecs[3];
3114 v[4] = h_vecs[4];
3115 v[5] = h_vecs[5];
3116 v[6] = h_vecs[6];
3117 v[7] = h_vecs[7];
3118 v[8] = set1_128_avx512(IV[0]);
3119 v[9] = set1_128_avx512(IV[1]);
3120 v[10] = set1_128_avx512(IV[2]);
3121 v[11] = set1_128_avx512(IV[3]);
3122 v[12] = counter_low_vec;
3123 v[13] = counter_high_vec;
3124 v[14] = block_len_vec;
3125 v[15] = block_flags_vec;
3126
3127 round_fn4_avx512(v, msg_vecs, 0);
3128 round_fn4_avx512(v, msg_vecs, 1);
3129 round_fn4_avx512(v, msg_vecs, 2);
3130 round_fn4_avx512(v, msg_vecs, 3);
3131 round_fn4_avx512(v, msg_vecs, 4);
3132 round_fn4_avx512(v, msg_vecs, 5);
3133 round_fn4_avx512(v, msg_vecs, 6);
3134
3135 h_vecs[0] = xor_128_avx512(v[0], v[8]);
3136 h_vecs[1] = xor_128_avx512(v[1], v[9]);
3137 h_vecs[2] = xor_128_avx512(v[2], v[10]);
3138 h_vecs[3] = xor_128_avx512(v[3], v[11]);
3139 h_vecs[4] = xor_128_avx512(v[4], v[12]);
3140 h_vecs[5] = xor_128_avx512(v[5], v[13]);
3141 h_vecs[6] = xor_128_avx512(v[6], v[14]);
3142 h_vecs[7] = xor_128_avx512(v[7], v[15]);
3143
3144 block_flags = flags;
3145 }
3146
3147 transpose_vecs_sse2(h_vecs);
3148 storeu_128_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
3149 storeu_128_avx512(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
3150 storeu_128_avx512(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
3151 storeu_128_avx512(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
3152 storeu_128_avx512(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
3153 storeu_128_avx512(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
3154 storeu_128_avx512(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
3155 storeu_128_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
3156}
3157
3158static inline void
3159hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
3160{
3161 uint32_t cv[8];
3162 uint8_t block_flags;
3163
3164 memcpy(cv, key, BLAKE3_KEY_LEN);
3165 block_flags = flags | flags_start;
3166 while (blocks > 0) {
3167 if (blocks == 1) {
3168 block_flags |= flags_end;
3169 }
3170 blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
3171 input = &input[BLAKE3_BLOCK_LEN];
3172 blocks -= 1;
3173 block_flags = flags;
3174 }
3175 memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
3176}
3177
3178void
3179blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3180{
3181 while (num_inputs >= 16) {
3182 blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3183 if (increment_counter) {
3184 counter += 16;
3185 }
3186 inputs += 16;
3187 num_inputs -= 16;
3188 out_bytes = &out_bytes[16 * BLAKE3_OUT_LEN];
3189 }
3190 while (num_inputs >= 8) {
3191 blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3192 if (increment_counter) {
3193 counter += 8;
3194 }
3195 inputs += 8;
3196 num_inputs -= 8;
3197 out_bytes = &out_bytes[8 * BLAKE3_OUT_LEN];
3198 }
3199 while (num_inputs >= 4) {
3200 blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3201 if (increment_counter) {
3202 counter += 4;
3203 }
3204 inputs += 4;
3205 num_inputs -= 4;
3206 out_bytes = &out_bytes[4 * BLAKE3_OUT_LEN];
3207 }
3208 while (num_inputs > 0) {
3209 hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
3210 if (increment_counter) {
3211 counter += 1;
3212 }
3213 inputs += 1;
3214 num_inputs -= 1;
3215 out_bytes = &out_bytes[BLAKE3_OUT_LEN];
3216 }
3217}
3218#pragma GCC pop_options
3219#endif
3220
3221/* dispatch functions */
3222void
3223blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3224{
3225#if defined(__x86_64__)
3226 if (!blake3_cpu_detected)
3227 blake3_detect_cpu_features();
3228 if (blake3_cpu_features & AVX512) {
3229 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
3230 return;
3231 }
3232 if (blake3_cpu_features & SSE41) {
3233 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
3234 return;
3235 }
3236 if (blake3_cpu_features & SSE2) {
3237 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
3238 return;
3239 }
3240#endif
3241 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
3242}
3243
3244void
3245blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
3246{
3247#if defined(__x86_64__)
3248 if (!blake3_cpu_detected)
3249 blake3_detect_cpu_features();
3250 if (blake3_cpu_features & AVX512) {
3251 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
3252 return;
3253 }
3254 if (blake3_cpu_features & SSE41) {
3255 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
3256 return;
3257 }
3258 if (blake3_cpu_features & SSE2) {
3259 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
3260 return;
3261 }
3262#endif
3263 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
3264}
3265
3266void
3267blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
3268{
3269#if defined(__x86_64__)
3270 if (!blake3_cpu_detected)
3271 blake3_detect_cpu_features();
3272 if (blake3_cpu_features & AVX512) {
3273 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3274 return;
3275 }
3276 if (blake3_cpu_features & AVX2) {
3277 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3278 return;
3279 }
3280 if (blake3_cpu_features & SSE41) {
3281 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3282 return;
3283 }
3284 if (blake3_cpu_features & SSE2) {
3285 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3286 return;
3287 }
3288#endif
3289 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3290}
3291
3292size_t
3293blake3_simd_degree(void)
3294{
3295#if defined(__x86_64__)
3296 if (!blake3_cpu_detected)
3297 blake3_detect_cpu_features();
3298 if (blake3_cpu_features & AVX512)
3299 return 16;
3300 if (blake3_cpu_features & AVX2)
3301 return 8;
3302 if (blake3_cpu_features & SSE41)
3303 return 4;
3304 if (blake3_cpu_features & SSE2)
3305 return 4;
3306#endif
3307 return 1;
3308}
3309
3310/* core hasher implementation */
3311const char *
3312blake3_version(void)
3313{
3314 return BLAKE3_VERSION_STRING;
3315}
3316
3317static inline void
3318chunk_state_init(struct Blake3ChunkState *self, const uint32_t key[8], uint8_t flags)
3319{
3320 memcpy(self->cv, key, BLAKE3_KEY_LEN);
3321 self->chunk_counter = 0;
3322 memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3323 self->buf_len = 0;
3324 self->blocks_compressed = 0;
3325 self->flags = flags;
3326}
3327
3328static inline void
3329chunk_state_reset(struct Blake3ChunkState *self, const uint32_t key[8], uint64_t chunk_counter)
3330{
3331 memcpy(self->cv, key, BLAKE3_KEY_LEN);
3332 self->chunk_counter = chunk_counter;
3333 self->blocks_compressed = 0;
3334 memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3335 self->buf_len = 0;
3336}
3337
3338static inline size_t
3339chunk_state_len(const struct Blake3ChunkState *self)
3340{
3341 return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len);
3342}
3343
3344static inline size_t
3345chunk_state_fill_buf(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3346{
3347 size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
3348 uint8_t *dest;
3349
3350 if (take > input_len) {
3351 take = input_len;
3352 }
3353 dest = self->buf + ((size_t)self->buf_len);
3354 memcpy(dest, input, take);
3355 self->buf_len += (uint8_t)take;
3356 return take;
3357}
3358
3359static inline uint8_t
3360chunk_state_maybe_start_flag(const struct Blake3ChunkState *self)
3361{
3362 if (self->blocks_compressed == 0) {
3363 return CHUNK_START;
3364 } else {
3365 return 0;
3366 }
3367}
3368
3369static inline struct Output
3370make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3371{
3372 struct Output ret;
3373
3374 memcpy(ret.input_cv, input_cv, 32);
3375 memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
3376 ret.block_len = block_len;
3377 ret.counter = counter;
3378 ret.flags = flags;
3379 return ret;
3380}
3381
3382static inline void
3383output_chaining_value(const struct Output *self, uint8_t cv[32])
3384{
3385 uint32_t cv_words[8];
3386
3387 memcpy(cv_words, self->input_cv, 32);
3388 blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags);
3389 store_cv_words(cv, cv_words);
3390}
3391
3392static inline void
3393output_root_bytes(const struct Output *self, uint64_t seek, uint8_t *out, size_t out_len)
3394{
3395 uint64_t output_block_counter = seek / 64;
3396 size_t offset_within_block = seek % 64;
3397 uint8_t wide_buf[64];
3398 size_t available_bytes;
3399 size_t memcpy_len;
3400
3401 while (out_len > 0) {
3402 blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
3403 available_bytes = 64 - offset_within_block;
3404 if (out_len > available_bytes) {
3405 memcpy_len = available_bytes;
3406 } else {
3407 memcpy_len = out_len;
3408 }
3409 memcpy(out, wide_buf + offset_within_block, memcpy_len);
3410 out += memcpy_len;
3411 out_len -= memcpy_len;
3412 output_block_counter += 1;
3413 offset_within_block = 0;
3414 }
3415}
3416
3417static inline void
3418chunk_state_update(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3419{
3420 size_t take;
3421
3422 if (self->buf_len > 0) {
3423 take = chunk_state_fill_buf(self, input, input_len);
3424 input += take;
3425 input_len -= take;
3426 if (input_len > 0) {
3427 blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3428 self->blocks_compressed += 1;
3429 self->buf_len = 0;
3430 memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3431 }
3432 }
3433
3434 while (input_len > BLAKE3_BLOCK_LEN) {
3435 blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3436 self->blocks_compressed += 1;
3437 input += BLAKE3_BLOCK_LEN;
3438 input_len -= BLAKE3_BLOCK_LEN;
3439 }
3440
3441 take = chunk_state_fill_buf(self, input, input_len);
3442 input += take;
3443 input_len -= take;
3444}
3445
3446static inline struct Output
3447chunk_state_output(const struct Blake3ChunkState *self)
3448{
3449 uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
3450
3451 return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
3452}
3453
3454static inline struct Output
3455parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags)
3456{
3457 return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
3458}
3459
3460static unsigned int
3461highest_one(uint64_t x)
3462{
3463#if defined(__GNUC__) || defined(__clang__)
3464 return 63 ^ __builtin_clzll(x);
3465#else
3466 unsigned int c = 0;
3467 if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
3468 if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
3469 if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
3470 if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
3471 if (x & 0x000000000000000cULL) { x >>= 2; c += 2; }
3472 if (x & 0x0000000000000002ULL) { c += 1; }
3473 return c;
3474#endif
3475}
3476
3477static inline uint64_t
3478round_down_to_power_of_2(uint64_t x)
3479{
3480 return 1ULL << highest_one(x | 1);
3481}
3482
3483static inline size_t
3484left_len(size_t content_len)
3485{
3486 size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
3487
3488 return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
3489}
3490
3491static inline size_t
3492compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3493{
3494 const uint8_t *chunks_array[MAX_SIMD_DEGREE];
3495 size_t input_position = 0;
3496 size_t chunks_array_len = 0;
3497
3498 assert(0 < input_len);
3499 assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
3500
3501 while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
3502 chunks_array[chunks_array_len] = &input[input_position];
3503 input_position += BLAKE3_CHUNK_LEN;
3504 chunks_array_len += 1;
3505 }
3506
3507 blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, 1, flags, CHUNK_START, CHUNK_END, out);
3508
3509 if (input_len > input_position) {
3510 uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
3511 struct Blake3ChunkState chunk_state;
3512 struct Output output;
3513
3514 chunk_state_init(&chunk_state, key, flags);
3515 chunk_state.chunk_counter = counter;
3516 chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
3517 output = chunk_state_output(&chunk_state);
3518 output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
3519 return chunks_array_len + 1;
3520 } else {
3521 return chunks_array_len;
3522 }
3523}
3524
3525static inline size_t
3526compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out)
3527{
3528 const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
3529 size_t parents_array_len = 0;
3530
3531 assert(2 <= num_chaining_values);
3532 assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
3533
3534 while (num_chaining_values - (2 * parents_array_len) >= 2) {
3535 parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
3536 parents_array_len += 1;
3537 }
3538
3539 blake3_hash_many(parents_array, parents_array_len, 1, key, 0, 0, flags | PARENT, 0, 0, out);
3540
3541 if (num_chaining_values > 2 * parents_array_len) {
3542 memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
3543 return parents_array_len + 1;
3544 } else {
3545 return parents_array_len;
3546 }
3547}
3548
3549static inline size_t
3550blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3551{
3552 size_t degree;
3553
3554 if (input_len <= (size_t)blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
3555 return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out);
3556 }
3557
3558 degree = blake3_simd_degree();
3559 if (degree > 1) {
3560 size_t child_len = round_down_to_power_of_2(input_len - 1) / degree;
3561 size_t cvs_written = 0;
3562
3563 if (child_len < BLAKE3_CHUNK_LEN) {
3564 child_len = BLAKE3_CHUNK_LEN;
3565 }
3566
3567 while (input_len > 0) {
3568 size_t take = child_len;
3569 size_t sub_cvs;
3570
3571 if (take > input_len) {
3572 take = input_len;
3573 }
3574 sub_cvs = blake3_compress_subtree_wide(input, take, key, chunk_counter, flags, &out[cvs_written * BLAKE3_OUT_LEN]);
3575 cvs_written += sub_cvs;
3576 chunk_counter += take / BLAKE3_CHUNK_LEN;
3577 input += take;
3578 input_len -= take;
3579 }
3580
3581 while (cvs_written > 2) {
3582 cvs_written = compress_parents_parallel(out, cvs_written, key, flags, out);
3583 }
3584 return cvs_written;
3585 } else {
3586 /* fallback when simd degree is 1 */
3587 size_t left = left_len(input_len);
3588 size_t right = input_len - left;
3589 size_t left_cvs = blake3_compress_subtree_wide(input, left, key, chunk_counter, flags, out);
3590 size_t right_cvs = blake3_compress_subtree_wide(input + left, right, key, chunk_counter + (left / BLAKE3_CHUNK_LEN), flags, &out[left_cvs * BLAKE3_OUT_LEN]);
3591
3592 return left_cvs + right_cvs;
3593 }
3594}
3595
3596static void
3597compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
3598{
3599 uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
3600 size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array);
3601
3602 assert(num_cvs >= 2);
3603 while (num_cvs > 2) {
3604 num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, cv_array);
3605 }
3606 memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
3607}
3608
3609static inline void
3610hasher_init_base(struct Blake3Hasher *self, const uint32_t key[8], uint8_t flags)
3611{
3612 memcpy(self->key, key, BLAKE3_KEY_LEN);
3613 chunk_state_init(&self->chunk, key, flags);
3614 self->cv_stack_len = 0;
3615}
3616
3617void
3618blake3_hasher_init(struct Blake3Hasher *self)
3619{
3620 if (!blake3_cpu_detected)
3621 blake3_detect_cpu_features();
3622 hasher_init_base(self, IV, 0);
3623}
3624
3625static inline void
3626load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8])
3627{
3628 key_words[0] = load32(&key[0 * 4]);
3629 key_words[1] = load32(&key[1 * 4]);
3630 key_words[2] = load32(&key[2 * 4]);
3631 key_words[3] = load32(&key[3 * 4]);
3632 key_words[4] = load32(&key[4 * 4]);
3633 key_words[5] = load32(&key[5 * 4]);
3634 key_words[6] = load32(&key[6 * 4]);
3635 key_words[7] = load32(&key[7 * 4]);
3636}
3637
3638void
3639blake3_hasher_init_keyed(struct Blake3Hasher *self, const uint8_t key[BLAKE3_KEY_LEN])
3640{
3641 uint32_t key_words[8];
3642
3643 load_key_words(key, key_words);
3644 hasher_init_base(self, key_words, KEYED_HASH);
3645}
3646
3647void
3648blake3_hasher_init_derive_key_raw(struct Blake3Hasher *self, const void *context, size_t context_len)
3649{
3650 struct Blake3Hasher context_hasher;
3651 uint8_t context_key[BLAKE3_KEY_LEN];
3652 uint32_t context_key_words[8];
3653
3654 hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
3655 blake3_hasher_update(&context_hasher, context, context_len);
3656 blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
3657 load_key_words(context_key, context_key_words);
3658 hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
3659}
3660
3661static inline unsigned int
3662popcnt(uint64_t x)
3663{
3664#if defined(__GNUC__) || defined(__clang__)
3665 return __builtin_popcountll(x);
3666#else
3667 unsigned int count = 0;
3668 while (x != 0) {
3669 count += 1;
3670 x &= x - 1;
3671 }
3672 return count;
3673#endif
3674}
3675
3676void
3677blake3_hasher_init_derive_key(struct Blake3Hasher *self, const char *context)
3678{
3679 blake3_hasher_init_derive_key_raw(self, context, strlen(context));
3680}
3681
3682static inline void
3683hasher_merge_cv_stack(struct Blake3Hasher *self, uint64_t total_len)
3684{
3685 size_t post_merge_stack_len = (size_t)popcnt(total_len);
3686
3687 while (self->cv_stack_len > post_merge_stack_len) {
3688 uint8_t *parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
3689 struct Output output = parent_output(parent_node, self->key, self->chunk.flags);
3690
3691 output_chaining_value(&output, parent_node);
3692 self->cv_stack_len -= 1;
3693 }
3694}
3695
3696static inline void
3697hasher_push_cv(struct Blake3Hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
3698{
3699 hasher_merge_cv_stack(self, chunk_counter);
3700 memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
3701 self->cv_stack_len += 1;
3702}
3703
3704void
3705blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len)
3706{
3707 const uint8_t *input_bytes;
3708
3709 if (input_len == 0) {
3710 return;
3711 }
3712
3713 input_bytes = (const uint8_t *)input;
3714
3715 if (chunk_state_len(&self->chunk) > 0) {
3716 size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
3717
3718 if (take > input_len) {
3719 take = input_len;
3720 }
3721 chunk_state_update(&self->chunk, input_bytes, take);
3722 input_bytes += take;
3723 input_len -= take;
3724 if (input_len > 0) {
3725 struct Output output = chunk_state_output(&self->chunk);
3726 uint8_t chunk_cv[32];
3727
3728 output_chaining_value(&output, chunk_cv);
3729 hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
3730 chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
3731 } else {
3732 return;
3733 }
3734 }
3735
3736 while (input_len > BLAKE3_CHUNK_LEN) {
3737 size_t subtree_len = round_down_to_power_of_2(input_len);
3738 uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
3739 uint64_t subtree_chunks;
3740
3741 while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
3742 subtree_len /= 2;
3743 }
3744 subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
3745 if (subtree_len <= BLAKE3_CHUNK_LEN) {
3746 struct Blake3ChunkState chunk_state;
3747 struct Output output;
3748 uint8_t cv[BLAKE3_OUT_LEN];
3749
3750 chunk_state_init(&chunk_state, self->key, self->chunk.flags);
3751 chunk_state.chunk_counter = self->chunk.chunk_counter;
3752 chunk_state_update(&chunk_state, input_bytes, subtree_len);
3753 output = chunk_state_output(&chunk_state);
3754 output_chaining_value(&output, cv);
3755 hasher_push_cv(self, cv, chunk_state.chunk_counter);
3756 } else {
3757 uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
3758
3759 compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
3760 hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
3761 hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2));
3762 }
3763 self->chunk.chunk_counter += subtree_chunks;
3764 input_bytes += subtree_len;
3765 input_len -= subtree_len;
3766 }
3767
3768 if (input_len > 0) {
3769 chunk_state_update(&self->chunk, input_bytes, input_len);
3770 hasher_merge_cv_stack(self, self->chunk.chunk_counter);
3771 }
3772}
3773
3774void
3775blake3_hasher_finalize_seek(const struct Blake3Hasher *self, uint64_t seek, uint8_t *out, size_t out_len)
3776{
3777 struct Output output;
3778 size_t cvs_remaining;
3779
3780 if (out_len == 0) {
3781 return;
3782 }
3783
3784 if (self->cv_stack_len == 0) {
3785 output = chunk_state_output(&self->chunk);
3786 output_root_bytes(&output, seek, out, out_len);
3787 return;
3788 }
3789
3790 if (chunk_state_len(&self->chunk) > 0) {
3791 cvs_remaining = self->cv_stack_len;
3792 output = chunk_state_output(&self->chunk);
3793 } else {
3794 cvs_remaining = self->cv_stack_len - 2;
3795 output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags);
3796 }
3797
3798 while (cvs_remaining > 0) {
3799 uint8_t parent_block[BLAKE3_BLOCK_LEN];
3800
3801 cvs_remaining -= 1;
3802 memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
3803 output_chaining_value(&output, &parent_block[32]);
3804 output = parent_output(parent_block, self->key, self->chunk.flags);
3805 }
3806 output_root_bytes(&output, seek, out, out_len);
3807}
3808
3809void
3810blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len)
3811{
3812 blake3_hasher_finalize_seek(self, 0, out, out_len);
3813}
3814
3815void
3816blake3_hasher_reset(struct Blake3Hasher *self)
3817{
3818 chunk_state_reset(&self->chunk, self->key, 0);
3819 self->cv_stack_len = 0;
3820}
3821
3822/* utility implementation */
3823static unsigned char *out;
3824static size_t outlen = BLAKE3_OUT_LEN;
3825
3826static void
3827usage(void)
3828{
3829 fprintf(stderr, "usage: %s [-bct] [-l length] [file ...]\n", argv0);
3830 exit(1);
3831}
3832
3833static int
3834sumfile(const char *name, FILE *file, unsigned char *out_buf, size_t out_len)
3835{
3836 char buf[16384];
3837 struct Blake3Hasher ctx;
3838 size_t len;
3839
3840 blake3_hasher_init(&ctx);
3841 do {
3842 len = fread(buf, 1, sizeof(buf), file);
3843 if (len > 0)
3844 blake3_hasher_update(&ctx, buf, len);
3845 } while (len == sizeof(buf));
3846
3847 if (ferror(file)) {
3848 fprintf(stderr, "%s: read %s: ", argv0, name);
3849 perror(NULL);
3850 return 1;
3851 }
3852 blake3_hasher_finalize(&ctx, out_buf, out_len);
3853 return 0;
3854}
3855
3856static int
3857sum(const char *name, FILE *file)
3858{
3859 size_t i;
3860
3861 if (sumfile(name, file, out, outlen) != 0)
3862 return 1;
3863 for (i = 0; i < outlen; i++)
3864 printf("%02x", out[i]);
3865 printf(" %s\n", name);
3866 return 0;
3867}
3868
3869static int
3870hexval(int c)
3871{
3872 if ('0' <= c && c <= '9')
3873 return c - '0';
3874 if ('a' <= c && c <= 'f')
3875 return c - 'a' + 10;
3876 if ('A' <= c && c <= 'F')
3877 return c - 'A' + 10;
3878 return -1;
3879}
3880
3881static int
3882checkfile(const char *name, const char *mode, const char *str, unsigned char *out_buf, size_t len)
3883{
3884 FILE *file;
3885 int c1, c2;
3886 size_t i;
3887
3888 file = fopen(name, mode);
3889 if (!file) {
3890 fprintf(stderr, "%s: open %s: ", argv0, name);
3891 perror(NULL);
3892 return 1;
3893 }
3894 sumfile(name, file, out_buf, len);
3895 fclose(file);
3896
3897 for (i = 0; i < len; i++) {
3898 c1 = hexval(str[i * 2]);
3899 c2 = hexval(str[i * 2 + 1]);
3900 if (c1 == -1 || c2 == -1) {
3901 fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3902 return 1;
3903 }
3904 if (out_buf[i] != (c1 << 4 | c2)) {
3905 printf("%s: FAILED\n", name);
3906 return 1;
3907 }
3908 }
3909 printf("%s: OK\n", name);
3910 return 0;
3911}
3912
3913static int
3914check(const char *name, FILE *file)
3915{
3916 const char *mode;
3917 char buf[8192], *pos, *end;
3918 size_t len;
3919 int ret = 0, skip = 0;
3920
3921 buf[sizeof(buf) - 2] = 0;
3922 while (fgets(buf, sizeof(buf), file)) {
3923 if (buf[sizeof(buf) - 2]) {
3924 fprintf(stderr, "%s: skipping line that is too long\n", argv0);
3925 buf[sizeof(buf) - 2] = 0;
3926 skip = 1;
3927 ret = 1;
3928 continue;
3929 }
3930 if (skip) {
3931 skip = 0;
3932 continue;
3933 }
3934 pos = strchr(buf, ' ');
3935 if (!pos || pos == buf || (pos[1] != ' ' && pos[1] != '*') || (pos - buf) & 1) {
3936 fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3937 ret = 1;
3938 continue;
3939 }
3940 mode = pos[1] == ' ' ? "r" : "rb";
3941 len = (pos - buf) / 2;
3942 if (len > outlen) {
3943 outlen = len;
3944 free(out);
3945 out = malloc(len);
3946 if (!out) {
3947 perror(argv0);
3948 return 1;
3949 }
3950 }
3951 *pos = '\0';
3952 pos += 2;
3953 end = strchr(pos, '\n');
3954 if (end)
3955 *end = '\0';
3956 ret |= checkfile(pos, mode, buf, out, len);
3957 }
3958 if (ferror(file)) {
3959 fprintf(stderr, "%s: read %s: ", argv0, name);
3960 perror(NULL);
3961 ret = 1;
3962 }
3963 return ret;
3964}
3965
3966// ?man b3sum: compute blake3 checksums
3967// ?man arguments: file ...
3968// ?man compute and check blake3 message digests
3969int
3970main(int argc, char *argv[])
3971{
3972 int (*func)(const char *, FILE *) = sum;
3973 FILE *file;
3974 char *end;
3975 const char *name, *mode = NULL;
3976 int ret = 0;
3977
3978 ARGBEGIN {
3979 // ?man -b: read in binary mode
3980 case 'b':
3981 mode = "rb";
3982 break;
3983 // ?man -c: check blake3 sums from file
3984 case 'c':
3985 func = check;
3986 break;
3987 // ?man -l:str: -l length: output digest length in bytes
3988 case 'l':
3989 outlen = strtoul(EARGF(usage()), &end, 10);
3990 if (*end)
3991 usage();
3992 break;
3993 // ?man -t: read in text mode
3994 case 't':
3995 mode = "r";
3996 break;
3997 default:
3998 usage();
3999 } ARGEND
4000
4001 out = malloc(outlen);
4002 if (!out) {
4003 perror(NULL);
4004 return 1;
4005 }
4006
4007 if (argc == 0) {
4008 if (!mode || strcmp(mode, "r") == 0 || freopen(NULL, mode, stdin)) {
4009 ret |= func("<stdin>", stdin);
4010 } else {
4011 fprintf(stderr, "%s: reopen stdin: ", argv0);
4012 perror(NULL);
4013 ret = 1;
4014 }
4015 } else {
4016 if (!mode)
4017 mode = "r";
4018 for (; argc > 0; argc--, argv++) {
4019 name = *argv;
4020 file = fopen(name, mode);
4021 if (file) {
4022 ret |= func(name, file);
4023 fclose(file);
4024 } else {
4025 fprintf(stderr, "%s: open %s: ", argv0, name);
4026 perror(NULL);
4027 ret = 1;
4028 }
4029 }
4030 }
4031
4032 free(out);
4033 if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"))
4034 ret = 1;
4035
4036 return ret;
4037}