commit f8ed7d5

xplshn  ·  2026-06-10 19:00:17 +0000 UTC
parent c264a7c
fix: sed, ls

Signed-off-by: xplshn <anto@xplshn.com.ar>
6 files changed,  +4314, -24
+10, -6
 1@@ -264,7 +264,8 @@ PSEUDO_BIN_ALL =\
 2 	cmd/pseudo/whoami\
 3 	cmd/pseudo/xinstall\
 4 	cmd/pseudo/yes\
 5-	cmd/pseudo/base64
 6+	cmd/pseudo/base64\
 7+	cmd/extra/b3sum
 8 
 9 MAKEOBJ =\
10 	cmd/posix/make/defaults.o\
11@@ -431,6 +432,7 @@ BIN_whoami_1 = cmd/pseudo/whoami
12 BIN_xinstall_1 = cmd/pseudo/xinstall
13 BIN_yes_1 = cmd/pseudo/yes
14 BIN_base64_1 = cmd/pseudo/base64
15+BIN_b3sum_1 = cmd/extra/b3sum
16 
17 BIN_make_tool_1 = cmd/posix/make/make
18 
19@@ -596,7 +598,12 @@ PSEUDO_BIN = \
20 	$(BIN_whoami_$(BUILD_PSEUDO_WHOAMI)) \
21 	$(BIN_xinstall_$(BUILD_PSEUDO_XINSTALL)) \
22 	$(BIN_yes_$(BUILD_PSEUDO_YES)) \
23-	$(BIN_base64_$(BUILD_PSEUDO_BASE64))
24+	$(BIN_base64_$(BUILD_PSEUDO_BASE64)) \
25+	$(BIN_b3sum_$(BUILD_PSEUDO_B3SUM)) \
26+	$(BIN_ar_$(BUILD_DEV_AR)) \
27+	$(BIN_as_$(BUILD_DEV_CC)) \
28+	$(BIN_ld_$(BUILD_DEV_LD)) \
29+	$(BIN_cc_$(BUILD_DEV_CC))
30 
31 MAKE_BIN = $(BIN_make_tool_$(BUILD_MAKE_MAKE))
32 
33@@ -647,7 +654,7 @@ box: $(LIB)
34 	scripts/mkbox
35 
36 clean:
37-	rm -f shared/libutf/*.o shared/libutil/*.o cmd/posix/make/*.o cmd/posix/awk/*.o cmd/posix/sh/*.o cmd/dev/ar/*.o cmd/dev/ld/*.o
38+	rm -f shared/libutf/*.o shared/libutil/*.o cmd/posix/make/*.o cmd/posix/awk/*.o cmd/posix/sh/*.o cmd/extra/*.o
39 	rm -f $(POSIX_BIN_ALL) $(LINUX_BIN_ALL) $(NET_BIN_ALL) $(XSI_BIN_ALL) $(PSEUDO_BIN_ALL) $(LIB)
40 	rm -f cmd/posix/make/make cmd/posix/getconf.h cmd/posix/bc.c
41 	rm -f cmd/posix/awk/awk cmd/posix/awk/maketab cmd/posix/awk/awkgram.tab.c cmd/posix/awk/awkgram.tab.h cmd/posix/awk/proctab.c
42@@ -747,6 +754,3 @@ cmd/posix/sh/%.o: cmd/posix/sh/%.c
43 
44 cmd/posix/sh/sh: $(SHOBJ) $(LIB)
45 	$(CC) $(LDFLAGS) -o $@ $(SHOBJ) $(LIB) $(LDLIBS)
46-
47-cmd/pseudo/base64: cmd/pseudo/base64.o $(LIB)
48-	$(CC) $(LDFLAGS) -o $@ cmd/pseudo/base64.o $(LIB) $(LDLIBS)
+4029, -0
   1@@ -0,0 +1,4029 @@
   2+/* See LICENSE file for copyright and license details. */
   3+#include "util.h"
   4+#include "arg.h"
   5+
   6+#include <assert.h>
   7+#include <immintrin.h>
   8+#include <stdint.h>
   9+#include <stdio.h>
  10+#include <stdlib.h>
  11+#include <string.h>
  12+
  13+#define BLAKE3_VERSION_STRING "1.5.0"
  14+#define BLAKE3_KEY_LEN        32
  15+#define BLAKE3_OUT_LEN        32
  16+#define BLAKE3_BLOCK_LEN      64
  17+#define BLAKE3_CHUNK_LEN      1024
  18+#define BLAKE3_MAX_DEPTH      54
  19+
  20+#if defined(__x86_64__)
  21+#define MAX_SIMD_DEGREE 16
  22+#else
  23+#define MAX_SIMD_DEGREE 1
  24+#endif
  25+
  26+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
  27+
  28+enum Blake3Flags {
  29+	CHUNK_START         = 1 << 0,
  30+	CHUNK_END           = 1 << 1,
  31+	PARENT              = 1 << 2,
  32+	ROOT                = 1 << 3,
  33+	KEYED_HASH          = 1 << 4,
  34+	DERIVE_KEY_CONTEXT  = 1 << 5,
  35+	DERIVE_KEY_MATERIAL = 1 << 6
  36+};
  37+
  38+struct Blake3ChunkState {
  39+	uint32_t cv[8];
  40+	uint64_t chunk_counter;
  41+	uint8_t buf[BLAKE3_BLOCK_LEN];
  42+	uint8_t buf_len;
  43+	uint8_t blocks_compressed;
  44+	uint8_t flags;
  45+};
  46+
  47+struct Blake3Hasher {
  48+	uint32_t key[8];
  49+	struct Blake3ChunkState chunk;
  50+	uint8_t cv_stack_len;
  51+	uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
  52+};
  53+
  54+void blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len);
  55+void blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len);
  56+
  57+struct Output {
  58+	uint32_t input_cv[8];
  59+	uint64_t counter;
  60+	uint8_t block[BLAKE3_BLOCK_LEN];
  61+	uint8_t block_len;
  62+	uint8_t flags;
  63+};
  64+
  65+static const uint32_t IV[8] = {
  66+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
  67+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
  68+};
  69+
  70+static const uint8_t MSG_SCHEDULE[7][16] = {
  71+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  72+	{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
  73+	{3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
  74+	{10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
  75+	{12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
  76+	{9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
  77+	{11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
  78+};
  79+
  80+static inline uint32_t
  81+load32(const void *src)
  82+{
  83+	const uint8_t *p = (const uint8_t *)src;
  84+
  85+	return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
  86+	       ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
  87+}
  88+
  89+static inline void
  90+store32(void *dst, uint32_t w)
  91+{
  92+	uint8_t *p = (uint8_t *)dst;
  93+
  94+	p[0] = (uint8_t)(w >> 0);
  95+	p[1] = (uint8_t)(w >> 8);
  96+	p[2] = (uint8_t)(w >> 16);
  97+	p[3] = (uint8_t)(w >> 24);
  98+}
  99+
 100+static inline void
 101+store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8])
 102+{
 103+	store32(&bytes_out[0 * 4], cv_words[0]);
 104+	store32(&bytes_out[1 * 4], cv_words[1]);
 105+	store32(&bytes_out[2 * 4], cv_words[2]);
 106+	store32(&bytes_out[3 * 4], cv_words[3]);
 107+	store32(&bytes_out[4 * 4], cv_words[4]);
 108+	store32(&bytes_out[5 * 4], cv_words[5]);
 109+	store32(&bytes_out[6 * 4], cv_words[6]);
 110+	store32(&bytes_out[7 * 4], cv_words[7]);
 111+}
 112+
 113+static inline uint32_t
 114+counter_low(uint64_t counter)
 115+{
 116+	return (uint32_t)counter;
 117+}
 118+
 119+static inline uint32_t
 120+counter_high(uint64_t counter)
 121+{
 122+	return (uint32_t)(counter >> 32);
 123+}
 124+
 125+/* forward declarations */
 126+#if defined(__x86_64__)
 127+void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
 128+void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
 129+void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 130+
 131+void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
 132+void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
 133+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 134+
 135+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 136+
 137+void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
 138+void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
 139+void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 140+#endif
 141+
 142+void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
 143+void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
 144+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 145+
 146+void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
 147+void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]);
 148+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out);
 149+
 150+/* portable implementations */
 151+static inline uint32_t
 152+rotr32(uint32_t w, uint32_t c)
 153+{
 154+	return (w >> c) | (w << (32 - c));
 155+}
 156+
 157+static inline void
 158+g_portable(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
 159+{
 160+	state[a] = state[a] + state[b] + x;
 161+	state[d] = rotr32(state[d] ^ state[a], 16);
 162+	state[c] = state[c] + state[d];
 163+	state[b] = rotr32(state[b] ^ state[c], 12);
 164+	state[a] = state[a] + state[b] + y;
 165+	state[d] = rotr32(state[d] ^ state[a], 8);
 166+	state[c] = state[c] + state[d];
 167+	state[b] = rotr32(state[b] ^ state[c], 7);
 168+}
 169+
 170+static inline void
 171+round_fn_portable(uint32_t state[16], const uint32_t *msg, size_t round)
 172+{
 173+	const uint8_t *schedule = MSG_SCHEDULE[round];
 174+
 175+	g_portable(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
 176+	g_portable(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
 177+	g_portable(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
 178+	g_portable(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
 179+
 180+	g_portable(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
 181+	g_portable(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
 182+	g_portable(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
 183+	g_portable(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
 184+}
 185+
 186+static inline void
 187+compress_pre_portable(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
 188+{
 189+	uint32_t block_words[16];
 190+
 191+	block_words[0] = load32(block + 4 * 0);
 192+	block_words[1] = load32(block + 4 * 1);
 193+	block_words[2] = load32(block + 4 * 2);
 194+	block_words[3] = load32(block + 4 * 3);
 195+	block_words[4] = load32(block + 4 * 4);
 196+	block_words[5] = load32(block + 4 * 5);
 197+	block_words[6] = load32(block + 4 * 6);
 198+	block_words[7] = load32(block + 4 * 7);
 199+	block_words[8] = load32(block + 4 * 8);
 200+	block_words[9] = load32(block + 4 * 9);
 201+	block_words[10] = load32(block + 4 * 10);
 202+	block_words[11] = load32(block + 4 * 11);
 203+	block_words[12] = load32(block + 4 * 12);
 204+	block_words[13] = load32(block + 4 * 13);
 205+	block_words[14] = load32(block + 4 * 14);
 206+	block_words[15] = load32(block + 4 * 15);
 207+
 208+	state[0] = cv[0];
 209+	state[1] = cv[1];
 210+	state[2] = cv[2];
 211+	state[3] = cv[3];
 212+	state[4] = cv[4];
 213+	state[5] = cv[5];
 214+	state[6] = cv[6];
 215+	state[7] = cv[7];
 216+	state[8] = IV[0];
 217+	state[9] = IV[1];
 218+	state[10] = IV[2];
 219+	state[11] = IV[3];
 220+	state[12] = counter_low(counter);
 221+	state[13] = counter_high(counter);
 222+	state[14] = (uint32_t)block_len;
 223+	state[15] = (uint32_t)flags;
 224+
 225+	round_fn_portable(state, &block_words[0], 0);
 226+	round_fn_portable(state, &block_words[0], 1);
 227+	round_fn_portable(state, &block_words[0], 2);
 228+	round_fn_portable(state, &block_words[0], 3);
 229+	round_fn_portable(state, &block_words[0], 4);
 230+	round_fn_portable(state, &block_words[0], 5);
 231+	round_fn_portable(state, &block_words[0], 6);
 232+}
 233+
 234+void
 235+blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
 236+{
 237+	uint32_t state[16];
 238+
 239+	compress_pre_portable(state, cv, block, block_len, counter, flags);
 240+	cv[0] = state[0] ^ state[8];
 241+	cv[1] = state[1] ^ state[9];
 242+	cv[2] = state[2] ^ state[10];
 243+	cv[3] = state[3] ^ state[11];
 244+	cv[4] = state[4] ^ state[12];
 245+	cv[5] = state[5] ^ state[13];
 246+	cv[6] = state[6] ^ state[14];
 247+	cv[7] = state[7] ^ state[15];
 248+}
 249+
 250+void
 251+blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
 252+{
 253+	uint32_t state[16];
 254+
 255+	compress_pre_portable(state, cv, block, block_len, counter, flags);
 256+
 257+	store32(&out[0 * 4], state[0] ^ state[8]);
 258+	store32(&out[1 * 4], state[1] ^ state[9]);
 259+	store32(&out[2 * 4], state[2] ^ state[10]);
 260+	store32(&out[3 * 4], state[3] ^ state[11]);
 261+	store32(&out[4 * 4], state[4] ^ state[12]);
 262+	store32(&out[5 * 4], state[5] ^ state[13]);
 263+	store32(&out[6 * 4], state[6] ^ state[14]);
 264+	store32(&out[7 * 4], state[7] ^ state[15]);
 265+	store32(&out[8 * 4], state[8] ^ cv[0]);
 266+	store32(&out[9 * 4], state[9] ^ cv[1]);
 267+	store32(&out[10 * 4], state[10] ^ cv[2]);
 268+	store32(&out[11 * 4], state[11] ^ cv[3]);
 269+	store32(&out[12 * 4], state[12] ^ cv[4]);
 270+	store32(&out[13 * 4], state[13] ^ cv[5]);
 271+	store32(&out[14 * 4], state[14] ^ cv[6]);
 272+	store32(&out[15 * 4], state[15] ^ cv[7]);
 273+}
 274+
 275+static inline void
 276+hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
 277+{
 278+	uint32_t cv[8];
 279+	uint8_t block_flags;
 280+
 281+	memcpy(cv, key, BLAKE3_KEY_LEN);
 282+	block_flags = flags | flags_start;
 283+	while (blocks > 0) {
 284+		if (blocks == 1) {
 285+			block_flags |= flags_end;
 286+		}
 287+		blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
 288+		input = &input[BLAKE3_BLOCK_LEN];
 289+		blocks -= 1;
 290+		block_flags = flags;
 291+	}
 292+	store_cv_words(out, cv);
 293+}
 294+
 295+void
 296+blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
 297+{
 298+	while (num_inputs > 0) {
 299+		hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
 300+		if (increment_counter) {
 301+			counter += 1;
 302+		}
 303+		inputs += 1;
 304+		num_inputs -= 1;
 305+		out = &out[BLAKE3_OUT_LEN];
 306+	}
 307+}
 308+
 309+/* cpu features detection */
 310+enum {
 311+	SSE2   = 1 << 0,
 312+	SSE41  = 1 << 1,
 313+	AVX2   = 1 << 2,
 314+	AVX512 = 1 << 3
 315+};
 316+
 317+static int blake3_cpu_features = 0;
 318+static int blake3_cpu_detected = 0;
 319+
 320+#if defined(__x86_64__)
 321+#include <cpuid.h>
 322+
 323+static void
 324+blake3_cpuid(uint32_t out[4], uint32_t id, uint32_t sid)
 325+{
 326+	__cpuid_count(id, sid, out[0], out[1], out[2], out[3]);
 327+}
 328+
 329+static uint64_t
 330+blake3_xgetbv(void)
 331+{
 332+	uint32_t eax, edx;
 333+
 334+	__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
 335+	return ((uint64_t)edx << 32) | eax;
 336+}
 337+#endif
 338+
 339+static void
 340+blake3_detect_cpu_features(void)
 341+{
 342+#if defined(__x86_64__)
 343+	enum { EAX, EBX, ECX, EDX };
 344+	uint32_t regs[4];
 345+	uint64_t xcr0;
 346+	int features = 0;
 347+
 348+	blake3_cpuid(regs, 1, 0);
 349+	if (regs[EDX] & (1UL << 26))
 350+		features |= SSE2;
 351+	if (regs[ECX] & (1UL << 19))
 352+		features |= SSE41;
 353+	/* osxsave */
 354+	if (regs[ECX] & (1UL << 27)) {
 355+		blake3_cpuid(regs, 0, 0);
 356+		if (regs[EAX] >= 7) {
 357+			blake3_cpuid(regs, 7, 0);
 358+			xcr0 = blake3_xgetbv();
 359+			/* avx2 and xcr0 sse, avx */
 360+			if ((regs[EBX] & (1UL << 5)) && (xcr0 & 0x06) == 0x06)
 361+				features |= AVX2;
 362+			/* avx512f, avx512vl and xcr0 opmask, zmm_hi256, hi16_zmm */
 363+			if ((regs[EBX] & (1UL << 31 | 1UL << 16)) && (xcr0 & 0xe0) == 0xe0)
 364+				features |= AVX512;
 365+		}
 366+	}
 367+	blake3_cpu_features = features;
 368+#endif
 369+	blake3_cpu_detected = 1;
 370+}
 371+
 372+#if defined(__x86_64__)
 373+__attribute__((constructor))
 374+static void
 375+blake3_init_cpu(void)
 376+{
 377+	if (!blake3_cpu_detected)
 378+		blake3_detect_cpu_features();
 379+}
 380+#endif
 381+
 382+#if defined(__x86_64__)
 383+#pragma GCC push_options
 384+#pragma GCC target("sse2")
 385+#define DEGREE_SSE2 4
 386+
 387+#define _mm_shuffle_ps2(a, b, c)                                               \
 388+  (_mm_castps_si128(                                                           \
 389+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
 390+
 391+static inline __m128i
 392+loadu_sse2(const uint8_t src[16])
 393+{
 394+	return _mm_loadu_si128((const __m128i *)src);
 395+}
 396+
 397+static inline void
 398+storeu_sse2(__m128i src, uint8_t dest[16])
 399+{
 400+	_mm_storeu_si128((__m128i *)dest, src);
 401+}
 402+
 403+static inline __m128i
 404+addv_sse2(__m128i a, __m128i b)
 405+{
 406+	return _mm_add_epi32(a, b);
 407+}
 408+
 409+static inline __m128i
 410+xorv_sse2(__m128i a, __m128i b)
 411+{
 412+	return _mm_xor_si128(a, b);
 413+}
 414+
 415+static inline __m128i
 416+set1_sse2(uint32_t x)
 417+{
 418+	return _mm_set1_epi32((int32_t)x);
 419+}
 420+
 421+static inline __m128i
 422+set4_sse2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 423+{
 424+	return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
 425+}
 426+
 427+static inline __m128i
 428+rot16_sse2(__m128i x)
 429+{
 430+	return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
 431+}
 432+
 433+static inline __m128i
 434+rot12_sse2(__m128i x)
 435+{
 436+	return xorv_sse2(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
 437+}
 438+
 439+static inline __m128i
 440+rot8_sse2(__m128i x)
 441+{
 442+	return xorv_sse2(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
 443+}
 444+
 445+static inline __m128i
 446+rot7_sse2(__m128i x)
 447+{
 448+	return xorv_sse2(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
 449+}
 450+
 451+static inline void
 452+g1_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
 453+{
 454+	*row0 = addv_sse2(addv_sse2(*row0, m), *row1);
 455+	*row3 = xorv_sse2(*row3, *row0);
 456+	*row3 = rot16_sse2(*row3);
 457+	*row2 = addv_sse2(*row2, *row3);
 458+	*row1 = xorv_sse2(*row1, *row2);
 459+	*row1 = rot12_sse2(*row1);
 460+}
 461+
 462+static inline void
 463+g2_sse2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
 464+{
 465+	*row0 = addv_sse2(addv_sse2(*row0, m), *row1);
 466+	*row3 = xorv_sse2(*row3, *row0);
 467+	*row3 = rot8_sse2(*row3);
 468+	*row2 = addv_sse2(*row2, *row3);
 469+	*row1 = xorv_sse2(*row1, *row2);
 470+	*row1 = rot7_sse2(*row1);
 471+}
 472+
 473+static inline void
 474+diagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
 475+{
 476+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
 477+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 478+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
 479+}
 480+
 481+static inline void
 482+undiagonalize_sse2(__m128i *row0, __m128i *row2, __m128i *row3)
 483+{
 484+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
 485+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
 486+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 487+}
 488+
 489+static inline __m128i
 490+blend_epi16_sse2(__m128i a, __m128i b, const int16_t imm8)
 491+{
 492+	const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
 493+	__m128i mask = _mm_set1_epi16(imm8);
 494+
 495+	mask = _mm_and_si128(mask, bits);
 496+	mask = _mm_cmpeq_epi16(mask, bits);
 497+	return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
 498+}
 499+
 500+static inline void
 501+compress_pre_sse2(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
 502+{
 503+	__m128i m0, m1, m2, m3;
 504+	__m128i t0, t1, t2, t3, tt;
 505+
 506+	rows[0] = loadu_sse2((uint8_t *)&cv[0]);
 507+	rows[1] = loadu_sse2((uint8_t *)&cv[4]);
 508+	rows[2] = set4_sse2(IV[0], IV[1], IV[2], IV[3]);
 509+	rows[3] = set4_sse2(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
 510+
 511+	m0 = loadu_sse2(&block[sizeof(__m128i) * 0]);
 512+	m1 = loadu_sse2(&block[sizeof(__m128i) * 1]);
 513+	m2 = loadu_sse2(&block[sizeof(__m128i) * 2]);
 514+	m3 = loadu_sse2(&block[sizeof(__m128i) * 3]);
 515+
 516+	/* round 1 */
 517+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
 518+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 519+	t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
 520+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 521+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 522+	t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
 523+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
 524+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 525+	t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
 526+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
 527+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 528+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 529+	m0 = t0;
 530+	m1 = t1;
 531+	m2 = t2;
 532+	m3 = t3;
 533+
 534+	/* round 2 */
 535+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 536+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 537+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 538+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 539+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 540+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 541+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 542+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 543+	t2 = _mm_unpacklo_epi64(m3, m1);
 544+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 545+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 546+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 547+	t3 = _mm_unpackhi_epi32(m1, m3);
 548+	tt = _mm_unpacklo_epi32(m2, t3);
 549+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 550+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 551+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 552+	m0 = t0;
 553+	m1 = t1;
 554+	m2 = t2;
 555+	m3 = t3;
 556+
 557+	/* round 3 */
 558+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 559+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 560+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 561+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 562+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 563+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 564+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 565+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 566+	t2 = _mm_unpacklo_epi64(m3, m1);
 567+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 568+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 569+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 570+	t3 = _mm_unpackhi_epi32(m1, m3);
 571+	tt = _mm_unpacklo_epi32(m2, t3);
 572+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 573+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 574+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 575+	m0 = t0;
 576+	m1 = t1;
 577+	m2 = t2;
 578+	m3 = t3;
 579+
 580+	/* round 4 */
 581+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 582+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 583+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 584+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 585+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 586+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 587+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 588+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 589+	t2 = _mm_unpacklo_epi64(m3, m1);
 590+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 591+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 592+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 593+	t3 = _mm_unpackhi_epi32(m1, m3);
 594+	tt = _mm_unpacklo_epi32(m2, t3);
 595+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 596+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 597+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 598+	m0 = t0;
 599+	m1 = t1;
 600+	m2 = t2;
 601+	m3 = t3;
 602+
 603+	/* round 5 */
 604+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 605+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 606+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 607+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 608+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 609+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 610+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 611+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 612+	t2 = _mm_unpacklo_epi64(m3, m1);
 613+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 614+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 615+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 616+	t3 = _mm_unpackhi_epi32(m1, m3);
 617+	tt = _mm_unpacklo_epi32(m2, t3);
 618+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 619+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 620+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 621+	m0 = t0;
 622+	m1 = t1;
 623+	m2 = t2;
 624+	m3 = t3;
 625+
 626+	/* round 6 */
 627+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 628+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 629+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 630+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 631+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 632+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 633+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 634+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 635+	t2 = _mm_unpacklo_epi64(m3, m1);
 636+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 637+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 638+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 639+	t3 = _mm_unpackhi_epi32(m1, m3);
 640+	tt = _mm_unpacklo_epi32(m2, t3);
 641+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 642+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 643+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 644+	m0 = t0;
 645+	m1 = t1;
 646+	m2 = t2;
 647+	m3 = t3;
 648+
 649+	/* round 7 */
 650+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
 651+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
 652+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t0);
 653+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
 654+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
 655+	t1 = blend_epi16_sse2(tt, t1, 0xCC);
 656+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
 657+	diagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 658+	t2 = _mm_unpacklo_epi64(m3, m1);
 659+	tt = blend_epi16_sse2(t2, m2, 0xC0);
 660+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
 661+	g1_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t2);
 662+	t3 = _mm_unpackhi_epi32(m1, m3);
 663+	tt = _mm_unpacklo_epi32(m2, t3);
 664+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
 665+	g2_sse2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
 666+	undiagonalize_sse2(&rows[0], &rows[2], &rows[3]);
 667+}
 668+
 669+void
 670+blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
 671+{
 672+	__m128i rows[4];
 673+
 674+	compress_pre_sse2(rows, cv, block, block_len, counter, flags);
 675+	storeu_sse2(xorv_sse2(rows[0], rows[2]), (uint8_t *)&cv[0]);
 676+	storeu_sse2(xorv_sse2(rows[1], rows[3]), (uint8_t *)&cv[4]);
 677+}
 678+
 679+void
 680+blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
 681+{
 682+	__m128i rows[4];
 683+
 684+	compress_pre_sse2(rows, cv, block, block_len, counter, flags);
 685+	storeu_sse2(xorv_sse2(rows[0], rows[2]), &out_buf[0]);
 686+	storeu_sse2(xorv_sse2(rows[1], rows[3]), &out_buf[16]);
 687+	storeu_sse2(xorv_sse2(rows[2], loadu_sse2((uint8_t *)&cv[0])), &out_buf[32]);
 688+	storeu_sse2(xorv_sse2(rows[3], loadu_sse2((uint8_t *)&cv[4])), &out_buf[48]);
 689+}
 690+
 691+static inline void
 692+round_fn_sse2(__m128i v[16], __m128i m[16], size_t r)
 693+{
 694+	v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
 695+	v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
 696+	v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
 697+	v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
 698+	v[0] = addv_sse2(v[0], v[4]);
 699+	v[1] = addv_sse2(v[1], v[5]);
 700+	v[2] = addv_sse2(v[2], v[6]);
 701+	v[3] = addv_sse2(v[3], v[7]);
 702+	v[12] = xorv_sse2(v[12], v[0]);
 703+	v[13] = xorv_sse2(v[13], v[1]);
 704+	v[14] = xorv_sse2(v[14], v[2]);
 705+	v[15] = xorv_sse2(v[15], v[3]);
 706+	v[12] = rot16_sse2(v[12]);
 707+	v[13] = rot16_sse2(v[13]);
 708+	v[14] = rot16_sse2(v[14]);
 709+	v[15] = rot16_sse2(v[15]);
 710+	v[8] = addv_sse2(v[8], v[12]);
 711+	v[9] = addv_sse2(v[9], v[13]);
 712+	v[10] = addv_sse2(v[10], v[14]);
 713+	v[11] = addv_sse2(v[11], v[15]);
 714+	v[4] = xorv_sse2(v[4], v[8]);
 715+	v[5] = xorv_sse2(v[5], v[9]);
 716+	v[6] = xorv_sse2(v[6], v[10]);
 717+	v[7] = xorv_sse2(v[7], v[11]);
 718+	v[4] = rot12_sse2(v[4]);
 719+	v[5] = rot12_sse2(v[5]);
 720+	v[6] = rot12_sse2(v[6]);
 721+	v[7] = rot12_sse2(v[7]);
 722+	v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
 723+	v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
 724+	v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
 725+	v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
 726+	v[0] = addv_sse2(v[0], v[4]);
 727+	v[1] = addv_sse2(v[1], v[5]);
 728+	v[2] = addv_sse2(v[2], v[6]);
 729+	v[3] = addv_sse2(v[3], v[7]);
 730+	v[12] = xorv_sse2(v[12], v[0]);
 731+	v[13] = xorv_sse2(v[13], v[1]);
 732+	v[14] = xorv_sse2(v[14], v[2]);
 733+	v[15] = xorv_sse2(v[15], v[3]);
 734+	v[12] = rot8_sse2(v[12]);
 735+	v[13] = rot8_sse2(v[13]);
 736+	v[14] = rot8_sse2(v[14]);
 737+	v[15] = rot8_sse2(v[15]);
 738+	v[8] = addv_sse2(v[8], v[12]);
 739+	v[9] = addv_sse2(v[9], v[13]);
 740+	v[10] = addv_sse2(v[10], v[14]);
 741+	v[11] = addv_sse2(v[11], v[15]);
 742+	v[4] = xorv_sse2(v[4], v[8]);
 743+	v[5] = xorv_sse2(v[5], v[9]);
 744+	v[6] = xorv_sse2(v[6], v[10]);
 745+	v[7] = xorv_sse2(v[7], v[11]);
 746+	v[4] = rot7_sse2(v[4]);
 747+	v[5] = rot7_sse2(v[5]);
 748+	v[6] = rot7_sse2(v[6]);
 749+	v[7] = rot7_sse2(v[7]);
 750+
 751+	v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
 752+	v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
 753+	v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
 754+	v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
 755+	v[0] = addv_sse2(v[0], v[5]);
 756+	v[1] = addv_sse2(v[1], v[6]);
 757+	v[2] = addv_sse2(v[2], v[7]);
 758+	v[3] = addv_sse2(v[3], v[4]);
 759+	v[15] = xorv_sse2(v[15], v[0]);
 760+	v[12] = xorv_sse2(v[12], v[1]);
 761+	v[13] = xorv_sse2(v[13], v[2]);
 762+	v[14] = xorv_sse2(v[14], v[3]);
 763+	v[15] = rot16_sse2(v[15]);
 764+	v[12] = rot16_sse2(v[12]);
 765+	v[13] = rot16_sse2(v[13]);
 766+	v[14] = rot16_sse2(v[14]);
 767+	v[10] = addv_sse2(v[10], v[15]);
 768+	v[11] = addv_sse2(v[11], v[12]);
 769+	v[8] = addv_sse2(v[8], v[13]);
 770+	v[9] = addv_sse2(v[9], v[14]);
 771+	v[5] = xorv_sse2(v[5], v[10]);
 772+	v[6] = xorv_sse2(v[6], v[11]);
 773+	v[7] = xorv_sse2(v[7], v[8]);
 774+	v[4] = xorv_sse2(v[4], v[9]);
 775+	v[5] = rot12_sse2(v[5]);
 776+	v[6] = rot12_sse2(v[6]);
 777+	v[7] = rot12_sse2(v[7]);
 778+	v[4] = rot12_sse2(v[4]);
 779+	v[0] = addv_sse2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
 780+	v[1] = addv_sse2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
 781+	v[2] = addv_sse2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
 782+	v[3] = addv_sse2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
 783+	v[0] = addv_sse2(v[0], v[5]);
 784+	v[1] = addv_sse2(v[1], v[6]);
 785+	v[2] = addv_sse2(v[2], v[7]);
 786+	v[3] = addv_sse2(v[3], v[4]);
 787+	v[15] = xorv_sse2(v[15], v[0]);
 788+	v[12] = xorv_sse2(v[12], v[1]);
 789+	v[13] = xorv_sse2(v[13], v[2]);
 790+	v[14] = xorv_sse2(v[14], v[3]);
 791+	v[15] = rot8_sse2(v[15]);
 792+	v[12] = rot8_sse2(v[12]);
 793+	v[13] = rot8_sse2(v[13]);
 794+	v[14] = rot8_sse2(v[14]);
 795+	v[10] = addv_sse2(v[10], v[15]);
 796+	v[11] = addv_sse2(v[11], v[12]);
 797+	v[8] = addv_sse2(v[8], v[13]);
 798+	v[9] = addv_sse2(v[9], v[14]);
 799+	v[5] = xorv_sse2(v[5], v[10]);
 800+	v[6] = xorv_sse2(v[6], v[11]);
 801+	v[7] = xorv_sse2(v[7], v[8]);
 802+	v[4] = xorv_sse2(v[4], v[9]);
 803+	v[5] = rot7_sse2(v[5]);
 804+	v[6] = rot7_sse2(v[6]);
 805+	v[7] = rot7_sse2(v[7]);
 806+	v[4] = rot7_sse2(v[4]);
 807+}
 808+
 809+static inline void
 810+transpose_vecs_sse2(__m128i vecs[DEGREE_SSE2])
 811+{
 812+	__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
 813+	__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
 814+	__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
 815+	__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
 816+
 817+	__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
 818+	__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
 819+	__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
 820+	__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
 821+
 822+	vecs[0] = abcd_0;
 823+	vecs[1] = abcd_1;
 824+	vecs[2] = abcd_2;
 825+	vecs[3] = abcd_3;
 826+}
 827+
 828+static inline void
 829+transpose_msg_vecs_sse2(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
 830+{
 831+	size_t i;
 832+
 833+	out_msg[0] = loadu_sse2(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
 834+	out_msg[1] = loadu_sse2(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
 835+	out_msg[2] = loadu_sse2(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
 836+	out_msg[3] = loadu_sse2(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
 837+	out_msg[4] = loadu_sse2(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
 838+	out_msg[5] = loadu_sse2(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
 839+	out_msg[6] = loadu_sse2(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
 840+	out_msg[7] = loadu_sse2(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
 841+	out_msg[8] = loadu_sse2(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
 842+	out_msg[9] = loadu_sse2(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
 843+	out_msg[10] = loadu_sse2(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
 844+	out_msg[11] = loadu_sse2(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
 845+	out_msg[12] = loadu_sse2(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
 846+	out_msg[13] = loadu_sse2(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
 847+	out_msg[14] = loadu_sse2(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
 848+	out_msg[15] = loadu_sse2(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
 849+
 850+	for (i = 0; i < 4; i++) {
 851+		_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
 852+	}
 853+	transpose_vecs_sse2(&out_msg[0]);
 854+	transpose_vecs_sse2(&out_msg[4]);
 855+	transpose_vecs_sse2(&out_msg[8]);
 856+	transpose_vecs_sse2(&out_msg[12]);
 857+}
 858+
 859+static inline void
 860+load_counters_sse2(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
 861+{
 862+	const __m128i mask = _mm_set1_epi32(-increment_counter);
 863+	const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
 864+	const __m128i add1 = _mm_and_si128(mask, add0);
 865+	__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
 866+	__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
 867+	                                _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
 868+	__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
 869+
 870+	*out_lo = l;
 871+	*out_hi = h;
 872+}
 873+
 874+void
 875+blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
 876+{
 877+	__m128i h_vecs[8];
 878+	__m128i counter_low_vec, counter_high_vec;
 879+	uint8_t block_flags;
 880+	size_t block;
 881+
 882+	h_vecs[0] = set1_sse2(key[0]);
 883+	h_vecs[1] = set1_sse2(key[1]);
 884+	h_vecs[2] = set1_sse2(key[2]);
 885+	h_vecs[3] = set1_sse2(key[3]);
 886+	h_vecs[4] = set1_sse2(key[4]);
 887+	h_vecs[5] = set1_sse2(key[5]);
 888+	h_vecs[6] = set1_sse2(key[6]);
 889+	h_vecs[7] = set1_sse2(key[7]);
 890+
 891+	load_counters_sse2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
 892+	block_flags = flags | flags_start;
 893+
 894+	for (block = 0; block < blocks; block++) {
 895+		__m128i block_len_vec;
 896+		__m128i block_flags_vec;
 897+		__m128i msg_vecs[16];
 898+		__m128i v[16];
 899+
 900+		if (block + 1 == blocks) {
 901+			block_flags |= flags_end;
 902+		}
 903+		block_len_vec = set1_sse2(BLAKE3_BLOCK_LEN);
 904+		block_flags_vec = set1_sse2(block_flags);
 905+		transpose_msg_vecs_sse2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 906+
 907+		v[0] = h_vecs[0];
 908+		v[1] = h_vecs[1];
 909+		v[2] = h_vecs[2];
 910+		v[3] = h_vecs[3];
 911+		v[4] = h_vecs[4];
 912+		v[5] = h_vecs[5];
 913+		v[6] = h_vecs[6];
 914+		v[7] = h_vecs[7];
 915+		v[8] = set1_sse2(IV[0]);
 916+		v[9] = set1_sse2(IV[1]);
 917+		v[10] = set1_sse2(IV[2]);
 918+		v[11] = set1_sse2(IV[3]);
 919+		v[12] = counter_low_vec;
 920+		v[13] = counter_high_vec;
 921+		v[14] = block_len_vec;
 922+		v[15] = block_flags_vec;
 923+
 924+		round_fn_sse2(v, msg_vecs, 0);
 925+		round_fn_sse2(v, msg_vecs, 1);
 926+		round_fn_sse2(v, msg_vecs, 2);
 927+		round_fn_sse2(v, msg_vecs, 3);
 928+		round_fn_sse2(v, msg_vecs, 4);
 929+		round_fn_sse2(v, msg_vecs, 5);
 930+		round_fn_sse2(v, msg_vecs, 6);
 931+
 932+		h_vecs[0] = xorv_sse2(v[0], v[8]);
 933+		h_vecs[1] = xorv_sse2(v[1], v[9]);
 934+		h_vecs[2] = xorv_sse2(v[2], v[10]);
 935+		h_vecs[3] = xorv_sse2(v[3], v[11]);
 936+		h_vecs[4] = xorv_sse2(v[4], v[12]);
 937+		h_vecs[5] = xorv_sse2(v[5], v[13]);
 938+		h_vecs[6] = xorv_sse2(v[6], v[14]);
 939+		h_vecs[7] = xorv_sse2(v[7], v[15]);
 940+
 941+		block_flags = flags;
 942+	}
 943+
 944+	transpose_vecs_sse2(&h_vecs[0]);
 945+	transpose_vecs_sse2(&h_vecs[4]);
 946+	storeu_sse2(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
 947+	storeu_sse2(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
 948+	storeu_sse2(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
 949+	storeu_sse2(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
 950+	storeu_sse2(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
 951+	storeu_sse2(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
 952+	storeu_sse2(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
 953+	storeu_sse2(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
 954+}
 955+
 956+static inline void
 957+hash_one_sse2(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
 958+{
 959+	uint32_t cv[8];
 960+	uint8_t block_flags;
 961+
 962+	memcpy(cv, key, BLAKE3_KEY_LEN);
 963+	block_flags = flags | flags_start;
 964+	while (blocks > 0) {
 965+		if (blocks == 1) {
 966+			block_flags |= flags_end;
 967+		}
 968+		blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
 969+		input = &input[BLAKE3_BLOCK_LEN];
 970+		blocks -= 1;
 971+		block_flags = flags;
 972+	}
 973+	memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
 974+}
 975+
 976+void
 977+blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
 978+{
 979+	while (num_inputs >= DEGREE_SSE2) {
 980+		blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
 981+		if (increment_counter) {
 982+			counter += DEGREE_SSE2;
 983+		}
 984+		inputs += DEGREE_SSE2;
 985+		num_inputs -= DEGREE_SSE2;
 986+		out_bytes = &out_bytes[DEGREE_SSE2 * BLAKE3_OUT_LEN];
 987+	}
 988+	while (num_inputs > 0) {
 989+		hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
 990+		if (increment_counter) {
 991+			counter += 1;
 992+		}
 993+		inputs += 1;
 994+		num_inputs -= 1;
 995+		out_bytes = &out_bytes[BLAKE3_OUT_LEN];
 996+	}
 997+}
 998+#pragma GCC pop_options
 999+
1000+#pragma GCC push_options
1001+#pragma GCC target("sse4.1")
1002+#define DEGREE_SSE41 4
1003+
1004+#define _mm_shuffle_ps2_sse41(a, b, c)                                         \
1005+  (_mm_castps_si128(                                                           \
1006+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
1007+
1008+static inline __m128i
1009+loadu_sse41(const uint8_t src[16])
1010+{
1011+	return _mm_loadu_si128((const __m128i *)src);
1012+}
1013+
1014+static inline void
1015+storeu_sse41(__m128i src, uint8_t dest[16])
1016+{
1017+	_mm_storeu_si128((__m128i *)dest, src);
1018+}
1019+
1020+static inline __m128i
1021+addv_sse41(__m128i a, __m128i b)
1022+{
1023+	return _mm_add_epi32(a, b);
1024+}
1025+
1026+static inline __m128i
1027+xorv_sse41(__m128i a, __m128i b)
1028+{
1029+	return _mm_xor_si128(a, b);
1030+}
1031+
1032+static inline __m128i
1033+set1_sse41(uint32_t x)
1034+{
1035+	return _mm_set1_epi32((int32_t)x);
1036+}
1037+
1038+static inline __m128i
1039+set4_sse41(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1040+{
1041+	return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
1042+}
1043+
1044+static inline __m128i
1045+rot16_sse41(__m128i x)
1046+{
1047+	return _mm_shuffle_epi8(x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1048+}
1049+
1050+static inline __m128i
1051+rot12_sse41(__m128i x)
1052+{
1053+	return xorv_sse41(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
1054+}
1055+
1056+static inline __m128i
1057+rot8_sse41(__m128i x)
1058+{
1059+	return _mm_shuffle_epi8(x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1060+}
1061+
1062+static inline __m128i
1063+rot7_sse41(__m128i x)
1064+{
1065+	return xorv_sse41(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
1066+}
1067+
1068+static inline void
1069+g1_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1070+{
1071+	*row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1072+	*row3 = xorv_sse41(*row3, *row0);
1073+	*row3 = rot16_sse41(*row3);
1074+	*row2 = addv_sse41(*row2, *row3);
1075+	*row1 = xorv_sse41(*row1, *row2);
1076+	*row1 = rot12_sse41(*row1);
1077+}
1078+
1079+static inline void
1080+g2_sse41(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
1081+{
1082+	*row0 = addv_sse41(addv_sse41(*row0, m), *row1);
1083+	*row3 = xorv_sse41(*row3, *row0);
1084+	*row3 = rot8_sse41(*row3);
1085+	*row2 = addv_sse41(*row2, *row3);
1086+	*row1 = xorv_sse41(*row1, *row2);
1087+	*row1 = rot7_sse41(*row1);
1088+}
1089+
1090+static inline void
1091+diagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1092+{
1093+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
1094+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1095+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
1096+}
1097+
1098+static inline void
1099+undiagonalize_sse41(__m128i *row0, __m128i *row2, __m128i *row3)
1100+{
1101+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
1102+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
1103+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
1104+}
1105+
1106+static inline void
1107+compress_pre_sse41(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1108+{
1109+	__m128i m0, m1, m2, m3;
1110+	__m128i t0, t1, t2, t3, tt;
1111+
1112+	rows[0] = loadu_sse41((uint8_t *)&cv[0]);
1113+	rows[1] = loadu_sse41((uint8_t *)&cv[4]);
1114+	rows[2] = set4_sse41(IV[0], IV[1], IV[2], IV[3]);
1115+	rows[3] = set4_sse41(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
1116+
1117+	m0 = loadu_sse41(&block[sizeof(__m128i) * 0]);
1118+	m1 = loadu_sse41(&block[sizeof(__m128i) * 1]);
1119+	m2 = loadu_sse41(&block[sizeof(__m128i) * 2]);
1120+	m3 = loadu_sse41(&block[sizeof(__m128i) * 3]);
1121+
1122+	/* round 1 */
1123+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
1124+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1125+	t1 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
1126+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1127+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1128+	t2 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
1129+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
1130+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1131+	t3 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
1132+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
1133+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1134+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1135+	m0 = t0;
1136+	m1 = t1;
1137+	m2 = t2;
1138+	m3 = t3;
1139+
1140+	/* round 2 */
1141+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1142+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1143+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1144+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1145+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1146+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1147+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1148+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1149+	t2 = _mm_unpacklo_epi64(m3, m1);
1150+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1151+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1152+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1153+	t3 = _mm_unpackhi_epi32(m1, m3);
1154+	tt = _mm_unpacklo_epi32(m2, t3);
1155+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1156+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1157+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1158+	m0 = t0;
1159+	m1 = t1;
1160+	m2 = t2;
1161+	m3 = t3;
1162+
1163+	/* round 3 */
1164+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1165+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1166+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1167+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1168+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1169+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1170+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1171+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1172+	t2 = _mm_unpacklo_epi64(m3, m1);
1173+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1174+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1175+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1176+	t3 = _mm_unpackhi_epi32(m1, m3);
1177+	tt = _mm_unpacklo_epi32(m2, t3);
1178+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1179+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1180+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1181+	m0 = t0;
1182+	m1 = t1;
1183+	m2 = t2;
1184+	m3 = t3;
1185+
1186+	/* round 4 */
1187+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1188+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1189+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1190+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1191+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1192+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1193+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1194+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1195+	t2 = _mm_unpacklo_epi64(m3, m1);
1196+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1197+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1198+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1199+	t3 = _mm_unpackhi_epi32(m1, m3);
1200+	tt = _mm_unpacklo_epi32(m2, t3);
1201+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1202+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1203+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1204+	m0 = t0;
1205+	m1 = t1;
1206+	m2 = t2;
1207+	m3 = t3;
1208+
1209+	/* round 5 */
1210+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1211+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1212+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1213+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1214+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1215+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1216+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1217+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1218+	t2 = _mm_unpacklo_epi64(m3, m1);
1219+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1220+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1221+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1222+	t3 = _mm_unpackhi_epi32(m1, m3);
1223+	tt = _mm_unpacklo_epi32(m2, t3);
1224+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1225+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1226+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1227+	m0 = t0;
1228+	m1 = t1;
1229+	m2 = t2;
1230+	m3 = t3;
1231+
1232+	/* round 6 */
1233+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1234+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1235+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1236+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1237+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1238+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1239+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1240+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1241+	t2 = _mm_unpacklo_epi64(m3, m1);
1242+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1243+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1244+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1245+	t3 = _mm_unpackhi_epi32(m1, m3);
1246+	tt = _mm_unpacklo_epi32(m2, t3);
1247+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1248+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1249+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1250+	m0 = t0;
1251+	m1 = t1;
1252+	m2 = t2;
1253+	m3 = t3;
1254+
1255+	/* round 7 */
1256+	t0 = _mm_shuffle_ps2_sse41(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
1257+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
1258+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t0);
1259+	t1 = _mm_shuffle_ps2_sse41(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
1260+	tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
1261+	t1 = _mm_blend_epi16(tt, t1, 0xCC);
1262+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t1);
1263+	diagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1264+	t2 = _mm_unpacklo_epi64(m3, m1);
1265+	tt = _mm_blend_epi16(t2, m2, 0xC0);
1266+	t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
1267+	g1_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t2);
1268+	t3 = _mm_unpackhi_epi32(m1, m3);
1269+	tt = _mm_unpacklo_epi32(m2, t3);
1270+	t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
1271+	g2_sse41(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1272+	undiagonalize_sse41(&rows[0], &rows[2], &rows[3]);
1273+}
1274+
1275+void
1276+blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
1277+{
1278+	__m128i rows[4];
1279+
1280+	compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1281+	storeu_sse41(xorv_sse41(rows[0], rows[2]), (uint8_t *)&cv[0]);
1282+	storeu_sse41(xorv_sse41(rows[1], rows[3]), (uint8_t *)&cv[4]);
1283+}
1284+
1285+void
1286+blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
1287+{
1288+	__m128i rows[4];
1289+
1290+	compress_pre_sse41(rows, cv, block, block_len, counter, flags);
1291+	storeu_sse41(xorv_sse41(rows[0], rows[2]), &out_buf[0]);
1292+	storeu_sse41(xorv_sse41(rows[1], rows[3]), &out_buf[16]);
1293+	storeu_sse41(xorv_sse41(rows[2], loadu_sse41((uint8_t *)&cv[0])), &out_buf[32]);
1294+	storeu_sse41(xorv_sse41(rows[3], loadu_sse41((uint8_t *)&cv[4])), &out_buf[48]);
1295+}
1296+
1297+static inline void
1298+round_fn_sse41(__m128i v[16], __m128i m[16], size_t r)
1299+{
1300+	v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1301+	v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1302+	v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1303+	v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1304+	v[0] = addv_sse41(v[0], v[4]);
1305+	v[1] = addv_sse41(v[1], v[5]);
1306+	v[2] = addv_sse41(v[2], v[6]);
1307+	v[3] = addv_sse41(v[3], v[7]);
1308+	v[12] = xorv_sse41(v[12], v[0]);
1309+	v[13] = xorv_sse41(v[13], v[1]);
1310+	v[14] = xorv_sse41(v[14], v[2]);
1311+	v[15] = xorv_sse41(v[15], v[3]);
1312+	v[12] = rot16_sse41(v[12]);
1313+	v[13] = rot16_sse41(v[13]);
1314+	v[14] = rot16_sse41(v[14]);
1315+	v[15] = rot16_sse41(v[15]);
1316+	v[8] = addv_sse41(v[8], v[12]);
1317+	v[9] = addv_sse41(v[9], v[13]);
1318+	v[10] = addv_sse41(v[10], v[14]);
1319+	v[11] = addv_sse41(v[11], v[15]);
1320+	v[4] = xorv_sse41(v[4], v[8]);
1321+	v[5] = xorv_sse41(v[5], v[9]);
1322+	v[6] = xorv_sse41(v[6], v[10]);
1323+	v[7] = xorv_sse41(v[7], v[11]);
1324+	v[4] = rot12_sse41(v[4]);
1325+	v[5] = rot12_sse41(v[5]);
1326+	v[6] = rot12_sse41(v[6]);
1327+	v[7] = rot12_sse41(v[7]);
1328+	v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1329+	v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1330+	v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1331+	v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1332+	v[0] = addv_sse41(v[0], v[4]);
1333+	v[1] = addv_sse41(v[1], v[5]);
1334+	v[2] = addv_sse41(v[2], v[6]);
1335+	v[3] = addv_sse41(v[3], v[7]);
1336+	v[12] = xorv_sse41(v[12], v[0]);
1337+	v[13] = xorv_sse41(v[13], v[1]);
1338+	v[14] = xorv_sse41(v[14], v[2]);
1339+	v[15] = xorv_sse41(v[15], v[3]);
1340+	v[12] = rot8_sse41(v[12]);
1341+	v[13] = rot8_sse41(v[13]);
1342+	v[14] = rot8_sse41(v[14]);
1343+	v[15] = rot8_sse41(v[15]);
1344+	v[8] = addv_sse41(v[8], v[12]);
1345+	v[9] = addv_sse41(v[9], v[13]);
1346+	v[10] = addv_sse41(v[10], v[14]);
1347+	v[11] = addv_sse41(v[11], v[15]);
1348+	v[4] = xorv_sse41(v[4], v[8]);
1349+	v[5] = xorv_sse41(v[5], v[9]);
1350+	v[6] = xorv_sse41(v[6], v[10]);
1351+	v[7] = xorv_sse41(v[7], v[11]);
1352+	v[4] = rot7_sse41(v[4]);
1353+	v[5] = rot7_sse41(v[5]);
1354+	v[6] = rot7_sse41(v[6]);
1355+	v[7] = rot7_sse41(v[7]);
1356+
1357+	v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1358+	v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1359+	v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1360+	v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1361+	v[0] = addv_sse41(v[0], v[5]);
1362+	v[1] = addv_sse41(v[1], v[6]);
1363+	v[2] = addv_sse41(v[2], v[7]);
1364+	v[3] = addv_sse41(v[3], v[4]);
1365+	v[15] = xorv_sse41(v[15], v[0]);
1366+	v[12] = xorv_sse41(v[12], v[1]);
1367+	v[13] = xorv_sse41(v[13], v[2]);
1368+	v[14] = xorv_sse41(v[14], v[3]);
1369+	v[15] = rot16_sse41(v[15]);
1370+	v[12] = rot16_sse41(v[12]);
1371+	v[13] = rot16_sse41(v[13]);
1372+	v[14] = rot16_sse41(v[14]);
1373+	v[10] = addv_sse41(v[10], v[15]);
1374+	v[11] = addv_sse41(v[11], v[12]);
1375+	v[8] = addv_sse41(v[8], v[13]);
1376+	v[9] = addv_sse41(v[9], v[14]);
1377+	v[5] = xorv_sse41(v[5], v[10]);
1378+	v[6] = xorv_sse41(v[6], v[11]);
1379+	v[7] = xorv_sse41(v[7], v[8]);
1380+	v[4] = xorv_sse41(v[4], v[9]);
1381+	v[5] = rot12_sse41(v[5]);
1382+	v[6] = rot12_sse41(v[6]);
1383+	v[7] = rot12_sse41(v[7]);
1384+	v[4] = rot12_sse41(v[4]);
1385+	v[0] = addv_sse41(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1386+	v[1] = addv_sse41(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1387+	v[2] = addv_sse41(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1388+	v[3] = addv_sse41(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1389+	v[0] = addv_sse41(v[0], v[5]);
1390+	v[1] = addv_sse41(v[1], v[6]);
1391+	v[2] = addv_sse41(v[2], v[7]);
1392+	v[3] = addv_sse41(v[3], v[4]);
1393+	v[15] = xorv_sse41(v[15], v[0]);
1394+	v[12] = xorv_sse41(v[12], v[1]);
1395+	v[13] = xorv_sse41(v[13], v[2]);
1396+	v[14] = xorv_sse41(v[14], v[3]);
1397+	v[15] = rot8_sse41(v[15]);
1398+	v[12] = rot8_sse41(v[12]);
1399+	v[13] = rot8_sse41(v[13]);
1400+	v[14] = rot8_sse41(v[14]);
1401+	v[10] = addv_sse41(v[10], v[15]);
1402+	v[11] = addv_sse41(v[11], v[12]);
1403+	v[8] = addv_sse41(v[8], v[13]);
1404+	v[9] = addv_sse41(v[9], v[14]);
1405+	v[5] = xorv_sse41(v[5], v[10]);
1406+	v[6] = xorv_sse41(v[6], v[11]);
1407+	v[7] = xorv_sse41(v[7], v[8]);
1408+	v[4] = xorv_sse41(v[4], v[9]);
1409+	v[5] = rot7_sse41(v[5]);
1410+	v[6] = rot7_sse41(v[6]);
1411+	v[7] = rot7_sse41(v[7]);
1412+	v[4] = rot7_sse41(v[4]);
1413+}
1414+
1415+static inline void
1416+transpose_vecs_sse41(__m128i vecs[DEGREE_SSE41])
1417+{
1418+	__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
1419+	__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
1420+	__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
1421+	__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
1422+
1423+	__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
1424+	__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
1425+	__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
1426+	__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
1427+
1428+	vecs[0] = abcd_0;
1429+	vecs[1] = abcd_1;
1430+	vecs[2] = abcd_2;
1431+	vecs[3] = abcd_3;
1432+}
1433+
1434+static inline void
1435+transpose_msg_vecs_sse41(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
1436+{
1437+	size_t i;
1438+
1439+	out_msg[0] = loadu_sse41(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
1440+	out_msg[1] = loadu_sse41(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
1441+	out_msg[2] = loadu_sse41(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
1442+	out_msg[3] = loadu_sse41(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
1443+	out_msg[4] = loadu_sse41(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
1444+	out_msg[5] = loadu_sse41(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
1445+	out_msg[6] = loadu_sse41(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
1446+	out_msg[7] = loadu_sse41(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
1447+	out_msg[8] = loadu_sse41(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
1448+	out_msg[9] = loadu_sse41(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
1449+	out_msg[10] = loadu_sse41(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
1450+	out_msg[11] = loadu_sse41(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
1451+	out_msg[12] = loadu_sse41(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
1452+	out_msg[13] = loadu_sse41(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
1453+	out_msg[14] = loadu_sse41(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
1454+	out_msg[15] = loadu_sse41(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
1455+
1456+	for (i = 0; i < 4; i++) {
1457+		_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1458+	}
1459+	transpose_vecs_sse41(&out_msg[0]);
1460+	transpose_vecs_sse41(&out_msg[4]);
1461+	transpose_vecs_sse41(&out_msg[8]);
1462+	transpose_vecs_sse41(&out_msg[12]);
1463+}
1464+
1465+static inline void
1466+load_counters_sse41(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
1467+{
1468+	const __m128i mask = _mm_set1_epi32(-increment_counter);
1469+	const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
1470+	const __m128i add1 = _mm_and_si128(mask, add0);
1471+	__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
1472+	__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
1473+	                                _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
1474+	__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
1475+
1476+	*out_lo = l;
1477+	*out_hi = h;
1478+}
1479+
1480+void
1481+blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1482+{
1483+	__m128i h_vecs[8];
1484+	__m128i counter_low_vec, counter_high_vec;
1485+	uint8_t block_flags;
1486+	size_t block;
1487+
1488+	h_vecs[0] = set1_sse41(key[0]);
1489+	h_vecs[1] = set1_sse41(key[1]);
1490+	h_vecs[2] = set1_sse41(key[2]);
1491+	h_vecs[3] = set1_sse41(key[3]);
1492+	h_vecs[4] = set1_sse41(key[4]);
1493+	h_vecs[5] = set1_sse41(key[5]);
1494+	h_vecs[6] = set1_sse41(key[6]);
1495+	h_vecs[7] = set1_sse41(key[7]);
1496+
1497+	load_counters_sse41(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1498+	block_flags = flags | flags_start;
1499+
1500+	for (block = 0; block < blocks; block++) {
1501+		__m128i block_len_vec;
1502+		__m128i block_flags_vec;
1503+		__m128i msg_vecs[16];
1504+		__m128i v[16];
1505+
1506+		if (block + 1 == blocks) {
1507+			block_flags |= flags_end;
1508+		}
1509+		block_len_vec = set1_sse41(BLAKE3_BLOCK_LEN);
1510+		block_flags_vec = set1_sse41(block_flags);
1511+		transpose_msg_vecs_sse41(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1512+
1513+		v[0] = h_vecs[0];
1514+		v[1] = h_vecs[1];
1515+		v[2] = h_vecs[2];
1516+		v[3] = h_vecs[3];
1517+		v[4] = h_vecs[4];
1518+		v[5] = h_vecs[5];
1519+		v[6] = h_vecs[6];
1520+		v[7] = h_vecs[7];
1521+		v[8] = set1_sse41(IV[0]);
1522+		v[9] = set1_sse41(IV[1]);
1523+		v[10] = set1_sse41(IV[2]);
1524+		v[11] = set1_sse41(IV[3]);
1525+		v[12] = counter_low_vec;
1526+		v[13] = counter_high_vec;
1527+		v[14] = block_len_vec;
1528+		v[15] = block_flags_vec;
1529+
1530+		round_fn_sse41(v, msg_vecs, 0);
1531+		round_fn_sse41(v, msg_vecs, 1);
1532+		round_fn_sse41(v, msg_vecs, 2);
1533+		round_fn_sse41(v, msg_vecs, 3);
1534+		round_fn_sse41(v, msg_vecs, 4);
1535+		round_fn_sse41(v, msg_vecs, 5);
1536+		round_fn_sse41(v, msg_vecs, 6);
1537+
1538+		h_vecs[0] = xorv_sse41(v[0], v[8]);
1539+		h_vecs[1] = xorv_sse41(v[1], v[9]);
1540+		h_vecs[2] = xorv_sse41(v[2], v[10]);
1541+		h_vecs[3] = xorv_sse41(v[3], v[11]);
1542+		h_vecs[4] = xorv_sse41(v[4], v[12]);
1543+		h_vecs[5] = xorv_sse41(v[5], v[13]);
1544+		h_vecs[6] = xorv_sse41(v[6], v[14]);
1545+		h_vecs[7] = xorv_sse41(v[7], v[15]);
1546+
1547+		block_flags = flags;
1548+	}
1549+
1550+	transpose_vecs_sse41(&h_vecs[0]);
1551+	transpose_vecs_sse41(&h_vecs[4]);
1552+	storeu_sse41(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
1553+	storeu_sse41(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
1554+	storeu_sse41(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
1555+	storeu_sse41(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
1556+	storeu_sse41(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
1557+	storeu_sse41(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
1558+	storeu_sse41(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
1559+	storeu_sse41(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
1560+}
1561+
1562+static inline void
1563+hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
1564+{
1565+	uint32_t cv[8];
1566+	uint8_t block_flags;
1567+
1568+	memcpy(cv, key, BLAKE3_KEY_LEN);
1569+	block_flags = flags | flags_start;
1570+	while (blocks > 0) {
1571+		if (blocks == 1) {
1572+			block_flags |= flags_end;
1573+		}
1574+		blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
1575+		input = &input[BLAKE3_BLOCK_LEN];
1576+		blocks -= 1;
1577+		block_flags = flags;
1578+	}
1579+	memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
1580+}
1581+
1582+void
1583+blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1584+{
1585+	while (num_inputs >= DEGREE_SSE41) {
1586+		blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1587+		if (increment_counter) {
1588+			counter += DEGREE_SSE41;
1589+		}
1590+		inputs += DEGREE_SSE41;
1591+		num_inputs -= DEGREE_SSE41;
1592+		out_bytes = &out_bytes[DEGREE_SSE41 * BLAKE3_OUT_LEN];
1593+	}
1594+	while (num_inputs > 0) {
1595+		hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
1596+		if (increment_counter) {
1597+			counter += 1;
1598+		}
1599+		inputs += 1;
1600+		num_inputs -= 1;
1601+		out_bytes = &out_bytes[BLAKE3_OUT_LEN];
1602+	}
1603+}
1604+#pragma GCC pop_options
1605+
1606+#pragma GCC push_options
1607+#pragma GCC target("avx2")
1608+#define DEGREE_AVX2 8
1609+
1610+static inline __m256i
1611+loadu_avx2(const uint8_t src[32])
1612+{
1613+	return _mm256_loadu_si256((const __m256i *)src);
1614+}
1615+
1616+static inline void
1617+storeu_avx2(__m256i src, uint8_t dest[32])
1618+{
1619+	_mm256_storeu_si256((__m256i *)dest, src);
1620+}
1621+
1622+static inline __m256i
1623+addv_avx2(__m256i a, __m256i b)
1624+{
1625+	return _mm256_add_epi32(a, b);
1626+}
1627+
1628+static inline __m256i
1629+xorv_avx2(__m256i a, __m256i b)
1630+{
1631+	return _mm256_xor_si256(a, b);
1632+}
1633+
1634+static inline __m256i
1635+set1_avx2(uint32_t x)
1636+{
1637+	return _mm256_set1_epi32((int32_t)x);
1638+}
1639+
1640+static inline __m256i
1641+rot16_avx2(__m256i x)
1642+{
1643+	return _mm256_shuffle_epi8(x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
1644+	                                              13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
1645+}
1646+
1647+static inline __m256i
1648+rot12_avx2(__m256i x)
1649+{
1650+	return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
1651+}
1652+
1653+static inline __m256i
1654+rot8_avx2(__m256i x)
1655+{
1656+	return _mm256_shuffle_epi8(x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
1657+	                                             12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
1658+}
1659+
1660+static inline __m256i
1661+rot7_avx2(__m256i x)
1662+{
1663+	return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
1664+}
1665+
1666+static inline void
1667+round_fn_avx2(__m256i v[16], __m256i m[16], size_t r)
1668+{
1669+	v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
1670+	v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
1671+	v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
1672+	v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
1673+	v[0] = addv_avx2(v[0], v[4]);
1674+	v[1] = addv_avx2(v[1], v[5]);
1675+	v[2] = addv_avx2(v[2], v[6]);
1676+	v[3] = addv_avx2(v[3], v[7]);
1677+	v[12] = xorv_avx2(v[12], v[0]);
1678+	v[13] = xorv_avx2(v[13], v[1]);
1679+	v[14] = xorv_avx2(v[14], v[2]);
1680+	v[15] = xorv_avx2(v[15], v[3]);
1681+	v[12] = rot16_avx2(v[12]);
1682+	v[13] = rot16_avx2(v[13]);
1683+	v[14] = rot16_avx2(v[14]);
1684+	v[15] = rot16_avx2(v[15]);
1685+	v[8] = addv_avx2(v[8], v[12]);
1686+	v[9] = addv_avx2(v[9], v[13]);
1687+	v[10] = addv_avx2(v[10], v[14]);
1688+	v[11] = addv_avx2(v[11], v[15]);
1689+	v[4] = xorv_avx2(v[4], v[8]);
1690+	v[5] = xorv_avx2(v[5], v[9]);
1691+	v[6] = xorv_avx2(v[6], v[10]);
1692+	v[7] = xorv_avx2(v[7], v[11]);
1693+	v[4] = rot12_avx2(v[4]);
1694+	v[5] = rot12_avx2(v[5]);
1695+	v[6] = rot12_avx2(v[6]);
1696+	v[7] = rot12_avx2(v[7]);
1697+	v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
1698+	v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
1699+	v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
1700+	v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
1701+	v[0] = addv_avx2(v[0], v[4]);
1702+	v[1] = addv_avx2(v[1], v[5]);
1703+	v[2] = addv_avx2(v[2], v[6]);
1704+	v[3] = addv_avx2(v[3], v[7]);
1705+	v[12] = xorv_avx2(v[12], v[0]);
1706+	v[13] = xorv_avx2(v[13], v[1]);
1707+	v[14] = xorv_avx2(v[14], v[2]);
1708+	v[15] = xorv_avx2(v[15], v[3]);
1709+	v[12] = rot8_avx2(v[12]);
1710+	v[13] = rot8_avx2(v[13]);
1711+	v[14] = rot8_avx2(v[14]);
1712+	v[15] = rot8_avx2(v[15]);
1713+	v[8] = addv_avx2(v[8], v[12]);
1714+	v[9] = addv_avx2(v[9], v[13]);
1715+	v[10] = addv_avx2(v[10], v[14]);
1716+	v[11] = addv_avx2(v[11], v[15]);
1717+	v[4] = xorv_avx2(v[4], v[8]);
1718+	v[5] = xorv_avx2(v[5], v[9]);
1719+	v[6] = xorv_avx2(v[6], v[10]);
1720+	v[7] = xorv_avx2(v[7], v[11]);
1721+	v[4] = rot7_avx2(v[4]);
1722+	v[5] = rot7_avx2(v[5]);
1723+	v[6] = rot7_avx2(v[6]);
1724+	v[7] = rot7_avx2(v[7]);
1725+
1726+	v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
1727+	v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
1728+	v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
1729+	v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
1730+	v[0] = addv_avx2(v[0], v[5]);
1731+	v[1] = addv_avx2(v[1], v[6]);
1732+	v[2] = addv_avx2(v[2], v[7]);
1733+	v[3] = addv_avx2(v[3], v[4]);
1734+	v[15] = xorv_avx2(v[15], v[0]);
1735+	v[12] = xorv_avx2(v[12], v[1]);
1736+	v[13] = xorv_avx2(v[13], v[2]);
1737+	v[14] = xorv_avx2(v[14], v[3]);
1738+	v[15] = rot16_avx2(v[15]);
1739+	v[12] = rot16_avx2(v[12]);
1740+	v[13] = rot16_avx2(v[13]);
1741+	v[14] = rot16_avx2(v[14]);
1742+	v[10] = addv_avx2(v[10], v[15]);
1743+	v[11] = addv_avx2(v[11], v[12]);
1744+	v[8] = addv_avx2(v[8], v[13]);
1745+	v[9] = addv_avx2(v[9], v[14]);
1746+	v[5] = xorv_avx2(v[5], v[10]);
1747+	v[6] = xorv_avx2(v[6], v[11]);
1748+	v[7] = xorv_avx2(v[7], v[8]);
1749+	v[4] = xorv_avx2(v[4], v[9]);
1750+	v[5] = rot12_avx2(v[5]);
1751+	v[6] = rot12_avx2(v[6]);
1752+	v[7] = rot12_avx2(v[7]);
1753+	v[4] = rot12_avx2(v[4]);
1754+	v[0] = addv_avx2(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
1755+	v[1] = addv_avx2(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
1756+	v[2] = addv_avx2(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1757+	v[3] = addv_avx2(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1758+	v[0] = addv_avx2(v[0], v[5]);
1759+	v[1] = addv_avx2(v[1], v[6]);
1760+	v[2] = addv_avx2(v[2], v[7]);
1761+	v[3] = addv_avx2(v[3], v[4]);
1762+	v[15] = xorv_avx2(v[15], v[0]);
1763+	v[12] = xorv_avx2(v[12], v[1]);
1764+	v[13] = xorv_avx2(v[13], v[2]);
1765+	v[14] = xorv_avx2(v[14], v[3]);
1766+	v[15] = rot8_avx2(v[15]);
1767+	v[12] = rot8_avx2(v[12]);
1768+	v[13] = rot8_avx2(v[13]);
1769+	v[14] = rot8_avx2(v[14]);
1770+	v[10] = addv_avx2(v[10], v[15]);
1771+	v[11] = addv_avx2(v[11], v[12]);
1772+	v[8] = addv_avx2(v[8], v[13]);
1773+	v[9] = addv_avx2(v[9], v[14]);
1774+	v[5] = xorv_avx2(v[5], v[10]);
1775+	v[6] = xorv_avx2(v[6], v[11]);
1776+	v[7] = xorv_avx2(v[7], v[8]);
1777+	v[4] = xorv_avx2(v[4], v[9]);
1778+	v[5] = rot7_avx2(v[5]);
1779+	v[6] = rot7_avx2(v[6]);
1780+	v[7] = rot7_avx2(v[7]);
1781+	v[4] = rot7_avx2(v[4]);
1782+}
1783+
1784+static inline void
1785+transpose_vecs_avx2(__m256i vecs[DEGREE_AVX2])
1786+{
1787+	__m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
1788+	__m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
1789+	__m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
1790+	__m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
1791+	__m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
1792+	__m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
1793+	__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
1794+	__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
1795+
1796+	__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
1797+	__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
1798+	__m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
1799+	__m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
1800+	__m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
1801+	__m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
1802+	__m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
1803+	__m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
1804+
1805+	vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
1806+	vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
1807+	vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
1808+	vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
1809+	vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
1810+	vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
1811+	vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
1812+	vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
1813+}
1814+
1815+static inline void
1816+transpose_msg_vecs_avx2(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
1817+{
1818+	size_t i;
1819+
1820+	out_msg[0] = loadu_avx2(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
1821+	out_msg[1] = loadu_avx2(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
1822+	out_msg[2] = loadu_avx2(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
1823+	out_msg[3] = loadu_avx2(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
1824+	out_msg[4] = loadu_avx2(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
1825+	out_msg[5] = loadu_avx2(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
1826+	out_msg[6] = loadu_avx2(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
1827+	out_msg[7] = loadu_avx2(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
1828+	out_msg[8] = loadu_avx2(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
1829+	out_msg[9] = loadu_avx2(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
1830+	out_msg[10] = loadu_avx2(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
1831+	out_msg[11] = loadu_avx2(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
1832+	out_msg[12] = loadu_avx2(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
1833+	out_msg[13] = loadu_avx2(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
1834+	out_msg[14] = loadu_avx2(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
1835+	out_msg[15] = loadu_avx2(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
1836+
1837+	for (i = 0; i < 8; i++) {
1838+		_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1839+	}
1840+	transpose_vecs_avx2(&out_msg[0]);
1841+	transpose_vecs_avx2(&out_msg[8]);
1842+}
1843+
1844+static inline void
1845+load_counters_avx2(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
1846+{
1847+	const __m256i mask = _mm256_set1_epi32(-increment_counter);
1848+	const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1849+	const __m256i add1 = _mm256_and_si256(mask, add0);
1850+	__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
1851+	__m256i carry = _mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000));
1852+	__m256i comp = _mm256_xor_si256(l, _mm256_set1_epi32(0x80000000));
1853+	__m256i gt = _mm256_cmpgt_epi32(carry, comp);
1854+	__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), gt);
1855+
1856+	*out_lo = l;
1857+	*out_hi = h;
1858+}
1859+
1860+void
1861+blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1862+{
1863+	__m256i h_vecs[8];
1864+	__m256i counter_low_vec, counter_high_vec;
1865+	uint8_t block_flags;
1866+	size_t block;
1867+
1868+	h_vecs[0] = set1_avx2(key[0]);
1869+	h_vecs[1] = set1_avx2(key[1]);
1870+	h_vecs[2] = set1_avx2(key[2]);
1871+	h_vecs[3] = set1_avx2(key[3]);
1872+	h_vecs[4] = set1_avx2(key[4]);
1873+	h_vecs[5] = set1_avx2(key[5]);
1874+	h_vecs[6] = set1_avx2(key[6]);
1875+	h_vecs[7] = set1_avx2(key[7]);
1876+
1877+	load_counters_avx2(counter, increment_counter, &counter_low_vec, &counter_high_vec);
1878+	block_flags = flags | flags_start;
1879+
1880+	for (block = 0; block < blocks; block++) {
1881+		__m256i block_len_vec;
1882+		__m256i block_flags_vec;
1883+		__m256i msg_vecs[16];
1884+		__m256i v[16];
1885+
1886+		if (block + 1 == blocks) {
1887+			block_flags |= flags_end;
1888+		}
1889+		block_len_vec = set1_avx2(BLAKE3_BLOCK_LEN);
1890+		block_flags_vec = set1_avx2(block_flags);
1891+		transpose_msg_vecs_avx2(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1892+
1893+		v[0] = h_vecs[0];
1894+		v[1] = h_vecs[1];
1895+		v[2] = h_vecs[2];
1896+		v[3] = h_vecs[3];
1897+		v[4] = h_vecs[4];
1898+		v[5] = h_vecs[5];
1899+		v[6] = h_vecs[6];
1900+		v[7] = h_vecs[7];
1901+		v[8] = set1_avx2(IV[0]);
1902+		v[9] = set1_avx2(IV[1]);
1903+		v[10] = set1_avx2(IV[2]);
1904+		v[11] = set1_avx2(IV[3]);
1905+		v[12] = counter_low_vec;
1906+		v[13] = counter_high_vec;
1907+		v[14] = block_len_vec;
1908+		v[15] = block_flags_vec;
1909+
1910+		round_fn_avx2(v, msg_vecs, 0);
1911+		round_fn_avx2(v, msg_vecs, 1);
1912+		round_fn_avx2(v, msg_vecs, 2);
1913+		round_fn_avx2(v, msg_vecs, 3);
1914+		round_fn_avx2(v, msg_vecs, 4);
1915+		round_fn_avx2(v, msg_vecs, 5);
1916+		round_fn_avx2(v, msg_vecs, 6);
1917+
1918+		h_vecs[0] = xorv_avx2(v[0], v[8]);
1919+		h_vecs[1] = xorv_avx2(v[1], v[9]);
1920+		h_vecs[2] = xorv_avx2(v[2], v[10]);
1921+		h_vecs[3] = xorv_avx2(v[3], v[11]);
1922+		h_vecs[4] = xorv_avx2(v[4], v[12]);
1923+		h_vecs[5] = xorv_avx2(v[5], v[13]);
1924+		h_vecs[6] = xorv_avx2(v[6], v[14]);
1925+		h_vecs[7] = xorv_avx2(v[7], v[15]);
1926+
1927+		block_flags = flags;
1928+	}
1929+
1930+	transpose_vecs_avx2(h_vecs);
1931+	storeu_avx2(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
1932+	storeu_avx2(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
1933+	storeu_avx2(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
1934+	storeu_avx2(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
1935+	storeu_avx2(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
1936+	storeu_avx2(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
1937+	storeu_avx2(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
1938+	storeu_avx2(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
1939+}
1940+
1941+void
1942+blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
1943+{
1944+	while (num_inputs >= DEGREE_AVX2) {
1945+		blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1946+		if (increment_counter) {
1947+			counter += DEGREE_AVX2;
1948+		}
1949+		inputs += DEGREE_AVX2;
1950+		num_inputs -= DEGREE_AVX2;
1951+		out_bytes = &out_bytes[DEGREE_AVX2 * BLAKE3_OUT_LEN];
1952+	}
1953+	blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
1954+}
1955+#pragma GCC pop_options
1956+
1957+#pragma GCC push_options
1958+#pragma GCC target("avx512f,avx512vl")
1959+static inline __m128i
1960+loadu_128_avx512(const uint8_t src[16])
1961+{
1962+	return _mm_loadu_si128((const __m128i *)src);
1963+}
1964+
1965+static inline __m256i
1966+loadu_256_avx512(const uint8_t src[32])
1967+{
1968+	return _mm256_loadu_si256((const __m256i *)src);
1969+}
1970+
1971+static inline __m512i
1972+loadu_512_avx512(const uint8_t src[64])
1973+{
1974+	return _mm512_loadu_si512((const __m512i *)src);
1975+}
1976+
1977+static inline void
1978+storeu_128_avx512(__m128i src, uint8_t dest[16])
1979+{
1980+	_mm_storeu_si128((__m128i *)dest, src);
1981+}
1982+
1983+static inline void
1984+storeu_256_avx512(__m256i src, uint8_t dest[32])
1985+{
1986+	_mm256_storeu_si256((__m256i *)dest, src);
1987+}
1988+
1989+static inline __m128i
1990+add_128_avx512(__m128i a, __m128i b)
1991+{
1992+	return _mm_add_epi32(a, b);
1993+}
1994+
1995+static inline __m256i
1996+add_256_avx512(__m256i a, __m256i b)
1997+{
1998+	return _mm256_add_epi32(a, b);
1999+}
2000+
2001+static inline __m512i
2002+add_512_avx512(__m512i a, __m512i b)
2003+{
2004+	return _mm512_add_epi32(a, b);
2005+}
2006+
2007+static inline __m128i
2008+xor_128_avx512(__m128i a, __m128i b)
2009+{
2010+	return _mm_xor_si128(a, b);
2011+}
2012+
2013+static inline __m256i
2014+xor_256_avx512(__m256i a, __m256i b)
2015+{
2016+	return _mm256_xor_si256(a, b);
2017+}
2018+
2019+static inline __m512i
2020+xor_512_avx512(__m512i a, __m512i b)
2021+{
2022+	return _mm512_xor_si512(a, b);
2023+}
2024+
2025+static inline __m128i
2026+set1_128_avx512(uint32_t x)
2027+{
2028+	return _mm_set1_epi32((int32_t)x);
2029+}
2030+
2031+static inline __m256i
2032+set1_256_avx512(uint32_t x)
2033+{
2034+	return _mm256_set1_epi32((int32_t)x);
2035+}
2036+
2037+static inline __m512i
2038+set1_512_avx512(uint32_t x)
2039+{
2040+	return _mm512_set1_epi32((int32_t)x);
2041+}
2042+
2043+static inline __m128i
2044+set4_avx512(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
2045+{
2046+	return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
2047+}
2048+
2049+static inline __m128i
2050+rot16_128_avx512(__m128i x)
2051+{
2052+	return _mm_ror_epi32(x, 16);
2053+}
2054+
2055+static inline __m256i
2056+rot16_256_avx512(__m256i x)
2057+{
2058+	return _mm256_ror_epi32(x, 16);
2059+}
2060+
2061+static inline __m512i
2062+rot16_512_avx512(__m512i x)
2063+{
2064+	return _mm512_ror_epi32(x, 16);
2065+}
2066+
2067+static inline __m128i
2068+rot12_128_avx512(__m128i x)
2069+{
2070+	return _mm_ror_epi32(x, 12);
2071+}
2072+
2073+static inline __m256i
2074+rot12_256_avx512(__m256i x)
2075+{
2076+	return _mm256_ror_epi32(x, 12);
2077+}
2078+
2079+static inline __m512i
2080+rot12_512_avx512(__m512i x)
2081+{
2082+	return _mm512_ror_epi32(x, 12);
2083+}
2084+
2085+static inline __m128i
2086+rot8_128_avx512(__m128i x)
2087+{
2088+	return _mm_ror_epi32(x, 8);
2089+}
2090+
2091+static inline __m256i
2092+rot8_256_avx512(__m256i x)
2093+{
2094+	return _mm256_ror_epi32(x, 8);
2095+}
2096+
2097+static inline __m512i
2098+rot8_512_avx512(__m512i x)
2099+{
2100+	return _mm512_ror_epi32(x, 8);
2101+}
2102+
2103+static inline __m128i
2104+rot7_128_avx512(__m128i x)
2105+{
2106+	return _mm_ror_epi32(x, 7);
2107+}
2108+
2109+static inline __m256i
2110+rot7_256_avx512(__m256i x)
2111+{
2112+	return _mm256_ror_epi32(x, 7);
2113+}
2114+
2115+static inline __m512i
2116+rot7_512_avx512(__m512i x)
2117+{
2118+	return _mm512_ror_epi32(x, 7);
2119+}
2120+
2121+static inline void
2122+g1_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2123+{
2124+	*row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2125+	*row3 = xor_128_avx512(*row3, *row0);
2126+	*row3 = rot16_128_avx512(*row3);
2127+	*row2 = add_128_avx512(*row2, *row3);
2128+	*row1 = xor_128_avx512(*row1, *row2);
2129+	*row1 = rot12_128_avx512(*row1);
2130+}
2131+
2132+static inline void
2133+g2_avx512(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
2134+{
2135+	*row0 = add_128_avx512(add_128_avx512(*row0, m), *row1);
2136+	*row3 = xor_128_avx512(*row3, *row0);
2137+	*row3 = rot8_128_avx512(*row3);
2138+	*row2 = add_128_avx512(*row2, *row3);
2139+	*row1 = xor_128_avx512(*row1, *row2);
2140+	*row1 = rot7_128_avx512(*row1);
2141+}
2142+
2143+static inline void
2144+diagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2145+{
2146+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
2147+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2148+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
2149+}
2150+
2151+static inline void
2152+undiagonalize_avx512(__m128i *row0, __m128i *row2, __m128i *row3)
2153+{
2154+	*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
2155+	*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
2156+	*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
2157+}
2158+
2159+static inline void
2160+compress_pre_avx512(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2161+{
2162+	__m128i m0, m1, m2, m3;
2163+	__m128i t0, t1, t2, t3;
2164+
2165+	rows[0] = loadu_128_avx512((const uint8_t *)&cv[0]);
2166+	rows[1] = loadu_128_avx512((const uint8_t *)&cv[4]);
2167+	rows[2] = set4_avx512(IV[0], IV[1], IV[2], IV[3]);
2168+	rows[3] = set4_avx512(counter_low(counter), counter_high(counter), (uint32_t)block_len, (uint32_t)flags);
2169+
2170+	m0 = loadu_128_avx512(&block[sizeof(__m128i) * 0]);
2171+	m1 = loadu_128_avx512(&block[sizeof(__m128i) * 1]);
2172+	m2 = loadu_128_avx512(&block[sizeof(__m128i) * 2]);
2173+	m3 = loadu_128_avx512(&block[sizeof(__m128i) * 3]);
2174+
2175+	/* round 1 */
2176+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0));
2177+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2178+	t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1));
2179+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2180+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2181+	t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0));
2182+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
2183+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2184+	t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1));
2185+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
2186+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2187+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2188+	m0 = t0;
2189+	m1 = t1;
2190+	m2 = t2;
2191+	m3 = t3;
2192+
2193+	/* round 2 */
2194+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2195+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2196+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2197+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2198+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2199+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2200+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2201+	t2 = _mm_unpacklo_epi64(m3, m1);
2202+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2203+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2204+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2205+	t3 = _mm_unpackhi_epi32(m1, m3);
2206+	t3 = _mm_unpacklo_epi32(m2, t3);
2207+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2208+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2209+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2210+	m0 = t0;
2211+	m1 = t1;
2212+	m2 = t2;
2213+	m3 = t3;
2214+
2215+	/* round 3 */
2216+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2217+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2218+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2219+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2220+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2221+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2222+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2223+	t2 = _mm_unpacklo_epi64(m3, m1);
2224+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2225+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2226+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2227+	t3 = _mm_unpackhi_epi32(m1, m3);
2228+	t3 = _mm_unpacklo_epi32(m2, t3);
2229+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2230+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2231+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2232+	m0 = t0;
2233+	m1 = t1;
2234+	m2 = t2;
2235+	m3 = t3;
2236+
2237+	/* round 4 */
2238+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2239+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2240+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2241+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2242+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2243+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2244+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2245+	t2 = _mm_unpacklo_epi64(m3, m1);
2246+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2247+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2248+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2249+	t3 = _mm_unpackhi_epi32(m1, m3);
2250+	t3 = _mm_unpacklo_epi32(m2, t3);
2251+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2252+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2253+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2254+	m0 = t0;
2255+	m1 = t1;
2256+	m2 = t2;
2257+	m3 = t3;
2258+
2259+	/* round 5 */
2260+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2261+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2262+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2263+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2264+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2265+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2266+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2267+	t2 = _mm_unpacklo_epi64(m3, m1);
2268+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2269+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2270+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2271+	t3 = _mm_unpackhi_epi32(m1, m3);
2272+	t3 = _mm_unpacklo_epi32(m2, t3);
2273+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2274+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2275+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2276+	m0 = t0;
2277+	m1 = t1;
2278+	m2 = t2;
2279+	m3 = t3;
2280+
2281+	/* round 6 */
2282+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2283+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2284+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2285+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2286+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2287+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2288+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2289+	t2 = _mm_unpacklo_epi64(m3, m1);
2290+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2291+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2292+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2293+	t3 = _mm_unpackhi_epi32(m1, m3);
2294+	t3 = _mm_unpacklo_epi32(m2, t3);
2295+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2296+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2297+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2298+	m0 = t0;
2299+	m1 = t1;
2300+	m2 = t2;
2301+	m3 = t3;
2302+
2303+	/* round 7 */
2304+	t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
2305+	t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
2306+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t0);
2307+	t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
2308+	t1 = _mm_blend_epi16(m0, t1, 0xCC);
2309+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t1);
2310+	diagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2311+	t2 = _mm_unpacklo_epi64(m3, m1);
2312+	t2 = _mm_blend_epi16(t2, m2, 0xC0);
2313+	t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 3, 2, 0));
2314+	g1_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t2);
2315+	t3 = _mm_unpackhi_epi32(m1, m3);
2316+	t3 = _mm_unpacklo_epi32(m2, t3);
2317+	t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(0, 1, 3, 2));
2318+	g2_avx512(&rows[0], &rows[1], &rows[2], &rows[3], t3);
2319+	undiagonalize_avx512(&rows[0], &rows[2], &rows[3]);
2320+}
2321+
2322+void
2323+blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
2324+{
2325+	__m128i rows[4];
2326+
2327+	compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2328+	storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), (uint8_t *)&cv[0]);
2329+	storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), (uint8_t *)&cv[4]);
2330+}
2331+
2332+void
2333+blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out_buf[64])
2334+{
2335+	__m128i rows[4];
2336+
2337+	compress_pre_avx512(rows, cv, block, block_len, counter, flags);
2338+	storeu_128_avx512(xor_128_avx512(rows[0], rows[2]), &out_buf[0]);
2339+	storeu_128_avx512(xor_128_avx512(rows[1], rows[3]), &out_buf[16]);
2340+	storeu_128_avx512(xor_128_avx512(rows[2], loadu_128_avx512((const uint8_t *)&cv[0])), &out_buf[32]);
2341+	storeu_128_avx512(xor_128_avx512(rows[3], loadu_128_avx512((const uint8_t *)&cv[4])), &out_buf[48]);
2342+}
2343+
2344+static inline void
2345+round_fn4_avx512(__m128i v[16], __m128i m[16], size_t r)
2346+{
2347+	v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2348+	v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2349+	v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2350+	v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2351+	v[0] = add_128_avx512(v[0], v[4]);
2352+	v[1] = add_128_avx512(v[1], v[5]);
2353+	v[2] = add_128_avx512(v[2], v[6]);
2354+	v[3] = add_128_avx512(v[3], v[7]);
2355+	v[12] = xor_128_avx512(v[12], v[0]);
2356+	v[13] = xor_128_avx512(v[13], v[1]);
2357+	v[14] = xor_128_avx512(v[14], v[2]);
2358+	v[15] = xor_128_avx512(v[15], v[3]);
2359+	v[12] = rot16_128_avx512(v[12]);
2360+	v[13] = rot16_128_avx512(v[13]);
2361+	v[14] = rot16_128_avx512(v[14]);
2362+	v[15] = rot16_128_avx512(v[15]);
2363+	v[8] = add_128_avx512(v[8], v[12]);
2364+	v[9] = add_128_avx512(v[9], v[13]);
2365+	v[10] = add_128_avx512(v[10], v[14]);
2366+	v[11] = add_128_avx512(v[11], v[15]);
2367+	v[4] = xor_128_avx512(v[4], v[8]);
2368+	v[5] = xor_128_avx512(v[5], v[9]);
2369+	v[6] = xor_128_avx512(v[6], v[10]);
2370+	v[7] = xor_128_avx512(v[7], v[11]);
2371+	v[4] = rot12_128_avx512(v[4]);
2372+	v[5] = rot12_128_avx512(v[5]);
2373+	v[6] = rot12_128_avx512(v[6]);
2374+	v[7] = rot12_128_avx512(v[7]);
2375+	v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2376+	v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2377+	v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2378+	v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2379+	v[0] = add_128_avx512(v[0], v[4]);
2380+	v[1] = add_128_avx512(v[1], v[5]);
2381+	v[2] = add_128_avx512(v[2], v[6]);
2382+	v[3] = add_128_avx512(v[3], v[7]);
2383+	v[12] = xor_128_avx512(v[12], v[0]);
2384+	v[13] = xor_128_avx512(v[13], v[1]);
2385+	v[14] = xor_128_avx512(v[14], v[2]);
2386+	v[15] = xor_128_avx512(v[15], v[3]);
2387+	v[12] = rot8_128_avx512(v[12]);
2388+	v[13] = rot8_128_avx512(v[13]);
2389+	v[14] = rot8_128_avx512(v[14]);
2390+	v[15] = rot8_128_avx512(v[15]);
2391+	v[8] = add_128_avx512(v[8], v[12]);
2392+	v[9] = add_128_avx512(v[9], v[13]);
2393+	v[10] = add_128_avx512(v[10], v[14]);
2394+	v[11] = add_128_avx512(v[11], v[15]);
2395+	v[4] = xor_128_avx512(v[4], v[8]);
2396+	v[5] = xor_128_avx512(v[5], v[9]);
2397+	v[6] = xor_128_avx512(v[6], v[10]);
2398+	v[7] = xor_128_avx512(v[7], v[11]);
2399+	v[4] = rot7_128_avx512(v[4]);
2400+	v[5] = rot7_128_avx512(v[5]);
2401+	v[6] = rot7_128_avx512(v[6]);
2402+	v[7] = rot7_128_avx512(v[7]);
2403+
2404+	v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2405+	v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2406+	v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2407+	v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2408+	v[0] = add_128_avx512(v[0], v[5]);
2409+	v[1] = add_128_avx512(v[1], v[6]);
2410+	v[2] = add_128_avx512(v[2], v[7]);
2411+	v[3] = add_128_avx512(v[3], v[4]);
2412+	v[15] = xor_128_avx512(v[15], v[0]);
2413+	v[12] = xor_128_avx512(v[12], v[1]);
2414+	v[13] = xor_128_avx512(v[13], v[2]);
2415+	v[14] = xor_128_avx512(v[14], v[3]);
2416+	v[15] = rot16_128_avx512(v[15]);
2417+	v[12] = rot16_128_avx512(v[12]);
2418+	v[13] = rot16_128_avx512(v[13]);
2419+	v[14] = rot16_128_avx512(v[14]);
2420+	v[10] = add_128_avx512(v[10], v[15]);
2421+	v[11] = add_128_avx512(v[11], v[12]);
2422+	v[8] = add_128_avx512(v[8], v[13]);
2423+	v[9] = add_128_avx512(v[9], v[14]);
2424+	v[5] = xor_128_avx512(v[5], v[10]);
2425+	v[6] = xor_128_avx512(v[6], v[11]);
2426+	v[7] = xor_128_avx512(v[7], v[8]);
2427+	v[4] = xor_128_avx512(v[4], v[9]);
2428+	v[5] = rot12_128_avx512(v[5]);
2429+	v[6] = rot12_128_avx512(v[6]);
2430+	v[7] = rot12_128_avx512(v[7]);
2431+	v[4] = rot12_128_avx512(v[4]);
2432+	v[0] = add_128_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2433+	v[1] = add_128_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2434+	v[2] = add_128_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2435+	v[3] = add_128_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2436+	v[0] = add_128_avx512(v[0], v[5]);
2437+	v[1] = add_128_avx512(v[1], v[6]);
2438+	v[2] = add_128_avx512(v[2], v[7]);
2439+	v[3] = add_128_avx512(v[3], v[4]);
2440+	v[15] = xor_128_avx512(v[15], v[0]);
2441+	v[12] = xor_128_avx512(v[12], v[1]);
2442+	v[13] = xor_128_avx512(v[13], v[2]);
2443+	v[14] = xor_128_avx512(v[14], v[3]);
2444+	v[15] = rot8_128_avx512(v[15]);
2445+	v[12] = rot8_128_avx512(v[12]);
2446+	v[13] = rot8_128_avx512(v[13]);
2447+	v[14] = rot8_128_avx512(v[14]);
2448+	v[10] = add_128_avx512(v[10], v[15]);
2449+	v[11] = add_128_avx512(v[11], v[12]);
2450+	v[8] = add_128_avx512(v[8], v[13]);
2451+	v[9] = add_128_avx512(v[9], v[14]);
2452+	v[5] = xor_128_avx512(v[5], v[10]);
2453+	v[6] = xor_128_avx512(v[6], v[11]);
2454+	v[7] = xor_128_avx512(v[7], v[8]);
2455+	v[4] = xor_128_avx512(v[4], v[9]);
2456+	v[5] = rot7_128_avx512(v[5]);
2457+	v[6] = rot7_128_avx512(v[6]);
2458+	v[7] = rot7_128_avx512(v[7]);
2459+	v[4] = rot7_128_avx512(v[4]);
2460+}
2461+
2462+static inline void
2463+round_fn8_avx512(__m256i v[16], __m256i m[16], size_t r)
2464+{
2465+	v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2466+	v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2467+	v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2468+	v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2469+	v[0] = add_256_avx512(v[0], v[4]);
2470+	v[1] = add_256_avx512(v[1], v[5]);
2471+	v[2] = add_256_avx512(v[2], v[6]);
2472+	v[3] = add_256_avx512(v[3], v[7]);
2473+	v[12] = xor_256_avx512(v[12], v[0]);
2474+	v[13] = xor_256_avx512(v[13], v[1]);
2475+	v[14] = xor_256_avx512(v[14], v[2]);
2476+	v[15] = xor_256_avx512(v[15], v[3]);
2477+	v[12] = rot16_256_avx512(v[12]);
2478+	v[13] = rot16_256_avx512(v[13]);
2479+	v[14] = rot16_256_avx512(v[14]);
2480+	v[15] = rot16_256_avx512(v[15]);
2481+	v[8] = add_256_avx512(v[8], v[12]);
2482+	v[9] = add_256_avx512(v[9], v[13]);
2483+	v[10] = add_256_avx512(v[10], v[14]);
2484+	v[11] = add_256_avx512(v[11], v[15]);
2485+	v[4] = xor_256_avx512(v[4], v[8]);
2486+	v[5] = xor_256_avx512(v[5], v[9]);
2487+	v[6] = xor_256_avx512(v[6], v[10]);
2488+	v[7] = xor_256_avx512(v[7], v[11]);
2489+	v[4] = rot12_256_avx512(v[4]);
2490+	v[5] = rot12_256_avx512(v[5]);
2491+	v[6] = rot12_256_avx512(v[6]);
2492+	v[7] = rot12_256_avx512(v[7]);
2493+	v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2494+	v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2495+	v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2496+	v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2497+	v[0] = add_256_avx512(v[0], v[4]);
2498+	v[1] = add_256_avx512(v[1], v[5]);
2499+	v[2] = add_256_avx512(v[2], v[6]);
2500+	v[3] = add_256_avx512(v[3], v[7]);
2501+	v[12] = xor_256_avx512(v[12], v[0]);
2502+	v[13] = xor_256_avx512(v[13], v[1]);
2503+	v[14] = xor_256_avx512(v[14], v[2]);
2504+	v[15] = xor_256_avx512(v[15], v[3]);
2505+	v[12] = rot8_256_avx512(v[12]);
2506+	v[13] = rot8_256_avx512(v[13]);
2507+	v[14] = rot8_256_avx512(v[14]);
2508+	v[15] = rot8_256_avx512(v[15]);
2509+	v[8] = add_256_avx512(v[8], v[12]);
2510+	v[9] = add_256_avx512(v[9], v[13]);
2511+	v[10] = add_256_avx512(v[10], v[14]);
2512+	v[11] = add_256_avx512(v[11], v[15]);
2513+	v[4] = xor_256_avx512(v[4], v[8]);
2514+	v[5] = xor_256_avx512(v[5], v[9]);
2515+	v[6] = xor_256_avx512(v[6], v[10]);
2516+	v[7] = xor_256_avx512(v[7], v[11]);
2517+	v[4] = rot7_256_avx512(v[4]);
2518+	v[5] = rot7_256_avx512(v[5]);
2519+	v[6] = rot7_256_avx512(v[6]);
2520+	v[7] = rot7_256_avx512(v[7]);
2521+
2522+	v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2523+	v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2524+	v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2525+	v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2526+	v[0] = add_256_avx512(v[0], v[5]);
2527+	v[1] = add_256_avx512(v[1], v[6]);
2528+	v[2] = add_256_avx512(v[2], v[7]);
2529+	v[3] = add_256_avx512(v[3], v[4]);
2530+	v[15] = xor_256_avx512(v[15], v[0]);
2531+	v[12] = xor_256_avx512(v[12], v[1]);
2532+	v[13] = xor_256_avx512(v[13], v[2]);
2533+	v[14] = xor_256_avx512(v[14], v[3]);
2534+	v[15] = rot16_256_avx512(v[15]);
2535+	v[12] = rot16_256_avx512(v[12]);
2536+	v[13] = rot16_256_avx512(v[13]);
2537+	v[14] = rot16_256_avx512(v[14]);
2538+	v[10] = add_256_avx512(v[10], v[15]);
2539+	v[11] = add_256_avx512(v[11], v[12]);
2540+	v[8] = add_256_avx512(v[8], v[13]);
2541+	v[9] = add_256_avx512(v[9], v[14]);
2542+	v[5] = xor_256_avx512(v[5], v[10]);
2543+	v[6] = xor_256_avx512(v[6], v[11]);
2544+	v[7] = xor_256_avx512(v[7], v[8]);
2545+	v[4] = xor_256_avx512(v[4], v[9]);
2546+	v[5] = rot12_256_avx512(v[5]);
2547+	v[6] = rot12_256_avx512(v[6]);
2548+	v[7] = rot12_256_avx512(v[7]);
2549+	v[4] = rot12_256_avx512(v[4]);
2550+	v[0] = add_256_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2551+	v[1] = add_256_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2552+	v[2] = add_256_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2553+	v[3] = add_256_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2554+	v[0] = add_256_avx512(v[0], v[5]);
2555+	v[1] = add_256_avx512(v[1], v[6]);
2556+	v[2] = add_256_avx512(v[2], v[7]);
2557+	v[3] = add_256_avx512(v[3], v[4]);
2558+	v[15] = xor_256_avx512(v[15], v[0]);
2559+	v[12] = xor_256_avx512(v[12], v[1]);
2560+	v[13] = xor_256_avx512(v[13], v[2]);
2561+	v[14] = xor_256_avx512(v[14], v[3]);
2562+	v[15] = rot8_256_avx512(v[15]);
2563+	v[12] = rot8_256_avx512(v[12]);
2564+	v[13] = rot8_256_avx512(v[13]);
2565+	v[14] = rot8_256_avx512(v[14]);
2566+	v[10] = add_256_avx512(v[10], v[15]);
2567+	v[11] = add_256_avx512(v[11], v[12]);
2568+	v[8] = add_256_avx512(v[8], v[13]);
2569+	v[9] = add_256_avx512(v[9], v[14]);
2570+	v[5] = xor_256_avx512(v[5], v[10]);
2571+	v[6] = xor_256_avx512(v[6], v[11]);
2572+	v[7] = xor_256_avx512(v[7], v[8]);
2573+	v[4] = xor_256_avx512(v[4], v[9]);
2574+	v[5] = rot7_256_avx512(v[5]);
2575+	v[6] = rot7_256_avx512(v[6]);
2576+	v[7] = rot7_256_avx512(v[7]);
2577+	v[4] = rot7_256_avx512(v[4]);
2578+}
2579+
2580+static inline void
2581+round_fn16_avx512(__m512i v[16], __m512i m[16], size_t r)
2582+{
2583+	v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
2584+	v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
2585+	v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
2586+	v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
2587+	v[0] = add_512_avx512(v[0], v[4]);
2588+	v[1] = add_512_avx512(v[1], v[5]);
2589+	v[2] = add_512_avx512(v[2], v[6]);
2590+	v[3] = add_512_avx512(v[3], v[7]);
2591+	v[12] = xor_512_avx512(v[12], v[0]);
2592+	v[13] = xor_512_avx512(v[13], v[1]);
2593+	v[14] = xor_512_avx512(v[14], v[2]);
2594+	v[15] = xor_512_avx512(v[15], v[3]);
2595+	v[12] = rot16_512_avx512(v[12]);
2596+	v[13] = rot16_512_avx512(v[13]);
2597+	v[14] = rot16_512_avx512(v[14]);
2598+	v[15] = rot16_512_avx512(v[15]);
2599+	v[8] = add_512_avx512(v[8], v[12]);
2600+	v[9] = add_512_avx512(v[9], v[13]);
2601+	v[10] = add_512_avx512(v[10], v[14]);
2602+	v[11] = add_512_avx512(v[11], v[15]);
2603+	v[4] = xor_512_avx512(v[4], v[8]);
2604+	v[5] = xor_512_avx512(v[5], v[9]);
2605+	v[6] = xor_512_avx512(v[6], v[10]);
2606+	v[7] = xor_512_avx512(v[7], v[11]);
2607+	v[4] = rot12_512_avx512(v[4]);
2608+	v[5] = rot12_512_avx512(v[5]);
2609+	v[6] = rot12_512_avx512(v[6]);
2610+	v[7] = rot12_512_avx512(v[7]);
2611+	v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
2612+	v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
2613+	v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
2614+	v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
2615+	v[0] = add_512_avx512(v[0], v[4]);
2616+	v[1] = add_512_avx512(v[1], v[5]);
2617+	v[2] = add_512_avx512(v[2], v[6]);
2618+	v[3] = add_512_avx512(v[3], v[7]);
2619+	v[12] = xor_512_avx512(v[12], v[0]);
2620+	v[13] = xor_512_avx512(v[13], v[1]);
2621+	v[14] = xor_512_avx512(v[14], v[2]);
2622+	v[15] = xor_512_avx512(v[15], v[3]);
2623+	v[12] = rot8_512_avx512(v[12]);
2624+	v[13] = rot8_512_avx512(v[13]);
2625+	v[14] = rot8_512_avx512(v[14]);
2626+	v[15] = rot8_512_avx512(v[15]);
2627+	v[8] = add_512_avx512(v[8], v[12]);
2628+	v[9] = add_512_avx512(v[9], v[13]);
2629+	v[10] = add_512_avx512(v[10], v[14]);
2630+	v[11] = add_512_avx512(v[11], v[15]);
2631+	v[4] = xor_512_avx512(v[4], v[8]);
2632+	v[5] = xor_512_avx512(v[5], v[9]);
2633+	v[6] = xor_512_avx512(v[6], v[10]);
2634+	v[7] = xor_512_avx512(v[7], v[11]);
2635+	v[4] = rot7_512_avx512(v[4]);
2636+	v[5] = rot7_512_avx512(v[5]);
2637+	v[6] = rot7_512_avx512(v[6]);
2638+	v[7] = rot7_512_avx512(v[7]);
2639+
2640+	v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
2641+	v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
2642+	v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
2643+	v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
2644+	v[0] = add_512_avx512(v[0], v[5]);
2645+	v[1] = add_512_avx512(v[1], v[6]);
2646+	v[2] = add_512_avx512(v[2], v[7]);
2647+	v[3] = add_512_avx512(v[3], v[4]);
2648+	v[15] = xor_512_avx512(v[15], v[0]);
2649+	v[12] = xor_512_avx512(v[12], v[1]);
2650+	v[13] = xor_512_avx512(v[13], v[2]);
2651+	v[14] = xor_512_avx512(v[14], v[3]);
2652+	v[15] = rot16_512_avx512(v[15]);
2653+	v[12] = rot16_512_avx512(v[12]);
2654+	v[13] = rot16_512_avx512(v[13]);
2655+	v[14] = rot16_512_avx512(v[14]);
2656+	v[10] = add_512_avx512(v[10], v[15]);
2657+	v[11] = add_512_avx512(v[11], v[12]);
2658+	v[8] = add_512_avx512(v[8], v[13]);
2659+	v[9] = add_512_avx512(v[9], v[14]);
2660+	v[5] = xor_512_avx512(v[5], v[10]);
2661+	v[6] = xor_512_avx512(v[6], v[11]);
2662+	v[7] = xor_512_avx512(v[7], v[8]);
2663+	v[4] = xor_512_avx512(v[4], v[9]);
2664+	v[5] = rot12_512_avx512(v[5]);
2665+	v[6] = rot12_512_avx512(v[6]);
2666+	v[7] = rot12_512_avx512(v[7]);
2667+	v[4] = rot12_512_avx512(v[4]);
2668+	v[0] = add_512_avx512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
2669+	v[1] = add_512_avx512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
2670+	v[2] = add_512_avx512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
2671+	v[3] = add_512_avx512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
2672+	v[0] = add_512_avx512(v[0], v[5]);
2673+	v[1] = add_512_avx512(v[1], v[6]);
2674+	v[2] = add_512_avx512(v[2], v[7]);
2675+	v[3] = add_512_avx512(v[3], v[4]);
2676+	v[15] = xor_512_avx512(v[15], v[0]);
2677+	v[12] = xor_512_avx512(v[12], v[1]);
2678+	v[13] = xor_512_avx512(v[13], v[2]);
2679+	v[14] = xor_512_avx512(v[14], v[3]);
2680+	v[15] = rot8_512_avx512(v[15]);
2681+	v[12] = rot8_512_avx512(v[12]);
2682+	v[13] = rot8_512_avx512(v[13]);
2683+	v[14] = rot8_512_avx512(v[14]);
2684+	v[10] = add_512_avx512(v[10], v[15]);
2685+	v[11] = add_512_avx512(v[11], v[12]);
2686+	v[8] = add_512_avx512(v[8], v[13]);
2687+	v[9] = add_512_avx512(v[9], v[14]);
2688+	v[5] = xor_512_avx512(v[5], v[10]);
2689+	v[6] = xor_512_avx512(v[6], v[11]);
2690+	v[7] = xor_512_avx512(v[7], v[8]);
2691+	v[4] = xor_512_avx512(v[4], v[9]);
2692+	v[5] = rot7_512_avx512(v[5]);
2693+	v[6] = rot7_512_avx512(v[6]);
2694+	v[7] = rot7_512_avx512(v[7]);
2695+	v[4] = rot7_512_avx512(v[4]);
2696+}
2697+
2698+#define LO_IMM8 0x88
2699+#define HI_IMM8 0xdd
2700+
2701+static inline __m512i
2702+unpack_lo_128_avx512(__m512i a, __m512i b)
2703+{
2704+	return _mm512_shuffle_i32x4(a, b, LO_IMM8);
2705+}
2706+
2707+static inline __m512i
2708+unpack_hi_128_avx512(__m512i a, __m512i b)
2709+{
2710+	return _mm512_shuffle_i32x4(a, b, HI_IMM8);
2711+}
2712+
2713+static inline void
2714+transpose_vecs_512_avx512(__m512i vecs[16])
2715+{
2716+	__m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
2717+	__m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
2718+	__m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
2719+	__m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
2720+	__m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
2721+	__m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
2722+	__m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
2723+	__m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
2724+	__m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
2725+	__m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
2726+	__m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
2727+	__m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
2728+	__m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
2729+	__m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
2730+	__m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
2731+	__m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
2732+
2733+	__m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
2734+	__m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
2735+	__m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
2736+	__m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
2737+	__m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
2738+	__m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
2739+	__m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
2740+	__m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
2741+	__m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
2742+	__m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
2743+	__m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
2744+	__m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
2745+	__m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
2746+	__m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
2747+	__m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
2748+	__m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
2749+
2750+	__m512i abcdefgh_0 = unpack_lo_128_avx512(abcd_0, efgh_0);
2751+	__m512i abcdefgh_1 = unpack_lo_128_avx512(abcd_1, efgh_1);
2752+	__m512i abcdefgh_2 = unpack_lo_128_avx512(abcd_2, efgh_2);
2753+	__m512i abcdefgh_3 = unpack_lo_128_avx512(abcd_3, efgh_3);
2754+	__m512i abcdefgh_4 = unpack_hi_128_avx512(abcd_0, efgh_0);
2755+	__m512i abcdefgh_5 = unpack_hi_128_avx512(abcd_1, efgh_1);
2756+	__m512i abcdefgh_6 = unpack_hi_128_avx512(abcd_2, efgh_2);
2757+	__m512i abcdefgh_7 = unpack_hi_128_avx512(abcd_3, efgh_3);
2758+	__m512i ijklmnop_0 = unpack_lo_128_avx512(ijkl_0, mnop_0);
2759+	__m512i ijklmnop_1 = unpack_lo_128_avx512(ijkl_1, mnop_1);
2760+	__m512i ijklmnop_2 = unpack_lo_128_avx512(ijkl_2, mnop_2);
2761+	__m512i ijklmnop_3 = unpack_lo_128_avx512(ijkl_3, mnop_3);
2762+	__m512i ijklmnop_4 = unpack_hi_128_avx512(ijkl_0, mnop_0);
2763+	__m512i ijklmnop_5 = unpack_hi_128_avx512(ijkl_1, mnop_1);
2764+	__m512i ijklmnop_6 = unpack_hi_128_avx512(ijkl_2, mnop_2);
2765+	__m512i ijklmnop_7 = unpack_hi_128_avx512(ijkl_3, mnop_3);
2766+
2767+	vecs[0] = unpack_lo_128_avx512(abcdefgh_0, ijklmnop_0);
2768+	vecs[1] = unpack_lo_128_avx512(abcdefgh_1, ijklmnop_1);
2769+	vecs[2] = unpack_lo_128_avx512(abcdefgh_2, ijklmnop_2);
2770+	vecs[3] = unpack_lo_128_avx512(abcdefgh_3, ijklmnop_3);
2771+	vecs[4] = unpack_lo_128_avx512(abcdefgh_4, ijklmnop_4);
2772+	vecs[5] = unpack_lo_128_avx512(abcdefgh_5, ijklmnop_5);
2773+	vecs[6] = unpack_lo_128_avx512(abcdefgh_6, ijklmnop_6);
2774+	vecs[7] = unpack_lo_128_avx512(abcdefgh_7, ijklmnop_7);
2775+	vecs[8] = unpack_hi_128_avx512(abcdefgh_0, ijklmnop_0);
2776+	vecs[9] = unpack_hi_128_avx512(abcdefgh_1, ijklmnop_1);
2777+	vecs[10] = unpack_hi_128_avx512(abcdefgh_2, ijklmnop_2);
2778+	vecs[11] = unpack_hi_128_avx512(abcdefgh_3, ijklmnop_3);
2779+	vecs[12] = unpack_hi_128_avx512(abcdefgh_4, ijklmnop_4);
2780+	vecs[13] = unpack_hi_128_avx512(abcdefgh_5, ijklmnop_5);
2781+	vecs[14] = unpack_hi_128_avx512(abcdefgh_6, ijklmnop_6);
2782+	vecs[15] = unpack_hi_128_avx512(abcdefgh_7, ijklmnop_7);
2783+}
2784+
2785+static inline void
2786+transpose_msg_vecs16_avx512(const uint8_t *const *inputs, size_t block_offset, __m512i out_msg[16])
2787+{
2788+	size_t i;
2789+
2790+	out_msg[0] = loadu_512_avx512(&inputs[0][block_offset]);
2791+	out_msg[1] = loadu_512_avx512(&inputs[1][block_offset]);
2792+	out_msg[2] = loadu_512_avx512(&inputs[2][block_offset]);
2793+	out_msg[3] = loadu_512_avx512(&inputs[3][block_offset]);
2794+	out_msg[4] = loadu_512_avx512(&inputs[4][block_offset]);
2795+	out_msg[5] = loadu_512_avx512(&inputs[5][block_offset]);
2796+	out_msg[6] = loadu_512_avx512(&inputs[6][block_offset]);
2797+	out_msg[7] = loadu_512_avx512(&inputs[7][block_offset]);
2798+	out_msg[8] = loadu_512_avx512(&inputs[8][block_offset]);
2799+	out_msg[9] = loadu_512_avx512(&inputs[9][block_offset]);
2800+	out_msg[10] = loadu_512_avx512(&inputs[10][block_offset]);
2801+	out_msg[11] = loadu_512_avx512(&inputs[11][block_offset]);
2802+	out_msg[12] = loadu_512_avx512(&inputs[12][block_offset]);
2803+	out_msg[13] = loadu_512_avx512(&inputs[13][block_offset]);
2804+	out_msg[14] = loadu_512_avx512(&inputs[14][block_offset]);
2805+	out_msg[15] = loadu_512_avx512(&inputs[15][block_offset]);
2806+
2807+	for (i = 0; i < 16; i++) {
2808+		_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
2809+	}
2810+	transpose_vecs_512_avx512(out_msg);
2811+}
2812+
2813+static inline void
2814+load_counters16_avx512(uint64_t counter, int increment_counter, __m512i *out_lo, __m512i *out_hi)
2815+{
2816+	const __m512i mask = _mm512_set1_epi32(-increment_counter);
2817+	const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
2818+	const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
2819+	const __m512i low_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), masked_deltas);
2820+	const __m512i carries = _mm512_srli_epi32(_mm512_andnot_si512(low_words, _mm512_set1_epi32((int32_t)counter)), 31);
2821+	const __m512i high_words = _mm512_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carries);
2822+
2823+	*out_lo = low_words;
2824+	*out_hi = high_words;
2825+}
2826+
2827+void
2828+blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2829+{
2830+	__m512i h_vecs[8];
2831+	__m512i counter_low_vec, counter_high_vec;
2832+	uint8_t block_flags;
2833+	size_t block;
2834+
2835+	h_vecs[0] = set1_512_avx512(key[0]);
2836+	h_vecs[1] = set1_512_avx512(key[1]);
2837+	h_vecs[2] = set1_512_avx512(key[2]);
2838+	h_vecs[3] = set1_512_avx512(key[3]);
2839+	h_vecs[4] = set1_512_avx512(key[4]);
2840+	h_vecs[5] = set1_512_avx512(key[5]);
2841+	h_vecs[6] = set1_512_avx512(key[6]);
2842+	h_vecs[7] = set1_512_avx512(key[7]);
2843+
2844+	load_counters16_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2845+	block_flags = flags | flags_start;
2846+
2847+	for (block = 0; block < blocks; block++) {
2848+		__m512i block_len_vec;
2849+		__m512i block_flags_vec;
2850+		__m512i msg_vecs[16];
2851+		__m512i v[16];
2852+
2853+		if (block + 1 == blocks) {
2854+			block_flags |= flags_end;
2855+		}
2856+		block_len_vec = set1_512_avx512(BLAKE3_BLOCK_LEN);
2857+		block_flags_vec = set1_512_avx512(block_flags);
2858+		transpose_msg_vecs16_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2859+
2860+		v[0] = h_vecs[0];
2861+		v[1] = h_vecs[1];
2862+		v[2] = h_vecs[2];
2863+		v[3] = h_vecs[3];
2864+		v[4] = h_vecs[4];
2865+		v[5] = h_vecs[5];
2866+		v[6] = h_vecs[6];
2867+		v[7] = h_vecs[7];
2868+		v[8] = set1_512_avx512(IV[0]);
2869+		v[9] = set1_512_avx512(IV[1]);
2870+		v[10] = set1_512_avx512(IV[2]);
2871+		v[11] = set1_512_avx512(IV[3]);
2872+		v[12] = counter_low_vec;
2873+		v[13] = counter_high_vec;
2874+		v[14] = block_len_vec;
2875+		v[15] = block_flags_vec;
2876+
2877+		round_fn16_avx512(v, msg_vecs, 0);
2878+		round_fn16_avx512(v, msg_vecs, 1);
2879+		round_fn16_avx512(v, msg_vecs, 2);
2880+		round_fn16_avx512(v, msg_vecs, 3);
2881+		round_fn16_avx512(v, msg_vecs, 4);
2882+		round_fn16_avx512(v, msg_vecs, 5);
2883+		round_fn16_avx512(v, msg_vecs, 6);
2884+
2885+		h_vecs[0] = xor_512_avx512(v[0], v[8]);
2886+		h_vecs[1] = xor_512_avx512(v[1], v[9]);
2887+		h_vecs[2] = xor_512_avx512(v[2], v[10]);
2888+		h_vecs[3] = xor_512_avx512(v[3], v[11]);
2889+		h_vecs[4] = xor_512_avx512(v[4], v[12]);
2890+		h_vecs[5] = xor_512_avx512(v[5], v[13]);
2891+		h_vecs[6] = xor_512_avx512(v[6], v[14]);
2892+		h_vecs[7] = xor_512_avx512(v[7], v[15]);
2893+
2894+		block_flags = flags;
2895+	}
2896+
2897+	__m512i padded[16] = {
2898+		h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
2899+		h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
2900+		set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2901+		set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0), set1_512_avx512(0),
2902+	};
2903+	transpose_vecs_512_avx512(padded);
2904+	_mm256_mask_storeu_epi32(&out_bytes[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
2905+	_mm256_mask_storeu_epi32(&out_bytes[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
2906+	_mm256_mask_storeu_epi32(&out_bytes[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
2907+	_mm256_mask_storeu_epi32(&out_bytes[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
2908+	_mm256_mask_storeu_epi32(&out_bytes[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
2909+	_mm256_mask_storeu_epi32(&out_bytes[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
2910+	_mm256_mask_storeu_epi32(&out_bytes[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
2911+	_mm256_mask_storeu_epi32(&out_bytes[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
2912+	_mm256_mask_storeu_epi32(&out_bytes[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
2913+	_mm256_mask_storeu_epi32(&out_bytes[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
2914+	_mm256_mask_storeu_epi32(&out_bytes[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
2915+	_mm256_mask_storeu_epi32(&out_bytes[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
2916+	_mm256_mask_storeu_epi32(&out_bytes[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
2917+	_mm256_mask_storeu_epi32(&out_bytes[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
2918+	_mm256_mask_storeu_epi32(&out_bytes[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
2919+	_mm256_mask_storeu_epi32(&out_bytes[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
2920+}
2921+
2922+static inline void
2923+transpose_msg_vecs8_avx512(const uint8_t *const *inputs, size_t block_offset, __m256i out_msg[16])
2924+{
2925+	out_msg[0] = loadu_256_avx512(&inputs[0][block_offset]);
2926+	out_msg[1] = loadu_256_avx512(&inputs[1][block_offset]);
2927+	out_msg[2] = loadu_256_avx512(&inputs[2][block_offset]);
2928+	out_msg[3] = loadu_256_avx512(&inputs[3][block_offset]);
2929+	out_msg[4] = loadu_256_avx512(&inputs[4][block_offset]);
2930+	out_msg[5] = loadu_256_avx512(&inputs[5][block_offset]);
2931+	out_msg[6] = loadu_256_avx512(&inputs[6][block_offset]);
2932+	out_msg[7] = loadu_256_avx512(&inputs[7][block_offset]);
2933+	out_msg[8] = loadu_256_avx512(&inputs[0][block_offset + 32]);
2934+	out_msg[9] = loadu_256_avx512(&inputs[1][block_offset + 32]);
2935+	out_msg[10] = loadu_256_avx512(&inputs[2][block_offset + 32]);
2936+	out_msg[11] = loadu_256_avx512(&inputs[3][block_offset + 32]);
2937+	out_msg[12] = loadu_256_avx512(&inputs[4][block_offset + 32]);
2938+	out_msg[13] = loadu_256_avx512(&inputs[5][block_offset + 32]);
2939+	out_msg[14] = loadu_256_avx512(&inputs[6][block_offset + 32]);
2940+	out_msg[15] = loadu_256_avx512(&inputs[7][block_offset + 32]);
2941+
2942+	transpose_vecs_avx2(out_msg);
2943+}
2944+
2945+static inline void
2946+load_counters8_avx512(uint64_t counter, int increment_counter, __m256i *out_lo, __m256i *out_hi)
2947+{
2948+	const __m256i mask = _mm256_set1_epi32(-increment_counter);
2949+	const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2950+	const __m256i add1 = _mm256_and_si256(mask, add0);
2951+	__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
2952+	__m256i carry = _mm256_srli_epi32(_mm256_andnot_si256(l, _mm256_set1_epi32((int32_t)counter)), 31);
2953+	__m256i h = _mm256_add_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
2954+
2955+	*out_lo = l;
2956+	*out_hi = h;
2957+}
2958+
2959+void
2960+blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
2961+{
2962+	__m256i h_vecs[8];
2963+	__m256i counter_low_vec, counter_high_vec;
2964+	uint8_t block_flags;
2965+	size_t block;
2966+
2967+	h_vecs[0] = set1_256_avx512(key[0]);
2968+	h_vecs[1] = set1_256_avx512(key[1]);
2969+	h_vecs[2] = set1_256_avx512(key[2]);
2970+	h_vecs[3] = set1_256_avx512(key[3]);
2971+	h_vecs[4] = set1_256_avx512(key[4]);
2972+	h_vecs[5] = set1_256_avx512(key[5]);
2973+	h_vecs[6] = set1_256_avx512(key[6]);
2974+	h_vecs[7] = set1_256_avx512(key[7]);
2975+
2976+	load_counters8_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
2977+	block_flags = flags | flags_start;
2978+
2979+	for (block = 0; block < blocks; block++) {
2980+		__m256i block_len_vec;
2981+		__m256i block_flags_vec;
2982+		__m256i msg_vecs[16];
2983+		__m256i v[16];
2984+
2985+		if (block + 1 == blocks) {
2986+			block_flags |= flags_end;
2987+		}
2988+		block_len_vec = set1_256_avx512(BLAKE3_BLOCK_LEN);
2989+		block_flags_vec = set1_256_avx512(block_flags);
2990+		transpose_msg_vecs8_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
2991+
2992+		v[0] = h_vecs[0];
2993+		v[1] = h_vecs[1];
2994+		v[2] = h_vecs[2];
2995+		v[3] = h_vecs[3];
2996+		v[4] = h_vecs[4];
2997+		v[5] = h_vecs[5];
2998+		v[6] = h_vecs[6];
2999+		v[7] = h_vecs[7];
3000+		v[8] = set1_256_avx512(IV[0]);
3001+		v[9] = set1_256_avx512(IV[1]);
3002+		v[10] = set1_256_avx512(IV[2]);
3003+		v[11] = set1_256_avx512(IV[3]);
3004+		v[12] = counter_low_vec;
3005+		v[13] = counter_high_vec;
3006+		v[14] = block_len_vec;
3007+		v[15] = block_flags_vec;
3008+
3009+		round_fn8_avx512(v, msg_vecs, 0);
3010+		round_fn8_avx512(v, msg_vecs, 1);
3011+		round_fn8_avx512(v, msg_vecs, 2);
3012+		round_fn8_avx512(v, msg_vecs, 3);
3013+		round_fn8_avx512(v, msg_vecs, 4);
3014+		round_fn8_avx512(v, msg_vecs, 5);
3015+		round_fn8_avx512(v, msg_vecs, 6);
3016+
3017+		h_vecs[0] = xor_256_avx512(v[0], v[8]);
3018+		h_vecs[1] = xor_256_avx512(v[1], v[9]);
3019+		h_vecs[2] = xor_256_avx512(v[2], v[10]);
3020+		h_vecs[3] = xor_256_avx512(v[3], v[11]);
3021+		h_vecs[4] = xor_256_avx512(v[4], v[12]);
3022+		h_vecs[5] = xor_256_avx512(v[5], v[13]);
3023+		h_vecs[6] = xor_256_avx512(v[6], v[14]);
3024+		h_vecs[7] = xor_256_avx512(v[7], v[15]);
3025+
3026+		block_flags = flags;
3027+	}
3028+
3029+	transpose_vecs_avx2(h_vecs);
3030+	storeu_256_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m256i)]);
3031+	storeu_256_avx512(h_vecs[1], &out_bytes[1 * sizeof(__m256i)]);
3032+	storeu_256_avx512(h_vecs[2], &out_bytes[2 * sizeof(__m256i)]);
3033+	storeu_256_avx512(h_vecs[3], &out_bytes[3 * sizeof(__m256i)]);
3034+	storeu_256_avx512(h_vecs[4], &out_bytes[4 * sizeof(__m256i)]);
3035+	storeu_256_avx512(h_vecs[5], &out_bytes[5 * sizeof(__m256i)]);
3036+	storeu_256_avx512(h_vecs[6], &out_bytes[6 * sizeof(__m256i)]);
3037+	storeu_256_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m256i)]);
3038+}
3039+
3040+static inline void
3041+transpose_msg_vecs4_avx512(const uint8_t *const *inputs, size_t block_offset, __m128i out_msg[16])
3042+{
3043+	out_msg[0] = loadu_128_avx512(&inputs[0][block_offset]);
3044+	out_msg[1] = loadu_128_avx512(&inputs[1][block_offset]);
3045+	out_msg[2] = loadu_128_avx512(&inputs[2][block_offset]);
3046+	out_msg[3] = loadu_128_avx512(&inputs[3][block_offset]);
3047+	out_msg[4] = loadu_128_avx512(&inputs[0][block_offset + 16]);
3048+	out_msg[5] = loadu_128_avx512(&inputs[1][block_offset + 16]);
3049+	out_msg[6] = loadu_128_avx512(&inputs[2][block_offset + 16]);
3050+	out_msg[7] = loadu_128_avx512(&inputs[3][block_offset + 16]);
3051+	out_msg[8] = loadu_128_avx512(&inputs[0][block_offset + 32]);
3052+	out_msg[9] = loadu_128_avx512(&inputs[1][block_offset + 32]);
3053+	out_msg[10] = loadu_128_avx512(&inputs[2][block_offset + 32]);
3054+	out_msg[11] = loadu_128_avx512(&inputs[3][block_offset + 32]);
3055+	out_msg[12] = loadu_128_avx512(&inputs[0][block_offset + 48]);
3056+	out_msg[13] = loadu_128_avx512(&inputs[1][block_offset + 48]);
3057+	out_msg[14] = loadu_128_avx512(&inputs[2][block_offset + 48]);
3058+	out_msg[15] = loadu_128_avx512(&inputs[3][block_offset + 48]);
3059+
3060+	transpose_vecs_sse2(out_msg);
3061+}
3062+
3063+static inline void
3064+load_counters4_avx512(uint64_t counter, int increment_counter, __m128i *out_lo, __m128i *out_hi)
3065+{
3066+	const __m128i mask = _mm_set1_epi32(-increment_counter);
3067+	const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
3068+	const __m128i add1 = _mm_and_si128(mask, add0);
3069+	__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
3070+	__m128i carry = _mm_srli_epi32(_mm_andnot_si128(l, _mm_set1_epi32((int32_t)counter)), 31);
3071+	__m128i h = _mm_add_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
3072+
3073+	*out_lo = l;
3074+	*out_hi = h;
3075+}
3076+
3077+void
3078+blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3079+{
3080+	__m128i h_vecs[8];
3081+	__m128i counter_low_vec, counter_high_vec;
3082+	uint8_t block_flags;
3083+	size_t block;
3084+
3085+	h_vecs[0] = set1_128_avx512(key[0]);
3086+	h_vecs[1] = set1_128_avx512(key[1]);
3087+	h_vecs[2] = set1_128_avx512(key[2]);
3088+	h_vecs[3] = set1_128_avx512(key[3]);
3089+	h_vecs[4] = set1_128_avx512(key[4]);
3090+	h_vecs[5] = set1_128_avx512(key[5]);
3091+	h_vecs[6] = set1_128_avx512(key[6]);
3092+	h_vecs[7] = set1_128_avx512(key[7]);
3093+
3094+	load_counters4_avx512(counter, increment_counter, &counter_low_vec, &counter_high_vec);
3095+	block_flags = flags | flags_start;
3096+
3097+	for (block = 0; block < blocks; block++) {
3098+		__m128i block_len_vec;
3099+		__m128i block_flags_vec;
3100+		__m128i msg_vecs[16];
3101+		__m128i v[16];
3102+
3103+		if (block + 1 == blocks) {
3104+			block_flags |= flags_end;
3105+		}
3106+		block_len_vec = set1_128_avx512(BLAKE3_BLOCK_LEN);
3107+		block_flags_vec = set1_128_avx512(block_flags);
3108+		transpose_msg_vecs4_avx512(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
3109+
3110+		v[0] = h_vecs[0];
3111+		v[1] = h_vecs[1];
3112+		v[2] = h_vecs[2];
3113+		v[3] = h_vecs[3];
3114+		v[4] = h_vecs[4];
3115+		v[5] = h_vecs[5];
3116+		v[6] = h_vecs[6];
3117+		v[7] = h_vecs[7];
3118+		v[8] = set1_128_avx512(IV[0]);
3119+		v[9] = set1_128_avx512(IV[1]);
3120+		v[10] = set1_128_avx512(IV[2]);
3121+		v[11] = set1_128_avx512(IV[3]);
3122+		v[12] = counter_low_vec;
3123+		v[13] = counter_high_vec;
3124+		v[14] = block_len_vec;
3125+		v[15] = block_flags_vec;
3126+
3127+		round_fn4_avx512(v, msg_vecs, 0);
3128+		round_fn4_avx512(v, msg_vecs, 1);
3129+		round_fn4_avx512(v, msg_vecs, 2);
3130+		round_fn4_avx512(v, msg_vecs, 3);
3131+		round_fn4_avx512(v, msg_vecs, 4);
3132+		round_fn4_avx512(v, msg_vecs, 5);
3133+		round_fn4_avx512(v, msg_vecs, 6);
3134+
3135+		h_vecs[0] = xor_128_avx512(v[0], v[8]);
3136+		h_vecs[1] = xor_128_avx512(v[1], v[9]);
3137+		h_vecs[2] = xor_128_avx512(v[2], v[10]);
3138+		h_vecs[3] = xor_128_avx512(v[3], v[11]);
3139+		h_vecs[4] = xor_128_avx512(v[4], v[12]);
3140+		h_vecs[5] = xor_128_avx512(v[5], v[13]);
3141+		h_vecs[6] = xor_128_avx512(v[6], v[14]);
3142+		h_vecs[7] = xor_128_avx512(v[7], v[15]);
3143+
3144+		block_flags = flags;
3145+	}
3146+
3147+	transpose_vecs_sse2(h_vecs);
3148+	storeu_128_avx512(h_vecs[0], &out_bytes[0 * sizeof(__m128i)]);
3149+	storeu_128_avx512(h_vecs[1], &out_bytes[2 * sizeof(__m128i)]);
3150+	storeu_128_avx512(h_vecs[2], &out_bytes[4 * sizeof(__m128i)]);
3151+	storeu_128_avx512(h_vecs[3], &out_bytes[6 * sizeof(__m128i)]);
3152+	storeu_128_avx512(h_vecs[4], &out_bytes[1 * sizeof(__m128i)]);
3153+	storeu_128_avx512(h_vecs[5], &out_bytes[3 * sizeof(__m128i)]);
3154+	storeu_128_avx512(h_vecs[6], &out_bytes[5 * sizeof(__m128i)]);
3155+	storeu_128_avx512(h_vecs[7], &out_bytes[7 * sizeof(__m128i)]);
3156+}
3157+
3158+static inline void
3159+hash_one_avx512(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out_bytes[BLAKE3_OUT_LEN])
3160+{
3161+	uint32_t cv[8];
3162+	uint8_t block_flags;
3163+
3164+	memcpy(cv, key, BLAKE3_KEY_LEN);
3165+	block_flags = flags | flags_start;
3166+	while (blocks > 0) {
3167+		if (blocks == 1) {
3168+			block_flags |= flags_end;
3169+		}
3170+		blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
3171+		input = &input[BLAKE3_BLOCK_LEN];
3172+		blocks -= 1;
3173+		block_flags = flags;
3174+	}
3175+	memcpy(out_bytes, cv, BLAKE3_OUT_LEN);
3176+}
3177+
3178+void
3179+blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out_bytes)
3180+{
3181+	while (num_inputs >= 16) {
3182+		blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3183+		if (increment_counter) {
3184+			counter += 16;
3185+		}
3186+		inputs += 16;
3187+		num_inputs -= 16;
3188+		out_bytes = &out_bytes[16 * BLAKE3_OUT_LEN];
3189+	}
3190+	while (num_inputs >= 8) {
3191+		blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3192+		if (increment_counter) {
3193+			counter += 8;
3194+		}
3195+		inputs += 8;
3196+		num_inputs -= 8;
3197+		out_bytes = &out_bytes[8 * BLAKE3_OUT_LEN];
3198+	}
3199+	while (num_inputs >= 4) {
3200+		blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out_bytes);
3201+		if (increment_counter) {
3202+			counter += 4;
3203+		}
3204+		inputs += 4;
3205+		num_inputs -= 4;
3206+		out_bytes = &out_bytes[4 * BLAKE3_OUT_LEN];
3207+	}
3208+	while (num_inputs > 0) {
3209+		hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out_bytes);
3210+		if (increment_counter) {
3211+			counter += 1;
3212+		}
3213+		inputs += 1;
3214+		num_inputs -= 1;
3215+		out_bytes = &out_bytes[BLAKE3_OUT_LEN];
3216+	}
3217+}
3218+#pragma GCC pop_options
3219+#endif
3220+
3221+/* dispatch functions */
3222+void
3223+blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3224+{
3225+#if defined(__x86_64__)
3226+	if (!blake3_cpu_detected)
3227+		blake3_detect_cpu_features();
3228+	if (blake3_cpu_features & AVX512) {
3229+		blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
3230+		return;
3231+	}
3232+	if (blake3_cpu_features & SSE41) {
3233+		blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
3234+		return;
3235+	}
3236+	if (blake3_cpu_features & SSE2) {
3237+		blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
3238+		return;
3239+	}
3240+#endif
3241+	blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
3242+}
3243+
3244+void
3245+blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
3246+{
3247+#if defined(__x86_64__)
3248+	if (!blake3_cpu_detected)
3249+		blake3_detect_cpu_features();
3250+	if (blake3_cpu_features & AVX512) {
3251+		blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
3252+		return;
3253+	}
3254+	if (blake3_cpu_features & SSE41) {
3255+		blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
3256+		return;
3257+	}
3258+	if (blake3_cpu_features & SSE2) {
3259+		blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
3260+		return;
3261+	}
3262+#endif
3263+	blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
3264+}
3265+
3266+void
3267+blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, int increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
3268+{
3269+#if defined(__x86_64__)
3270+	if (!blake3_cpu_detected)
3271+		blake3_detect_cpu_features();
3272+	if (blake3_cpu_features & AVX512) {
3273+		blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3274+		return;
3275+	}
3276+	if (blake3_cpu_features & AVX2) {
3277+		blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3278+		return;
3279+	}
3280+	if (blake3_cpu_features & SSE41) {
3281+		blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3282+		return;
3283+	}
3284+	if (blake3_cpu_features & SSE2) {
3285+		blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3286+		return;
3287+	}
3288+#endif
3289+	blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
3290+}
3291+
3292+size_t
3293+blake3_simd_degree(void)
3294+{
3295+#if defined(__x86_64__)
3296+	if (!blake3_cpu_detected)
3297+		blake3_detect_cpu_features();
3298+	if (blake3_cpu_features & AVX512)
3299+		return 16;
3300+	if (blake3_cpu_features & AVX2)
3301+		return 8;
3302+	if (blake3_cpu_features & SSE41)
3303+		return 4;
3304+	if (blake3_cpu_features & SSE2)
3305+		return 4;
3306+#endif
3307+	return 1;
3308+}
3309+
3310+/* core hasher implementation */
3311+const char *
3312+blake3_version(void)
3313+{
3314+	return BLAKE3_VERSION_STRING;
3315+}
3316+
3317+static inline void
3318+chunk_state_init(struct Blake3ChunkState *self, const uint32_t key[8], uint8_t flags)
3319+{
3320+	memcpy(self->cv, key, BLAKE3_KEY_LEN);
3321+	self->chunk_counter = 0;
3322+	memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3323+	self->buf_len = 0;
3324+	self->blocks_compressed = 0;
3325+	self->flags = flags;
3326+}
3327+
3328+static inline void
3329+chunk_state_reset(struct Blake3ChunkState *self, const uint32_t key[8], uint64_t chunk_counter)
3330+{
3331+	memcpy(self->cv, key, BLAKE3_KEY_LEN);
3332+	self->chunk_counter = chunk_counter;
3333+	self->blocks_compressed = 0;
3334+	memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3335+	self->buf_len = 0;
3336+}
3337+
3338+static inline size_t
3339+chunk_state_len(const struct Blake3ChunkState *self)
3340+{
3341+	return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len);
3342+}
3343+
3344+static inline size_t
3345+chunk_state_fill_buf(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3346+{
3347+	size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
3348+	uint8_t *dest;
3349+
3350+	if (take > input_len) {
3351+		take = input_len;
3352+	}
3353+	dest = self->buf + ((size_t)self->buf_len);
3354+	memcpy(dest, input, take);
3355+	self->buf_len += (uint8_t)take;
3356+	return take;
3357+}
3358+
3359+static inline uint8_t
3360+chunk_state_maybe_start_flag(const struct Blake3ChunkState *self)
3361+{
3362+	if (self->blocks_compressed == 0) {
3363+		return CHUNK_START;
3364+	} else {
3365+		return 0;
3366+	}
3367+}
3368+
3369+static inline struct Output
3370+make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
3371+{
3372+	struct Output ret;
3373+
3374+	memcpy(ret.input_cv, input_cv, 32);
3375+	memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
3376+	ret.block_len = block_len;
3377+	ret.counter = counter;
3378+	ret.flags = flags;
3379+	return ret;
3380+}
3381+
3382+static inline void
3383+output_chaining_value(const struct Output *self, uint8_t cv[32])
3384+{
3385+	uint32_t cv_words[8];
3386+
3387+	memcpy(cv_words, self->input_cv, 32);
3388+	blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags);
3389+	store_cv_words(cv, cv_words);
3390+}
3391+
3392+static inline void
3393+output_root_bytes(const struct Output *self, uint64_t seek, uint8_t *out, size_t out_len)
3394+{
3395+	uint64_t output_block_counter = seek / 64;
3396+	size_t offset_within_block = seek % 64;
3397+	uint8_t wide_buf[64];
3398+	size_t available_bytes;
3399+	size_t memcpy_len;
3400+
3401+	while (out_len > 0) {
3402+		blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
3403+		available_bytes = 64 - offset_within_block;
3404+		if (out_len > available_bytes) {
3405+			memcpy_len = available_bytes;
3406+		} else {
3407+			memcpy_len = out_len;
3408+		}
3409+		memcpy(out, wide_buf + offset_within_block, memcpy_len);
3410+		out += memcpy_len;
3411+		out_len -= memcpy_len;
3412+		output_block_counter += 1;
3413+		offset_within_block = 0;
3414+	}
3415+}
3416+
3417+static inline void
3418+chunk_state_update(struct Blake3ChunkState *self, const uint8_t *input, size_t input_len)
3419+{
3420+	size_t take;
3421+
3422+	if (self->buf_len > 0) {
3423+		take = chunk_state_fill_buf(self, input, input_len);
3424+		input += take;
3425+		input_len -= take;
3426+		if (input_len > 0) {
3427+			blake3_compress_in_place(self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3428+			self->blocks_compressed += 1;
3429+			self->buf_len = 0;
3430+			memset(self->buf, 0, BLAKE3_BLOCK_LEN);
3431+		}
3432+	}
3433+
3434+	while (input_len > BLAKE3_BLOCK_LEN) {
3435+		blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
3436+		self->blocks_compressed += 1;
3437+		input += BLAKE3_BLOCK_LEN;
3438+		input_len -= BLAKE3_BLOCK_LEN;
3439+	}
3440+
3441+	take = chunk_state_fill_buf(self, input, input_len);
3442+	input += take;
3443+	input_len -= take;
3444+}
3445+
3446+static inline struct Output
3447+chunk_state_output(const struct Blake3ChunkState *self)
3448+{
3449+	uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
3450+
3451+	return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
3452+}
3453+
3454+static inline struct Output
3455+parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags)
3456+{
3457+	return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
3458+}
3459+
3460+static unsigned int
3461+highest_one(uint64_t x)
3462+{
3463+#if defined(__GNUC__) || defined(__clang__)
3464+	return 63 ^ __builtin_clzll(x);
3465+#else
3466+	unsigned int c = 0;
3467+	if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
3468+	if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
3469+	if (x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
3470+	if (x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
3471+	if (x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
3472+	if (x & 0x0000000000000002ULL) {           c +=  1; }
3473+	return c;
3474+#endif
3475+}
3476+
3477+static inline uint64_t
3478+round_down_to_power_of_2(uint64_t x)
3479+{
3480+	return 1ULL << highest_one(x | 1);
3481+}
3482+
3483+static inline size_t
3484+left_len(size_t content_len)
3485+{
3486+	size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
3487+
3488+	return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
3489+}
3490+
3491+static inline size_t
3492+compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3493+{
3494+	const uint8_t *chunks_array[MAX_SIMD_DEGREE];
3495+	size_t input_position = 0;
3496+	size_t chunks_array_len = 0;
3497+
3498+	assert(0 < input_len);
3499+	assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
3500+
3501+	while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
3502+		chunks_array[chunks_array_len] = &input[input_position];
3503+		input_position += BLAKE3_CHUNK_LEN;
3504+		chunks_array_len += 1;
3505+	}
3506+
3507+	blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, 1, flags, CHUNK_START, CHUNK_END, out);
3508+
3509+	if (input_len > input_position) {
3510+		uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
3511+		struct Blake3ChunkState chunk_state;
3512+		struct Output output;
3513+
3514+		chunk_state_init(&chunk_state, key, flags);
3515+		chunk_state.chunk_counter = counter;
3516+		chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
3517+		output = chunk_state_output(&chunk_state);
3518+		output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
3519+		return chunks_array_len + 1;
3520+	} else {
3521+		return chunks_array_len;
3522+	}
3523+}
3524+
3525+static inline size_t
3526+compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out)
3527+{
3528+	const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
3529+	size_t parents_array_len = 0;
3530+
3531+	assert(2 <= num_chaining_values);
3532+	assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
3533+
3534+	while (num_chaining_values - (2 * parents_array_len) >= 2) {
3535+		parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
3536+		parents_array_len += 1;
3537+	}
3538+
3539+	blake3_hash_many(parents_array, parents_array_len, 1, key, 0, 0, flags | PARENT, 0, 0, out);
3540+
3541+	if (num_chaining_values > 2 * parents_array_len) {
3542+		memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
3543+		return parents_array_len + 1;
3544+	} else {
3545+		return parents_array_len;
3546+	}
3547+}
3548+
3549+static inline size_t
3550+blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
3551+{
3552+	size_t degree;
3553+
3554+	if (input_len <= (size_t)blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
3555+		return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out);
3556+	}
3557+
3558+	degree = blake3_simd_degree();
3559+	if (degree > 1) {
3560+		size_t child_len = round_down_to_power_of_2(input_len - 1) / degree;
3561+		size_t cvs_written = 0;
3562+
3563+		if (child_len < BLAKE3_CHUNK_LEN) {
3564+			child_len = BLAKE3_CHUNK_LEN;
3565+		}
3566+
3567+		while (input_len > 0) {
3568+			size_t take = child_len;
3569+			size_t sub_cvs;
3570+
3571+			if (take > input_len) {
3572+				take = input_len;
3573+			}
3574+			sub_cvs = blake3_compress_subtree_wide(input, take, key, chunk_counter, flags, &out[cvs_written * BLAKE3_OUT_LEN]);
3575+			cvs_written += sub_cvs;
3576+			chunk_counter += take / BLAKE3_CHUNK_LEN;
3577+			input += take;
3578+			input_len -= take;
3579+		}
3580+
3581+		while (cvs_written > 2) {
3582+			cvs_written = compress_parents_parallel(out, cvs_written, key, flags, out);
3583+		}
3584+		return cvs_written;
3585+	} else {
3586+		/* fallback when simd degree is 1 */
3587+		size_t left = left_len(input_len);
3588+		size_t right = input_len - left;
3589+		size_t left_cvs = blake3_compress_subtree_wide(input, left, key, chunk_counter, flags, out);
3590+		size_t right_cvs = blake3_compress_subtree_wide(input + left, right, key, chunk_counter + (left / BLAKE3_CHUNK_LEN), flags, &out[left_cvs * BLAKE3_OUT_LEN]);
3591+
3592+		return left_cvs + right_cvs;
3593+	}
3594+}
3595+
3596+static void
3597+compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
3598+{
3599+	uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
3600+	size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array);
3601+
3602+	assert(num_cvs >= 2);
3603+	while (num_cvs > 2) {
3604+		num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, cv_array);
3605+	}
3606+	memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
3607+}
3608+
3609+static inline void
3610+hasher_init_base(struct Blake3Hasher *self, const uint32_t key[8], uint8_t flags)
3611+{
3612+	memcpy(self->key, key, BLAKE3_KEY_LEN);
3613+	chunk_state_init(&self->chunk, key, flags);
3614+	self->cv_stack_len = 0;
3615+}
3616+
3617+void
3618+blake3_hasher_init(struct Blake3Hasher *self)
3619+{
3620+	if (!blake3_cpu_detected)
3621+		blake3_detect_cpu_features();
3622+	hasher_init_base(self, IV, 0);
3623+}
3624+
3625+static inline void
3626+load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8])
3627+{
3628+	key_words[0] = load32(&key[0 * 4]);
3629+	key_words[1] = load32(&key[1 * 4]);
3630+	key_words[2] = load32(&key[2 * 4]);
3631+	key_words[3] = load32(&key[3 * 4]);
3632+	key_words[4] = load32(&key[4 * 4]);
3633+	key_words[5] = load32(&key[5 * 4]);
3634+	key_words[6] = load32(&key[6 * 4]);
3635+	key_words[7] = load32(&key[7 * 4]);
3636+}
3637+
3638+void
3639+blake3_hasher_init_keyed(struct Blake3Hasher *self, const uint8_t key[BLAKE3_KEY_LEN])
3640+{
3641+	uint32_t key_words[8];
3642+
3643+	load_key_words(key, key_words);
3644+	hasher_init_base(self, key_words, KEYED_HASH);
3645+}
3646+
3647+void
3648+blake3_hasher_init_derive_key_raw(struct Blake3Hasher *self, const void *context, size_t context_len)
3649+{
3650+	struct Blake3Hasher context_hasher;
3651+	uint8_t context_key[BLAKE3_KEY_LEN];
3652+	uint32_t context_key_words[8];
3653+
3654+	hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
3655+	blake3_hasher_update(&context_hasher, context, context_len);
3656+	blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
3657+	load_key_words(context_key, context_key_words);
3658+	hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
3659+}
3660+
3661+static inline unsigned int
3662+popcnt(uint64_t x)
3663+{
3664+#if defined(__GNUC__) || defined(__clang__)
3665+	return __builtin_popcountll(x);
3666+#else
3667+	unsigned int count = 0;
3668+	while (x != 0) {
3669+		count += 1;
3670+		x &= x - 1;
3671+	}
3672+	return count;
3673+#endif
3674+}
3675+
3676+void
3677+blake3_hasher_init_derive_key(struct Blake3Hasher *self, const char *context)
3678+{
3679+	blake3_hasher_init_derive_key_raw(self, context, strlen(context));
3680+}
3681+
3682+static inline void
3683+hasher_merge_cv_stack(struct Blake3Hasher *self, uint64_t total_len)
3684+{
3685+	size_t post_merge_stack_len = (size_t)popcnt(total_len);
3686+
3687+	while (self->cv_stack_len > post_merge_stack_len) {
3688+		uint8_t *parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
3689+		struct Output output = parent_output(parent_node, self->key, self->chunk.flags);
3690+
3691+		output_chaining_value(&output, parent_node);
3692+		self->cv_stack_len -= 1;
3693+	}
3694+}
3695+
3696+static inline void
3697+hasher_push_cv(struct Blake3Hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
3698+{
3699+	hasher_merge_cv_stack(self, chunk_counter);
3700+	memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
3701+	self->cv_stack_len += 1;
3702+}
3703+
3704+void
3705+blake3_hasher_update(struct Blake3Hasher *self, const void *input, size_t input_len)
3706+{
3707+	const uint8_t *input_bytes;
3708+
3709+	if (input_len == 0) {
3710+		return;
3711+	}
3712+
3713+	input_bytes = (const uint8_t *)input;
3714+
3715+	if (chunk_state_len(&self->chunk) > 0) {
3716+		size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
3717+
3718+		if (take > input_len) {
3719+			take = input_len;
3720+		}
3721+		chunk_state_update(&self->chunk, input_bytes, take);
3722+		input_bytes += take;
3723+		input_len -= take;
3724+		if (input_len > 0) {
3725+			struct Output output = chunk_state_output(&self->chunk);
3726+			uint8_t chunk_cv[32];
3727+
3728+			output_chaining_value(&output, chunk_cv);
3729+			hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
3730+			chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
3731+		} else {
3732+			return;
3733+		}
3734+	}
3735+
3736+	while (input_len > BLAKE3_CHUNK_LEN) {
3737+		size_t subtree_len = round_down_to_power_of_2(input_len);
3738+		uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
3739+		uint64_t subtree_chunks;
3740+
3741+		while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
3742+			subtree_len /= 2;
3743+		}
3744+		subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
3745+		if (subtree_len <= BLAKE3_CHUNK_LEN) {
3746+			struct Blake3ChunkState chunk_state;
3747+			struct Output output;
3748+			uint8_t cv[BLAKE3_OUT_LEN];
3749+
3750+			chunk_state_init(&chunk_state, self->key, self->chunk.flags);
3751+			chunk_state.chunk_counter = self->chunk.chunk_counter;
3752+			chunk_state_update(&chunk_state, input_bytes, subtree_len);
3753+			output = chunk_state_output(&chunk_state);
3754+			output_chaining_value(&output, cv);
3755+			hasher_push_cv(self, cv, chunk_state.chunk_counter);
3756+		} else {
3757+			uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
3758+
3759+			compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
3760+			hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
3761+			hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2));
3762+		}
3763+		self->chunk.chunk_counter += subtree_chunks;
3764+		input_bytes += subtree_len;
3765+		input_len -= subtree_len;
3766+	}
3767+
3768+	if (input_len > 0) {
3769+		chunk_state_update(&self->chunk, input_bytes, input_len);
3770+		hasher_merge_cv_stack(self, self->chunk.chunk_counter);
3771+	}
3772+}
3773+
3774+void
3775+blake3_hasher_finalize_seek(const struct Blake3Hasher *self, uint64_t seek, uint8_t *out, size_t out_len)
3776+{
3777+	struct Output output;
3778+	size_t cvs_remaining;
3779+
3780+	if (out_len == 0) {
3781+		return;
3782+	}
3783+
3784+	if (self->cv_stack_len == 0) {
3785+		output = chunk_state_output(&self->chunk);
3786+		output_root_bytes(&output, seek, out, out_len);
3787+		return;
3788+	}
3789+
3790+	if (chunk_state_len(&self->chunk) > 0) {
3791+		cvs_remaining = self->cv_stack_len;
3792+		output = chunk_state_output(&self->chunk);
3793+	} else {
3794+		cvs_remaining = self->cv_stack_len - 2;
3795+		output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags);
3796+	}
3797+
3798+	while (cvs_remaining > 0) {
3799+		uint8_t parent_block[BLAKE3_BLOCK_LEN];
3800+
3801+		cvs_remaining -= 1;
3802+		memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
3803+		output_chaining_value(&output, &parent_block[32]);
3804+		output = parent_output(parent_block, self->key, self->chunk.flags);
3805+	}
3806+	output_root_bytes(&output, seek, out, out_len);
3807+}
3808+
3809+void
3810+blake3_hasher_finalize(const struct Blake3Hasher *self, uint8_t *out, size_t out_len)
3811+{
3812+	blake3_hasher_finalize_seek(self, 0, out, out_len);
3813+}
3814+
3815+void
3816+blake3_hasher_reset(struct Blake3Hasher *self)
3817+{
3818+	chunk_state_reset(&self->chunk, self->key, 0);
3819+	self->cv_stack_len = 0;
3820+}
3821+
3822+/* utility implementation */
3823+static unsigned char *out;
3824+static size_t outlen = BLAKE3_OUT_LEN;
3825+
3826+static void
3827+usage(void)
3828+{
3829+	fprintf(stderr, "usage: %s [-bct] [-l length] [file ...]\n", argv0);
3830+	exit(1);
3831+}
3832+
3833+static int
3834+sumfile(const char *name, FILE *file, unsigned char *out_buf, size_t out_len)
3835+{
3836+	char buf[16384];
3837+	struct Blake3Hasher ctx;
3838+	size_t len;
3839+
3840+	blake3_hasher_init(&ctx);
3841+	do {
3842+		len = fread(buf, 1, sizeof(buf), file);
3843+		if (len > 0)
3844+			blake3_hasher_update(&ctx, buf, len);
3845+	} while (len == sizeof(buf));
3846+
3847+	if (ferror(file)) {
3848+		fprintf(stderr, "%s: read %s: ", argv0, name);
3849+		perror(NULL);
3850+		return 1;
3851+	}
3852+	blake3_hasher_finalize(&ctx, out_buf, out_len);
3853+	return 0;
3854+}
3855+
3856+static int
3857+sum(const char *name, FILE *file)
3858+{
3859+	size_t i;
3860+
3861+	if (sumfile(name, file, out, outlen) != 0)
3862+		return 1;
3863+	for (i = 0; i < outlen; i++)
3864+		printf("%02x", out[i]);
3865+	printf("  %s\n", name);
3866+	return 0;
3867+}
3868+
3869+static int
3870+hexval(int c)
3871+{
3872+	if ('0' <= c && c <= '9')
3873+		return c - '0';
3874+	if ('a' <= c && c <= 'f')
3875+		return c - 'a' + 10;
3876+	if ('A' <= c && c <= 'F')
3877+		return c - 'A' + 10;
3878+	return -1;
3879+}
3880+
3881+static int
3882+checkfile(const char *name, const char *mode, const char *str, unsigned char *out_buf, size_t len)
3883+{
3884+	FILE *file;
3885+	int c1, c2;
3886+	size_t i;
3887+
3888+	file = fopen(name, mode);
3889+	if (!file) {
3890+		fprintf(stderr, "%s: open %s: ", argv0, name);
3891+		perror(NULL);
3892+		return 1;
3893+	}
3894+	sumfile(name, file, out_buf, len);
3895+	fclose(file);
3896+
3897+	for (i = 0; i < len; i++) {
3898+		c1 = hexval(str[i * 2]);
3899+		c2 = hexval(str[i * 2 + 1]);
3900+		if (c1 == -1 || c2 == -1) {
3901+			fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3902+			return 1;
3903+		}
3904+		if (out_buf[i] != (c1 << 4 | c2)) {
3905+			printf("%s: FAILED\n", name);
3906+			return 1;
3907+		}
3908+	}
3909+	printf("%s: OK\n", name);
3910+	return 0;
3911+}
3912+
3913+static int
3914+check(const char *name, FILE *file)
3915+{
3916+	const char *mode;
3917+	char buf[8192], *pos, *end;
3918+	size_t len;
3919+	int ret = 0, skip = 0;
3920+
3921+	buf[sizeof(buf) - 2] = 0;
3922+	while (fgets(buf, sizeof(buf), file)) {
3923+		if (buf[sizeof(buf) - 2]) {
3924+			fprintf(stderr, "%s: skipping line that is too long\n", argv0);
3925+			buf[sizeof(buf) - 2] = 0;
3926+			skip = 1;
3927+			ret = 1;
3928+			continue;
3929+		}
3930+		if (skip) {
3931+			skip = 0;
3932+			continue;
3933+		}
3934+		pos = strchr(buf, ' ');
3935+		if (!pos || pos == buf || (pos[1] != ' ' && pos[1] != '*') || (pos - buf) & 1) {
3936+			fprintf(stderr, "%s: skipping invalid checksum line\n", argv0);
3937+			ret = 1;
3938+			continue;
3939+		}
3940+		mode = pos[1] == ' ' ? "r" : "rb";
3941+		len = (pos - buf) / 2;
3942+		if (len > outlen) {
3943+			outlen = len;
3944+			free(out);
3945+			out = malloc(len);
3946+			if (!out) {
3947+				perror(argv0);
3948+				return 1;
3949+			}
3950+		}
3951+		*pos = '\0';
3952+		pos += 2;
3953+		end = strchr(pos, '\n');
3954+		if (end)
3955+			*end = '\0';
3956+		ret |= checkfile(pos, mode, buf, out, len);
3957+	}
3958+	if (ferror(file)) {
3959+		fprintf(stderr, "%s: read %s: ", argv0, name);
3960+		perror(NULL);
3961+		ret = 1;
3962+	}
3963+	return ret;
3964+}
3965+
3966+int
3967+main(int argc, char *argv[])
3968+{
3969+	int (*func)(const char *, FILE *) = sum;
3970+	FILE *file;
3971+	char *end;
3972+	const char *name, *mode = NULL;
3973+	int ret = 0;
3974+
3975+	ARGBEGIN {
3976+	case 'b':
3977+		mode = "rb";
3978+		break;
3979+	case 'c':
3980+		func = check;
3981+		break;
3982+	case 'l':
3983+		outlen = strtoul(EARGF(usage()), &end, 10);
3984+		if (*end)
3985+			usage();
3986+		break;
3987+	case 't':
3988+		mode = "r";
3989+		break;
3990+	default:
3991+		usage();
3992+	} ARGEND
3993+
3994+	out = malloc(outlen);
3995+	if (!out) {
3996+		perror(NULL);
3997+		return 1;
3998+	}
3999+
4000+	if (argc == 0) {
4001+		if (!mode || strcmp(mode, "r") == 0 || freopen(NULL, mode, stdin)) {
4002+			ret |= func("<stdin>", stdin);
4003+		} else {
4004+			fprintf(stderr, "%s: reopen stdin: ", argv0);
4005+			perror(NULL);
4006+			ret = 1;
4007+		}
4008+	} else {
4009+		if (!mode)
4010+			mode = "r";
4011+		for (; argc > 0; argc--, argv++) {
4012+			name = *argv;
4013+			file = fopen(name, mode);
4014+			if (file) {
4015+				ret |= func(name, file);
4016+				fclose(file);
4017+			} else {
4018+				fprintf(stderr, "%s: open %s: ", argv0, name);
4019+				perror(NULL);
4020+				ret = 1;
4021+			}
4022+		}
4023+	}
4024+
4025+	free(out);
4026+	if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"))
4027+		ret = 1;
4028+
4029+	return ret;
4030+}
+233, -10
  1@@ -58,7 +58,27 @@ static int first = 1;
  2 static char sort = 0;
  3 static int showdirs;
  4 
  5+static int Cflag   = 0;
  6+static int one_flag = 0;
  7+static int termwidth = 80;
  8+
  9+#if FEATURE_LS_COLOR
 10+#define COLOR_DIR	"\033[1;34m"
 11+#define COLOR_LNK	"\033[1;36m"
 12+#define COLOR_FIFO	"\033[33m"
 13+#define COLOR_SOCK	"\033[1;35m"
 14+#define COLOR_DEV	"\033[1;33m"
 15+#define COLOR_EXE	"\033[1;32m"
 16+#define COLOR_RST	"\033[0m"
 17+
 18+enum { COLOR_NEVER, COLOR_ALWAYS, COLOR_AUTO };
 19+static int color_mode = COLOR_NEVER;
 20+#endif
 21+
 22 static void ls(const char *, const struct entry *, int);
 23+static void printname_colored(const char *, mode_t);
 24+static void printcols(const struct entry *, size_t);
 25+static void output(const struct entry *);
 26 
 27 static void
 28 mkent(struct entry *ent, char *path, int dostat, int follow)
 29@@ -132,6 +152,155 @@ printname(const char *name)
 30 	}
 31 }
 32 
 33+#if FEATURE_LS_COLOR
 34+static int
 35+should_color(void)
 36+{
 37+	if (color_mode == COLOR_ALWAYS)
 38+		return 1;
 39+	if (color_mode == COLOR_AUTO)
 40+		return isatty(STDOUT_FILENO);
 41+	return 0;
 42+}
 43+#endif
 44+
 45+static void
 46+printname_colored(const char *name, mode_t mode)
 47+{
 48+#if FEATURE_LS_COLOR
 49+	int need_reset = 0;
 50+
 51+	if (should_color()) {
 52+		if (S_ISDIR(mode)) {
 53+			fputs(COLOR_DIR, stdout);
 54+			need_reset = 1;
 55+		} else if (S_ISLNK(mode)) {
 56+			fputs(COLOR_LNK, stdout);
 57+			need_reset = 1;
 58+		} else if (S_ISFIFO(mode)) {
 59+			fputs(COLOR_FIFO, stdout);
 60+			need_reset = 1;
 61+		} else if (S_ISSOCK(mode)) {
 62+			fputs(COLOR_SOCK, stdout);
 63+			need_reset = 1;
 64+		} else if (S_ISBLK(mode) || S_ISCHR(mode)) {
 65+			fputs(COLOR_DEV, stdout);
 66+			need_reset = 1;
 67+		} else if (S_ISREG(mode) && (mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
 68+			fputs(COLOR_EXE, stdout);
 69+			need_reset = 1;
 70+		}
 71+	}
 72+	printname(name);
 73+	if (need_reset)
 74+		fputs(COLOR_RST, stdout);
 75+#else
 76+	(void)mode;
 77+	printname(name);
 78+#endif
 79+}
 80+
 81+#include <sys/ioctl.h>
 82+
 83+static void
 84+gettermwidth(void)
 85+{
 86+	struct winsize ws;
 87+
 88+	if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws) == 0 && ws.ws_col > 0)
 89+		termwidth = ws.ws_col;
 90+	else
 91+		termwidth = 80;
 92+}
 93+
 94+static size_t
 95+entrywidth(const struct entry *ent)
 96+{
 97+	size_t w;
 98+	char buf[32];
 99+
100+	w = utflen(ent->name);
101+	if (iflag) {
102+		snprintf(buf, sizeof(buf), "%lu ", (unsigned long)ent->ino);
103+		w += strlen(buf);
104+	}
105+	w += strlen(indicator(ent->mode));
106+	return w;
107+}
108+
109+static void
110+printcols(const struct entry *ents, size_t n)
111+{
112+	int i, r, c, ncols, nrows, total_width;
113+	int *colwidths;
114+	int maxcols;
115+
116+	if (n == 0)
117+		return;
118+
119+	gettermwidth();
120+
121+	colwidths = ecalloc(n, sizeof(*colwidths));
122+
123+	maxcols = termwidth / 2;
124+	if (maxcols > (int)n)
125+		maxcols = n;
126+
127+	for (ncols = maxcols; ncols > 1; ncols--) {
128+		nrows = (n + ncols - 1) / ncols;
129+		total_width = 0;
130+
131+		for (c = 0; c < ncols; c++) {
132+			int maxw = 0;
133+			for (r = 0; r < nrows; r++) {
134+				int idx = c * nrows + r;
135+				if (idx < (int)n) {
136+					int w = entrywidth(&ents[idx]);
137+					if (w > maxw)
138+						maxw = w;
139+				}
140+			}
141+			colwidths[c] = maxw;
142+			total_width += maxw;
143+		}
144+		total_width += 2 * (ncols - 1);
145+
146+		if (total_width < termwidth)
147+			break;
148+	}
149+
150+	if (ncols <= 1) {
151+		for (i = 0; i < (int)n; i++) {
152+			output(&ents[i]);
153+		}
154+		free(colwidths);
155+		return;
156+	}
157+
158+	nrows = (n + ncols - 1) / ncols;
159+	for (r = 0; r < nrows; r++) {
160+		for (c = 0; c < ncols; c++) {
161+			int idx = c * nrows + r;
162+			if (idx < (int)n) {
163+				int w = entrywidth(&ents[idx]);
164+				if (iflag)
165+					printf("%lu ", (unsigned long)ents[idx].ino);
166+				printname_colored(ents[idx].name, ents[idx].mode);
167+				fputs(indicator(ents[idx].mode), stdout);
168+				
169+				if (c < ncols - 1 && (c + 1) * nrows + r < (int)n) {
170+					int pad = colwidths[c] - w + 2;
171+					while (pad-- > 0)
172+						putchar(' ');
173+				}
174+			}
175+		}
176+		putchar('\n');
177+	}
178+
179+	free(colwidths);
180+}
181+
182 static void
183 output(const struct entry *ent)
184 {
185@@ -145,7 +314,7 @@ output(const struct entry *ent)
186 	if (iflag)
187 		printf("%lu ", (unsigned long)ent->ino);
188 	if (!lflag) {
189-		printname(ent->name);
190+		printname_colored(ent->name, ent->mode);
191 		puts(indicator(ent->mode));
192 		return;
193 	}
194@@ -208,13 +377,15 @@ output(const struct entry *ent)
195 	else
196 		printf("%10lu ", (unsigned long)ent->size);
197 	printf("%s ", buf);
198-	printname(ent->name);
199+	printname_colored(ent->name, ent->mode);
200 	fputs(indicator(ent->mode), stdout);
201 	if (S_ISLNK(ent->mode)) {
202 		if ((len = readlink(ent->name, buf, sizeof(buf) - 1)) < 0)
203 			eprintf("readlink %s:", ent->name);
204 		buf[len] = '\0';
205-		printf(" -> %s%s", buf, indicator(ent->tmode));
206+		printf(" -> ");
207+		printname_colored(buf, ent->tmode);
208+		fputs(indicator(ent->tmode), stdout);
209 	}
210 	putchar('\n');
211 }
212@@ -281,8 +452,12 @@ lsdir(const char *path, const struct entry *dir)
213 		printname(dir->name);
214 		puts(":");
215 	}
216-	for (i = 0; i < n; i++)
217-		output(&ents[i]);
218+	if (!lflag && Cflag) {
219+		printcols(ents, n);
220+	} else {
221+		for (i = 0; i < n; i++)
222+			output(&ents[i]);
223+	}
224 
225 	if (Rflag) {
226 		if (snprintf(prefix, PATH_MAX, "%s%s/", path, dir->name) >=
227@@ -362,7 +537,7 @@ ls(const char *path, const struct entry *ent, int listdir)
228 static void
229 usage(void)
230 {
231-	eprintf("usage: %s [-1AacdFfHhiLlnpqRrtUu] [file ...]\n", argv0);
232+	eprintf("usage: %s [-1ACacdFfHhiLlnpqRrtUu] [file ...]\n", argv0);
233 }
234 
235 int
236@@ -371,12 +546,18 @@ main(int argc, char *argv[])
237 	struct entry ent, *dents, *fents;
238 	size_t i, ds, fs;
239 
240+	if (isatty(STDOUT_FILENO))
241+		Cflag = 1;
242+	else
243+		one_flag = 1;
244+
245 	tree = ereallocarray(NULL, PATH_MAX, sizeof(*tree));
246 
247 	ARGBEGIN {
248 	case '1':
249-		/* force output to 1 entry per line */
250-		qflag = 1;
251+		one_flag = 1;
252+		Cflag = 0;
253+		lflag = 0;
254 		break;
255 	case 'A':
256 		Aflag = 1;
257@@ -388,6 +569,11 @@ main(int argc, char *argv[])
258 		cflag = 1;
259 		uflag = 0;
260 		break;
261+	case 'C':
262+		Cflag = 1;
263+		one_flag = 0;
264+		lflag = 0;
265+		break;
266 	case 'd':
267 		dflag = 1;
268 		break;
269@@ -413,10 +599,14 @@ main(int argc, char *argv[])
270 		break;
271 	case 'l':
272 		lflag = 1;
273+		Cflag = 0;
274+		one_flag = 0;
275 		break;
276 	case 'n':
277 		lflag = 1;
278 		nflag = 1;
279+		Cflag = 0;
280+		one_flag = 0;
281 		break;
282 	case 'p':
283 		pflag = 1;
284@@ -443,6 +633,35 @@ main(int argc, char *argv[])
285 		uflag = 1;
286 		cflag = 0;
287 		break;
288+	case '-':
289+#if FEATURE_LS_COLOR
290+		if (strncmp(argv[0], "-color", 6) == 0) {
291+			char *val = NULL;
292+			if (argv[0][6] == '=') {
293+				val = &argv[0][7];
294+			} else if (argv[0][6] == '\0') {
295+				val = "always";
296+			}
297+			if (val) {
298+				if (strcmp(val, "always") == 0)
299+					color_mode = COLOR_ALWAYS;
300+				else if (strcmp(val, "never") == 0)
301+					color_mode = COLOR_NEVER;
302+				else if (strcmp(val, "auto") == 0)
303+					color_mode = COLOR_AUTO;
304+				else {
305+					fprintf(stderr, "ls: invalid --color value: %s\n", val);
306+					usage();
307+				}
308+			}
309+			brk_ = 1;
310+		} else {
311+			usage();
312+		}
313+#else
314+		usage();
315+#endif
316+		break;
317 	default:
318 		usage();
319 	} ARGEND
320@@ -478,8 +697,12 @@ main(int argc, char *argv[])
321 		qsort(fents, fs, sizeof(ent), entcmp);
322 		qsort(dents, ds, sizeof(ent), entcmp);
323 
324-		for (i = 0; i < fs; ++i)
325-			ls("", &fents[i], 0);
326+		if (!lflag && Cflag && fs > 0) {
327+			printcols(fents, fs);
328+		} else {
329+			for (i = 0; i < fs; ++i)
330+				ls("", &fents[i], 0);
331+		}
332 		free(fents);
333 		if (fs && ds)
334 			putchar('\n');
+30, -7
  1@@ -169,6 +169,7 @@ static int is_eof(FILE *f);
  2 static void do_writes(void);
  3 static void write_file(char *path, FILE *out);
  4 static void check_puts(char *s, FILE *f);
  5+static void write_patt(char *s, FILE *f);
  6 static void update_ranges(Cmd *beg, Cmd *end);
  7 
  8 /* Sed functions */
  9@@ -216,6 +217,9 @@ static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
 10 static Cmd   *prog, *pc; /* Program, program counter */
 11 static size_t pcap;
 12 static size_t lineno;
 13+#if FEATURE_SED_PRESERVE_NEWLINE
 14+static int hadnl = 1;
 15+#endif
 16 
 17 static regex_t *lastre; /* last used regex for empty regex search */
 18 static char   **files;  /* list of file names from argv */
 19@@ -479,6 +483,10 @@ read_line(FILE *f, String *s)
 20 			eprintf("getline:");
 21 		return EOF;
 22 	}
 23+#if FEATURE_SED_PRESERVE_NEWLINE
 24+	if (len > 0)
 25+		hadnl = (s->str[len - 1] == '\n');
 26+#endif
 27 	if (s->str[--len] == '\n')
 28 		s->str[len] = '\0';
 29 	return 0;
 30@@ -1227,6 +1235,21 @@ check_puts(char *s, FILE *f)
 31 		eprintf("fputs:");
 32 }
 33 
 34+static void
 35+write_patt(char *s, FILE *f)
 36+{
 37+#if FEATURE_SED_PRESERVE_NEWLINE
 38+	if (s && fputs(s, f) == EOF)
 39+		eprintf("fputs:");
 40+	if (hadnl) {
 41+		if (fputs("\n", f) == EOF)
 42+			eprintf("fputs:");
 43+	}
 44+#else
 45+	check_puts(s, f);
 46+#endif
 47+}
 48+
 49 /* iterate from beg to end updating ranges so we don't miss any commands
 50  * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
 51  */
 52@@ -1391,7 +1414,7 @@ cmd_n(Cmd *c)
 53 		return;
 54 
 55 	if (!gflags.n)
 56-		check_puts(patt.str, stdout);
 57+		write_patt(patt.str, stdout);
 58 	do_writes();
 59 	new_line();
 60 }
 61@@ -1409,7 +1432,7 @@ static void
 62 cmd_p(Cmd *c)
 63 {
 64 	if (in_range(c))
 65-		check_puts(patt.str, stdout);
 66+		write_patt(patt.str, stdout);
 67 }
 68 
 69 static void
 70@@ -1423,7 +1446,7 @@ cmd_P(Cmd *c)
 71 	if ((p = strchr(patt.str, '\n')))
 72 		*p = '\0';
 73 
 74-	check_puts(patt.str, stdout);
 75+	write_patt(patt.str, stdout);
 76 
 77 	if (p)
 78 		*p = '\n';
 79@@ -1551,9 +1574,9 @@ cmd_s(Cmd *c)
 80 	genbuf = tmp;
 81 
 82 	if (c->u.s.p)
 83-		check_puts(patt.str, stdout);
 84+		write_patt(patt.str, stdout);
 85 	if (c->u.s.file)
 86-		check_puts(patt.str, c->u.s.file);
 87+		write_patt(patt.str, c->u.s.file);
 88 }
 89 
 90 static void
 91@@ -1572,7 +1595,7 @@ static void
 92 cmd_w(Cmd *c)
 93 {
 94 	if (in_range(c))
 95-		check_puts(patt.str, c->u.file);
 96+		write_patt(patt.str, c->u.file);
 97 }
 98 
 99 static void
100@@ -1662,7 +1685,7 @@ cmd_last(Cmd *c)
101 {
102 	(void)c;
103 	if (!gflags.n)
104-		check_puts(patt.str, stdout);
105+		write_patt(patt.str, stdout);
106 	do_writes();
107 	new_next();
108 }
+6, -1
 1@@ -178,6 +178,7 @@ BUILD_PSEUDO_WHOAMI = $(BUILD_PSEUDO)
 2 BUILD_PSEUDO_XINSTALL = $(BUILD_PSEUDO)
 3 BUILD_PSEUDO_YES = $(BUILD_PSEUDO)
 4 BUILD_PSEUDO_BASE64 = $(BUILD_PSEUDO)
 5+BUILD_PSEUDO_B3SUM = $(BUILD_PSEUDO)
 6 
 7 # make tool
 8 BUILD_MAKE_MAKE = $(BUILD_MAKE)
 9@@ -214,6 +215,8 @@ FEATURE_SH_ULIMIT       = 1
10 FEATURE_SH_SETVAR       = 1
11 FEATURE_SH_WORDEXP      = 1
12 FEATURE_CAL_EXT         = 1
13+FEATURE_SED_PRESERVE_NEWLINE = 1
14+FEATURE_LS_COLOR             = 1
15 
16 CPPFLAGS =\
17 	-Ishared\
18@@ -255,7 +258,9 @@ CPPFLAGS =\
19 	-DFEATURE_SH_ULIMIT=$(FEATURE_SH_ULIMIT)\
20 	-DFEATURE_SH_SETVAR=$(FEATURE_SH_SETVAR)\
21 	-DFEATURE_SH_WORDEXP=$(FEATURE_SH_WORDEXP)\
22-	-DFEATURE_CAL_EXT=$(FEATURE_CAL_EXT)
23+	-DFEATURE_CAL_EXT=$(FEATURE_CAL_EXT)\
24+	-DFEATURE_SED_PRESERVE_NEWLINE=$(FEATURE_SED_PRESERVE_NEWLINE)\
25+	-DFEATURE_LS_COLOR=$(FEATURE_LS_COLOR)
26 
27 CFLAGS   = -std=c99 -Wall -Wextra -pedantic
28 LDFLAGS  =
+6, -0
 1@@ -81,3 +81,9 @@
 2 #ifndef FEATURE_OD_ENDIAN
 3 #define FEATURE_OD_ENDIAN       1
 4 #endif
 5+#ifndef FEATURE_SED_PRESERVE_NEWLINE
 6+#define FEATURE_SED_PRESERVE_NEWLINE 1
 7+#endif
 8+#ifndef FEATURE_LS_COLOR
 9+#define FEATURE_LS_COLOR             1
10+#endif