commit 9faf0d2

pascalecu  ·  2026-02-19 21:21:11 +0000 UTC
parent 67d10f2
reformat everything
93 files changed,  +24395, -18432
+64, -60
  1@@ -33,8 +33,8 @@
  2 #include <stdlib.h>
  3 #include <string.h>
  4 #include <sys/mman.h>
  5-#include <sys/types.h>
  6 #include <sys/stat.h>
  7+#include <sys/types.h>
  8 
  9 #define min(a, b) ((a) < (b) ? (a) : (b))
 10 #define max(a, b) ((a) > (b) ? (a) : (b))
 11@@ -53,7 +53,7 @@ struct glyph {
 12 static struct {
 13 	int count;
 14 	struct glyph *glyphs;
 15-} extracted_font = { 0, NULL };
 16+} extracted_font = {0, NULL};
 17 
 18 #define PCF_PROPERTIES (1 << 0)
 19 #define PCF_ACCELERATORS (1 << 1)
 20@@ -136,10 +136,8 @@ handle_compressed_metrics(int32_t count, struct compressed_metrics *m)
 21 	int i;
 22 	for (i = 0; i < count; ++i) {
 23 		struct glyph *glyph = &extracted_font.glyphs[i];
 24-		glyph->left_bearing =
 25-		    ((int16_t)m[i].left_sided_bearing) - 0x80;
 26-		glyph->right_bearing =
 27-		    ((int16_t)m[i].right_side_bearing) - 0x80;
 28+		glyph->left_bearing = ((int16_t)m[i].left_sided_bearing) - 0x80;
 29+		glyph->right_bearing = ((int16_t)m[i].right_side_bearing) - 0x80;
 30 		glyph->width = ((int16_t)m[i].character_width) - 0x80;
 31 		glyph->ascent = ((int16_t)m[i].character_ascent) - 0x80;
 32 		glyph->descent = ((int16_t)m[i].character_descent) - 0x80;
 33@@ -161,9 +159,8 @@ handle_metrics(void *metricbuf)
 34 	if ((metrics->format & PCF_FORMAT_MASK) == PCF_DEFAULT_FORMAT) {
 35 		fprintf(stderr, "todo...\n");
 36 	} else if ((metrics->format & PCF_FORMAT_MASK) == PCF_COMPRESSED_METRICS) {
 37-		handle_compressed_metrics(
 38-		    metrics->compressed.count,
 39-		    &metrics->compressed.compressed_metrics[0]);
 40+		handle_compressed_metrics(metrics->compressed.count,
 41+		                          &metrics->compressed.compressed_metrics[0]);
 42 	} else {
 43 		fprintf(stderr, "incompatible format\n");
 44 		abort();
 45@@ -181,8 +178,8 @@ handle_glyph_names(struct glyph_names *names)
 46 
 47 	fprintf(stderr, "glyph names format %x\n", names->format);
 48 
 49-	char *names_start = ((char *)names) + sizeof(struct glyph_names)
 50-	                    + (names->glyph_count + 1) * sizeof(int32_t);
 51+	char *names_start = ((char *)names) + sizeof(struct glyph_names) +
 52+	                    (names->glyph_count + 1) * sizeof(int32_t);
 53 
 54 	int i;
 55 	for (i = 0; i < names->glyph_count; ++i) {
 56@@ -210,8 +207,8 @@ handle_bitmaps(struct bitmaps *bitmaps)
 57 		abort();
 58 	}
 59 
 60-	char *bitmaps_start = ((char *)bitmaps) + sizeof(struct bitmaps)
 61-	                      + (bitmaps->glyph_count + 4) * sizeof(int32_t);
 62+	char *bitmaps_start = ((char *)bitmaps) + sizeof(struct bitmaps) +
 63+	                      (bitmaps->glyph_count + 4) * sizeof(int32_t);
 64 
 65 	for (unsigned i = 0; i < bitmaps->glyph_count; ++i) {
 66 		int32_t offset = bitmaps->offsets[i];
 67@@ -247,8 +244,9 @@ get_glyph_pixel(struct glyph *glyph, int x, int y)
 68 	int absx = glyph->hotx + x;
 69 	int absy = glyph->hoty + y;
 70 
 71-	if (absx < 0 || absx >= glyph->width || absy < 0 || absy >= glyph->height)
 72+	if (absx < 0 || absx >= glyph->width || absy < 0 || absy >= glyph->height) {
 73 		return 0;
 74+	}
 75 
 76 	int stride = (glyph->width + 31) / 32 * 4;
 77 	unsigned char block = glyph->data[absy * stride + (absx / 8)];
 78@@ -275,8 +273,7 @@ add_pixel(uint32_t pixel)
 79 	if (data_buffer.size == data_buffer.capacity) {
 80 		data_buffer.capacity *= 2;
 81 		data_buffer.data =
 82-		    realloc(data_buffer.data,
 83-		            sizeof(uint32_t) * data_buffer.capacity);
 84+		    realloc(data_buffer.data, sizeof(uint32_t) * data_buffer.capacity);
 85 	}
 86 	data_buffer.data[data_buffer.size++] = pixel;
 87 }
 88@@ -296,8 +293,7 @@ reconstruct_glyph(struct glyph *cursor, struct glyph *mask, char *name,
 89 	int maxx = max(cursor->right_bearing, mask->right_bearing);
 90 
 91 	int miny = min(-cursor->hoty, -mask->hoty);
 92-	int maxy = max(cursor->height - cursor->hoty,
 93-	               mask->height - mask->hoty);
 94+	int maxy = max(cursor->height - cursor->hoty, mask->height - mask->hoty);
 95 
 96 	int width = maxx - minx;
 97 	int height = maxy - miny;
 98@@ -315,10 +311,11 @@ reconstruct_glyph(struct glyph *cursor, struct glyph *mask, char *name,
 99 			char alpha = get_glyph_pixel(mask, x, y);
100 			if (alpha) {
101 				char color = get_glyph_pixel(cursor, x, y);
102-				if (color)
103+				if (color) {
104 					add_pixel(0xff000000);
105-				else
106+				} else {
107 					add_pixel(0xffffffff);
108+				}
109 			} else {
110 				add_pixel(0);
111 			}
112@@ -326,25 +323,33 @@ reconstruct_glyph(struct glyph *cursor, struct glyph *mask, char *name,
113 	}
114 }
115 
116-/* From http://cgit.freedesktop.org/xorg/lib/libXfont/tree/src/builtins/fonts.c */
117+/* From http://cgit.freedesktop.org/xorg/lib/libXfont/tree/src/builtins/fonts.c
118+ */
119 static const char cursor_licence[] =
120     "/*\n"
121     "* Copyright 1999 SuSE, Inc.\n"
122     "*\n"
123-    "* Permission to use, copy, modify, distribute, and sell this software and its\n"
124-    "* documentation for any purpose is hereby granted without fee, provided that\n"
125+    "* Permission to use, copy, modify, distribute, and sell this software and "
126+    "its\n"
127+    "* documentation for any purpose is hereby granted without fee, provided "
128+    "that\n"
129     "* the above copyright notice appear in all copies and that both that\n"
130     "* copyright notice and this permission notice appear in supporting\n"
131     "* documentation, and that the name of SuSE not be used in advertising or\n"
132     "* publicity pertaining to distribution of the software without specific,\n"
133     "* written prior permission.  SuSE makes no representations about the\n"
134-    "* suitability of this software for any purpose.  It is provided \"as is\"\n"
135+    "* suitability of this software for any purpose.  It is provided \"as "
136+    "is\"\n"
137     "* without express or implied warranty.\n"
138     "*\n"
139-    "* SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL\n"
140-    "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE\n"
141-    "* BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\n"
142-    "* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION\n"
143+    "* SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING "
144+    "ALL\n"
145+    "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL "
146+    "SuSE\n"
147+    "* BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY "
148+    "DAMAGES\n"
149+    "* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN "
150+    "ACTION\n"
151     "* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN\n"
152     "* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\n"
153     "*\n"
154@@ -368,16 +373,18 @@ write_output_file(FILE *file, struct reconstructed_glyph *glyphs, int n)
155 
156 		for (j = 0; j < size; ++j) {
157 			fprintf(file, "0x%08x, ", data[j]);
158-			if (++counter % 6 == 0)
159+			if (++counter % 6 == 0) {
160 				fprintf(file, "\n\t");
161+			}
162 		}
163 	}
164 	fprintf(file, "\n};\n\n");
165 
166 	fputs("enum cursor_type {\n", file);
167 
168-	for (i = 0; i < n; ++i)
169+	for (i = 0; i < n; ++i) {
170 		fprintf(file, "\tcursor_%s,\n", glyphs[i].name);
171+	}
172 
173 	fputs("};\n\n", file);
174 
175@@ -388,11 +395,11 @@ write_output_file(FILE *file, struct reconstructed_glyph *glyphs, int n)
176 	        "\tsize_t offset;\n"
177 	        "} cursor_metadata[] = {\n");
178 
179-	for (i = 0; i < n; ++i)
180-		fprintf(file, "\t{ %d, %d, %d, %d, %zu }, /* %s */\n",
181-		        glyphs[i].width, glyphs[i].height,
182-		        glyphs[i].hotspot_x, glyphs[i].hotspot_y,
183+	for (i = 0; i < n; ++i) {
184+		fprintf(file, "\t{ %d, %d, %d, %d, %zu }, /* %s */\n", glyphs[i].width,
185+		        glyphs[i].height, glyphs[i].hotspot_x, glyphs[i].hotspot_y,
186 		        glyphs[i].offset, glyphs[i].name);
187+	}
188 
189 	fputs("};\n", file);
190 }
191@@ -408,7 +415,8 @@ find_mask_glyph(char *name)
192 	for (i = 0; i < extracted_font.count; ++i) {
193 		struct glyph *g = &extracted_font.glyphs[i];
194 		int l2 = strlen(g->name);
195-		if ((l2 == len + masklen) && (memcmp(g->name, name, len) == 0) && (memcmp(g->name + len, mask, masklen) == 0)) {
196+		if ((l2 == len + masklen) && (memcmp(g->name, name, len) == 0) &&
197+		    (memcmp(g->name + len, mask, masklen) == 0)) {
198 			return g;
199 		}
200 	}
201@@ -428,43 +436,41 @@ find_cursor_and_mask(const char *name,
202 
203 	for (i = 0; i < extracted_font.count && (!*mask || !*cursor); ++i) {
204 		struct glyph *g = &extracted_font.glyphs[i];
205-		if (!strcmp(name, g->name))
206+		if (!strcmp(name, g->name)) {
207 			*cursor = g;
208-		else if (!strcmp(mask_name, g->name))
209+		} else if (!strcmp(mask_name, g->name)) {
210 			*mask = g;
211+		}
212 	}
213 }
214 
215 static struct {
216 	char *target_name, *source_name;
217-} interesting_cursors[] = {
218-	{ "bottom_left_corner", "bottom_left_corner" },
219-	{ "bottom_right_corner", "bottom_right_corner" },
220-	{ "bottom_side", "bottom_side" },
221-	{ "grabbing", "fleur" },
222-	{ "left_ptr", "left_ptr" },
223-	{ "left_side", "left_side" },
224-	{ "right_side", "right_side" },
225-	{ "top_left_corner", "top_left_corner" },
226-	{ "top_right_corner", "top_right_corner" },
227-	{ "top_side", "top_side" },
228-	{ "xterm", "xterm" },
229-	{ "hand1", "hand1" },
230-	{ "watch", "watch" }
231-};
232+} interesting_cursors[] = {{"bottom_left_corner", "bottom_left_corner"},
233+                           {"bottom_right_corner", "bottom_right_corner"},
234+                           {"bottom_side", "bottom_side"},
235+                           {"grabbing", "fleur"},
236+                           {"left_ptr", "left_ptr"},
237+                           {"left_side", "left_side"},
238+                           {"right_side", "right_side"},
239+                           {"top_left_corner", "top_left_corner"},
240+                           {"top_right_corner", "top_right_corner"},
241+                           {"top_side", "top_side"},
242+                           {"xterm", "xterm"},
243+                           {"hand1", "hand1"},
244+                           {"watch", "watch"}};
245 
246 static void
247 output_interesting_cursors(FILE *file)
248 {
249 	int i;
250 	int n = sizeof(interesting_cursors) / sizeof(interesting_cursors[0]);
251-	struct reconstructed_glyph *glyphs =
252-	    malloc(n * sizeof(*glyphs));
253+	struct reconstructed_glyph *glyphs = malloc(n * sizeof(*glyphs));
254 
255 	for (i = 0; i < n; ++i) {
256 		struct glyph *cursor, *mask;
257-		find_cursor_and_mask(interesting_cursors[i].source_name,
258-		                     &cursor, &mask);
259+		find_cursor_and_mask(interesting_cursors[i].source_name, &cursor,
260+		                     &mask);
261 		if (!cursor) {
262 			fprintf(stderr, "no cursor for %s\n",
263 			        interesting_cursors[i].source_name);
264@@ -475,8 +481,7 @@ output_interesting_cursors(FILE *file)
265 			        interesting_cursors[i].source_name);
266 			abort();
267 		}
268-		reconstruct_glyph(cursor, mask,
269-		                  interesting_cursors[i].target_name,
270+		reconstruct_glyph(cursor, mask, interesting_cursors[i].target_name,
271 		                  &glyphs[i]);
272 	}
273 
274@@ -496,8 +501,7 @@ main(int argc, char *argv[])
275 
276 	fstat(fd, &filestat);
277 
278-	void *fontbuf = mmap(NULL, filestat.st_size, PROT_READ,
279-	                     MAP_PRIVATE, fd, 0);
280+	void *fontbuf = mmap(NULL, filestat.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
281 
282 	handle_pcf(fontbuf);
283 
+45, -36
  1@@ -39,8 +39,8 @@ struct window {
  2 	struct wl_list link;
  3 };
  4 
  5-static const char *terminal_command[] = { "st-wl", NULL };
  6-static const char *dmenu_command[] = { "dmenu_run-wl", NULL };
  7+static const char *terminal_command[] = {"st-wl", NULL};
  8+static const char *dmenu_command[] = {"dmenu_run-wl", NULL};
  9 static const uint32_t border_width = 1;
 10 static const uint32_t border_color_active = 0xff333388;
 11 static const uint32_t border_color_normal = 0xff888888;
 12@@ -60,27 +60,29 @@ arrange(struct screen *screen)
 13 	struct swc_rectangle geometry;
 14 	struct swc_rectangle *screen_geometry = &screen->swc->usable_geometry;
 15 
 16-	if (screen->num_windows == 0)
 17+	if (screen->num_windows == 0) {
 18 		return;
 19+	}
 20 
 21 	num_columns = ceil(sqrt(screen->num_windows));
 22 	num_rows = screen->num_windows / num_columns + 1;
 23 	window = wl_container_of(screen->windows.next, window, link);
 24 
 25 	for (column_index = 0; &window->link != &screen->windows; ++column_index) {
 26-		geometry.x = screen_geometry->x + border_width
 27-		             + screen_geometry->width * column_index / num_columns;
 28-		geometry.width = screen_geometry->width / num_columns
 29-		                 - 2 * border_width;
 30+		geometry.x = screen_geometry->x + border_width +
 31+		             screen_geometry->width * column_index / num_columns;
 32+		geometry.width =
 33+		    screen_geometry->width / num_columns - 2 * border_width;
 34 
 35-		if (column_index == screen->num_windows % num_columns)
 36+		if (column_index == screen->num_windows % num_columns) {
 37 			--num_rows;
 38+		}
 39 
 40 		for (row_index = 0; row_index < num_rows; ++row_index) {
 41-			geometry.y = screen_geometry->y + border_width
 42-			             + screen_geometry->height * row_index / num_rows;
 43-			geometry.height = screen_geometry->height / num_rows
 44-			                  - 2 * border_width;
 45+			geometry.y = screen_geometry->y + border_width +
 46+			             screen_geometry->height * row_index / num_rows;
 47+			geometry.height =
 48+			    screen_geometry->height / num_rows - 2 * border_width;
 49 
 50 			swc_window_set_geometry(window->swc, &geometry);
 51 			window = wl_container_of(window->link.next, window, link);
 52@@ -112,15 +114,16 @@ static void
 53 focus(struct window *window)
 54 {
 55 	if (focused_window) {
 56-		swc_window_set_border(focused_window->swc,
 57-		                      border_color_normal, border_width);
 58+		swc_window_set_border(focused_window->swc, border_color_normal,
 59+		                      border_width);
 60 	}
 61 
 62 	if (window) {
 63 		swc_window_set_border(window->swc, border_color_active, border_width);
 64 		swc_window_focus(window->swc);
 65-	} else
 66+	} else {
 67 		swc_window_focus(NULL);
 68+	}
 69 
 70 	focused_window = window;
 71 }
 72@@ -145,8 +148,8 @@ screen_entered(void *data)
 73 }
 74 
 75 static const struct swc_screen_handler screen_handler = {
 76-	.usable_geometry_changed = &screen_usable_geometry_changed,
 77-	.entered = &screen_entered,
 78+    .usable_geometry_changed = &screen_usable_geometry_changed,
 79+    .entered = &screen_entered,
 80 };
 81 
 82 static void
 83@@ -159,11 +162,11 @@ window_destroy(void *data)
 84 		next_focus = wl_container_of(window->link.next, window, link);
 85 
 86 		if (&next_focus->link == &window->screen->windows) {
 87-			next_focus = wl_container_of(window->link.prev,
 88-			                             window, link);
 89+			next_focus = wl_container_of(window->link.prev, window, link);
 90 
 91-			if (&next_focus->link == &window->screen->windows)
 92+			if (&next_focus->link == &window->screen->windows) {
 93 				next_focus = NULL;
 94+			}
 95 		}
 96 
 97 		focus(next_focus);
 98@@ -182,8 +185,8 @@ window_entered(void *data)
 99 }
100 
101 static const struct swc_window_handler window_handler = {
102-	.destroy = &window_destroy,
103-	.entered = &window_entered,
104+    .destroy = &window_destroy,
105+    .entered = &window_entered,
106 };
107 
108 static void
109@@ -193,8 +196,9 @@ new_screen(struct swc_screen *swc)
110 
111 	screen = malloc(sizeof(*screen));
112 
113-	if (!screen)
114+	if (!screen) {
115 		return;
116+	}
117 
118 	screen->swc = swc;
119 	screen->num_windows = 0;
120@@ -210,8 +214,9 @@ new_window(struct swc_window *swc)
121 
122 	window = malloc(sizeof(*window));
123 
124-	if (!window)
125+	if (!window) {
126 		return;
127+	}
128 
129 	window->swc = swc;
130 	window->screen = NULL;
131@@ -221,15 +226,16 @@ new_window(struct swc_window *swc)
132 	focus(window);
133 }
134 
135-const struct swc_manager manager = { &new_screen, &new_window };
136+const struct swc_manager manager = {&new_screen, &new_window};
137 
138 static void
139 spawn(void *data, uint32_t time, uint32_t value, uint32_t state)
140 {
141 	char *const *command = data;
142 
143-	if (state != WL_KEYBOARD_KEY_STATE_PRESSED)
144+	if (state != WL_KEYBOARD_KEY_STATE_PRESSED) {
145 		return;
146+	}
147 
148 	if (fork() == 0) {
149 		execvp(command[0], command);
150@@ -240,8 +246,9 @@ spawn(void *data, uint32_t time, uint32_t value, uint32_t state)
151 static void
152 quit(void *data, uint32_t time, uint32_t value, uint32_t state)
153 {
154-	if (state != WL_KEYBOARD_KEY_STATE_PRESSED)
155+	if (state != WL_KEYBOARD_KEY_STATE_PRESSED) {
156 		return;
157+	}
158 
159 	wl_display_terminate(display);
160 }
161@@ -252,23 +259,25 @@ main(int argc, char *argv[])
162 	const char *socket;
163 
164 	display = wl_display_create();
165-	if (!display)
166+	if (!display) {
167 		return EXIT_FAILURE;
168+	}
169 
170 	socket = wl_display_add_socket_auto(display);
171-	if (!socket)
172+	if (!socket) {
173 		return EXIT_FAILURE;
174+	}
175 	setenv("WAYLAND_DISPLAY", socket, 1);
176 
177-	if (!swc_initialize(display, NULL, &manager))
178+	if (!swc_initialize(display, NULL, &manager)) {
179 		return EXIT_FAILURE;
180+	}
181 
182-	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_Return,
183-	                &spawn, terminal_command);
184-	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_r,
185-	                &spawn, dmenu_command);
186-	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_q,
187-	                &quit, NULL);
188+	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_Return, &spawn,
189+	                terminal_command);
190+	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_r, &spawn,
191+	                dmenu_command);
192+	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_LOGO, XKB_KEY_q, &quit, NULL);
193 
194 	event_loop = wl_display_get_event_loop(display);
195 	wl_display_run(display);
+2, -2
 1@@ -25,9 +25,9 @@
 2  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 3  * SOFTWARE.
 4  */
 5-#include <sys/sysmacros.h>
 6-#include <linux/major.h>
 7 #include "devmajor.h"
 8+#include <linux/major.h>
 9+#include <sys/sysmacros.h>
10 
11 #ifndef DRM_MAJOR
12 #define DRM_MAJOR 226
+8, -6
 1@@ -20,19 +20,22 @@
 2  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 3  * SOFTWARE.
 4  */
 5-#include <sys/stat.h>
 6-#include <stdlib.h>
 7 #include "devmajor.h"
 8+#include <stdlib.h>
 9+#include <sys/stat.h>
10 
11 bool
12 device_is_input(dev_t rdev)
13 {
14-	if (major(rdev) == getdevmajor("wskbd", S_IFCHR))
15+	if (major(rdev) == getdevmajor("wskbd", S_IFCHR)) {
16 		return true;
17-	if (major(rdev) == getdevmajor("wsmouse", S_IFCHR))
18+	}
19+	if (major(rdev) == getdevmajor("wsmouse", S_IFCHR)) {
20 		return true;
21-	if (major(rdev) == getdevmajor("wsmux", S_IFCHR))
22+	}
23+	if (major(rdev) == getdevmajor("wsmux", S_IFCHR)) {
24 		return true;
25+	}
26 	return false;
27 }
28 
29@@ -47,4 +50,3 @@ device_is_drm(dev_t rdev)
30 {
31 	return major(rdev) == getdevmajor("drm", S_IFCHR);
32 }
33-
+14, -11
 1@@ -21,10 +21,10 @@
 2  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 3  * SOFTWARE.
 4  */
 5-#include <sys/stat.h>
 6+#include "devmajor.h"
 7 #include <stdlib.h>
 8 #include <string.h>
 9-#include "devmajor.h"
10+#include <sys/stat.h>
11 
12 static bool
13 devname_is(dev_t rdev, const char *prefix)
14@@ -33,8 +33,9 @@ devname_is(dev_t rdev, const char *prefix)
15 	size_t len;
16 
17 	name = devname(rdev, S_IFCHR);
18-	if (!name || name[0] == '?' || name[1] == '?')
19+	if (!name || name[0] == '?' || name[1] == '?') {
20 		return false;
21+	}
22 
23 	len = strlen(prefix);
24 	return strncmp(name, prefix, len) == 0;
25@@ -43,12 +44,15 @@ devname_is(dev_t rdev, const char *prefix)
26 bool
27 device_is_input(dev_t rdev)
28 {
29-	if (devname_is(rdev, "wskbd"))
30+	if (devname_is(rdev, "wskbd")) {
31 		return true;
32-	if (devname_is(rdev, "wsmouse"))
33+	}
34+	if (devname_is(rdev, "wsmouse")) {
35 		return true;
36-	if (devname_is(rdev, "wsmux"))
37+	}
38+	if (devname_is(rdev, "wsmux")) {
39 		return true;
40+	}
41 	return false;
42 }
43 
44@@ -64,11 +68,10 @@ device_is_drm(dev_t rdev)
45 	const char *n;
46 
47 	n = devname(rdev, S_IFCHR);
48-	if (!n)
49+	if (!n) {
50 		return false;
51+	}
52 
53-	return
54-		strncmp(n, "drm", 3) == 0 ||
55-		strncmp(n, "dri/card", 8) == 0 ||
56-		strncmp(n, "dri/renderD", 11) == 0;
57+	return strncmp(n, "drm", 3) == 0 || strncmp(n, "dri/card", 8) == 0 ||
58+	       strncmp(n, "dri/renderD", 11) == 0;
59 }
+1, -1
 1@@ -24,8 +24,8 @@
 2 #ifndef DEVMAJOR_H
 3 #define DEVMAJOR_H
 4 
 5-#include <sys/stat.h>
 6 #include <stdbool.h>
 7+#include <sys/stat.h>
 8 
 9 bool device_is_input(dev_t);
10 
+119, -69
  1@@ -25,13 +25,14 @@
  2  * SOFTWARE.
  3  */
  4 
  5-#include "protocol.h"
  6 #include "devmajor.h"
  7+#include "protocol.h"
  8 
  9 #include <errno.h>
 10 #include <fcntl.h>
 11 #include <limits.h>
 12 #include <poll.h>
 13+#include <signal.h>
 14 #include <spawn.h>
 15 #include <stdbool.h>
 16 #include <stdio.h>
 17@@ -39,13 +40,12 @@
 18 #include <stdnoreturn.h>
 19 #include <string.h>
 20 #include <unistd.h>
 21-#include <signal.h>
 22 
 23+#include <sys/ioctl.h>
 24 #include <sys/socket.h>
 25 #include <sys/stat.h>
 26-#include <sys/wait.h>
 27-#include <sys/ioctl.h>
 28 #include <sys/types.h>
 29+#include <sys/wait.h>
 30 #ifdef __linux__
 31 #include <sys/sysmacros.h>
 32 #endif
 33@@ -69,8 +69,10 @@
 34 
 35 #define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array)[0])
 36 
 37-static void activate(void);
 38-static void deactivate(void);
 39+static void
 40+activate(void);
 41+static void
 42+deactivate(void);
 43 
 44 static bool nflag;
 45 static int sigfd[2], sock[2];
 46@@ -86,9 +88,11 @@ static struct {
 47 	long console_mode;
 48 } original_vt_state;
 49 
 50-static void cleanup(void);
 51+static void
 52+cleanup(void);
 53 
 54-static noreturn void usage(const char *name)
 55+static noreturn void
 56+usage(const char *name)
 57 {
 58 	fprintf(stderr, "usage: %s [-n] [-t tty] [--] server [args...]\n", name);
 59 	exit(2);
 60@@ -103,8 +107,9 @@ die(const char *format, ...)
 61 	vfprintf(stderr, format, args);
 62 	va_end(args);
 63 
 64-	if (format[0] && format[strlen(format) - 1] == ':')
 65+	if (format[0] && format[strlen(format) - 1] == ':') {
 66 		fprintf(stderr, " %s", strerror(errno));
 67+	}
 68 	fputc('\n', stderr);
 69 
 70 	cleanup();
 71@@ -117,8 +122,9 @@ start_devices(void)
 72 	int i;
 73 
 74 	for (i = 0; i < num_drm_fds; ++i) {
 75-		if (drmSetMaster(drm_fds[i]) < 0)
 76+		if (drmSetMaster(drm_fds[i]) < 0) {
 77 			die("failed to set DRM master");
 78+		}
 79 	}
 80 }
 81 
 82@@ -128,13 +134,16 @@ stop_devices(bool fatal)
 83 	int i;
 84 
 85 	for (i = 0; i < num_drm_fds; ++i) {
 86-		if (drmDropMaster(drm_fds[i]) < 0 && fatal)
 87+		if (drmDropMaster(drm_fds[i]) < 0 && fatal) {
 88 			die("drmDropMaster:");
 89+		}
 90 	}
 91 	for (i = 0; i < num_input_fds; ++i) {
 92 #ifdef EVIOCREVOKE
 93-		if (ioctl(input_fds[i], EVIOCREVOKE, 0) < 0 && errno != ENODEV && fatal)
 94+		if (ioctl(input_fds[i], EVIOCREVOKE, 0) < 0 && errno != ENODEV &&
 95+		    fatal) {
 96 			die("ioctl EVIOCREVOKE:");
 97+		}
 98 #endif
 99 		close(input_fds[i]);
100 	}
101@@ -148,11 +157,12 @@ cleanup(void)
102 	struct vt_mode mode = {.mode = VT_AUTO};
103 #endif
104 
105-	if (!original_vt_state.altered)
106+	if (!original_vt_state.altered) {
107 		return;
108+	}
109 
110-	/* Stop devices before switching the VT to make sure we have released the DRM
111-	 * device before the next session tries to claim it. */
112+	/* Stop devices before switching the VT to make sure we have released the
113+	 * DRM device before the next session tries to claim it. */
114 	stop_devices(false);
115 
116 	/* Cleanup VT */
117@@ -202,19 +212,20 @@ handle_socket_data(int socket)
118 	struct swc_launch_event response;
119 	char path[PATH_MAX];
120 	struct iovec request_iov[2] = {
121-		{.iov_base = &request, .iov_len = sizeof(request)},
122-		{.iov_base = path, .iov_len = sizeof(path)},
123+	    {.iov_base = &request, .iov_len = sizeof(request)},
124+	    {.iov_base = path, .iov_len = sizeof(path)},
125 	};
126 	struct iovec response_iov[1] = {
127-		{.iov_base = &response, .iov_len = sizeof(response)},
128+	    {.iov_base = &response, .iov_len = sizeof(response)},
129 	};
130 	int fd = -1;
131 	struct stat st;
132 	ssize_t size;
133 
134 	size = receive_fd(socket, &fd, request_iov, 2);
135-	if (size == -1 || size == 0 || size < sizeof(request))
136+	if (size == -1 || size == 0 || size < sizeof(request)) {
137 		return;
138+	}
139 	size -= sizeof(request);
140 
141 	response.type = SWC_LAUNCH_EVENT_RESPONSE;
142@@ -226,7 +237,8 @@ handle_socket_data(int socket)
143 			fprintf(stderr, "path is not NULL terminated\n");
144 			goto fail;
145 		}
146-		if ((request.flags & (O_ACCMODE|O_NONBLOCK|O_CLOEXEC)) != request.flags) {
147+		if ((request.flags & (O_ACCMODE | O_NONBLOCK | O_CLOEXEC)) !=
148+		    request.flags) {
149 			fprintf(stderr, "invalid open flags\n");
150 			goto fail;
151 		}
152@@ -242,8 +254,9 @@ handle_socket_data(int socket)
153 		}
154 
155 		if (device_is_input(st.st_rdev)) {
156-			if (!active)
157+			if (!active) {
158 				goto fail;
159+			}
160 			if (num_input_fds == ARRAY_LENGTH(input_fds)) {
161 				fprintf(stderr, "too many input devices opened\n");
162 				goto fail;
163@@ -261,11 +274,14 @@ handle_socket_data(int socket)
164 		}
165 		break;
166 	case SWC_LAUNCH_REQUEST_ACTIVATE_VT:
167-		if (!active)
168+		if (!active) {
169 			goto fail;
170+		}
171 
172-		if (ioctl(tty_fd, VT_ACTIVATE, request.vt) == -1)
173-			fprintf(stderr, "failed to activate VT %d: %s\n", request.vt, strerror(errno));
174+		if (ioctl(tty_fd, VT_ACTIVATE, request.vt) == -1) {
175+			fprintf(stderr, "failed to activate VT %d: %s\n", request.vt,
176+			        strerror(errno));
177+		}
178 		break;
179 	default:
180 		fprintf(stderr, "unknown request %u\n", request.type);
181@@ -277,8 +293,9 @@ handle_socket_data(int socket)
182 
183 fail:
184 	response.success = false;
185-	if (fd != -1)
186+	if (fd != -1) {
187 		close(fd);
188+	}
189 	fd = -1;
190 done:
191 	send_fd(socket, fd, response_iov, 1);
192@@ -288,33 +305,41 @@ static void
193 find_vt(char *vt, size_t size)
194 {
195 #if defined(__NetBSD__)
196-	if (snprintf(vt, size, "/dev/ttyE1") >= size)
197+	if (snprintf(vt, size, "/dev/ttyE1") >= size) {
198 		die("VT number is too large");
199+	}
200 #elif defined(__OpenBSD__)
201 	const char *tty;
202 	tty = ttyname(STDIN_FILENO);
203-	if (!tty || strncmp(tty, "/dev/ttyC", 8) != 0)
204+	if (!tty || strncmp(tty, "/dev/ttyC", 8) != 0) {
205 		die("must be run from wscons VT (/dev/ttyC*)");
206-	if (snprintf(vt, size, "%s", tty) >= size)
207+	}
208+	if (snprintf(vt, size, "%s", tty) >= size) {
209 		die("VT number is too large");
210+	}
211 #else
212 	char *vtnr;
213 	int tty0_fd, vt_num;
214 
215-	/* If we are running from an existing X or wayland session, always open a new
216-	 * VT instead of using the current one. */
217-	if (getenv("DISPLAY") || getenv("WAYLAND_DISPLAY") || !(vtnr = getenv("XDG_VTNR"))) {
218+	/* If we are running from an existing X or wayland session, always open a
219+	 * new VT instead of using the current one. */
220+	if (getenv("DISPLAY") || getenv("WAYLAND_DISPLAY") ||
221+	    !(vtnr = getenv("XDG_VTNR"))) {
222 		tty0_fd = open("/dev/tty0", O_RDWR);
223-		if (tty0_fd == -1)
224+		if (tty0_fd == -1) {
225 			die("open /dev/tty0:");
226-		if (ioctl(tty0_fd, VT_OPENQRY, &vt_num) != 0)
227+		}
228+		if (ioctl(tty0_fd, VT_OPENQRY, &vt_num) != 0) {
229 			die("VT open query failed:");
230+		}
231 		close(tty0_fd);
232-		if (snprintf(vt, size, "/dev/tty%d", vt_num) >= size)
233+		if (snprintf(vt, size, "/dev/tty%d", vt_num) >= size) {
234 			die("VT number is too large");
235+		}
236 	} else {
237-		if (snprintf(vt, size, "/dev/tty%s", vtnr) >= size)
238+		if (snprintf(vt, size, "/dev/tty%s", vtnr) >= size) {
239 			die("XDG_VTNR is too long");
240+		}
241 	}
242 #endif
243 }
244@@ -326,12 +351,15 @@ open_tty(const char *tty_name)
245 	int fd;
246 
247 	/* Check if we are already running on the desired VT */
248-	if ((stdin_tty = ttyname(STDIN_FILENO)) && strcmp(tty_name, stdin_tty) == 0)
249+	if ((stdin_tty = ttyname(STDIN_FILENO)) &&
250+	    strcmp(tty_name, stdin_tty) == 0) {
251 		return STDIN_FILENO;
252+	}
253 
254 	fd = open(tty_name, O_RDWR | O_NOCTTY);
255-	if (fd < 0)
256+	if (fd < 0) {
257 		die("open %s:", tty_name);
258+	}
259 
260 	return fd;
261 }
262@@ -344,29 +372,30 @@ setup_tty(int fd)
263 #ifndef __OpenBSD__
264 	struct vt_stat state;
265 	struct vt_mode mode = {
266-		.mode = VT_PROCESS,
267-		.relsig = SIGUSR1,
268-		.acqsig = SIGUSR2
269-	};
270+	    .mode = VT_PROCESS, .relsig = SIGUSR1, .acqsig = SIGUSR2};
271 #endif
272 
273-	if (fstat(fd, &st) == -1)
274+	if (fstat(fd, &st) == -1) {
275 		die("failed to stat TTY fd:");
276+	}
277 	vt = minor(st.st_rdev);
278 
279 #ifdef __OpenBSD__
280-	if (!device_is_tty(st.st_rdev))
281+	if (!device_is_tty(st.st_rdev)) {
282 		die("not a valid VT");
283+	}
284 #else
285-	if (!device_is_tty(st.st_rdev) || vt == 0)
286+	if (!device_is_tty(st.st_rdev) || vt == 0) {
287 		die("not a valid VT");
288+	}
289 #endif
290 
291 #ifdef __OpenBSD__
292 	/* OpenBSD wscons has no VT_GETSTATE */
293 #else
294-	if (ioctl(fd, VT_GETSTATE, &state) == -1)
295+	if (ioctl(fd, VT_GETSTATE, &state) == -1) {
296 		die("failed to get the current VT state:");
297+	}
298 #endif
299 
300 #ifndef __OpenBSD__
301@@ -376,18 +405,21 @@ setup_tty(int fd)
302 #endif
303 
304 #ifdef KDGETMODE
305-	if (ioctl(fd, KDGKBMODE, &original_vt_state.kb_mode))
306+	if (ioctl(fd, KDGKBMODE, &original_vt_state.kb_mode)) {
307 		die("failed to get keyboard mode:");
308-	if (ioctl(fd, KDGETMODE, &original_vt_state.console_mode))
309+	}
310+	if (ioctl(fd, KDGETMODE, &original_vt_state.console_mode)) {
311 		die("failed to get console mode:");
312+	}
313 #else
314 	original_vt_state.kb_mode = K_XLATE;
315 	original_vt_state.console_mode = KD_TEXT;
316 #endif
317 
318 #ifdef K_OFF
319-	if (ioctl(fd, KDSKBMODE, K_OFF) == -1)
320+	if (ioctl(fd, KDSKBMODE, K_OFF) == -1) {
321 		die("failed to set keyboard mode to K_OFF:");
322+	}
323 #endif
324 	if (ioctl(fd, KDSETMODE, KD_GRAPHICS) == -1) {
325 		perror("KDSETMODE KD_GRAPHICS");
326@@ -445,7 +477,7 @@ setup_tty(int fd)
327 
328 #ifndef __OpenBSD__
329 error2:
330-	mode = (struct vt_mode){.mode = VT_AUTO };
331+	mode = (struct vt_mode){.mode = VT_AUTO};
332 	ioctl(fd, VT_SETMODE, &mode);
333 error1:
334 	ioctl(fd, KDSKBMODE, original_vt_state.kb_mode);
335@@ -456,25 +488,29 @@ error0:
336 }
337 
338 static void
339-run(int fd) {
340+run(int fd)
341+{
342 	struct pollfd fds[] = {
343-		{.fd = fd, .events = POLLIN},
344-		{.fd = sigfd[0], .events = POLLIN},
345+	    {.fd = fd, .events = POLLIN},
346+	    {.fd = sigfd[0], .events = POLLIN},
347 	};
348 	int status;
349 	char sig;
350 
351 	for (;;) {
352 		if (poll(fds, ARRAY_LENGTH(fds), -1) < 0) {
353-			if (errno == EINTR)
354+			if (errno == EINTR) {
355 				continue;
356+			}
357 			die("poll:");
358 		}
359-		if (fds[0].revents)
360+		if (fds[0].revents) {
361 			handle_socket_data(fd);
362+		}
363 		if (fds[1].revents) {
364-			if (read(sigfd[0], &sig, 1) <= 0)
365+			if (read(sigfd[0], &sig, 1) <= 0) {
366 				continue;
367+			}
368 			switch (sig) {
369 			case SIGCHLD:
370 				wait(&status);
371@@ -500,8 +536,8 @@ main(int argc, char *argv[])
372 	int option;
373 	char *vt = NULL, buf[64];
374 	struct sigaction action = {
375-		.sa_handler = handle_signal,
376-		.sa_flags = SA_RESTART,
377+	    .sa_handler = handle_signal,
378+	    .sa_flags = SA_RESTART,
379 	};
380 	sigset_t set;
381 	pid_t pid;
382@@ -520,22 +556,29 @@ main(int argc, char *argv[])
383 		}
384 	}
385 
386-	if (argc - optind < 1)
387+	if (argc - optind < 1) {
388 		usage(argv[0]);
389+	}
390 
391-	if (socketpair(AF_LOCAL, SOCK_SEQPACKET, 0, sock) == -1)
392+	if (socketpair(AF_LOCAL, SOCK_SEQPACKET, 0, sock) == -1) {
393 		die("socketpair:");
394-	if (fcntl(sock[0], F_SETFD, FD_CLOEXEC) == -1)
395+	}
396+	if (fcntl(sock[0], F_SETFD, FD_CLOEXEC) == -1) {
397 		die("failed set CLOEXEC on socket:");
398+	}
399 
400-	if (pipe2(sigfd, O_CLOEXEC) == -1)
401+	if (pipe2(sigfd, O_CLOEXEC) == -1) {
402 		die("pipe:");
403-	if (sigaction(SIGCHLD, &action, NULL) == -1)
404+	}
405+	if (sigaction(SIGCHLD, &action, NULL) == -1) {
406 		die("sigaction SIGCHLD:");
407-	if (sigaction(SIGUSR1, &action, NULL) == -1)
408+	}
409+	if (sigaction(SIGUSR1, &action, NULL) == -1) {
410 		die("sigaction SIGUSR1:");
411-	if (sigaction(SIGUSR2, &action, NULL) == -1)
412+	}
413+	if (sigaction(SIGUSR2, &action, NULL) == -1) {
414 		die("sigaction SIGUSR2:");
415+	}
416 
417 	sigfillset(&set);
418 	sigdelset(&set, SIGCHLD);
419@@ -559,21 +602,28 @@ main(int argc, char *argv[])
420 	if (!getenv("XDG_RUNTIME_DIR")) {
421 		uid_t uid = getuid();
422 		snprintf(buf, sizeof(buf), "/tmp/XDG_RUNTIME_DIR_%d", uid);
423-		if (mkdir(buf, 0700) == -1 && errno != EEXIST)
424+		if (mkdir(buf, 0700) == -1 && errno != EEXIST) {
425 			die("mkdir %s:", buf);
426+		}
427 		setenv("XDG_RUNTIME_DIR", buf, 1);
428 		fprintf(stderr, "set XDG_RUNTIME_DIR=%s\n", buf);
429 	}
430 
431-	if ((errno = posix_spawnattr_init(&attr)))
432+	if ((errno = posix_spawnattr_init(&attr))) {
433 		die("posix_spawnattr_init:");
434-	if ((errno = posix_spawnattr_setflags(&attr, POSIX_SPAWN_RESETIDS|POSIX_SPAWN_SETSIGMASK)))
435+	}
436+	if ((errno = posix_spawnattr_setflags(&attr, POSIX_SPAWN_RESETIDS |
437+	                                                 POSIX_SPAWN_SETSIGMASK))) {
438 		die("posix_spawnattr_setflags:");
439+	}
440 	sigemptyset(&set);
441-	if ((errno = posix_spawnattr_setsigmask(&attr, &set)))
442+	if ((errno = posix_spawnattr_setsigmask(&attr, &set))) {
443 		die("posix_spawnattr_setsigmask:");
444-	if ((errno = posix_spawnp(&pid, argv[optind], NULL, &attr, argv + optind, environ)))
445+	}
446+	if ((errno = posix_spawnp(&pid, argv[optind], NULL, &attr, argv + optind,
447+	                          environ))) {
448 		die("posix_spawnp %s:", argv[optind]);
449+	}
450 	posix_spawnattr_destroy(&attr);
451 
452 	close(sock[1]);
+15, -13
 1@@ -1,24 +1,23 @@
 2 #include "protocol.h"
 3 
 4-#include <sys/socket.h>
 5 #include <stdio.h>
 6 #include <string.h>
 7+#include <sys/socket.h>
 8 
 9 ssize_t
10 send_fd(int socket, int fd, struct iovec *iov, int iovlen)
11 {
12 	char control[CMSG_SPACE(sizeof(fd))];
13 	struct msghdr message = {
14-		.msg_name = NULL,
15-		.msg_namelen = 0,
16-		.msg_iov = iov,
17-		.msg_iovlen = iovlen,
18+	    .msg_name = NULL,
19+	    .msg_namelen = 0,
20+	    .msg_iov = iov,
21+	    .msg_iovlen = iovlen,
22 	};
23 	struct cmsghdr *cmsg;
24 
25 	if (fd != -1) {
26-		message.msg_control = control,
27-		message.msg_controllen = sizeof(control);
28+		message.msg_control = control, message.msg_controllen = sizeof(control);
29 
30 		cmsg = CMSG_FIRSTHDR(&message);
31 		cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
32@@ -40,10 +39,10 @@ receive_fd(int socket, int *fd, struct iovec *iov, int iovlen)
33 	ssize_t size;
34 	char control[CMSG_SPACE(sizeof(*fd))];
35 	struct msghdr message = {
36-		.msg_name = NULL,
37-		.msg_namelen = 0,
38-		.msg_iov = iov,
39-		.msg_iovlen = iovlen,
40+	    .msg_name = NULL,
41+	    .msg_namelen = 0,
42+	    .msg_iov = iov,
43+	    .msg_iovlen = iovlen,
44 	};
45 	struct cmsghdr *cmsg;
46 
47@@ -54,12 +53,15 @@ receive_fd(int socket, int *fd, struct iovec *iov, int iovlen)
48 	}
49 
50 	size = recvmsg(socket, &message, MSG_CMSG_CLOEXEC);
51-	if (size < 0)
52+	if (size < 0) {
53 		return -1;
54+	}
55 
56 	cmsg = CMSG_FIRSTHDR(&message);
57-	if (fd && cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(*fd)) && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
58+	if (fd && cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(*fd)) &&
59+	    cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
60 		memcpy(fd, CMSG_DATA(cmsg), sizeof(*fd));
61+	}
62 
63 	return size;
64 }
+4, -2
 1@@ -65,7 +65,9 @@ struct swc_launch_event {
 2 	};
 3 };
 4 
 5-ssize_t send_fd(int socket, int fd, struct iovec *iov, int iovlen);
 6-ssize_t receive_fd(int socket, int *fd, struct iovec *iov, int iovlen);
 7+ssize_t
 8+send_fd(int socket, int fd, struct iovec *iov, int iovlen);
 9+ssize_t
10+receive_fd(int socket, int *fd, struct iovec *iov, int iovlen);
11 
12 #endif
+59, -31
  1@@ -21,12 +21,12 @@
  2  * SOFTWARE.
  3  */
  4 
  5-#include "swc.h"
  6 #include "bindings.h"
  7 #include "internal.h"
  8 #include "keyboard.h"
  9 #include "pointer.h"
 10 #include "seat.h"
 11+#include "swc.h"
 12 #include "util.h"
 13 
 14 #include <errno.h>
 15@@ -47,26 +47,32 @@ struct axis_binding {
 16 	void *data;
 17 };
 18 
 19-static bool handle_key(struct keyboard *keyboard, uint32_t time, struct key *key, uint32_t state);
 20+static bool
 21+handle_key(struct keyboard *keyboard, uint32_t time, struct key *key,
 22+           uint32_t state);
 23 
 24 static struct keyboard_handler key_binding_handler = {
 25-	.key = handle_key,
 26+    .key = handle_key,
 27 };
 28 
 29-static bool handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state);
 30-static bool handle_axis(struct pointer_handler *handler, uint32_t time, enum wl_pointer_axis axis,
 31-                        enum wl_pointer_axis_source source, wl_fixed_t value, int value120);
 32+static bool
 33+handle_button(struct pointer_handler *handler, uint32_t time,
 34+              struct button *button, uint32_t state);
 35+static bool
 36+handle_axis(struct pointer_handler *handler, uint32_t time,
 37+            enum wl_pointer_axis axis, enum wl_pointer_axis_source source,
 38+            wl_fixed_t value, int value120);
 39 
 40 static struct pointer_handler button_binding_handler = {
 41-	.button = handle_button,
 42-	.axis = handle_axis,
 43+    .button = handle_button,
 44+    .axis = handle_axis,
 45 };
 46 
 47 static struct wl_array key_bindings, button_bindings, axis_bindings;
 48 
 49 const struct swc_bindings swc_bindings = {
 50-	.keyboard_handler = &key_binding_handler,
 51-	.pointer_handler = &button_binding_handler,
 52+    .keyboard_handler = &key_binding_handler,
 53+    .pointer_handler = &button_binding_handler,
 54 };
 55 
 56 static struct binding *
 57@@ -74,9 +80,12 @@ find_binding(struct wl_array *bindings, uint32_t modifiers, uint32_t value)
 58 {
 59 	struct binding *binding;
 60 
 61-	wl_array_for_each (binding, bindings) {
 62-		if (binding->value == value && (binding->modifiers == modifiers || binding->modifiers == SWC_MOD_ANY))
 63+	wl_array_for_each(binding, bindings)
 64+	{
 65+		if (binding->value == value && (binding->modifiers == modifiers ||
 66+		                                binding->modifiers == SWC_MOD_ANY)) {
 67 			return binding;
 68+		}
 69 	}
 70 
 71 	return NULL;
 72@@ -93,18 +102,21 @@ find_key_binding(uint32_t modifiers, uint32_t key)
 73 	keysym = xkb_state_key_get_one_sym(xkb->state, XKB_KEY(key));
 74 	binding = find_binding(&key_bindings, modifiers, keysym);
 75 
 76-	if (binding)
 77+	if (binding) {
 78 		return binding;
 79+	}
 80 
 81 	xkb_layout_index_t layout;
 82 	const xkb_keysym_t *keysyms;
 83 
 84 	/* Then try the keysym associated with shift-level 0 for the key. */
 85 	layout = xkb_state_key_get_layout(xkb->state, XKB_KEY(key));
 86-	xkb_keymap_key_get_syms_by_level(xkb->keymap.map, XKB_KEY(key), layout, 0, &keysyms);
 87+	xkb_keymap_key_get_syms_by_level(xkb->keymap.map, XKB_KEY(key), layout, 0,
 88+	                                 &keysyms);
 89 
 90-	if (!keysyms)
 91+	if (!keysyms) {
 92 		return NULL;
 93+	}
 94 
 95 	binding = find_binding(&key_bindings, modifiers, keysyms[0]);
 96 
 97@@ -122,25 +134,31 @@ find_axis_binding(uint32_t modifiers, uint32_t axis)
 98 {
 99 	struct axis_binding *binding;
100 
101-	wl_array_for_each (binding, &axis_bindings) {
102-		if (binding->axis == axis && (binding->modifiers == modifiers || binding->modifiers == SWC_MOD_ANY))
103+	wl_array_for_each(binding, &axis_bindings)
104+	{
105+		if (binding->axis == axis && (binding->modifiers == modifiers ||
106+		                              binding->modifiers == SWC_MOD_ANY)) {
107 			return binding;
108+		}
109 	}
110 
111 	return NULL;
112 }
113 
114 static bool
115-handle_binding(uint32_t time, struct press *press, uint32_t state, struct binding *(*find_binding)(uint32_t, uint32_t))
116+handle_binding(uint32_t time, struct press *press, uint32_t state,
117+               struct binding *(*find_binding)(uint32_t, uint32_t))
118 {
119 	struct binding *binding;
120-	uint32_t modifiers = swc.seat && swc.seat->keyboard ? swc.seat->keyboard->modifiers : 0;
121+	uint32_t modifiers =
122+	    swc.seat && swc.seat->keyboard ? swc.seat->keyboard->modifiers : 0;
123 
124 	if (state) {
125 		binding = find_binding(modifiers, press->value);
126 
127-		if (!binding)
128+		if (!binding) {
129 			return false;
130+		}
131 
132 		press->data = binding;
133 	} else {
134@@ -153,35 +171,41 @@ handle_binding(uint32_t time, struct press *press, uint32_t state, struct bindin
135 }
136 
137 bool
138-handle_key(struct keyboard *keyboard, uint32_t time, struct key *key, uint32_t state)
139+handle_key(struct keyboard *keyboard, uint32_t time, struct key *key,
140+           uint32_t state)
141 {
142 	return handle_binding(time, &key->press, state, &find_key_binding);
143 }
144 
145 bool
146-handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state)
147+handle_button(struct pointer_handler *handler, uint32_t time,
148+              struct button *button, uint32_t state)
149 {
150 	return handle_binding(time, &button->press, state, &find_button_binding);
151 }
152 
153 bool
154-handle_axis(struct pointer_handler *handler, uint32_t time, enum wl_pointer_axis axis,
155-            enum wl_pointer_axis_source source, wl_fixed_t value, int value120)
156+handle_axis(struct pointer_handler *handler, uint32_t time,
157+            enum wl_pointer_axis axis, enum wl_pointer_axis_source source,
158+            wl_fixed_t value, int value120)
159 {
160 	(void)handler;
161 	(void)source;
162 
163-	uint32_t modifiers = swc.seat && swc.seat->keyboard ? swc.seat->keyboard->modifiers : 0;
164+	uint32_t modifiers =
165+	    swc.seat && swc.seat->keyboard ? swc.seat->keyboard->modifiers : 0;
166 	struct axis_binding *binding = find_axis_binding(modifiers, axis);
167 	int32_t delta120 = value120;
168 
169-	if (!binding || !binding->handler)
170+	if (!binding || !binding->handler) {
171 		return false;
172+	}
173 
174 	if (!delta120 && value) {
175 		delta120 = (int32_t)(wl_fixed_to_double(value) * 120.0);
176-		if (!delta120)
177+		if (!delta120) {
178 			delta120 = value > 0 ? 1 : -1;
179+		}
180 	}
181 
182 	binding->handler(binding->data, time, axis, delta120);
183@@ -207,7 +231,8 @@ bindings_finalize(void)
184 }
185 
186 EXPORT int
187-swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value, swc_binding_handler handler, void *data)
188+swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value,
189+                swc_binding_handler handler, void *data)
190 {
191 	struct binding *binding;
192 	struct wl_array *bindings;
193@@ -223,8 +248,9 @@ swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value,
194 		return -EINVAL;
195 	}
196 
197-	if (!(binding = wl_array_add(bindings, sizeof(*binding))))
198+	if (!(binding = wl_array_add(bindings, sizeof(*binding)))) {
199 		return -ENOMEM;
200+	}
201 
202 	binding->value = value;
203 	binding->modifiers = modifiers;
204@@ -235,12 +261,14 @@ swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value,
205 }
206 
207 EXPORT int
208-swc_add_axis_binding(uint32_t modifiers, uint32_t axis, swc_axis_binding_handler handler, void *data)
209+swc_add_axis_binding(uint32_t modifiers, uint32_t axis,
210+                     swc_axis_binding_handler handler, void *data)
211 {
212 	struct axis_binding *binding;
213 
214-	if (!(binding = wl_array_add(&axis_bindings, sizeof(*binding))))
215+	if (!(binding = wl_array_add(&axis_bindings, sizeof(*binding)))) {
216 		return -ENOMEM;
217+	}
218 
219 	binding->axis = axis;
220 	binding->modifiers = modifiers;
+4, -2
 1@@ -31,7 +31,9 @@ struct swc_bindings {
 2 	struct pointer_handler *pointer_handler;
 3 };
 4 
 5-bool bindings_initialize(void);
 6-void bindings_finalize(void);
 7+bool
 8+bindings_initialize(void);
 9+void
10+bindings_finalize(void);
11 
12 #endif
+459, -287
   1@@ -27,7 +27,6 @@
   2  * SOFTWARE.
   3  */
   4 
   5-#include "swc.h"
   6 #include "compositor.h"
   7 #include "data_device_manager.h"
   8 #include "drm.h"
   9@@ -40,29 +39,32 @@
  10 #include "screen.h"
  11 #include "seat.h"
  12 #include "shm.h"
  13-#include "surface.h"
  14 #include "subsurface.h"
  15+#include "surface.h"
  16+#include "swc.h"
  17 #include "util.h"
  18 #include "view.h"
  19 #include "wallpaper.h"
  20 #include "window.h"
  21 
  22-#include <errno.h>
  23-#include <stdlib.h>
  24-#include <stdio.h>
  25 #include <assert.h>
  26+#include <errno.h>
  27 #include <limits.h>
  28-#include <wld/wld.h>
  29+#include <stdio.h>
  30+#include <stdlib.h>
  31 #include <wld/drm.h>
  32+#include <wld/wld.h>
  33 #include <xkbcommon/xkbcommon-keysyms.h>
  34 
  35 static inline int32_t
  36 clamp_i32(int64_t v)
  37 {
  38-	if (v > INT32_MAX)
  39+	if (v > INT32_MAX) {
  40 		return INT32_MAX;
  41-	if (v < INT32_MIN)
  42+	}
  43+	if (v < INT32_MIN) {
  44 		return INT32_MIN;
  45+	}
  46 	return (int32_t)v;
  47 }
  48 
  49@@ -71,10 +73,12 @@ span_u32(int32_t a, int32_t b)
  50 {
  51 	int64_t d = (int64_t)b - (int64_t)a;
  52 
  53-	if (d <= 0)
  54+	if (d <= 0) {
  55 		return 0;
  56-	if (d > UINT32_MAX)
  57+	}
  58+	if (d > UINT32_MAX) {
  59 		return UINT32_MAX;
  60+	}
  61 	return (uint32_t)d;
  62 }
  63 
  64@@ -88,13 +92,18 @@ struct target {
  65 	struct wl_listener screen_destroy_listener;
  66 };
  67 
  68-static bool handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t x, wl_fixed_t y);
  69-static bool handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state);
  70-static void perform_update(void *data);
  71+static bool
  72+handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t x,
  73+              wl_fixed_t y);
  74+static bool
  75+handle_button(struct pointer_handler *handler, uint32_t time,
  76+              struct button *button, uint32_t state);
  77+static void
  78+perform_update(void *data);
  79 
  80 static struct pointer_handler pointer_handler = {
  81-	.motion = handle_motion,
  82-	.button = handle_button,
  83+    .motion = handle_motion,
  84+    .button = handle_button,
  85 };
  86 
  87 static struct {
  88@@ -102,7 +111,8 @@ static struct {
  89 	pixman_region32_t damage, opaque;
  90 	struct wl_listener swc_listener;
  91 
  92-	/* A mask of screens that have been repainted but are waiting on a page flip. */
  93+	/* A mask of screens that have been repainted but are waiting on a page
  94+	 * flip. */
  95 	uint32_t pending_flips;
  96 
  97 	/* A mask of screens that are scheduled to be repainted on the next idle. */
  98@@ -126,13 +136,14 @@ static struct {
  99 } overlay;
 100 
 101 struct swc_compositor swc_compositor = {
 102-	.pointer_handler = &pointer_handler,
 103+    .pointer_handler = &pointer_handler,
 104 };
 105 
 106 static void
 107 handle_screen_destroy(struct wl_listener *listener, void *data)
 108 {
 109-	struct target *target = wl_container_of(listener, target, screen_destroy_listener);
 110+	struct target *target =
 111+	    wl_container_of(listener, target, screen_destroy_listener);
 112 
 113 	wld_destroy_surface(target->surface);
 114 	free(target);
 115@@ -141,10 +152,12 @@ handle_screen_destroy(struct wl_listener *listener, void *data)
 116 static struct target *
 117 target_get(struct screen *screen)
 118 {
 119-	struct wl_listener *listener = wl_signal_get(&screen->destroy_signal, &handle_screen_destroy);
 120+	struct wl_listener *listener =
 121+	    wl_signal_get(&screen->destroy_signal, &handle_screen_destroy);
 122 	struct target *target;
 123 
 124-	return listener ? wl_container_of(listener, target, screen_destroy_listener) : NULL;
 125+	return listener ? wl_container_of(listener, target, screen_destroy_listener)
 126+	                : NULL;
 127 }
 128 
 129 static void
 130@@ -155,26 +168,30 @@ handle_screen_frame(struct view_handler *handler, uint32_t time)
 131 
 132 	compositor.pending_flips &= ~target->mask;
 133 
 134-	wl_list_for_each (view, &compositor.views, link) {
 135-		if (view->visible && view->base.screens & target->mask)
 136+	wl_list_for_each(view, &compositor.views, link)
 137+	{
 138+		if (view->visible && view->base.screens & target->mask) {
 139 			view_frame(&view->base, time);
 140+		}
 141 	}
 142 
 143-	if (target->current_buffer)
 144+	if (target->current_buffer) {
 145 		wld_surface_release(target->surface, target->current_buffer);
 146+	}
 147 
 148 	target->current_buffer = target->next_buffer;
 149 
 150-	/* If we had scheduled updates that couldn't run because we were waiting on a
 151-	 * page flip, run them now. If the compositor is currently updating, then the
 152-	 * frame finished immediately, and we can be sure that there are no pending
 153-	 * updates. */
 154-	if (compositor.scheduled_updates && !compositor.updating)
 155+	/* If we had scheduled updates that couldn't run because we were waiting on
 156+	 * a page flip, run them now. If the compositor is currently updating, then
 157+	 * the frame finished immediately, and we can be sure that there are no
 158+	 * pending updates. */
 159+	if (compositor.scheduled_updates && !compositor.updating) {
 160 		perform_update(NULL);
 161+	}
 162 }
 163 
 164 static const struct view_handler_impl screen_view_handler = {
 165-	.frame = handle_screen_frame,
 166+    .frame = handle_screen_frame,
 167 };
 168 
 169 static int
 170@@ -190,13 +207,17 @@ target_new(struct screen *screen)
 171 	struct target *target;
 172 	struct swc_rectangle *geom = &screen->base.geometry;
 173 
 174-	if (!(target = malloc(sizeof(*target))))
 175+	if (!(target = malloc(sizeof(*target)))) {
 176 		goto error0;
 177+	}
 178 
 179-	target->surface = wld_create_surface(swc.drm->context, geom->width, geom->height, WLD_FORMAT_XRGB8888, WLD_DRM_FLAG_SCANOUT);
 180+	target->surface =
 181+	    wld_create_surface(swc.drm->context, geom->width, geom->height,
 182+	                       WLD_FORMAT_XRGB8888, WLD_DRM_FLAG_SCANOUT);
 183 
 184-	if (!target->surface)
 185+	if (!target->surface) {
 186 		goto error1;
 187+	}
 188 
 189 	target->view = &screen->planes.primary.view;
 190 	target->view_handler.impl = &screen_view_handler;
 191@@ -218,34 +239,41 @@ error0:
 192 /* Rendering {{{ */
 193 
 194 static void
 195-repaint_view(struct target *target, struct compositor_view *view, pixman_region32_t *damage)
 196+repaint_view(struct target *target, struct compositor_view *view,
 197+             pixman_region32_t *damage)
 198 {
 199-	pixman_region32_t geom_region, buffer_region, border_region, view_damage, buffer_damage, border_damage;
 200-	const struct swc_rectangle *geom = &view->base.geometry, *target_geom = &target->view->geometry;
 201+	pixman_region32_t geom_region, buffer_region, border_region, view_damage,
 202+	    buffer_damage, border_damage;
 203+	const struct swc_rectangle *geom = &view->base.geometry,
 204+	                           *target_geom = &target->view->geometry;
 205 	int32_t buf_x, buf_y;
 206 	uint32_t buf_w, buf_h;
 207 	int64_t total_border;
 208 
 209-	if (!view->base.buffer)
 210+	if (!view->base.buffer) {
 211 		return;
 212+	}
 213 
 214 	buf_w = view->base.buffer->width;
 215 	buf_h = view->base.buffer->height;
 216 	buf_x = geom->x - view->buffer_offset_x;
 217 	buf_y = geom->y - view->buffer_offset_y;
 218 
 219-	total_border = (int64_t)view->border.outwidth + (int64_t)view->border.inwidth;
 220-	pixman_region32_init_rect(&geom_region, geom->x, geom->y, geom->width, geom->height);
 221+	total_border =
 222+	    (int64_t)view->border.outwidth + (int64_t)view->border.inwidth;
 223+	pixman_region32_init_rect(&geom_region, geom->x, geom->y, geom->width,
 224+	                          geom->height);
 225 	if (view->window) {
 226-		pixman_region32_init_rect(&buffer_region, geom->x, geom->y, geom->width, geom->height);
 227+		pixman_region32_init_rect(&buffer_region, geom->x, geom->y, geom->width,
 228+		                          geom->height);
 229 	} else {
 230 		pixman_region32_init_rect(&buffer_region, buf_x, buf_y, buf_w, buf_h);
 231 	}
 232 	pixman_region32_init_rect(&border_region,
 233-		geom->x - (int32_t)total_border,
 234-		geom->y - (int32_t)total_border,
 235-		geom->width + (uint32_t)(2 * total_border),
 236-		geom->height + (uint32_t)(2 * total_border));
 237+	                          geom->x - (int32_t)total_border,
 238+	                          geom->y - (int32_t)total_border,
 239+	                          geom->width + (uint32_t)(2 * total_border),
 240+	                          geom->height + (uint32_t)(2 * total_border));
 241 	pixman_region32_subtract(&border_region, &border_region, &geom_region);
 242 	pixman_region32_init_with_extents(&view_damage, &view->extents);
 243 	pixman_region32_init(&buffer_damage);
 244@@ -257,20 +285,22 @@ repaint_view(struct target *target, struct compositor_view *view, pixman_region3
 245 	pixman_region32_intersect(&buffer_damage, &view_damage, &buffer_region);
 246 
 247 	if (pixman_region32_not_empty(&buffer_damage)) {
 248-		pixman_region32_translate(&buffer_damage, -geom->x + view->buffer_offset_x, -geom->y + view->buffer_offset_y);
 249-		wld_copy_region(swc.drm->renderer, view->buffer,
 250-			buf_x - target_geom->x, buf_y - target_geom->y, &buffer_damage);
 251+		pixman_region32_translate(&buffer_damage,
 252+		                          -geom->x + view->buffer_offset_x,
 253+		                          -geom->y + view->buffer_offset_y);
 254+		wld_copy_region(swc.drm->renderer, view->buffer, buf_x - target_geom->x,
 255+		                buf_y - target_geom->y, &buffer_damage);
 256 	}
 257 
 258 	pixman_region32_fini(&view_damage);
 259 	pixman_region32_fini(&buffer_damage);
 260 
 261 	pixman_region32_t in_rect;
 262-	pixman_region32_init_rect(&in_rect, 
 263-			geom->x - view->border.inwidth, 
 264-			geom->y - view->border.inwidth, 
 265-			geom->width + (2 * view->border.inwidth), 
 266-			geom->height + (2 * view->border.inwidth));
 267+	pixman_region32_init_rect(&in_rect,
 268+	                          geom->x - view->border.inwidth,
 269+	                          geom->y - view->border.inwidth,
 270+	                          geom->width + (2 * view->border.inwidth),
 271+	                          geom->height + (2 * view->border.inwidth));
 272 
 273 	pixman_region32_t out_border;
 274 	pixman_region32_init(&out_border);
 275@@ -280,14 +310,15 @@ repaint_view(struct target *target, struct compositor_view *view, pixman_region3
 276 	pixman_region32_init(&in_border);
 277 	pixman_region32_subtract(&in_border, &in_rect, &geom_region);
 278 	pixman_region32_intersect(&in_border, &in_border, &border_damage);
 279-		
 280+
 281 	pixman_region32_fini(&geom_region);
 282 	pixman_region32_fini(&buffer_region);
 283 	pixman_region32_fini(&border_region);
 284 
 285 	/* Draw border */
 286 	if (view->border.outwidth > 0 && pixman_region32_not_empty(&out_border)) {
 287-		pixman_region32_translate(&out_border, -target_geom->x, -target_geom->y);
 288+		pixman_region32_translate(&out_border, -target_geom->x,
 289+		                          -target_geom->y);
 290 		wld_fill_region(swc.drm->renderer, view->border.outcolor, &out_border);
 291 	}
 292 
 293@@ -300,11 +331,12 @@ repaint_view(struct target *target, struct compositor_view *view, pixman_region3
 294 	pixman_region32_fini(&in_rect);
 295 	pixman_region32_fini(&out_border);
 296 	pixman_region32_fini(&in_border);
 297-
 298 }
 299 
 300 static void
 301-renderer_repaint(struct target *target, pixman_region32_t *damage, pixman_region32_t *base_damage, struct wl_list *views, struct screen *screen)
 302+renderer_repaint(struct target *target, pixman_region32_t *damage,
 303+                 pixman_region32_t *base_damage, struct wl_list *views,
 304+                 struct screen *screen)
 305 {
 306 	struct compositor_view *view;
 307 	const struct swc_rectangle *target_geom = &target->view->geometry;
 308@@ -318,54 +350,66 @@ renderer_repaint(struct target *target, pixman_region32_t *damage, pixman_region
 309 	if (pixman_region32_not_empty(base_damage)) {
 310 		struct wld_buffer *background = swc_wallpaper_buffer_for_screen(screen);
 311 
 312-		pixman_region32_translate(base_damage, -target->view->geometry.x, -target->view->geometry.y);
 313-		
 314-		if (background)
 315+		pixman_region32_translate(base_damage, -target->view->geometry.x,
 316+		                          -target->view->geometry.y);
 317+
 318+		if (background) {
 319 			wld_copy_region(swc.drm->renderer, background, 0, 0, base_damage);
 320+		}
 321 
 322-		else
 323+		else {
 324 			wld_fill_region(swc.drm->renderer, bgcolor, base_damage);
 325+		}
 326 	}
 327 
 328-	wl_list_for_each_reverse (view, views, link) {
 329-		if (view->visible && view->base.screens & target->mask)
 330+	wl_list_for_each_reverse(view, views, link)
 331+	{
 332+		if (view->visible && view->base.screens & target->mask) {
 333 			repaint_view(target, view, damage);
 334+		}
 335 	}
 336 
 337 	if (overlay.active && overlay.border_width > 0) {
 338 		int32_t x = overlay.x - target_geom->x;
 339 		int32_t y = overlay.y - target_geom->y;
 340-		uint32_t w = overlay.width, h = overlay.height, bw = overlay.border_width;
 341+		uint32_t w = overlay.width, h = overlay.height,
 342+		         bw = overlay.border_width;
 343 		int32_t tx = (int32_t)target_geom->width;
 344 		int32_t ty = (int32_t)target_geom->height;
 345 
 346-		/* draw box as 4 rectangles with wld */
 347-		#define CLAMP_LOW(v, lo) ((v) < (lo) ? (lo) : (v))
 348-		#define CLAMP_HIGH(v, hi) ((v) > (hi) ? (hi) : (v))
 349-		#define DRAW_CLIPPED(rx, ry, rw, rh) do { \
 350-			int32_t _x1 = CLAMP_LOW((rx), 0); \
 351-			int32_t _y1 = CLAMP_LOW((ry), 0); \
 352-			int32_t _x2 = CLAMP_HIGH((rx) + (int32_t)(rw), tx); \
 353-			int32_t _y2 = CLAMP_HIGH((ry) + (int32_t)(rh), ty); \
 354-			if (_x2 > _x1 && _y2 > _y1) \
 355-				wld_fill_rectangle(swc.drm->renderer, overlay.color, _x1, _y1, (uint32_t)(_x2 - _x1), (uint32_t)(_y2 - _y1)); \
 356-		} while (0)
 357+/* draw box as 4 rectangles with wld */
 358+#define CLAMP_LOW(v, lo) ((v) < (lo) ? (lo) : (v))
 359+#define CLAMP_HIGH(v, hi) ((v) > (hi) ? (hi) : (v))
 360+#define DRAW_CLIPPED(rx, ry, rw, rh)                                           \
 361+	do {                                                                       \
 362+		int32_t _x1 = CLAMP_LOW((rx), 0);                                      \
 363+		int32_t _y1 = CLAMP_LOW((ry), 0);                                      \
 364+		int32_t _x2 = CLAMP_HIGH((rx) + (int32_t)(rw), tx);                    \
 365+		int32_t _y2 = CLAMP_HIGH((ry) + (int32_t)(rh), ty);                    \
 366+		if (_x2 > _x1 && _y2 > _y1)                                            \
 367+			wld_fill_rectangle(swc.drm->renderer, overlay.color, _x1, _y1,     \
 368+			                   (uint32_t)(_x2 - _x1), (uint32_t)(_y2 - _y1));  \
 369+	} while (0)
 370 
 371 		if (w > 0 && h > 0) {
 372-			if (bw > w)
 373+			if (bw > w) {
 374 				bw = w;
 375-			if (bw > h)
 376+			}
 377+			if (bw > h) {
 378 				bw = h;
 379+			}
 380 
 381-			DRAW_CLIPPED(x, y, (int32_t)w, (int32_t)bw);                              /* top */
 382-			DRAW_CLIPPED(x, y + (int32_t)h - (int32_t)bw, (int32_t)w, (int32_t)bw);   /* bottom */
 383-			DRAW_CLIPPED(x, y, (int32_t)bw, (int32_t)h);                              /* left */
 384-			DRAW_CLIPPED(x + (int32_t)w - (int32_t)bw, y, (int32_t)bw, (int32_t)h);   /* right */
 385+			DRAW_CLIPPED(x, y, (int32_t)w, (int32_t)bw); /* top */
 386+			DRAW_CLIPPED(x, y + (int32_t)h - (int32_t)bw, (int32_t)w,
 387+			             (int32_t)bw);                   /* bottom */
 388+			DRAW_CLIPPED(x, y, (int32_t)bw, (int32_t)h); /* left */
 389+			DRAW_CLIPPED(x + (int32_t)w - (int32_t)bw, y, (int32_t)bw,
 390+			             (int32_t)h); /* right */
 391 		}
 392 
 393-		#undef DRAW_CLIPPED
 394-		#undef CLAMP_HIGH
 395-		#undef CLAMP_LOW
 396+#undef DRAW_CLIPPED
 397+#undef CLAMP_HIGH
 398+#undef CLAMP_LOW
 399 	}
 400 
 401 	wld_flush(swc.drm->renderer);
 402@@ -376,19 +420,26 @@ renderer_attach(struct compositor_view *view, struct wld_buffer *client_buffer)
 403 {
 404 	struct wld_buffer *buffer;
 405 	bool was_proxy = view->buffer != view->base.buffer;
 406-	bool needs_proxy = client_buffer && !(wld_capabilities(swc.drm->renderer, client_buffer) & WLD_CAPABILITY_READ);
 407-	bool resized = view->buffer && client_buffer && (view->buffer->width != client_buffer->width || view->buffer->height != client_buffer->height);
 408+	bool needs_proxy =
 409+	    client_buffer && !(wld_capabilities(swc.drm->renderer, client_buffer) &
 410+	                       WLD_CAPABILITY_READ);
 411+	bool resized = view->buffer && client_buffer &&
 412+	               (view->buffer->width != client_buffer->width ||
 413+	                view->buffer->height != client_buffer->height);
 414 
 415 	if (client_buffer) {
 416-		/* Create a proxy buffer if necessary (for example a hardware buffer backing
 417-		 * a SHM buffer). */
 418+		/* Create a proxy buffer if necessary (for example a hardware buffer
 419+		 * backing a SHM buffer). */
 420 		if (needs_proxy) {
 421 			if (!was_proxy || resized) {
 422 				DEBUG("Creating a proxy buffer\n");
 423-				buffer = wld_create_buffer(swc.drm->context, client_buffer->width, client_buffer->height, client_buffer->format, WLD_FLAG_MAP);
 424+				buffer = wld_create_buffer(
 425+				    swc.drm->context, client_buffer->width,
 426+				    client_buffer->height, client_buffer->format, WLD_FLAG_MAP);
 427 
 428-				if (!buffer)
 429+				if (!buffer) {
 430 					return -ENOMEM;
 431+				}
 432 			} else {
 433 				/* Otherwise we can keep the original proxy buffer. */
 434 				buffer = view->buffer;
 435@@ -402,8 +453,10 @@ renderer_attach(struct compositor_view *view, struct wld_buffer *client_buffer)
 436 
 437 	/* If we no longer need a proxy buffer, or the original buffer is of a
 438 	 * different size, destroy the old proxy image. */
 439-	if (view->buffer && ((!needs_proxy && was_proxy) || (needs_proxy && resized)))
 440+	if (view->buffer &&
 441+	    ((!needs_proxy && was_proxy) || (needs_proxy && resized))) {
 442 		wld_buffer_unreference(view->buffer);
 443+	}
 444 
 445 	view->buffer = buffer;
 446 
 447@@ -413,11 +466,13 @@ renderer_attach(struct compositor_view *view, struct wld_buffer *client_buffer)
 448 static void
 449 renderer_flush_view(struct compositor_view *view)
 450 {
 451-	if (view->buffer == view->base.buffer)
 452+	if (view->buffer == view->base.buffer) {
 453 		return;
 454+	}
 455 
 456 	wld_set_target_buffer(swc.shm->renderer, view->buffer);
 457-	wld_copy_region(swc.shm->renderer, view->base.buffer, 0, 0, &view->surface->state.damage);
 458+	wld_copy_region(swc.shm->renderer, view->base.buffer, 0, 0,
 459+	                &view->surface->state.damage);
 460 	wld_flush(swc.shm->renderer);
 461 }
 462 
 463@@ -434,7 +489,8 @@ damage_below_view(struct compositor_view *view)
 464 	pixman_region32_t damage_below;
 465 
 466 	pixman_region32_init_with_extents(&damage_below, &view->extents);
 467-	pixman_region32_union(&compositor.damage, &compositor.damage, &damage_below);
 468+	pixman_region32_union(&compositor.damage, &compositor.damage,
 469+	                      &damage_below);
 470 	pixman_region32_fini(&damage_below);
 471 }
 472 
 473@@ -452,7 +508,8 @@ damage_view(struct compositor_view *view)
 474 static void
 475 update_extents(struct compositor_view *view)
 476 {
 477-	int64_t total_border = (int64_t)view->border.outwidth + (int64_t)view->border.inwidth;
 478+	int64_t total_border =
 479+	    (int64_t)view->border.outwidth + (int64_t)view->border.inwidth;
 480 	int64_t geom_x = view->base.geometry.x;
 481 	int64_t geom_y = view->base.geometry.y;
 482 	int64_t geom_w = view->base.geometry.width;
 483@@ -465,46 +522,55 @@ update_extents(struct compositor_view *view)
 484 
 485 	int64_t buffer_x1 = geom_x - view->buffer_offset_x;
 486 	int64_t buffer_y1 = geom_y - view->buffer_offset_y;
 487-	int64_t buffer_x2 = buffer_x1 + (view->base.buffer ? view->base.buffer->width : (uint32_t)geom_w);
 488-	int64_t buffer_y2 = buffer_y1 + (view->base.buffer ? view->base.buffer->height : (uint32_t)geom_h);
 489+	int64_t buffer_x2 =
 490+	    buffer_x1 +
 491+	    (view->base.buffer ? view->base.buffer->width : (uint32_t)geom_w);
 492+	int64_t buffer_y2 =
 493+	    buffer_y1 +
 494+	    (view->base.buffer ? view->base.buffer->height : (uint32_t)geom_h);
 495 
 496 	view->extents.x1 = clamp_i32(MIN(border_x1, buffer_x1));
 497 	view->extents.y1 = clamp_i32(MIN(border_y1, buffer_y1));
 498 	view->extents.x2 = clamp_i32(MAX(border_x2, buffer_x2));
 499 	view->extents.y2 = clamp_i32(MAX(border_y2, buffer_y2));
 500 
 501-	if (view->extents.x2 < view->extents.x1)
 502+	if (view->extents.x2 < view->extents.x1) {
 503 		view->extents.x2 = view->extents.x1;
 504-	if (view->extents.y2 < view->extents.y1)
 505+	}
 506+	if (view->extents.y2 < view->extents.y1) {
 507 		view->extents.y2 = view->extents.y1;
 508+	}
 509 
 510 	/* Damage border. */
 511 	view->border.damaged_border1 = true;
 512 	view->border.damaged_border2 = true;
 513 }
 514 
 515-
 516 static void
 517 schedule_updates(uint32_t screens)
 518 {
 519-	if (compositor.scheduled_updates == 0)
 520+	if (compositor.scheduled_updates == 0) {
 521 		wl_event_loop_add_idle(swc.event_loop, &perform_update, NULL);
 522+	}
 523 
 524 	if (screens == -1) {
 525 		struct screen *screen;
 526 
 527 		screens = 0;
 528-		wl_list_for_each (screen, &swc.screens, link)
 529-			screens |= screen_mask(screen);
 530+		wl_list_for_each(screen, &swc.screens, link) screens |=
 531+		    screen_mask(screen);
 532 	}
 533 
 534-	/* when zoomed, force full screen damage since actual area differs from world coords */
 535+	/* when zoomed, force full screen damage since actual area differs from
 536+	 * world coords */
 537 	if (compositor.zoom != 1.0f) {
 538 		struct screen *screen;
 539-		wl_list_for_each (screen, &swc.screens, link) {
 540-			pixman_region32_union_rect(&compositor.damage, &compositor.damage,
 541-				screen->base.geometry.x, screen->base.geometry.y,
 542-				screen->base.geometry.width, screen->base.geometry.height);
 543+		wl_list_for_each(screen, &swc.screens, link)
 544+		{
 545+			pixman_region32_union_rect(
 546+			    &compositor.damage, &compositor.damage, screen->base.geometry.x,
 547+			    screen->base.geometry.y, screen->base.geometry.width,
 548+			    screen->base.geometry.height);
 549 			screens |= screen_mask(screen);
 550 		}
 551 	}
 552@@ -517,38 +583,47 @@ compositor_damage_all(void)
 553 {
 554 	struct screen *screen;
 555 
 556-	if (!compositor.initialized)
 557+	if (!compositor.initialized) {
 558 		return;
 559+	}
 560 
 561-	wl_list_for_each (screen, &swc.screens, link) {
 562-		pixman_region32_union_rect(&compositor.damage, &compositor.damage,
 563-			screen->base.geometry.x, screen->base.geometry.y,
 564-			screen->base.geometry.width, screen->base.geometry.height);
 565+	wl_list_for_each(screen, &swc.screens, link)
 566+	{
 567+		pixman_region32_union_rect(
 568+		    &compositor.damage, &compositor.damage, screen->base.geometry.x,
 569+		    screen->base.geometry.y, screen->base.geometry.width,
 570+		    screen->base.geometry.height);
 571 	}
 572 
 573 	schedule_updates(-1);
 574 }
 575 
 576 static void
 577-overlay_damage_region(int32_t x, int32_t y, uint32_t width, uint32_t height, uint32_t border_width)
 578+overlay_damage_region(int32_t x, int32_t y, uint32_t width, uint32_t height,
 579+                      uint32_t border_width)
 580 {
 581 	(void)border_width;
 582-	pixman_region32_union_rect(&compositor.damage, &compositor.damage, x, y, width, height);
 583+	pixman_region32_union_rect(&compositor.damage, &compositor.damage, x, y,
 584+	                           width, height);
 585 }
 586 
 587 EXPORT void
 588-swc_overlay_set_box(int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color, uint32_t border_width)
 589+swc_overlay_set_box(int32_t x1, int32_t y1, int32_t x2, int32_t y2,
 590+                    uint32_t color, uint32_t border_width)
 591 {
 592 	int32_t x = x1 < x2 ? x1 : x2;
 593 	int32_t y = y1 < y2 ? y1 : y2;
 594 	uint32_t width = (uint32_t)abs(x2 - x1);
 595 	uint32_t height = (uint32_t)abs(y2 - y1);
 596 
 597-	if (border_width == 0)
 598+	if (border_width == 0) {
 599 		border_width = 1;
 600+	}
 601 
 602-	if (overlay.active)
 603-		overlay_damage_region(overlay.x, overlay.y, overlay.width, overlay.height, overlay.border_width);
 604+	if (overlay.active) {
 605+		overlay_damage_region(overlay.x, overlay.y, overlay.width,
 606+		                      overlay.height, overlay.border_width);
 607+	}
 608 
 609 	overlay.active = true;
 610 	overlay.x = x;
 611@@ -558,17 +633,20 @@ swc_overlay_set_box(int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t col
 612 	overlay.color = color;
 613 	overlay.border_width = border_width;
 614 
 615-	overlay_damage_region(overlay.x, overlay.y, overlay.width, overlay.height, overlay.border_width);
 616+	overlay_damage_region(overlay.x, overlay.y, overlay.width, overlay.height,
 617+	                      overlay.border_width);
 618 	schedule_updates(-1);
 619 }
 620 
 621 EXPORT void
 622 swc_overlay_clear(void)
 623 {
 624-	if (!overlay.active)
 625+	if (!overlay.active) {
 626 		return;
 627+	}
 628 
 629-	overlay_damage_region(overlay.x, overlay.y, overlay.width, overlay.height, overlay.border_width);
 630+	overlay_damage_region(overlay.x, overlay.y, overlay.width, overlay.height,
 631+	                      overlay.border_width);
 632 	overlay.active = false;
 633 	schedule_updates(-1);
 634 }
 635@@ -576,10 +654,12 @@ swc_overlay_clear(void)
 636 EXPORT void
 637 swc_set_zoom(float level)
 638 {
 639-	if (level < 0.1f)
 640+	if (level < 0.1f) {
 641 		level = 0.1f;
 642-	if (level > 10.0f)
 643+	}
 644+	if (level > 10.0f) {
 645 		level = 10.0f;
 646+	}
 647 
 648 	if (compositor.zoom != level) {
 649 		compositor.zoom = level;
 650@@ -620,10 +700,11 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 651 	struct compositor_view *view;
 652 	struct wld_buffer *background;
 653 
 654-	struct wld_buffer *buffer = wld_create_buffer(swc.shm->context,
 655-		width, height, WLD_FORMAT_XRGB8888, WLD_FLAG_MAP);
 656-	if (!buffer)
 657+	struct wld_buffer *buffer = wld_create_buffer(
 658+	    swc.shm->context, width, height, WLD_FORMAT_XRGB8888, WLD_FLAG_MAP);
 659+	if (!buffer) {
 660 		return NULL;
 661+	}
 662 
 663 	if (!wld_set_target_buffer(swc.shm->renderer, buffer)) {
 664 		wld_buffer_unreference(buffer);
 665@@ -633,10 +714,11 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 666 	pixman_region32_t full;
 667 	pixman_region32_init_rect(&full, 0, 0, width, height);
 668 	background = swc_wallpaper_buffer_for_screen(screen);
 669-	if (background)
 670+	if (background) {
 671 		wld_copy_region(swc.shm->renderer, background, 0, 0, &full);
 672-	else
 673+	} else {
 674 		wld_fill_region(swc.shm->renderer, bgcolor, &full);
 675+	}
 676 	pixman_region32_fini(&full);
 677 	wld_flush(swc.shm->renderer);
 678 
 679@@ -646,9 +728,8 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 680 	}
 681 
 682 	pixman_image_t *dst_img = pixman_image_create_bits(
 683-		wld_to_pixman_format(buffer->format),
 684-		buffer->width, buffer->height,
 685-		buffer->map, buffer->pitch);
 686+	    wld_to_pixman_format(buffer->format), buffer->width, buffer->height,
 687+	    buffer->map, buffer->pitch);
 688 
 689 	if (!dst_img) {
 690 		wld_unmap(buffer);
 691@@ -657,17 +738,21 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 692 	}
 693 
 694 	/* render each view with scaling */
 695-	wl_list_for_each_reverse(view, &compositor.views, link) {
 696+	wl_list_for_each_reverse(view, &compositor.views, link)
 697+	{
 698 		struct wld_buffer *src = view->buffer;
 699 		const struct swc_rectangle *geom = &view->base.geometry;
 700 
 701-		if (!src)
 702+		if (!src) {
 703 			continue;
 704+		}
 705 
 706-		if (!(wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ))
 707+		if (!(wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ)) {
 708 			src = view->base.buffer;
 709-		if (!src)
 710+		}
 711+		if (!src) {
 712 			continue;
 713+		}
 714 
 715 		/* maths     zoom position and size */
 716 		float zoomed_x = (geom->x - cx) * zoom + width / 2.0f;
 717@@ -679,9 +764,12 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 718 		float border_in = view->border.inwidth * zoom;
 719 		float total_border = border_out + border_in;
 720 
 721-		if (zoomed_x + zoomed_w + total_border < 0 || zoomed_x - total_border >= (int32_t)width ||
 722-		    zoomed_y + zoomed_h + total_border < 0 || zoomed_y - total_border >= (int32_t)height)
 723+		if (zoomed_x + zoomed_w + total_border < 0 ||
 724+		    zoomed_x - total_border >= (int32_t)width ||
 725+		    zoomed_y + zoomed_h + total_border < 0 ||
 726+		    zoomed_y - total_border >= (int32_t)height) {
 727 			continue;
 728+		}
 729 
 730 		if (view->border.outwidth > 0 && border_out >= 1) {
 731 			int32_t bx = (int32_t)(zoomed_x - total_border);
 732@@ -691,21 +779,21 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 733 			int32_t bo = (int32_t)border_out;
 734 
 735 			pixman_color_t color = {
 736-				.red = ((view->border.outcolor >> 16) & 0xff) * 257,
 737-				.green = ((view->border.outcolor >> 8) & 0xff) * 257,
 738-				.blue = (view->border.outcolor & 0xff) * 257,
 739-				.alpha = 0xffff
 740-			};
 741+			    .red = ((view->border.outcolor >> 16) & 0xff) * 257,
 742+			    .green = ((view->border.outcolor >> 8) & 0xff) * 257,
 743+			    .blue = (view->border.outcolor & 0xff) * 257,
 744+			    .alpha = 0xffff};
 745 			pixman_image_t *fill = pixman_image_create_solid_fill(&color);
 746 			if (fill) {
 747-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 748-					0, 0, 0, 0, bx, by, bw, bo);
 749-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 750-					0, 0, 0, 0, bx, by + bh - bo, bw, bo);
 751-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 752-					0, 0, 0, 0, bx, by + bo, bo, bh - 2 * bo);
 753-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 754-					0, 0, 0, 0, bx + bw - bo, by + bo, bo, bh - 2 * bo);
 755+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 756+				                         0, 0, 0, bx, by, bw, bo);
 757+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 758+				                         0, 0, 0, bx, by + bh - bo, bw, bo);
 759+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 760+				                         0, 0, 0, bx, by + bo, bo, bh - 2 * bo);
 761+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 762+				                         0, 0, 0, bx + bw - bo, by + bo, bo,
 763+				                         bh - 2 * bo);
 764 				pixman_image_unref(fill);
 765 			}
 766 		}
 767@@ -718,32 +806,32 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 768 			int32_t bi = (int32_t)border_in;
 769 
 770 			pixman_color_t color = {
 771-				.red = ((view->border.incolor >> 16) & 0xff) * 257,
 772-				.green = ((view->border.incolor >> 8) & 0xff) * 257,
 773-				.blue = (view->border.incolor & 0xff) * 257,
 774-				.alpha = 0xffff
 775-			};
 776+			    .red = ((view->border.incolor >> 16) & 0xff) * 257,
 777+			    .green = ((view->border.incolor >> 8) & 0xff) * 257,
 778+			    .blue = (view->border.incolor & 0xff) * 257,
 779+			    .alpha = 0xffff};
 780 			pixman_image_t *fill = pixman_image_create_solid_fill(&color);
 781 			if (fill) {
 782-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 783-					0, 0, 0, 0, bx, by, bw, bi);
 784-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 785-					0, 0, 0, 0, bx, by + bh - bi, bw, bi);
 786-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 787-					0, 0, 0, 0, bx, by + bi, bi, bh - 2 * bi);
 788-				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img,
 789-					0, 0, 0, 0, bx + bw - bi, by + bi, bi, bh - 2 * bi);
 790+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 791+				                         0, 0, 0, bx, by, bw, bi);
 792+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 793+				                         0, 0, 0, bx, by + bh - bi, bw, bi);
 794+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 795+				                         0, 0, 0, bx, by + bi, bi, bh - 2 * bi);
 796+				pixman_image_composite32(PIXMAN_OP_OVER, fill, NULL, dst_img, 0,
 797+				                         0, 0, 0, bx + bw - bi, by + bi, bi,
 798+				                         bh - 2 * bi);
 799 				pixman_image_unref(fill);
 800 			}
 801 		}
 802 
 803-		if (!wld_map(src))
 804+		if (!wld_map(src)) {
 805 			continue;
 806+		}
 807 
 808 		pixman_image_t *src_img = pixman_image_create_bits(
 809-			wld_to_pixman_format(src->format),
 810-			src->width, src->height,
 811-			src->map, src->pitch);
 812+		    wld_to_pixman_format(src->format), src->width, src->height,
 813+		    src->map, src->pitch);
 814 
 815 		if (src_img) {
 816 			pixman_transform_t transform;
 817@@ -753,11 +841,10 @@ render_zoomed_to_shm(struct screen *screen, float zoom)
 818 			pixman_image_set_transform(src_img, &transform);
 819 			pixman_image_set_filter(src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
 820 
 821-			pixman_image_composite32(PIXMAN_OP_OVER,
 822-				src_img, NULL, dst_img,
 823-				0, 0, 0, 0,
 824-				(int32_t)zoomed_x, (int32_t)zoomed_y,
 825-				(int32_t)(zoomed_w + 1), (int32_t)(zoomed_h + 1));
 826+			pixman_image_composite32(PIXMAN_OP_OVER, src_img, NULL, dst_img, 0,
 827+			                         0, 0, 0, (int32_t)zoomed_x,
 828+			                         (int32_t)zoomed_y, (int32_t)(zoomed_w + 1),
 829+			                         (int32_t)(zoomed_h + 1));
 830 
 831 			pixman_image_unref(src_img);
 832 		}
 833@@ -776,8 +863,9 @@ update(struct view *base)
 834 {
 835 	struct compositor_view *view = (void *)base;
 836 
 837-	if (!swc.active || !view->visible)
 838+	if (!swc.active || !view->visible) {
 839 		return false;
 840+	}
 841 
 842 	schedule_updates(view->base.screens);
 843 
 844@@ -795,8 +883,9 @@ attach(struct view *base, struct wld_buffer *buffer)
 845 	uint32_t new_height = buffer ? buffer->height : 0;
 846 	int ret;
 847 
 848-	if ((ret = renderer_attach(view, buffer)) < 0)
 849+	if ((ret = renderer_attach(view, buffer)) < 0) {
 850 		return ret;
 851+	}
 852 
 853 	/* Schedule updates on the screens the view was previously
 854 	 * visible on. */
 855@@ -853,8 +942,8 @@ move(struct view *base, int32_t x, int32_t y)
 856 		update_extents(view);
 857 
 858 		if (view->visible) {
 859-			/* Assume worst-case no clipping until we draw the next frame (in case the
 860-			 * surface gets moved again before that). */
 861+			/* Assume worst-case no clipping until we draw the next frame (in
 862+			 * case the surface gets moved again before that). */
 863 			pixman_region32_init(&view->clip);
 864 
 865 			view_update_screens(&view->base);
 866@@ -867,9 +956,9 @@ move(struct view *base, int32_t x, int32_t y)
 867 }
 868 
 869 static const struct view_impl view_impl = {
 870-	.update = update,
 871-	.attach = attach,
 872-	.move = move,
 873+    .update = update,
 874+    .attach = attach,
 875+    .move = move,
 876 };
 877 
 878 static struct compositor_view *
 879@@ -879,29 +968,33 @@ view_at(int32_t x, int32_t y)
 880 	struct swc_rectangle *geom;
 881 	struct swc_rectangle buffer_geom;
 882 
 883-	wl_list_for_each (view, &compositor.views, link) {
 884-		if (!view->visible)
 885+	wl_list_for_each(view, &compositor.views, link)
 886+	{
 887+		if (!view->visible) {
 888 			continue;
 889+		}
 890 
 891 		geom = &view->base.geometry;
 892 		if (view->window) {
 893-			if (!rectangle_contains_point(geom, x, y))
 894+			if (!rectangle_contains_point(geom, x, y)) {
 895 				continue;
 896+			}
 897 		} else if (view->base.buffer) {
 898 			buffer_geom.x = geom->x - view->buffer_offset_x;
 899 			buffer_geom.y = geom->y - view->buffer_offset_y;
 900 			buffer_geom.width = view->base.buffer->width;
 901 			buffer_geom.height = view->base.buffer->height;
 902-			if (!rectangle_contains_point(&buffer_geom, x, y))
 903+			if (!rectangle_contains_point(&buffer_geom, x, y)) {
 904 				continue;
 905+			}
 906 		} else if (!rectangle_contains_point(geom, x, y)) {
 907 			continue;
 908 		}
 909 
 910 		if (pixman_region32_contains_point(&view->surface->state.input,
 911 		                                   x - geom->x + view->buffer_offset_x,
 912-		                                   y - geom->y + view->buffer_offset_y, NULL))
 913-		{
 914+		                                   y - geom->y + view->buffer_offset_y,
 915+		                                   NULL)) {
 916 			return view;
 917 		}
 918 	}
 919@@ -912,8 +1005,9 @@ view_at(int32_t x, int32_t y)
 920 static struct compositor_view *
 921 window_view(struct compositor_view *view)
 922 {
 923-	while (view && !view->window && view->parent && view->parent != view)
 924+	while (view && !view->window && view->parent && view->parent != view) {
 925 		view = view->parent;
 926+	}
 927 	return (view && view->window) ? view : NULL;
 928 }
 929 
 930@@ -925,14 +1019,17 @@ raise_window(struct compositor_view *view)
 931 	uint32_t screens;
 932 
 933 	view = window_view(view);
 934-	if (!view || !view->visible)
 935+	if (!view || !view->visible) {
 936 		return;
 937+	}
 938 
 939 	top_window = NULL;
 940 	insert_after = &compositor.views;
 941-	wl_list_for_each (other, &compositor.views, link) {
 942-		if (!other->visible)
 943+	wl_list_for_each(other, &compositor.views, link)
 944+	{
 945+		if (!other->visible) {
 946 			continue;
 947+		}
 948 
 949 		if (other->window) {
 950 			top_window = other;
 951@@ -941,8 +1038,9 @@ raise_window(struct compositor_view *view)
 952 		insert_after = &other->link;
 953 	}
 954 
 955-	if (view == top_window)
 956+	if (view == top_window) {
 957 		return;
 958+	}
 959 
 960 	screens = view->base.screens;
 961 
 962@@ -970,8 +1068,9 @@ view_for_window(struct swc_window *base)
 963 {
 964 	struct window *window;
 965 
 966-	if (!base)
 967+	if (!base) {
 968 		return NULL;
 969+	}
 970 
 971 	window = (struct window *)base;
 972 	return window->view;
 973@@ -986,8 +1085,9 @@ prev_window_view(struct compositor_view *view)
 974 	for (link = view->link.prev; link != &compositor.views; link = link->prev) {
 975 		other = wl_container_of(link, other, link);
 976 
 977-		if (other->visible && other->window)
 978+		if (other->visible && other->window) {
 979 			return other;
 980+		}
 981 	}
 982 
 983 	return NULL;
 984@@ -1002,8 +1102,9 @@ next_window_view(struct compositor_view *view)
 985 	for (link = view->link.next; link != &compositor.views; link = link->next) {
 986 		other = wl_container_of(link, other, link);
 987 
 988-		if (other->visible && other->window)
 989+		if (other->visible && other->window) {
 990 			return other;
 991+		}
 992 	}
 993 
 994 	return NULL;
 995@@ -1039,19 +1140,22 @@ swc_window_stack(struct swc_window *window, int32_t direction)
 996 	struct compositor_view *view = view_for_window(window);
 997 	struct compositor_view *other = NULL;
 998 
 999-	if (!view || !view->visible || direction == 0)
1000+	if (!view || !view->visible || direction == 0) {
1001 		return;
1002+	}
1003 
1004 	if (direction < 0) {
1005 		other = prev_window_view(view);
1006-		if (!other)
1007+		if (!other) {
1008 			return;
1009+		}
1010 		wl_list_remove(&view->link);
1011 		wl_list_insert(other->link.prev, &view->link);
1012 	} else {
1013 		other = next_window_view(view);
1014-		if (!other)
1015+		if (!other) {
1016 			return;
1017+		}
1018 		wl_list_remove(&view->link);
1019 		wl_list_insert(&other->link, &view->link);
1020 	}
1021@@ -1066,8 +1170,9 @@ compositor_create_view(struct surface *surface)
1022 
1023 	view = malloc(sizeof(*view));
1024 
1025-	if (!view)
1026+	if (!view) {
1027 		return NULL;
1028+	}
1029 
1030 	view_initialize(&view->base, &view_impl);
1031 	view->surface = surface;
1032@@ -1114,30 +1219,36 @@ compositor_view(struct view *view)
1033 }
1034 
1035 void
1036-compositor_view_set_parent(struct compositor_view *view, struct compositor_view *parent)
1037+compositor_view_set_parent(struct compositor_view *view,
1038+                           struct compositor_view *parent)
1039 {
1040 	view->parent = parent;
1041 
1042-	if (parent->visible)
1043+	if (parent->visible) {
1044 		compositor_view_show(view);
1045-	else
1046+	} else {
1047 		compositor_view_hide(view);
1048+	}
1049 }
1050 
1051 void
1052-compositor_view_restack(struct compositor_view *view, struct compositor_view *sibling, bool above)
1053+compositor_view_restack(struct compositor_view *view,
1054+                        struct compositor_view *sibling, bool above)
1055 {
1056-	if (!view || !sibling || view == sibling)
1057+	if (!view || !sibling || view == sibling) {
1058 		return;
1059+	}
1060 
1061 	if (above) {
1062-		if (view->link.next == &sibling->link)
1063+		if (view->link.next == &sibling->link) {
1064 			return;
1065+		}
1066 		wl_list_remove(&view->link);
1067 		wl_list_insert(sibling->link.prev, &view->link);
1068 	} else {
1069-		if (view->link.prev == &sibling->link)
1070+		if (view->link.prev == &sibling->link) {
1071 			return;
1072+		}
1073 		wl_list_remove(&view->link);
1074 		wl_list_insert(&sibling->link, &view->link);
1075 	}
1076@@ -1151,13 +1262,15 @@ compositor_view_show(struct compositor_view *view)
1077 	struct compositor_view *other;
1078 	struct subsurface *subsurface;
1079 
1080-	if (view->visible)
1081+	if (view->visible) {
1082 		return;
1083+	}
1084 
1085 	subsurface = view->surface ? view->surface->subsurface : NULL;
1086 	if (subsurface) {
1087-		if (!subsurface->added || !view->surface->state.buffer)
1088+		if (!subsurface->added || !view->surface->state.buffer) {
1089 			return;
1090+		}
1091 	}
1092 
1093 	view->visible = true;
1094@@ -1169,9 +1282,11 @@ compositor_view_show(struct compositor_view *view)
1095 	damage_view(view);
1096 	update(&view->base);
1097 
1098-	wl_list_for_each (other, &compositor.views, link) {
1099-		if (other->parent == view)
1100+	wl_list_for_each(other, &compositor.views, link)
1101+	{
1102+		if (other->parent == view) {
1103 			compositor_view_show(other);
1104+		}
1105 	}
1106 }
1107 
1108@@ -1180,8 +1295,9 @@ compositor_view_hide(struct compositor_view *view)
1109 {
1110 	struct compositor_view *other;
1111 
1112-	if (!view->visible)
1113+	if (!view->visible) {
1114 		return;
1115+	}
1116 
1117 	/* Update all the screens the view was on. */
1118 	update(&view->base);
1119@@ -1190,17 +1306,21 @@ compositor_view_hide(struct compositor_view *view)
1120 	view_set_screens(&view->base, 0);
1121 	view->visible = false;
1122 
1123-	wl_list_for_each (other, &compositor.views, link) {
1124-		if (other->parent == view)
1125+	wl_list_for_each(other, &compositor.views, link)
1126+	{
1127+		if (other->parent == view) {
1128 			compositor_view_hide(other);
1129+		}
1130 	}
1131 }
1132 
1133 void
1134-compositor_view_set_border_width(struct compositor_view *view, uint32_t outwidth, uint32_t inwidth)
1135+compositor_view_set_border_width(struct compositor_view *view,
1136+                                 uint32_t outwidth, uint32_t inwidth)
1137 {
1138-	if (view->border.outwidth == outwidth && view->border.inwidth == inwidth)
1139+	if (view->border.outwidth == outwidth && view->border.inwidth == inwidth) {
1140 		return;
1141+	}
1142 
1143 	view->border.outwidth = outwidth;
1144 	view->border.damaged_border1 = true;
1145@@ -1215,17 +1335,18 @@ compositor_view_set_border_width(struct compositor_view *view, uint32_t outwidth
1146 }
1147 
1148 void
1149-compositor_view_set_border_color(struct compositor_view *view, uint32_t outcolor, uint32_t incolor)
1150+compositor_view_set_border_color(struct compositor_view *view,
1151+                                 uint32_t outcolor, uint32_t incolor)
1152 {
1153-	if (view->border.outcolor == outcolor && view->border.incolor == incolor)
1154+	if (view->border.outcolor == outcolor && view->border.incolor == incolor) {
1155 		return;
1156+	}
1157 
1158 	view->border.outcolor = outcolor;
1159 	view->border.damaged_border1 = true;
1160 
1161 	view->border.incolor = incolor;
1162 	view->border.damaged_border2 = true;
1163-	
1164 
1165 	/* XXX: Damage above surface for transparent surfaces? */
1166 
1167@@ -1245,14 +1366,17 @@ calculate_damage(void)
1168 	pixman_region32_init(&surface_opaque);
1169 
1170 	/* Go through views top-down to calculate clipping regions. */
1171-	wl_list_for_each (view, &compositor.views, link) {
1172-		if (!view->visible)
1173+	wl_list_for_each(view, &compositor.views, link)
1174+	{
1175+		if (!view->visible) {
1176 			continue;
1177+		}
1178 
1179 		geom = &view->base.geometry;
1180 		pixman_region32_t view_region;
1181 
1182-		pixman_region32_init_rect(&view_region, geom->x, geom->y, geom->width, geom->height);
1183+		pixman_region32_init_rect(&view_region, geom->x, geom->y, geom->width,
1184+		                          geom->height);
1185 
1186 		/* Clip the surface by the opaque region covering it. */
1187 		pixman_region32_copy(&view->clip, &compositor.opaque);
1188@@ -1262,10 +1386,12 @@ calculate_damage(void)
1189 		pixman_region32_translate(&surface_opaque,
1190 		                          geom->x - view->buffer_offset_x,
1191 		                          geom->y - view->buffer_offset_y);
1192-		pixman_region32_intersect(&surface_opaque, &surface_opaque, &view_region);
1193+		pixman_region32_intersect(&surface_opaque, &surface_opaque,
1194+		                          &view_region);
1195 
1196 		/* Add the surface's opaque region to the accumulated opaque region. */
1197-		pixman_region32_union(&compositor.opaque, &compositor.opaque, &surface_opaque);
1198+		pixman_region32_union(&compositor.opaque, &compositor.opaque,
1199+		                      &surface_opaque);
1200 
1201 		surface_damage = &view->surface->state.damage;
1202 
1203@@ -1278,27 +1404,30 @@ calculate_damage(void)
1204 			                          geom->y - view->buffer_offset_y);
1205 
1206 			/* Add the surface damage to the compositor damage. */
1207-			pixman_region32_union(&compositor.damage, &compositor.damage, surface_damage);
1208+			pixman_region32_union(&compositor.damage, &compositor.damage,
1209+			                      surface_damage);
1210 			pixman_region32_clear(surface_damage);
1211 		}
1212 
1213-	                /* redraw entire thingy if either */
1214-			if (view->border.damaged_border1 || view->border.damaged_border2) {
1215-				pixman_region32_t border_region;
1216+		/* redraw entire thingy if either */
1217+		if (view->border.damaged_border1 || view->border.damaged_border2) {
1218+			pixman_region32_t border_region;
1219 
1220-				pixman_region32_init_with_extents(&border_region, &view->extents);
1221+			pixman_region32_init_with_extents(&border_region, &view->extents);
1222 
1223-				pixman_region32_subtract(&border_region, &border_region, &view_region);
1224+			pixman_region32_subtract(&border_region, &border_region,
1225+			                         &view_region);
1226 
1227-				pixman_region32_union(&compositor.damage, &compositor.damage, &border_region);
1228+			pixman_region32_union(&compositor.damage, &compositor.damage,
1229+			                      &border_region);
1230 
1231-				pixman_region32_fini(&border_region);
1232+			pixman_region32_fini(&border_region);
1233 
1234-				view->border.damaged_border1 = false;
1235-				view->border.damaged_border2 = false;
1236-			}
1237+			view->border.damaged_border1 = false;
1238+			view->border.damaged_border2 = false;
1239+		}
1240 
1241-			pixman_region32_fini(&view_region);
1242+		pixman_region32_fini(&view_region);
1243 	}
1244 
1245 	pixman_region32_fini(&surface_opaque);
1246@@ -1311,14 +1440,17 @@ update_screen(struct screen *screen)
1247 	const struct swc_rectangle *geom = &screen->base.geometry;
1248 	pixman_region32_t damage, *total_damage;
1249 
1250-	if (!(compositor.scheduled_updates & screen_mask(screen)))
1251+	if (!(compositor.scheduled_updates & screen_mask(screen))) {
1252 		return;
1253+	}
1254 
1255-	if (!(target = target_get(screen)))
1256+	if (!(target = target_get(screen))) {
1257 		return;
1258+	}
1259 
1260 	pixman_region32_init(&damage);
1261-	pixman_region32_intersect_rect(&damage, &compositor.damage, geom->x, geom->y, geom->width, geom->height);
1262+	pixman_region32_intersect_rect(&damage, &compositor.damage, geom->x,
1263+	                               geom->y, geom->width, geom->height);
1264 	pixman_region32_translate(&damage, -geom->x, -geom->y);
1265 	total_damage = wld_surface_damage(target->surface, &damage);
1266 
1267@@ -1332,9 +1464,11 @@ update_screen(struct screen *screen)
1268 	if (compositor.zoom != 1.0f) {
1269 		pixman_region32_fini(&damage);
1270 
1271-		struct wld_buffer *zoomed = render_zoomed_to_shm(screen, compositor.zoom);
1272-		if (!zoomed)
1273+		struct wld_buffer *zoomed =
1274+		    render_zoomed_to_shm(screen, compositor.zoom);
1275+		if (!zoomed) {
1276 			return;
1277+		}
1278 
1279 		pixman_region32_t full;
1280 		pixman_region32_init_rect(&full, 0, 0, geom->width, geom->height);
1281@@ -1350,15 +1484,16 @@ update_screen(struct screen *screen)
1282 		pixman_region32_translate(&damage, geom->x, geom->y);
1283 		pixman_region32_init(&base_damage);
1284 		pixman_region32_subtract(&base_damage, &damage, &compositor.opaque);
1285-		renderer_repaint(target, &damage, &base_damage, &compositor.views, screen);
1286+		renderer_repaint(target, &damage, &base_damage, &compositor.views,
1287+		                 screen);
1288 		pixman_region32_fini(&damage);
1289 		pixman_region32_fini(&base_damage);
1290 	}
1291 
1292 	switch (target_swap_buffers(target)) {
1293 	case -EACCES:
1294-		/* If we get an EACCES, it is because this session is being deactivated, but
1295-		 * we haven't yet received the deactivate signal from swc-launch. */
1296+		/* If we get an EACCES, it is because this session is being deactivated,
1297+		 * but we haven't yet received the deactivate signal from swc-launch. */
1298 		swc_deactivate();
1299 		break;
1300 	case 0:
1301@@ -1373,16 +1508,16 @@ perform_update(void *data)
1302 	struct screen *screen;
1303 	uint32_t updates = compositor.scheduled_updates & ~compositor.pending_flips;
1304 
1305-	if (!swc.active || !updates)
1306+	if (!swc.active || !updates) {
1307 		return;
1308+	}
1309 
1310 	DEBUG("Performing update\n");
1311 
1312 	compositor.updating = true;
1313 	calculate_damage();
1314 
1315-	wl_list_for_each (screen, &swc.screens, link)
1316-		update_screen(screen);
1317+	wl_list_for_each(screen, &swc.screens, link) update_screen(screen);
1318 
1319 	/* XXX: Should assert that all damage was covered by some output */
1320 	pixman_region32_clear(&compositor.damage);
1321@@ -1391,13 +1526,15 @@ perform_update(void *data)
1322 }
1323 
1324 bool
1325-handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_fixed_t fy)
1326+handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx,
1327+              wl_fixed_t fy)
1328 {
1329 	int32_t x = wl_fixed_to_int(fx), y = wl_fixed_to_int(fy);
1330 
1331 	/* If buttons are pressed, don't change pointer focus. */
1332-	if (swc.seat->pointer->buttons.size > 0)
1333+	if (swc.seat->pointer->buttons.size > 0) {
1334 		return false;
1335+	}
1336 
1337 	struct compositor_view *view = view_at(x, y);
1338 
1339@@ -1407,14 +1544,16 @@ handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_
1340 }
1341 
1342 static bool
1343-handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state)
1344+handle_button(struct pointer_handler *handler, uint32_t time,
1345+              struct button *button, uint32_t state)
1346 {
1347 	(void)handler;
1348 	(void)time;
1349 	(void)button;
1350 
1351-	if (state != WL_POINTER_BUTTON_STATE_PRESSED)
1352+	if (state != WL_POINTER_BUTTON_STATE_PRESSED) {
1353 		return false;
1354+	}
1355 
1356 	int32_t x = wl_fixed_to_int(swc.seat->pointer->x);
1357 	int32_t y = wl_fixed_to_int(swc.seat->pointer->y);
1358@@ -1429,8 +1568,9 @@ handle_button(struct pointer_handler *handler, uint32_t time, struct button *but
1359 static void
1360 handle_terminate(void *data, uint32_t time, uint32_t value, uint32_t state)
1361 {
1362-	if (state == WL_KEYBOARD_KEY_STATE_PRESSED)
1363+	if (state == WL_KEYBOARD_KEY_STATE_PRESSED) {
1364 		wl_display_terminate(swc.display);
1365+	}
1366 }
1367 
1368 static void
1369@@ -1438,8 +1578,9 @@ handle_switch_vt(void *data, uint32_t time, uint32_t value, uint32_t state)
1370 {
1371 	uint8_t vt = value - XKB_KEY_XF86Switch_VT_1 + 1;
1372 
1373-	if (state == WL_KEYBOARD_KEY_STATE_PRESSED)
1374+	if (state == WL_KEYBOARD_KEY_STATE_PRESSED) {
1375 		launch_activate_vt(vt);
1376+	}
1377 }
1378 
1379 static void
1380@@ -1458,7 +1599,8 @@ handle_swc_event(struct wl_listener *listener, void *data)
1381 }
1382 
1383 static void
1384-create_surface(struct wl_client *client, struct wl_resource *resource, uint32_t id)
1385+create_surface(struct wl_client *client, struct wl_resource *resource,
1386+               uint32_t id)
1387 {
1388 	struct surface *surface;
1389 
1390@@ -1474,23 +1616,27 @@ create_surface(struct wl_client *client, struct wl_resource *resource, uint32_t
1391 }
1392 
1393 static void
1394-create_region(struct wl_client *client, struct wl_resource *resource, uint32_t id)
1395+create_region(struct wl_client *client, struct wl_resource *resource,
1396+              uint32_t id)
1397 {
1398-	if (!region_new(client, wl_resource_get_version(resource), id))
1399+	if (!region_new(client, wl_resource_get_version(resource), id)) {
1400 		wl_resource_post_no_memory(resource);
1401+	}
1402 }
1403 
1404 static const struct wl_compositor_interface compositor_impl = {
1405-	.create_surface = create_surface,
1406-	.create_region = create_region,
1407+    .create_surface = create_surface,
1408+    .create_region = create_region,
1409 };
1410 
1411 static void
1412-bind_compositor(struct wl_client *client, void *data, uint32_t version, uint32_t id)
1413+bind_compositor(struct wl_client *client, void *data, uint32_t version,
1414+                uint32_t id)
1415 {
1416 	struct wl_resource *resource;
1417 
1418-	resource = wl_resource_create(client, &wl_compositor_interface, version, id);
1419+	resource =
1420+	    wl_resource_create(client, &wl_compositor_interface, version, id);
1421 	if (!resource) {
1422 		wl_client_post_no_memory(client);
1423 		return;
1424@@ -1504,10 +1650,12 @@ compositor_initialize(void)
1425 	struct screen *screen;
1426 	uint32_t keysym;
1427 
1428-	compositor.global = wl_global_create(swc.display, &wl_compositor_interface, 4, NULL, &bind_compositor);
1429+	compositor.global = wl_global_create(swc.display, &wl_compositor_interface,
1430+	                                     4, NULL, &bind_compositor);
1431 
1432-	if (!compositor.global)
1433+	if (!compositor.global) {
1434 		return false;
1435+	}
1436 
1437 	compositor.scheduled_updates = 0;
1438 	compositor.pending_flips = 0;
1439@@ -1521,15 +1669,19 @@ compositor_initialize(void)
1440 	compositor.swc_listener.notify = &handle_swc_event;
1441 	wl_signal_add(&swc.event_signal, &compositor.swc_listener);
1442 
1443-	wl_list_for_each (screen, &swc.screens, link)
1444-		target_new(screen);
1445-	if (swc.active)
1446+	wl_list_for_each(screen, &swc.screens, link) target_new(screen);
1447+	if (swc.active) {
1448 		schedule_updates(-1);
1449+	}
1450 
1451-	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_CTRL | SWC_MOD_ALT, XKB_KEY_BackSpace, &handle_terminate, NULL);
1452+	swc_add_binding(SWC_BINDING_KEY, SWC_MOD_CTRL | SWC_MOD_ALT,
1453+	                XKB_KEY_BackSpace, &handle_terminate, NULL);
1454 
1455-	for (keysym = XKB_KEY_XF86Switch_VT_1; keysym <= XKB_KEY_XF86Switch_VT_12; ++keysym)
1456-		swc_add_binding(SWC_BINDING_KEY, SWC_MOD_ANY, keysym, &handle_switch_vt, NULL);
1457+	for (keysym = XKB_KEY_XF86Switch_VT_1; keysym <= XKB_KEY_XF86Switch_VT_12;
1458+	     ++keysym) {
1459+		swc_add_binding(SWC_BINDING_KEY, SWC_MOD_ANY, keysym, &handle_switch_vt,
1460+		                NULL);
1461+	}
1462 
1463 	compositor.initialized = true;
1464 
1465@@ -1541,8 +1693,9 @@ compositor_finalize(void)
1466 {
1467 	compositor.initialized = false;
1468 
1469-	if (compositor.zoom_buffer)
1470+	if (compositor.zoom_buffer) {
1471 		wld_buffer_unreference(compositor.zoom_buffer);
1472+	}
1473 	pixman_region32_fini(&compositor.damage);
1474 	pixman_region32_fini(&compositor.opaque);
1475 	wl_global_destroy(compositor.global);
1476@@ -1552,8 +1705,9 @@ struct wld_buffer *
1477 compositor_get_buffer(struct screen *screen)
1478 {
1479 	struct target *target = target_get(screen);
1480-	if (!target)
1481+	if (!target) {
1482 		return NULL;
1483+	}
1484 	return target->current_buffer;
1485 }
1486 
1487@@ -1572,8 +1726,9 @@ compositor_render_to_shm(struct screen *screen)
1488 	/* create shm buf */
1489 	buffer = wld_create_buffer(swc.shm->context, width, height,
1490 	                           WLD_FORMAT_ARGB8888, WLD_FLAG_MAP);
1491-	if (!buffer)
1492+	if (!buffer) {
1493 		return NULL;
1494+	}
1495 
1496 	caps = wld_capabilities(swc.shm->renderer, buffer);
1497 	if (!(caps & WLD_CAPABILITY_WRITE) ||
1498@@ -1584,52 +1739,63 @@ compositor_render_to_shm(struct screen *screen)
1499 
1500 	/* set reigon */
1501 	pixman_region32_init_rect(&region, 0, 0, width, height);
1502-	pixman_region32_init_rect(&damage, screen->base.geometry.x, screen->base.geometry.y, width, height);
1503+	pixman_region32_init_rect(&damage, screen->base.geometry.x,
1504+	                          screen->base.geometry.y, width, height);
1505 
1506 	/* background */
1507 	background = swc_wallpaper_buffer_for_screen(screen);
1508-	if (background)
1509+	if (background) {
1510 		wld_copy_region(swc.shm->renderer, background, 0, 0, &region);
1511-	else
1512+	} else {
1513 		wld_fill_region(swc.shm->renderer, bgcolor, &region);
1514+	}
1515 
1516-	wl_list_for_each_reverse(view, &compositor.views, link) {
1517+	wl_list_for_each_reverse(view, &compositor.views, link)
1518+	{
1519 		struct wld_buffer *src = view->buffer;
1520 
1521-		if (!view->visible)
1522+		if (!view->visible) {
1523 			continue;
1524+		}
1525 
1526-		if (src && !(wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ))
1527+		if (src &&
1528+		    !(wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ)) {
1529 			src = view->base.buffer;
1530+		}
1531 
1532-		if (src && (wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ)) {
1533+		if (src &&
1534+		    (wld_capabilities(swc.shm->renderer, src) & WLD_CAPABILITY_READ)) {
1535 			int32_t x = view->base.geometry.x - screen->base.geometry.x;
1536 			int32_t y = view->base.geometry.y - screen->base.geometry.y;
1537 
1538-			wld_copy_rectangle(swc.shm->renderer, src,
1539-			                   x, y, 0, 0,
1540-			                   view->base.geometry.width, view->base.geometry.height);
1541+			wld_copy_rectangle(swc.shm->renderer, src, x, y, 0, 0,
1542+			                   view->base.geometry.width,
1543+			                   view->base.geometry.height);
1544 		}
1545 
1546-		if ((view->border.outwidth > 0 || view->border.inwidth > 0) && view->base.buffer) {
1547+		if ((view->border.outwidth > 0 || view->border.inwidth > 0) &&
1548+		    view->base.buffer) {
1549 			pixman_region32_t view_region, view_damage, border_damage;
1550 			const struct swc_rectangle *geom = &view->base.geometry;
1551 			const struct swc_rectangle *target_geom = &screen->base.geometry;
1552 
1553-			pixman_region32_init_rect(&view_region, geom->x, geom->y, geom->width, geom->height);
1554+			pixman_region32_init_rect(&view_region, geom->x, geom->y,
1555+			                          geom->width, geom->height);
1556 			pixman_region32_init_with_extents(&view_damage, &view->extents);
1557 			pixman_region32_init(&border_damage);
1558 
1559 			pixman_region32_intersect(&view_damage, &view_damage, &damage);
1560 			pixman_region32_subtract(&view_damage, &view_damage, &view->clip);
1561-			pixman_region32_subtract(&border_damage, &view_damage, &view_region);
1562+			pixman_region32_subtract(&border_damage, &view_damage,
1563+			                         &view_region);
1564 
1565 			pixman_region32_t in_rect;
1566 			pixman_region32_init_rect(&in_rect,
1567 			                          geom->x - view->border.inwidth,
1568 			                          geom->y - view->border.inwidth,
1569 			                          geom->width + (2 * view->border.inwidth),
1570-			                          geom->height + (2 * view->border.inwidth));
1571+			                          geom->height +
1572+			                              (2 * view->border.inwidth));
1573 
1574 			pixman_region32_t out_border;
1575 			pixman_region32_init(&out_border);
1576@@ -1640,14 +1806,20 @@ compositor_render_to_shm(struct screen *screen)
1577 			pixman_region32_subtract(&in_border, &in_rect, &view_region);
1578 			pixman_region32_intersect(&in_border, &in_border, &border_damage);
1579 
1580-			if (view->border.outwidth > 0 && pixman_region32_not_empty(&out_border)) {
1581-				pixman_region32_translate(&out_border, -target_geom->x, -target_geom->y);
1582-				wld_fill_region(swc.shm->renderer, view->border.outcolor, &out_border);
1583+			if (view->border.outwidth > 0 &&
1584+			    pixman_region32_not_empty(&out_border)) {
1585+				pixman_region32_translate(&out_border, -target_geom->x,
1586+				                          -target_geom->y);
1587+				wld_fill_region(swc.shm->renderer, view->border.outcolor,
1588+				                &out_border);
1589 			}
1590 
1591-			if (view->border.inwidth > 0 && pixman_region32_not_empty(&in_border)) {
1592-				pixman_region32_translate(&in_border, -target_geom->x, -target_geom->y);
1593-				wld_fill_region(swc.shm->renderer, view->border.incolor, &in_border);
1594+			if (view->border.inwidth > 0 &&
1595+			    pixman_region32_not_empty(&in_border)) {
1596+				pixman_region32_translate(&in_border, -target_geom->x,
1597+				                          -target_geom->y);
1598+				wld_fill_region(swc.shm->renderer, view->border.incolor,
1599+				                &in_border);
1600 			}
1601 
1602 			pixman_region32_fini(&border_damage);
+39, -21
  1@@ -26,8 +26,8 @@
  2 
  3 #include "view.h"
  4 
  5-#include <stdbool.h>
  6 #include <pixman.h>
  7+#include <stdbool.h>
  8 #include <wayland-server.h>
  9 
 10 struct screen;
 11@@ -46,9 +46,12 @@ struct swc_compositor {
 12 	} signal;
 13 };
 14 
 15-bool compositor_initialize(void);
 16-void compositor_finalize(void);
 17-void compositor_damage_all(void);
 18+bool
 19+compositor_initialize(void);
 20+void
 21+compositor_finalize(void);
 22+void
 23+compositor_damage_all(void);
 24 
 25 struct compositor_view {
 26 	struct view base;
 27@@ -74,11 +77,11 @@ struct compositor_view {
 28 		uint32_t outcolor;
 29 
 30 		bool damaged_border1;
 31-		
 32+
 33 		/* sir, a second border has hit the compositor! */
 34 		uint32_t inwidth;
 35 		uint32_t incolor;
 36-	
 37+
 38 		bool damaged_border2;
 39 	} border;
 40 
 41@@ -86,34 +89,49 @@ struct compositor_view {
 42 	struct wl_signal destroy_signal;
 43 };
 44 
 45-struct compositor_view *compositor_create_view(struct surface *surface);
 46+struct compositor_view *
 47+compositor_create_view(struct surface *surface);
 48 
 49-void compositor_view_destroy(struct compositor_view *view);
 50+void
 51+compositor_view_destroy(struct compositor_view *view);
 52 
 53 /**
 54  * Returns view as a compositor_view, or NULL if view is not a compositor_view.
 55  */
 56-struct compositor_view *compositor_view(struct view *view);
 57-
 58-void compositor_view_set_parent(struct compositor_view *view, struct compositor_view *parent);
 59-void compositor_view_restack(struct compositor_view *view, struct compositor_view *sibling, bool above);
 60-
 61-void compositor_view_show(struct compositor_view *view);
 62-void compositor_view_hide(struct compositor_view *view);
 63-
 64-void compositor_view_set_border_color(struct compositor_view *view, uint32_t outcolor, uint32_t incolor);
 65-void compositor_view_set_border_width(struct compositor_view *view, uint32_t outwidth, uint32_t inwidth);
 66+struct compositor_view *
 67+compositor_view(struct view *view);
 68+
 69+void
 70+compositor_view_set_parent(struct compositor_view *view,
 71+                           struct compositor_view *parent);
 72+void
 73+compositor_view_restack(struct compositor_view *view,
 74+                        struct compositor_view *sibling, bool above);
 75+
 76+void
 77+compositor_view_show(struct compositor_view *view);
 78+void
 79+compositor_view_hide(struct compositor_view *view);
 80+
 81+void
 82+compositor_view_set_border_color(struct compositor_view *view,
 83+                                 uint32_t outcolor, uint32_t incolor);
 84+void
 85+compositor_view_set_border_width(struct compositor_view *view,
 86+                                 uint32_t outwidth, uint32_t inwidth);
 87 
 88 /**
 89  * get the current composited buffer for a screen for screenshotss.
 90  * returns null if no buffer
 91  */
 92-struct wld_buffer *compositor_get_buffer(struct screen *screen);
 93+struct wld_buffer *
 94+compositor_get_buffer(struct screen *screen);
 95 
 96 /**
 97- * render the compositor scene into a shm buffer 
 98+ * render the compositor scene into a shm buffer
 99  * caller must free with wld_buffer_unreference()
100  */
101-struct wld_buffer *compositor_render_to_shm(struct screen *screen);
102+struct wld_buffer *
103+compositor_render_to_shm(struct screen *screen);
104 
105 #endif
+43, -29
  1@@ -36,48 +36,55 @@ struct data {
  2 };
  3 
  4 static void
  5-offer_accept(struct wl_client *client, struct wl_resource *offer, uint32_t serial, const char *mime_type)
  6+offer_accept(struct wl_client *client, struct wl_resource *offer,
  7+             uint32_t serial, const char *mime_type)
  8 {
  9 	struct data *data = wl_resource_get_user_data(offer);
 10 
 11 	/* Protect against expired data_offers being used. */
 12-	if (!data)
 13+	if (!data) {
 14 		return;
 15+	}
 16 
 17 	wl_data_source_send_target(data->source, mime_type);
 18 }
 19 
 20 static void
 21-offer_receive(struct wl_client *client, struct wl_resource *offer, const char *mime_type, int fd)
 22+offer_receive(struct wl_client *client, struct wl_resource *offer,
 23+              const char *mime_type, int fd)
 24 {
 25 	struct data *data = wl_resource_get_user_data(offer);
 26 
 27 	/* Protect against expired data_offers being used. */
 28-	if (!data)
 29+	if (!data) {
 30 		return;
 31+	}
 32 
 33 	wl_data_source_send_send(data->source, mime_type, fd);
 34 	close(fd);
 35 }
 36 
 37 static const struct wl_data_offer_interface data_offer_impl = {
 38-	.accept = offer_accept,
 39-	.receive = offer_receive,
 40-	.destroy = destroy_resource,
 41+    .accept = offer_accept,
 42+    .receive = offer_receive,
 43+    .destroy = destroy_resource,
 44 };
 45 
 46 static void
 47-source_offer(struct wl_client *client, struct wl_resource *source, const char *mime_type)
 48+source_offer(struct wl_client *client, struct wl_resource *source,
 49+             const char *mime_type)
 50 {
 51 	struct data *data = wl_resource_get_user_data(source);
 52 	char *s, **dst;
 53 
 54 	s = strdup(mime_type);
 55-	if (!s)
 56+	if (!s) {
 57 		goto error0;
 58+	}
 59 	dst = wl_array_add(&data->mime_types, sizeof(*dst));
 60-	if (!dst)
 61+	if (!dst) {
 62 		goto error1;
 63+	}
 64 	*dst = s;
 65 	return;
 66 
 67@@ -88,8 +95,8 @@ error0:
 68 }
 69 
 70 static const struct wl_data_source_interface data_source_impl = {
 71-	.offer = source_offer,
 72-	.destroy = destroy_resource,
 73+    .offer = source_offer,
 74+    .destroy = destroy_resource,
 75 };
 76 
 77 static void
 78@@ -99,19 +106,19 @@ data_destroy(struct wl_resource *source)
 79 	struct wl_resource *offer;
 80 	char **mime_type;
 81 
 82-	wl_array_for_each (mime_type, &data->mime_types)
 83-		free(*mime_type);
 84+	wl_array_for_each(mime_type, &data->mime_types) free(*mime_type);
 85 	wl_array_release(&data->mime_types);
 86 
 87 	/* After this data_source is destroyed, each of the data_offer objects
 88-	 * associated with the data_source has a pointer to a free'd struct. We can't
 89-	 * destroy the resources because this results in a segfault on the client when
 90-	 * it correctly tries to call data_source.destroy. However, a misbehaving
 91-	 * client could still attempt to call accept or receive on the data_offer,
 92-	 * which would crash the server.
 93+	 * associated with the data_source has a pointer to a free'd struct. We
 94+	 * can't destroy the resources because this results in a segfault on the
 95+	 * client when it correctly tries to call data_source.destroy. However, a
 96+	 * misbehaving client could still attempt to call accept or receive on the
 97+	 * data_offer, which would crash the server.
 98 	 *
 99 	 * So, we clear the user data on each of the offers to protect us. */
100-	wl_resource_for_each (offer, &data->offers) {
101+	wl_resource_for_each(offer, &data->offers)
102+	{
103 		wl_resource_set_user_data(offer, NULL);
104 		wl_resource_set_destructor(offer, NULL);
105 	}
106@@ -125,15 +132,19 @@ data_source_new(struct wl_client *client, uint32_t version, uint32_t id)
107 	struct data *data;
108 
109 	data = malloc(sizeof(*data));
110-	if (!data)
111+	if (!data) {
112 		goto error0;
113+	}
114 	wl_array_init(&data->mime_types);
115 	wl_list_init(&data->offers);
116 
117-	data->source = wl_resource_create(client, &wl_data_source_interface, version, id);
118-	if (!data->source)
119+	data->source =
120+	    wl_resource_create(client, &wl_data_source_interface, version, id);
121+	if (!data->source) {
122 		goto error1;
123-	wl_resource_set_implementation(data->source, &data_source_impl, data, &data_destroy);
124+	}
125+	wl_resource_set_implementation(data->source, &data_source_impl, data,
126+	                               &data_destroy);
127 
128 	return data->source;
129 
130@@ -144,15 +155,18 @@ error0:
131 }
132 
133 struct wl_resource *
134-data_offer_new(struct wl_client *client, struct wl_resource *source, uint32_t version)
135+data_offer_new(struct wl_client *client, struct wl_resource *source,
136+               uint32_t version)
137 {
138 	struct data *data = wl_resource_get_user_data(source);
139 	struct wl_resource *offer;
140 
141 	offer = wl_resource_create(client, &wl_data_offer_interface, version, 0);
142-	if (!offer)
143+	if (!offer) {
144 		return NULL;
145-	wl_resource_set_implementation(offer, &data_offer_impl, data, &remove_resource);
146+	}
147+	wl_resource_set_implementation(offer, &data_offer_impl, data,
148+	                               &remove_resource);
149 	wl_list_insert(&data->offers, wl_resource_get_link(offer));
150 
151 	return offer;
152@@ -164,6 +178,6 @@ data_send_mime_types(struct wl_resource *source, struct wl_resource *offer)
153 	struct data *data = wl_resource_get_user_data(source);
154 	char **mime_type;
155 
156-	wl_array_for_each (mime_type, &data->mime_types)
157-		wl_data_offer_send_offer(offer, *mime_type);
158+	wl_array_for_each(mime_type, &data->mime_types)
159+	    wl_data_offer_send_offer(offer, *mime_type);
160 }
+7, -3
 1@@ -28,8 +28,12 @@
 2 
 3 struct wl_client;
 4 
 5-struct wl_resource *data_source_new(struct wl_client *client, uint32_t version, uint32_t id);
 6-struct wl_resource *data_offer_new(struct wl_client *client, struct wl_resource *source, uint32_t version);
 7-void data_send_mime_types(struct wl_resource *source, struct wl_resource *offer);
 8+struct wl_resource *
 9+data_source_new(struct wl_client *client, uint32_t version, uint32_t id);
10+struct wl_resource *
11+data_offer_new(struct wl_client *client, struct wl_resource *source,
12+               uint32_t version);
13+void
14+data_send_mime_types(struct wl_resource *source, struct wl_resource *offer);
15 
16 #endif
+41, -23
  1@@ -28,20 +28,23 @@
  2 
  3 static void
  4 start_drag(struct wl_client *client, struct wl_resource *resource,
  5-           struct wl_resource *source_resource, struct wl_resource *origin_resource,
  6+           struct wl_resource *source_resource,
  7+           struct wl_resource *origin_resource,
  8            struct wl_resource *icon_resource, uint32_t serial)
  9 {
 10 	/* XXX: Implement */
 11 }
 12 
 13 static void
 14-set_selection(struct wl_client *client, struct wl_resource *resource, struct wl_resource *data_source, uint32_t serial)
 15+set_selection(struct wl_client *client, struct wl_resource *resource,
 16+              struct wl_resource *data_source, uint32_t serial)
 17 {
 18 	struct data_device *data_device = wl_resource_get_user_data(resource);
 19 
 20 	/* Check if this data source is already the current selection. */
 21-	if (data_source == data_device->selection)
 22+	if (data_source == data_device->selection) {
 23 		return;
 24+	}
 25 
 26 	if (data_device->selection) {
 27 		wl_data_source_send_cancelled(data_device->selection);
 28@@ -50,25 +53,30 @@ set_selection(struct wl_client *client, struct wl_resource *resource, struct wl_
 29 
 30 	data_device->selection = data_source;
 31 
 32-	if (data_source)
 33-		wl_resource_add_destroy_listener(data_source, &data_device->selection_destroy_listener);
 34+	if (data_source) {
 35+		wl_resource_add_destroy_listener(
 36+		    data_source, &data_device->selection_destroy_listener);
 37+	}
 38 
 39-	send_event(&data_device->event_signal, DATA_DEVICE_EVENT_SELECTION_CHANGED, NULL);
 40+	send_event(&data_device->event_signal, DATA_DEVICE_EVENT_SELECTION_CHANGED,
 41+	           NULL);
 42 }
 43 
 44 static const struct wl_data_device_interface data_device_impl = {
 45-	.start_drag = start_drag,
 46-	.set_selection = set_selection,
 47-	.release = destroy_resource,
 48+    .start_drag = start_drag,
 49+    .set_selection = set_selection,
 50+    .release = destroy_resource,
 51 };
 52 
 53 static void
 54 handle_selection_destroy(struct wl_listener *listener, void *data)
 55 {
 56-	struct data_device *data_device = wl_container_of(listener, data_device, selection_destroy_listener);
 57+	struct data_device *data_device =
 58+	    wl_container_of(listener, data_device, selection_destroy_listener);
 59 
 60 	data_device->selection = NULL;
 61-	send_event(&data_device->event_signal, DATA_DEVICE_EVENT_SELECTION_CHANGED, NULL);
 62+	send_event(&data_device->event_signal, DATA_DEVICE_EVENT_SELECTION_CHANGED,
 63+	           NULL);
 64 }
 65 
 66 struct data_device *
 67@@ -77,8 +85,9 @@ data_device_create(void)
 68 	struct data_device *data_device;
 69 
 70 	data_device = malloc(sizeof(*data_device));
 71-	if (!data_device)
 72+	if (!data_device) {
 73 		return NULL;
 74+	}
 75 	data_device->selection = NULL;
 76 	data_device->selection_destroy_listener.notify = &handle_selection_destroy;
 77 	wl_signal_init(&data_device->event_signal);
 78@@ -92,33 +101,39 @@ data_device_destroy(struct data_device *data_device)
 79 {
 80 	struct wl_resource *resource, *tmp;
 81 
 82-	wl_list_for_each_safe (resource, tmp, &data_device->resources, link)
 83-		wl_resource_destroy(resource);
 84+	wl_list_for_each_safe(resource, tmp, &data_device->resources, link)
 85+	    wl_resource_destroy(resource);
 86 	free(data_device);
 87 }
 88 
 89 struct wl_resource *
 90-data_device_bind(struct data_device *data_device, struct wl_client *client, uint32_t version, uint32_t id)
 91+data_device_bind(struct data_device *data_device, struct wl_client *client,
 92+                 uint32_t version, uint32_t id)
 93 {
 94 	struct wl_resource *resource;
 95 
 96-	resource = wl_resource_create(client, &wl_data_device_interface, version, id);
 97-	if (!resource)
 98+	resource =
 99+	    wl_resource_create(client, &wl_data_device_interface, version, id);
100+	if (!resource) {
101 		return NULL;
102-	wl_resource_set_implementation(resource, &data_device_impl, data_device, &remove_resource);
103+	}
104+	wl_resource_set_implementation(resource, &data_device_impl, data_device,
105+	                               &remove_resource);
106 	wl_list_insert(&data_device->resources, &resource->link);
107 
108 	return resource;
109 }
110 
111 static struct wl_resource *
112-new_offer(struct wl_resource *resource, struct wl_client *client, struct wl_resource *source)
113+new_offer(struct wl_resource *resource, struct wl_client *client,
114+          struct wl_resource *source)
115 {
116 	struct wl_resource *offer;
117 
118 	offer = data_offer_new(client, source, wl_resource_get_version(resource));
119-	if (!offer)
120+	if (!offer) {
121 		return NULL;
122+	}
123 	wl_data_device_send_data_offer(resource, offer);
124 	data_send_mime_types(source, offer);
125 
126@@ -126,7 +141,8 @@ new_offer(struct wl_resource *resource, struct wl_client *client, struct wl_reso
127 }
128 
129 void
130-data_device_offer_selection(struct data_device *data_device, struct wl_client *client)
131+data_device_offer_selection(struct data_device *data_device,
132+                            struct wl_client *client)
133 {
134 	struct wl_resource *resource;
135 	struct wl_resource *offer = NULL;
136@@ -135,12 +151,14 @@ data_device_offer_selection(struct data_device *data_device, struct wl_client *c
137 	resource = wl_resource_find_for_client(&data_device->resources, client);
138 
139 	/* If the client does not have a data device, there is nothing to do. */
140-	if (!resource)
141+	if (!resource) {
142 		return;
143+	}
144 
145 	/* If we have a selection, create a new offer for the client. */
146-	if (data_device->selection)
147+	if (data_device->selection) {
148 		offer = new_offer(resource, client, data_device->selection);
149+	}
150 
151 	wl_data_device_send_selection(resource, offer);
152 }
+11, -7
 1@@ -27,9 +27,7 @@
 2 #include <stdbool.h>
 3 #include <wayland-server.h>
 4 
 5-enum {
 6-	DATA_DEVICE_EVENT_SELECTION_CHANGED
 7-};
 8+enum { DATA_DEVICE_EVENT_SELECTION_CHANGED };
 9 
10 struct data_device {
11 	/* The data source corresponding to the current selection. */
12@@ -40,9 +38,15 @@ struct data_device {
13 	struct wl_list resources;
14 };
15 
16-struct data_device *data_device_create(void);
17-void data_device_destroy(struct data_device *data_device);
18-struct wl_resource *data_device_bind(struct data_device *data_device, struct wl_client *client, uint32_t version, uint32_t id);
19-void data_device_offer_selection(struct data_device *data_device, struct wl_client *client);
20+struct data_device *
21+data_device_create(void);
22+void
23+data_device_destroy(struct data_device *data_device);
24+struct wl_resource *
25+data_device_bind(struct data_device *data_device, struct wl_client *client,
26+                 uint32_t version, uint32_t id);
27+void
28+data_device_offer_selection(struct data_device *data_device,
29+                            struct wl_client *client);
30 
31 #endif
+21, -11
 1@@ -28,41 +28,51 @@
 2 #include "seat.h"
 3 
 4 static void
 5-create_data_source(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 6+create_data_source(struct wl_client *client, struct wl_resource *resource,
 7+                   uint32_t id)
 8 {
 9-	if (!data_source_new(client, wl_resource_get_version(resource), id))
10+	if (!data_source_new(client, wl_resource_get_version(resource), id)) {
11 		wl_resource_post_no_memory(resource);
12+	}
13 }
14 
15 static void
16-get_data_device(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *seat_resource)
17+get_data_device(struct wl_client *client, struct wl_resource *resource,
18+                uint32_t id, struct wl_resource *seat_resource)
19 {
20 	struct swc_seat *seat = wl_resource_get_user_data(seat_resource);
21 
22-	if (!data_device_bind(seat->data_device, client, wl_resource_get_version(resource), id))
23+	if (!data_device_bind(seat->data_device, client,
24+	                      wl_resource_get_version(resource), id)) {
25 		wl_resource_post_no_memory(resource);
26+	}
27 }
28 
29-static const struct wl_data_device_manager_interface data_device_manager_impl = {
30-	.create_data_source = create_data_source,
31-	.get_data_device = get_data_device,
32+static const struct wl_data_device_manager_interface data_device_manager_impl =
33+    {
34+        .create_data_source = create_data_source,
35+        .get_data_device = get_data_device,
36 };
37 
38 static void
39-bind_data_device_manager(struct wl_client *client, void *data, uint32_t version, uint32_t id)
40+bind_data_device_manager(struct wl_client *client, void *data, uint32_t version,
41+                         uint32_t id)
42 {
43 	struct wl_resource *resource;
44 
45-	resource = wl_resource_create(client, &wl_data_device_manager_interface, version, id);
46+	resource = wl_resource_create(client, &wl_data_device_manager_interface,
47+	                              version, id);
48 	if (!resource) {
49 		wl_client_post_no_memory(client);
50 		return;
51 	}
52-	wl_resource_set_implementation(resource, &data_device_manager_impl, NULL, NULL);
53+	wl_resource_set_implementation(resource, &data_device_manager_impl, NULL,
54+	                               NULL);
55 }
56 
57 struct wl_global *
58 data_device_manager_create(struct wl_display *display)
59 {
60-	return wl_global_create(display, &wl_data_device_manager_interface, 2, NULL, &bind_data_device_manager);
61+	return wl_global_create(display, &wl_data_device_manager_interface, 2, NULL,
62+	                        &bind_data_device_manager);
63 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *data_device_manager_create(struct wl_display *display);
6+struct wl_global *
7+data_device_manager_create(struct wl_display *display);
8 
9 #endif
+74, -39
  1@@ -27,13 +27,13 @@
  2 #include "util.h"
  3 #include "wayland_buffer.h"
  4 
  5+#include "linux-dmabuf-unstable-v1-server-protocol.h"
  6+#include <drm_fourcc.h>
  7 #include <stdint.h>
  8 #include <stdlib.h>
  9-#include <drm_fourcc.h>
 10 #include <unistd.h>
 11-#include <wld/wld.h>
 12 #include <wld/drm.h>
 13-#include "linux-dmabuf-unstable-v1-server-protocol.h"
 14+#include <wld/wld.h>
 15 
 16 struct params {
 17 	struct wl_resource *resource;
 18@@ -45,20 +45,28 @@ struct params {
 19 };
 20 
 21 static void
 22-add(struct wl_client *client, struct wl_resource *resource, int32_t fd, uint32_t i, uint32_t offset, uint32_t stride, uint32_t modifier_hi, uint32_t modifier_lo)
 23+add(struct wl_client *client, struct wl_resource *resource, int32_t fd,
 24+    uint32_t i, uint32_t offset, uint32_t stride, uint32_t modifier_hi,
 25+    uint32_t modifier_lo)
 26 {
 27 	struct params *params = wl_resource_get_user_data(resource);
 28 
 29 	if (params->created) {
 30-		wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_ALREADY_USED, "buffer already created");
 31+		wl_resource_post_error(resource,
 32+		                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_ALREADY_USED,
 33+		                       "buffer already created");
 34 		return;
 35 	}
 36 	if (i > ARRAY_LENGTH(params->fd)) {
 37-		wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_PLANE_IDX, "plane index too large");
 38+		wl_resource_post_error(resource,
 39+		                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_PLANE_IDX,
 40+		                       "plane index too large");
 41 		return;
 42 	}
 43 	if (params->fd[i] != -1) {
 44-		wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_PLANE_SET, "buffer plane already set");
 45+		wl_resource_post_error(resource,
 46+		                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_PLANE_SET,
 47+		                       "buffer plane already set");
 48 		return;
 49 	}
 50 	params->fd[i] = fd;
 51@@ -68,8 +76,9 @@ add(struct wl_client *client, struct wl_resource *resource, int32_t fd, uint32_t
 52 }
 53 
 54 static void
 55-create_immed(struct wl_client *client, struct wl_resource *resource, uint32_t id,
 56-             int32_t width, int32_t height, uint32_t format, uint32_t flags)
 57+create_immed(struct wl_client *client, struct wl_resource *resource,
 58+             uint32_t id, int32_t width, int32_t height, uint32_t format,
 59+             uint32_t flags)
 60 {
 61 	struct params *params = wl_resource_get_user_data(resource);
 62 	struct wld_buffer *buffer;
 63@@ -78,7 +87,9 @@ create_immed(struct wl_client *client, struct wl_resource *resource, uint32_t id
 64 	int num_planes, i;
 65 
 66 	if (params->created) {
 67-		wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_ALREADY_USED, "buffer already created");
 68+		wl_resource_post_error(resource,
 69+		                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_ALREADY_USED,
 70+		                       "buffer already created");
 71 		return;
 72 	}
 73 	params->created = true;
 74@@ -88,49 +99,62 @@ create_immed(struct wl_client *client, struct wl_resource *resource, uint32_t id
 75 		num_planes = 1;
 76 		break;
 77 	default:
 78-		wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INVALID_FORMAT, "unsupported format %#" PRIx32, format);
 79+		wl_resource_post_error(resource,
 80+		                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INVALID_FORMAT,
 81+		                       "unsupported format %#" PRIx32, format);
 82 		return;
 83 	}
 84 	for (i = 0; i < num_planes; ++i) {
 85-		if (params->fd[i] == -1)
 86-			wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INCOMPLETE, "missing plane %d", i);
 87+		if (params->fd[i] == -1) {
 88+			wl_resource_post_error(resource,
 89+			                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INCOMPLETE,
 90+			                       "missing plane %d", i);
 91+		}
 92 	}
 93 	for (; i < ARRAY_LENGTH(params->fd); ++i) {
 94-		if (params->fd[i] != -1)
 95-			wl_resource_post_error(resource, ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INCOMPLETE, "too many planes");
 96+		if (params->fd[i] != -1) {
 97+			wl_resource_post_error(resource,
 98+			                       ZWP_LINUX_BUFFER_PARAMS_V1_ERROR_INCOMPLETE,
 99+			                       "too many planes");
100+		}
101 	}
102 	object.i = params->fd[0];
103-	buffer = wld_import_buffer(swc.drm->context, WLD_DRM_OBJECT_PRIME_FD, object, width, height, format, params->stride[0]);
104+	buffer =
105+	    wld_import_buffer(swc.drm->context, WLD_DRM_OBJECT_PRIME_FD, object,
106+	                      width, height, format, params->stride[0]);
107 	for (i = 0; i < num_planes; ++i) {
108 		close(params->fd[i]);
109 		params->fd[i] = -1;
110 	}
111-	if (!buffer)
112+	if (!buffer) {
113 		zwp_linux_buffer_params_v1_send_failed(resource);
114+	}
115 
116 	buffer_resource = wayland_buffer_create_resource(client, 1, id, buffer);
117 	if (!buffer_resource) {
118-		if (buffer)
119+		if (buffer) {
120 			wld_buffer_unreference(buffer);
121+		}
122 		wl_resource_post_no_memory(resource);
123 		return;
124 	}
125-	if (id == 0 && buffer)
126+	if (id == 0 && buffer) {
127 		zwp_linux_buffer_params_v1_send_created(resource, buffer_resource);
128+	}
129 }
130 
131 static void
132-create(struct wl_client *client, struct wl_resource *resource,
133-       int32_t width, int32_t height, uint32_t format, uint32_t flags)
134+create(struct wl_client *client, struct wl_resource *resource, int32_t width,
135+       int32_t height, uint32_t format, uint32_t flags)
136 {
137 	create_immed(client, resource, 0, width, height, format, flags);
138 }
139 
140 static const struct zwp_linux_buffer_params_v1_interface params_impl = {
141-	.destroy = destroy_resource,
142-	.add = add,
143-	.create = create,
144-	.create_immed = create_immed,
145+    .destroy = destroy_resource,
146+    .add = add,
147+    .create = create,
148+    .create_immed = create_immed,
149 };
150 
151 static void
152@@ -139,26 +163,34 @@ params_destroy(struct wl_resource *resource)
153 	struct params *params = wl_resource_get_user_data(resource);
154 	int i;
155 
156-	for (i = 0; i < ARRAY_LENGTH(params->fd); ++i)
157+	for (i = 0; i < ARRAY_LENGTH(params->fd); ++i) {
158 		close(params->fd[i]);
159+	}
160 }
161 
162 static void
163-create_params(struct wl_client *client, struct wl_resource *resource, uint32_t id)
164+create_params(struct wl_client *client, struct wl_resource *resource,
165+              uint32_t id)
166 {
167 	struct params *params;
168 	int i;
169 
170 	params = malloc(sizeof(*params));
171-	if (!params)
172+	if (!params) {
173 		goto error0;
174+	}
175 	params->created = false;
176-	params->resource = wl_resource_create(client, &zwp_linux_buffer_params_v1_interface, wl_resource_get_version(resource), id);
177-	if (!params->resource)
178+	params->resource =
179+	    wl_resource_create(client, &zwp_linux_buffer_params_v1_interface,
180+	                       wl_resource_get_version(resource), id);
181+	if (!params->resource) {
182 		goto error1;
183-	for (i = 0; i < ARRAY_LENGTH(params->fd); ++i)
184+	}
185+	for (i = 0; i < ARRAY_LENGTH(params->fd); ++i) {
186 		params->fd[i] = -1;
187-	wl_resource_set_implementation(params->resource, &params_impl, params, params_destroy);
188+	}
189+	wl_resource_set_implementation(params->resource, &params_impl, params,
190+	                               params_destroy);
191 	return;
192 
193 error1:
194@@ -168,22 +200,23 @@ error0:
195 }
196 
197 static const struct zwp_linux_dmabuf_v1_interface dmabuf_impl = {
198-	.destroy = destroy_resource,
199-	.create_params = create_params,
200+    .destroy = destroy_resource,
201+    .create_params = create_params,
202 };
203 
204 static void
205 bind_dmabuf(struct wl_client *client, void *data, uint32_t version, uint32_t id)
206 {
207 	static const uint32_t formats[] = {
208-		DRM_FORMAT_XRGB8888,
209-		DRM_FORMAT_ARGB8888,
210+	    DRM_FORMAT_XRGB8888,
211+	    DRM_FORMAT_ARGB8888,
212 	};
213 	uint64_t modifier = DRM_FORMAT_MOD_INVALID;
214 	struct wl_resource *resource;
215 	size_t i;
216 
217-	resource = wl_resource_create(client, &zwp_linux_dmabuf_v1_interface, version, id);
218+	resource =
219+	    wl_resource_create(client, &zwp_linux_dmabuf_v1_interface, version, id);
220 	if (!resource) {
221 		wl_client_post_no_memory(client);
222 		return;
223@@ -192,7 +225,8 @@ bind_dmabuf(struct wl_client *client, void *data, uint32_t version, uint32_t id)
224 	for (i = 0; i < ARRAY_LENGTH(formats); ++i) {
225 		if (version >= 3) {
226 			/* TODO: need a way to query DRM modifiers of wld */
227-			zwp_linux_dmabuf_v1_send_modifier(resource, formats[i], modifier >> 32, modifier & 0xffffffff);
228+			zwp_linux_dmabuf_v1_send_modifier(
229+			    resource, formats[i], modifier >> 32, modifier & 0xffffffff);
230 		} else {
231 			zwp_linux_dmabuf_v1_send_format(resource, formats[i]);
232 		}
233@@ -202,5 +236,6 @@ bind_dmabuf(struct wl_client *client, void *data, uint32_t version, uint32_t id)
234 struct wl_global *
235 swc_dmabuf_create(struct wl_display *display)
236 {
237-	return wl_global_create(display, &zwp_linux_dmabuf_v1_interface, 3, NULL, &bind_dmabuf);
238+	return wl_global_create(display, &zwp_linux_dmabuf_v1_interface, 3, NULL,
239+	                        &bind_dmabuf);
240 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *swc_dmabuf_create(struct wl_display *display);
6+struct wl_global *
7+swc_dmabuf_create(struct wl_display *display);
8 
9 #endif
+103, -67
  1@@ -32,21 +32,21 @@
  2 #include "util.h"
  3 #include "wayland_buffer.h"
  4 
  5+#include "wayland-drm-server-protocol.h"
  6 #include <dirent.h>
  7+#include <drm.h>
  8 #include <errno.h>
  9+#include <fcntl.h>
 10 #include <limits.h>
 11 #include <stdio.h>
 12 #include <stdlib.h>
 13 #include <string.h>
 14 #include <strings.h>
 15-#include <fcntl.h>
 16 #include <unistd.h>
 17-#include <drm.h>
 18-#include <xf86drm.h>
 19-#include <wld/wld.h>
 20-#include <wld/drm.h>
 21 #include <wayland-server.h>
 22-#include "wayland-drm-server-protocol.h"
 23+#include <wld/drm.h>
 24+#include <wld/wld.h>
 25+#include <xf86drm.h>
 26 
 27 struct swc_drm swc_drm;
 28 
 29@@ -59,49 +59,58 @@ static struct {
 30 } drm;
 31 
 32 static void
 33-authenticate(struct wl_client *client, struct wl_resource *resource, uint32_t magic)
 34+authenticate(struct wl_client *client, struct wl_resource *resource,
 35+             uint32_t magic)
 36 {
 37 	wl_drm_send_authenticated(resource);
 38 }
 39 
 40 static void
 41-create_buffer(struct wl_client *client, struct wl_resource *resource, uint32_t id,
 42-              uint32_t name, int32_t width, int32_t height, uint32_t stride, uint32_t format)
 43+create_buffer(struct wl_client *client, struct wl_resource *resource,
 44+              uint32_t id, uint32_t name, int32_t width, int32_t height,
 45+              uint32_t stride, uint32_t format)
 46 {
 47-	wl_resource_post_error(resource, WL_DRM_ERROR_INVALID_NAME, "GEM names are not supported, use a PRIME fd instead");
 48+	wl_resource_post_error(
 49+	    resource, WL_DRM_ERROR_INVALID_NAME,
 50+	    "GEM names are not supported, use a PRIME fd instead");
 51 }
 52 
 53 static void
 54-create_planar_buffer(struct wl_client *client, struct wl_resource *resource, uint32_t id,
 55-                     uint32_t name, int32_t width, int32_t height, uint32_t format,
 56-                     int32_t offset0, int32_t stride0,
 57-                     int32_t offset1, int32_t stride1,
 58-                     int32_t offset2, int32_t stride2)
 59+create_planar_buffer(struct wl_client *client, struct wl_resource *resource,
 60+                     uint32_t id, uint32_t name, int32_t width, int32_t height,
 61+                     uint32_t format, int32_t offset0, int32_t stride0,
 62+                     int32_t offset1, int32_t stride1, int32_t offset2,
 63+                     int32_t stride2)
 64 {
 65-	wl_resource_post_error(resource, WL_DRM_ERROR_INVALID_FORMAT, "planar buffers are not supported\n");
 66+	wl_resource_post_error(resource, WL_DRM_ERROR_INVALID_FORMAT,
 67+	                       "planar buffers are not supported\n");
 68 }
 69 
 70 static void
 71-create_prime_buffer(struct wl_client *client, struct wl_resource *resource, uint32_t id,
 72-                    int32_t fd, int32_t width, int32_t height, uint32_t format,
 73-                    int32_t offset0, int32_t stride0,
 74-                    int32_t offset1, int32_t stride1,
 75-                    int32_t offset2, int32_t stride2)
 76+create_prime_buffer(struct wl_client *client, struct wl_resource *resource,
 77+                    uint32_t id, int32_t fd, int32_t width, int32_t height,
 78+                    uint32_t format, int32_t offset0, int32_t stride0,
 79+                    int32_t offset1, int32_t stride1, int32_t offset2,
 80+                    int32_t stride2)
 81 {
 82 	struct wld_buffer *buffer;
 83 	struct wl_resource *buffer_resource;
 84-	union wld_object object = { .i = fd };
 85+	union wld_object object = {.i = fd};
 86 
 87-	buffer = wld_import_buffer(swc.drm->context, WLD_DRM_OBJECT_PRIME_FD, object, width, height, format, stride0);
 88+	buffer = wld_import_buffer(swc.drm->context, WLD_DRM_OBJECT_PRIME_FD,
 89+	                           object, width, height, format, stride0);
 90 	close(fd);
 91 
 92-	if (!buffer)
 93+	if (!buffer) {
 94 		goto error0;
 95+	}
 96 
 97-	buffer_resource = wayland_buffer_create_resource(client, wl_resource_get_version(resource), id, buffer);
 98+	buffer_resource = wayland_buffer_create_resource(
 99+	    client, wl_resource_get_version(resource), id, buffer);
100 
101-	if (!buffer_resource)
102+	if (!buffer_resource) {
103 		goto error1;
104+	}
105 
106 	return;
107 
108@@ -112,10 +121,10 @@ error0:
109 }
110 
111 static const struct wl_drm_interface drm_impl = {
112-	.authenticate = authenticate,
113-	.create_buffer = create_buffer,
114-	.create_planar_buffer = create_planar_buffer,
115-	.create_prime_buffer = create_prime_buffer,
116+    .authenticate = authenticate,
117+    .create_buffer = create_buffer,
118+    .create_planar_buffer = create_planar_buffer,
119+    .create_prime_buffer = create_prime_buffer,
120 };
121 
122 static int
123@@ -136,11 +145,13 @@ find_primary_drm_device(char *path, size_t size)
124 
125 	num_cards = scandir("/dev/dri", &cards, &select_card, &alphasort);
126 
127-	if (num_cards == -1)
128+	if (num_cards == -1) {
129 		return false;
130+	}
131 
132 	for (index = 0; index < num_cards; ++index) {
133-		snprintf(path, size, "/sys/class/drm/%s/device/boot_vga", cards[index]->d_name);
134+		snprintf(path, size, "/sys/class/drm/%s/device/boot_vga",
135+		         cards[index]->d_name);
136 
137 		if ((file = fopen(path, "r"))) {
138 			ret = fscanf(file, "%hhu", &boot_vga);
139@@ -154,26 +165,30 @@ find_primary_drm_device(char *path, size_t size)
140 			}
141 		}
142 
143-		if (!card)
144+		if (!card) {
145 			card = cards[index];
146-		else
147+		} else {
148 			free(cards[index]);
149+		}
150 	}
151 
152 	free(cards);
153 
154-	if (!card)
155+	if (!card) {
156 		return false;
157+	}
158 
159-	if (snprintf(path, size, "/dev/dri/%s", card->d_name) >= size)
160+	if (snprintf(path, size, "/dev/dri/%s", card->d_name) >= size) {
161 		return false;
162+	}
163 
164 	free(card);
165 	return true;
166 }
167 
168 static bool
169-find_available_crtc(drmModeRes *resources, drmModeConnector *connector, uint32_t taken_crtcs, int *crtc_index)
170+find_available_crtc(drmModeRes *resources, drmModeConnector *connector,
171+                    uint32_t taken_crtcs, int *crtc_index)
172 {
173 	int i, j;
174 	uint32_t possible_crtcs;
175@@ -196,12 +211,14 @@ find_available_crtc(drmModeRes *resources, drmModeConnector *connector, uint32_t
176 }
177 
178 static void
179-handle_vblank(int fd, unsigned int sequence, unsigned int sec, unsigned int usec, void *data)
180+handle_vblank(int fd, unsigned int sequence, unsigned int sec,
181+              unsigned int usec, void *data)
182 {
183 }
184 
185 static void
186-handle_page_flip(int fd, unsigned int sequence, unsigned int sec, unsigned int usec, unsigned int crtc_id, void *data)
187+handle_page_flip(int fd, unsigned int sequence, unsigned int sec,
188+                 unsigned int usec, unsigned int crtc_id, void *data)
189 {
190 	struct drm_handler *handler = data;
191 
192@@ -209,9 +226,9 @@ handle_page_flip(int fd, unsigned int sequence, unsigned int sec, unsigned int u
193 }
194 
195 static drmEventContext event_context = {
196-	.version = DRM_EVENT_CONTEXT_VERSION,
197-	.vblank_handler = handle_vblank,
198-	.page_flip_handler2 = handle_page_flip,
199+    .version = DRM_EVENT_CONTEXT_VERSION,
200+    .vblank_handler = handle_vblank,
201+    .page_flip_handler2 = handle_page_flip,
202 };
203 
204 static int
205@@ -233,8 +250,9 @@ bind_drm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
206 	}
207 	wl_resource_set_implementation(resource, &drm_impl, NULL, NULL);
208 
209-	if (version >= 2)
210+	if (version >= 2) {
211 		wl_drm_send_capabilities(resource, WL_DRM_CAPABILITY_PRIME);
212+	}
213 
214 	wl_drm_send_device(resource, drm.path);
215 	wl_drm_send_format(resource, WL_DRM_FORMAT_XRGB8888);
216@@ -261,11 +279,13 @@ drm_initialize(void)
217 		ERROR("Could not enable DRM universal planes\n");
218 		goto error1;
219 	}
220-	if (drmGetCap(swc.drm->fd, DRM_CAP_CURSOR_WIDTH, &val) < 0)
221+	if (drmGetCap(swc.drm->fd, DRM_CAP_CURSOR_WIDTH, &val) < 0) {
222 		val = 64;
223+	}
224 	swc.drm->cursor_w = val;
225-	if (drmGetCap(swc.drm->fd, DRM_CAP_CURSOR_HEIGHT, &val) < 0)
226+	if (drmGetCap(swc.drm->fd, DRM_CAP_CURSOR_HEIGHT, &val) < 0) {
227 		val = 64;
228+	}
229 	swc.drm->cursor_h = val;
230 
231 	drm.path = drmGetRenderDeviceNameFromFd(swc.drm->fd);
232@@ -284,7 +304,8 @@ drm_initialize(void)
233 		goto error2;
234 	}
235 
236-	drm.event_source = wl_event_loop_add_fd(swc.event_loop, swc.drm->fd, WL_EVENT_READABLE, &handle_data, NULL);
237+	drm.event_source = wl_event_loop_add_fd(
238+	    swc.event_loop, swc.drm->fd, WL_EVENT_READABLE, &handle_data, NULL);
239 
240 	if (!drm.event_source) {
241 		ERROR("Could not create DRM event source\n");
242@@ -292,7 +313,8 @@ drm_initialize(void)
243 	}
244 
245 	if (!wld_drm_is_dumb(swc.drm->context)) {
246-		drm.global = wl_global_create(swc.display, &wl_drm_interface, 2, NULL, &bind_drm);
247+		drm.global = wl_global_create(swc.display, &wl_drm_interface, 2, NULL,
248+		                              &bind_drm);
249 		if (!drm.global) {
250 			ERROR("Could not create wl_drm global\n");
251 			goto error4;
252@@ -321,8 +343,9 @@ error0:
253 void
254 drm_finalize(void)
255 {
256-	if (drm.global)
257+	if (drm.global) {
258 		wl_global_destroy(drm.global);
259+	}
260 	wl_event_source_remove(drm.event_source);
261 	wld_destroy_renderer(swc.drm->renderer);
262 	wld_destroy_context(swc.drm->context);
263@@ -349,8 +372,9 @@ drm_create_screens(struct wl_list *screens)
264 	wl_list_init(&planes);
265 	for (i = 0; i < plane_ids->count_planes; ++i) {
266 		plane = plane_new(plane_ids->planes[i]);
267-		if (plane)
268+		if (plane) {
269 			wl_list_insert(&planes, &plane->link);
270+		}
271 	}
272 	drmModeFreePlaneResources(plane_ids);
273 
274@@ -359,33 +383,40 @@ drm_create_screens(struct wl_list *screens)
275 		ERROR("Could not get DRM resources\n");
276 		return false;
277 	}
278-	for (i = 0; i < resources->count_connectors; ++i, drmModeFreeConnector(connector)) {
279+	for (i = 0; i < resources->count_connectors;
280+	     ++i, drmModeFreeConnector(connector)) {
281 		connector = drmModeGetConnector(swc.drm->fd, resources->connectors[i]);
282 
283 		if (connector->connection == DRM_MODE_CONNECTED) {
284 			int crtc_index;
285 
286-			if (!find_available_crtc(resources, connector, taken_crtcs, &crtc_index)) {
287+			if (!find_available_crtc(resources, connector, taken_crtcs,
288+			                         &crtc_index)) {
289 				WARNING("Could not find CRTC for connector %d\n", i);
290 				continue;
291 			}
292 
293 			cursor_plane = NULL;
294-			wl_list_for_each (plane, &planes, link) {
295-				if (plane->type == DRM_PLANE_TYPE_CURSOR && plane->possible_crtcs & 1 << crtc_index) {
296+			wl_list_for_each(plane, &planes, link)
297+			{
298+				if (plane->type == DRM_PLANE_TYPE_CURSOR &&
299+				    plane->possible_crtcs & 1 << crtc_index) {
300 					wl_list_remove(&plane->link);
301 					cursor_plane = plane;
302 					break;
303 				}
304 			}
305 			if (!cursor_plane) {
306-				WARNING("Could not find cursor plane for CRTC %d\n", crtc_index);
307+				WARNING("Could not find cursor plane for CRTC %d\n",
308+				        crtc_index);
309 			}
310 
311-			if (!(output = output_new(connector)))
312+			if (!(output = output_new(connector))) {
313 				continue;
314+			}
315 
316-			output->screen = screen_new(resources->crtcs[crtc_index], output, cursor_plane);
317+			output->screen =
318+			    screen_new(resources->crtcs[crtc_index], output, cursor_plane);
319 			output->screen->id = crtc_index;
320 			taken_crtcs |= 1 << crtc_index;
321 
322@@ -397,9 +428,7 @@ drm_create_screens(struct wl_list *screens)
323 	return true;
324 }
325 
326-enum {
327-	WLD_USER_OBJECT_FRAMEBUFFER = WLD_USER_ID
328-};
329+enum { WLD_USER_OBJECT_FRAMEBUFFER = WLD_USER_ID };
330 
331 struct framebuffer {
332 	struct wld_exporter exporter;
333@@ -408,9 +437,11 @@ struct framebuffer {
334 };
335 
336 static bool
337-framebuffer_export(struct wld_exporter *exporter, struct wld_buffer *buffer, uint32_t type, union wld_object *object)
338+framebuffer_export(struct wld_exporter *exporter, struct wld_buffer *buffer,
339+                   uint32_t type, union wld_object *object)
340 {
341-	struct framebuffer *framebuffer = wl_container_of(exporter, framebuffer, exporter);
342+	struct framebuffer *framebuffer =
343+	    wl_container_of(exporter, framebuffer, exporter);
344 
345 	switch (type) {
346 	case WLD_USER_OBJECT_FRAMEBUFFER:
347@@ -426,7 +457,8 @@ framebuffer_export(struct wld_exporter *exporter, struct wld_buffer *buffer, uin
348 static void
349 framebuffer_destroy(struct wld_destructor *destructor)
350 {
351-	struct framebuffer *framebuffer = wl_container_of(destructor, framebuffer, destructor);
352+	struct framebuffer *framebuffer =
353+	    wl_container_of(destructor, framebuffer, destructor);
354 
355 	drmModeRmFB(swc.drm->fd, framebuffer->id);
356 	free(framebuffer);
357@@ -439,22 +471,26 @@ drm_get_framebuffer(struct wld_buffer *buffer)
358 	union wld_object object;
359 	int ret;
360 
361-	if (!buffer)
362+	if (!buffer) {
363 		return 0;
364+	}
365 
366-	if (wld_export(buffer, WLD_USER_OBJECT_FRAMEBUFFER, &object))
367+	if (wld_export(buffer, WLD_USER_OBJECT_FRAMEBUFFER, &object)) {
368 		return object.u32;
369+	}
370 
371 	if (!wld_export(buffer, WLD_DRM_OBJECT_HANDLE, &object)) {
372 		ERROR("Could not get buffer handle\n");
373 		return 0;
374 	}
375 
376-	if (!(framebuffer = malloc(sizeof(*framebuffer))))
377+	if (!(framebuffer = malloc(sizeof(*framebuffer)))) {
378 		return 0;
379+	}
380 
381-	ret = drmModeAddFB2(swc.drm->fd, buffer->width, buffer->height, buffer->format,
382-	                    (uint32_t[4]){object.u32}, (uint32_t[4]){buffer->pitch}, (uint32_t[4]){0},
383+	ret = drmModeAddFB2(swc.drm->fd, buffer->width, buffer->height,
384+	                    buffer->format, (uint32_t[4]){object.u32},
385+	                    (uint32_t[4]){buffer->pitch}, (uint32_t[4]){0},
386 	                    &framebuffer->id, 0);
387 	if (ret < 0) {
388 		free(framebuffer);
+8, -4
 1@@ -18,10 +18,14 @@ struct swc_drm {
 2 	struct wld_renderer *renderer;
 3 };
 4 
 5-bool drm_initialize(void);
 6-void drm_finalize(void);
 7+bool
 8+drm_initialize(void);
 9+void
10+drm_finalize(void);
11 
12-bool drm_create_screens(struct wl_list *screens);
13-uint32_t drm_get_framebuffer(struct wld_buffer *buffer);
14+bool
15+drm_create_screens(struct wl_list *screens);
16+uint32_t
17+drm_get_framebuffer(struct wld_buffer *buffer);
18 
19 #endif
+1, -1
1@@ -29,7 +29,7 @@ struct event {
2 static inline void
3 send_event(struct wl_signal *signal, uint32_t type, void *event_data)
4 {
5-	struct event event = { .type = type, .data = event_data };
6+	struct event event = {.type = type, .data = event_data};
7 	wl_signal_emit(signal, &event);
8 }
9 
+26, -13
  1@@ -35,26 +35,32 @@ focus(struct input_focus *input_focus, struct compositor_view *view)
  2 
  3 	if (view) {
  4 		client = wl_resource_get_client(view->surface->resource);
  5-		wl_resource_for_each_safe (resource, tmp, &input_focus->inactive) {
  6+		wl_resource_for_each_safe(resource, tmp, &input_focus->inactive)
  7+		{
  8 			if (wl_resource_get_client(resource) == client) {
  9 				wl_list_remove(wl_resource_get_link(resource));
 10-				wl_list_insert(&input_focus->active, wl_resource_get_link(resource));
 11+				wl_list_insert(&input_focus->active,
 12+				               wl_resource_get_link(resource));
 13 			}
 14 		}
 15-		wl_signal_add(&view->destroy_signal, &input_focus->view_destroy_listener);
 16+		wl_signal_add(&view->destroy_signal,
 17+		              &input_focus->view_destroy_listener);
 18 	}
 19 
 20 	input_focus->client = client;
 21 	input_focus->view = view;
 22-	input_focus->handler->enter(input_focus->handler, &input_focus->active, view);
 23+	input_focus->handler->enter(input_focus->handler, &input_focus->active,
 24+	                            view);
 25 }
 26 
 27 static void
 28 unfocus(struct input_focus *input_focus)
 29 {
 30-	if (input_focus->view)
 31+	if (input_focus->view) {
 32 		wl_list_remove(&input_focus->view_destroy_listener.link);
 33-	input_focus->handler->leave(input_focus->handler, &input_focus->active, input_focus->view);
 34+	}
 35+	input_focus->handler->leave(input_focus->handler, &input_focus->active,
 36+	                            input_focus->view);
 37 	wl_list_insert_list(&input_focus->inactive, &input_focus->active);
 38 	wl_list_init(&input_focus->active);
 39 }
 40@@ -62,7 +68,8 @@ unfocus(struct input_focus *input_focus)
 41 static void
 42 handle_focus_view_destroy(struct wl_listener *listener, void *data)
 43 {
 44-	struct input_focus *input_focus = wl_container_of(listener, input_focus, view_destroy_listener);
 45+	struct input_focus *input_focus =
 46+	    wl_container_of(listener, input_focus, view_destroy_listener);
 47 
 48 	/* XXX: Should this call unfocus? */
 49 	wl_list_insert_list(&input_focus->inactive, &input_focus->active);
 50@@ -72,7 +79,8 @@ handle_focus_view_destroy(struct wl_listener *listener, void *data)
 51 }
 52 
 53 bool
 54-input_focus_initialize(struct input_focus *input_focus, struct input_focus_handler *handler)
 55+input_focus_initialize(struct input_focus *input_focus,
 56+                       struct input_focus_handler *handler)
 57 {
 58 	input_focus->client = NULL;
 59 	input_focus->view = NULL;
 60@@ -93,16 +101,19 @@ input_focus_finalize(struct input_focus *input_focus)
 61 }
 62 
 63 void
 64-input_focus_add_resource(struct input_focus *input_focus, struct wl_resource *resource)
 65+input_focus_add_resource(struct input_focus *input_focus,
 66+                         struct wl_resource *resource)
 67 {
 68 	struct wl_list resources, *target = &input_focus->inactive;
 69 
 70 	wl_list_init(&resources);
 71 	wl_list_insert(&resources, wl_resource_get_link(resource));
 72 
 73-	/* If this new input resource corresponds to the focused client, send an enter event. */
 74+	/* If this new input resource corresponds to the focused client, send an
 75+	 * enter event. */
 76 	if (wl_resource_get_client(resource) == input_focus->client) {
 77-		input_focus->handler->enter(input_focus->handler, &resources, input_focus->view);
 78+		input_focus->handler->enter(input_focus->handler, &resources,
 79+		                            input_focus->view);
 80 		target = &input_focus->active;
 81 	}
 82 
 83@@ -110,7 +121,8 @@ input_focus_add_resource(struct input_focus *input_focus, struct wl_resource *re
 84 }
 85 
 86 void
 87-input_focus_remove_resource(struct input_focus *input_focus, struct wl_resource *resource)
 88+input_focus_remove_resource(struct input_focus *input_focus,
 89+                            struct wl_resource *resource)
 90 {
 91 	wl_list_remove(wl_resource_get_link(resource));
 92 }
 93@@ -120,8 +132,9 @@ input_focus_set(struct input_focus *input_focus, struct compositor_view *view)
 94 {
 95 	struct input_focus_event_data data;
 96 
 97-	if (view == input_focus->view)
 98+	if (view == input_focus->view) {
 99 		return;
100+	}
101 
102 	data.old = input_focus->view;
103 	data.new = view;
+18, -10
 1@@ -29,17 +29,17 @@
 2 
 3 /* Focus {{{ */
 4 
 5-enum {
 6-	INPUT_FOCUS_EVENT_CHANGED
 7-};
 8+enum { INPUT_FOCUS_EVENT_CHANGED };
 9 
10 struct input_focus_event_data {
11 	struct compositor_view *old, *new;
12 };
13 
14 struct input_focus_handler {
15-	void (*enter)(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view);
16-	void (*leave)(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view);
17+	void (*enter)(struct input_focus_handler *handler,
18+	              struct wl_list *resources, struct compositor_view *view);
19+	void (*leave)(struct input_focus_handler *handler,
20+	              struct wl_list *resources, struct compositor_view *view);
21 };
22 
23 struct input_focus {
24@@ -53,11 +53,19 @@ struct input_focus {
25 	struct wl_signal event_signal;
26 };
27 
28-bool input_focus_initialize(struct input_focus *input_focus, struct input_focus_handler *input_handler);
29-void input_focus_finalize(struct input_focus *input_focus);
30-void input_focus_add_resource(struct input_focus *input_focus, struct wl_resource *resource);
31-void input_focus_remove_resource(struct input_focus *input_focus, struct wl_resource *resource);
32-void input_focus_set(struct input_focus *input_focus, struct compositor_view *view);
33+bool
34+input_focus_initialize(struct input_focus *input_focus,
35+                       struct input_focus_handler *input_handler);
36+void
37+input_focus_finalize(struct input_focus *input_focus);
38+void
39+input_focus_add_resource(struct input_focus *input_focus,
40+                         struct wl_resource *resource);
41+void
42+input_focus_remove_resource(struct input_focus *input_focus,
43+                            struct wl_resource *resource);
44+void
45+input_focus_set(struct input_focus *input_focus, struct compositor_view *view);
46 
47 /* }}} */
48 
+5, -3
 1@@ -24,8 +24,8 @@
 2 #ifndef SWC_INTERNAL_H
 3 #define SWC_INTERNAL_H
 4 
 5-#include <wayland-server.h>
 6 #include <stdbool.h>
 7+#include <wayland-server.h>
 8 
 9 enum {
10 	SWC_EVENT_ACTIVATED,
11@@ -62,7 +62,9 @@ struct swc {
12 
13 extern struct swc swc;
14 
15-void swc_activate(void);
16-void swc_deactivate(void);
17+void
18+swc_activate(void);
19+void
20+swc_deactivate(void);
21 
22 #endif
+26, -14
 1@@ -24,11 +24,12 @@
 2 #include "kde_decoration.h"
 3 #include "util.h"
 4 
 5-#include <wayland-server.h>
 6 #include "server-decoration-server-protocol.h"
 7+#include <wayland-server.h>
 8 
 9 static void
10-request_mode(struct wl_client *client, struct wl_resource *resource, uint32_t mode)
11+request_mode(struct wl_client *client, struct wl_resource *resource,
12+             uint32_t mode)
13 {
14 	/* Server is required to send back the mode requested by
15 	 * the client, we just don't plan to do anything with it. */
16@@ -36,44 +37,55 @@ request_mode(struct wl_client *client, struct wl_resource *resource, uint32_t mo
17 }
18 
19 static const struct org_kde_kwin_server_decoration_interface decoration_impl = {
20-	.release = destroy_resource,
21-	.request_mode = request_mode,
22+    .release = destroy_resource,
23+    .request_mode = request_mode,
24 };
25 
26 static void
27-create(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *toplevel_resource)
28+create(struct wl_client *client, struct wl_resource *resource, uint32_t id,
29+       struct wl_resource *toplevel_resource)
30 {
31 	struct wl_resource *decoration;
32 
33-	decoration = wl_resource_create(client, &org_kde_kwin_server_decoration_interface, wl_resource_get_version(resource), id);
34+	decoration =
35+	    wl_resource_create(client, &org_kde_kwin_server_decoration_interface,
36+	                       wl_resource_get_version(resource), id);
37 	if (!decoration) {
38 		wl_resource_post_no_memory(resource);
39 		return;
40 	}
41 	wl_resource_set_implementation(decoration, &decoration_impl, NULL, NULL);
42-	org_kde_kwin_server_decoration_send_mode(decoration, ORG_KDE_KWIN_SERVER_DECORATION_MANAGER_MODE_SERVER);
43+	org_kde_kwin_server_decoration_send_mode(
44+	    decoration, ORG_KDE_KWIN_SERVER_DECORATION_MANAGER_MODE_SERVER);
45 }
46 
47-static const struct org_kde_kwin_server_decoration_manager_interface decoration_manager_impl = {
48-	.create = create,
49+static const struct org_kde_kwin_server_decoration_manager_interface
50+    decoration_manager_impl = {
51+        .create = create,
52 };
53 
54 static void
55-bind_decoration_manager(struct wl_client *client, void *data, uint32_t version, uint32_t id)
56+bind_decoration_manager(struct wl_client *client, void *data, uint32_t version,
57+                        uint32_t id)
58 {
59 	struct wl_resource *resource;
60 
61-	resource = wl_resource_create(client, &org_kde_kwin_server_decoration_manager_interface, version, id);
62+	resource = wl_resource_create(
63+	    client, &org_kde_kwin_server_decoration_manager_interface, version, id);
64 	if (!resource) {
65 		wl_client_post_no_memory(client);
66 		return;
67 	}
68-	wl_resource_set_implementation(resource, &decoration_manager_impl, NULL, NULL);
69-	org_kde_kwin_server_decoration_manager_send_default_mode(resource, ORG_KDE_KWIN_SERVER_DECORATION_MANAGER_MODE_SERVER);
70+	wl_resource_set_implementation(resource, &decoration_manager_impl, NULL,
71+	                               NULL);
72+	org_kde_kwin_server_decoration_manager_send_default_mode(
73+	    resource, ORG_KDE_KWIN_SERVER_DECORATION_MANAGER_MODE_SERVER);
74 }
75 
76 struct wl_global *
77 kde_decoration_manager_create(struct wl_display *display)
78 {
79-	return wl_global_create(display, &org_kde_kwin_server_decoration_manager_interface, 1, NULL, &bind_decoration_manager);
80+	return wl_global_create(display,
81+	                        &org_kde_kwin_server_decoration_manager_interface,
82+	                        1, NULL, &bind_decoration_manager);
83 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *kde_decoration_manager_create(struct wl_display *display);
6+struct wl_global *
7+kde_decoration_manager_create(struct wl_display *display);
8 
9 #endif
+141, -67
  1@@ -25,11 +25,11 @@
  2  * SOFTWARE.
  3  */
  4 
  5-#include "swc.h"
  6+#include "keyboard.h"
  7 #include "compositor.h"
  8 #include "internal.h"
  9-#include "keyboard.h"
 10 #include "surface.h"
 11+#include "swc.h"
 12 #include "util.h"
 13 
 14 #include <assert.h>
 15@@ -44,44 +44,61 @@
 16 static const int repeat_delay = 500, repeat_rate = 40;
 17 
 18 static void
 19-enter(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view)
 20+enter(struct input_focus_handler *handler,
 21+      struct wl_list *resources,
 22+      struct compositor_view *view)
 23 {
 24-	struct keyboard *keyboard = wl_container_of(handler, keyboard, focus_handler);
 25+	struct keyboard *keyboard =
 26+	    wl_container_of(handler, keyboard, focus_handler);
 27 	struct keyboard_modifier_state *state = &keyboard->modifier_state;
 28 	struct wl_resource *resource;
 29 	uint32_t serial;
 30 
 31 	serial = wl_display_next_serial(swc.display);
 32-	wl_resource_for_each (resource, resources) {
 33-		wl_keyboard_send_modifiers(resource, serial, state->depressed, state->locked, state->latched, state->group);
 34-		wl_keyboard_send_enter(resource, serial, view->surface->resource, &keyboard->client_keys);
 35+	wl_resource_for_each(resource, resources)
 36+	{
 37+		wl_keyboard_send_modifiers(resource,
 38+		                           serial,
 39+		                           state->depressed,
 40+		                           state->locked,
 41+		                           state->latched,
 42+		                           state->group);
 43+		wl_keyboard_send_enter(
 44+		    resource, serial, view->surface->resource, &keyboard->client_keys);
 45 	}
 46 }
 47 
 48 static void
 49-leave(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view)
 50+leave(struct input_focus_handler *handler,
 51+      struct wl_list *resources,
 52+      struct compositor_view *view)
 53 {
 54 	struct wl_resource *resource;
 55 	uint32_t serial;
 56 
 57 	serial = wl_display_next_serial(swc.display);
 58-	wl_resource_for_each (resource, resources)
 59-		wl_keyboard_send_leave(resource, serial, view->surface->resource);
 60+	wl_resource_for_each(resource, resources)
 61+	    wl_keyboard_send_leave(resource, serial, view->surface->resource);
 62 }
 63 
 64 static bool
 65-client_handle_key(struct keyboard *keyboard, uint32_t time, struct key *key, uint32_t state)
 66+client_handle_key(struct keyboard *keyboard,
 67+                  uint32_t time,
 68+                  struct key *key,
 69+                  uint32_t state)
 70 {
 71 	uint32_t *value;
 72 	struct wl_resource *resource;
 73 
 74 	if (state == WL_KEYBOARD_KEY_STATE_PRESSED) {
 75-		if (!(value = wl_array_add(&keyboard->client_keys, sizeof(*value))))
 76+		if (!(value = wl_array_add(&keyboard->client_keys, sizeof(*value)))) {
 77 			return false;
 78+		}
 79 
 80 		*value = key->press.value;
 81 	} else {
 82-		wl_array_for_each (value, &keyboard->client_keys) {
 83+		wl_array_for_each(value, &keyboard->client_keys)
 84+		{
 85 			if (*value == key->press.value) {
 86 				array_remove(&keyboard->client_keys, value, sizeof(*value));
 87 				break;
 88@@ -89,23 +106,31 @@ client_handle_key(struct keyboard *keyboard, uint32_t time, struct key *key, uin
 89 		}
 90 	}
 91 
 92-	wl_resource_for_each (resource, &keyboard->focus.active)
 93-		wl_keyboard_send_key(resource, key->press.serial, time, key->press.value, state);
 94+	wl_resource_for_each(resource, &keyboard->focus.active)
 95+	    wl_keyboard_send_key(
 96+	        resource, key->press.serial, time, key->press.value, state);
 97 	return true;
 98 }
 99 
100 static bool
101-client_handle_modifiers(struct keyboard *keyboard, const struct keyboard_modifier_state *state)
102+client_handle_modifiers(struct keyboard *keyboard,
103+                        const struct keyboard_modifier_state *state)
104 {
105 	struct wl_resource *resource;
106 	uint32_t serial;
107 
108-	if (wl_list_empty(&keyboard->focus.active))
109+	if (wl_list_empty(&keyboard->focus.active)) {
110 		return false;
111+	}
112 
113 	serial = wl_display_next_serial(swc.display);
114-	wl_resource_for_each (resource, &keyboard->focus.active)
115-		wl_keyboard_send_modifiers(resource, serial, state->depressed, state->locked, state->latched, state->group);
116+	wl_resource_for_each(resource, &keyboard->focus.active)
117+	    wl_keyboard_send_modifiers(resource,
118+	                               serial,
119+	                               state->depressed,
120+	                               state->locked,
121+	                               state->latched,
122+	                               state->group);
123 	return true;
124 }
125 
126@@ -117,24 +142,33 @@ update_keymap(struct xkb *xkb)
127 	char *keymap_string;
128 	int ret;
129 
130-	if (!(keymap_directory = getenv("XDG_RUNTIME_DIR")))
131+	if (!(keymap_directory = getenv("XDG_RUNTIME_DIR"))) {
132 		keymap_directory = "/tmp";
133+	}
134 
135-	xkb->indices.ctrl = xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_CTRL);
136-	xkb->indices.alt = xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_ALT);
137-	xkb->indices.super = xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_LOGO);
138-	xkb->indices.shift = xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_SHIFT);
139+	xkb->indices.ctrl =
140+	    xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_CTRL);
141+	xkb->indices.alt =
142+	    xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_ALT);
143+	xkb->indices.super =
144+	    xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_LOGO);
145+	xkb->indices.shift =
146+	    xkb_keymap_mod_get_index(xkb->keymap.map, XKB_MOD_NAME_SHIFT);
147 
148 	/* In order to send the keymap to clients, we must first convert it to a
149 	 * string and then mmap it to a file. */
150-	keymap_string = xkb_keymap_get_as_string(xkb->keymap.map, XKB_KEYMAP_FORMAT_TEXT_V1);
151+	keymap_string =
152+	    xkb_keymap_get_as_string(xkb->keymap.map, XKB_KEYMAP_FORMAT_TEXT_V1);
153 
154 	if (!keymap_string) {
155 		WARNING("Could not get XKB keymap as a string\n");
156 		goto error0;
157 	}
158 
159-	ret = snprintf(keymap_path, sizeof(keymap_path), "%s/swc-xkb-keymap-XXXXXX", keymap_directory);
160+	ret = snprintf(keymap_path,
161+	               sizeof(keymap_path),
162+	               "%s/swc-xkb-keymap-XXXXXX",
163+	               keymap_directory);
164 	if (ret < 0 || (size_t)ret >= sizeof(keymap_path)) {
165 		WARNING("Could not determine XKB keymap path\n");
166 		goto error1;
167@@ -165,7 +199,12 @@ update_keymap(struct xkb *xkb)
168 	}
169 #endif
170 
171-	xkb->keymap.area = mmap(NULL, xkb->keymap.size, PROT_READ | PROT_WRITE, MAP_SHARED, xkb->keymap.fd, 0);
172+	xkb->keymap.area = mmap(NULL,
173+	                        xkb->keymap.size,
174+	                        PROT_READ | PROT_WRITE,
175+	                        MAP_SHARED,
176+	                        xkb->keymap.fd,
177+	                        0);
178 
179 	if (xkb->keymap.area == MAP_FAILED) {
180 		WARNING("Could not mmap XKB keymap string\n");
181@@ -192,8 +231,9 @@ keyboard_create(struct xkb_rule_names *names)
182 	struct xkb *xkb;
183 
184 	keyboard = malloc(sizeof(*keyboard));
185-	if (!keyboard)
186+	if (!keyboard) {
187 		goto error0;
188+	}
189 
190 	xkb = &keyboard->xkb;
191 	if (!(xkb->context = xkb_context_new(0))) {
192@@ -201,7 +241,8 @@ keyboard_create(struct xkb_rule_names *names)
193 		goto error1;
194 	}
195 
196-	if (!(xkb->keymap.map = xkb_keymap_new_from_names(xkb->context, names, 0))) {
197+	if (!(xkb->keymap.map =
198+	          xkb_keymap_new_from_names(xkb->context, names, 0))) {
199 		ERROR("Could not create XKB keymap\n");
200 		goto error2;
201 	}
202@@ -216,8 +257,9 @@ keyboard_create(struct xkb_rule_names *names)
203 		goto error4;
204 	}
205 
206-	if (!input_focus_initialize(&keyboard->focus, &keyboard->focus_handler))
207+	if (!input_focus_initialize(&keyboard->focus, &keyboard->focus_handler)) {
208 		goto error4;
209+	}
210 
211 	keyboard->modifier_state = (struct keyboard_modifier_state){0};
212 	keyboard->modifiers = 0;
213@@ -266,13 +308,15 @@ keyboard_reset(struct keyboard *keyboard)
214 	struct xkb_state *state;
215 
216 	/* Send simulated key release events for all current key handlers. */
217-	wl_array_for_each (key, &keyboard->keys) {
218+	wl_array_for_each(key, &keyboard->keys)
219+	{
220 		if (key->handler) {
221 			key->press.serial = wl_display_next_serial(swc.display);
222-			key->handler->key(keyboard, time, key, WL_KEYBOARD_KEY_STATE_RELEASED);
223-			/* Don't bother updating the XKB state because we will be resetting it
224-			 * later on and it is unlikely that a key handler cares about the keyboard
225-			 * state for release events. */
226+			key->handler->key(
227+			    keyboard, time, key, WL_KEYBOARD_KEY_STATE_RELEASED);
228+			/* Don't bother updating the XKB state because we will be resetting
229+			 * it later on and it is unlikely that a key handler cares about the
230+			 * keyboard state for release events. */
231 		}
232 	}
233 
234@@ -304,7 +348,7 @@ keyboard_set_focus(struct keyboard *keyboard, struct compositor_view *view)
235 }
236 
237 static const struct wl_keyboard_interface keyboard_impl = {
238-	.release = destroy_resource,
239+    .release = destroy_resource,
240 };
241 
242 static void
243@@ -315,28 +359,42 @@ unbind(struct wl_resource *resource)
244 }
245 
246 struct wl_resource *
247-keyboard_bind(struct keyboard *keyboard, struct wl_client *client, uint32_t version, uint32_t id)
248+keyboard_bind(struct keyboard *keyboard,
249+              struct wl_client *client,
250+              uint32_t version,
251+              uint32_t id)
252 {
253 	struct wl_resource *client_resource;
254 
255-	client_resource = wl_resource_create(client, &wl_keyboard_interface, version, id);
256-	if (!client_resource)
257+	client_resource =
258+	    wl_resource_create(client, &wl_keyboard_interface, version, id);
259+	if (!client_resource) {
260 		return NULL;
261-	wl_resource_set_implementation(client_resource, &keyboard_impl, keyboard, &unbind);
262+	}
263+	wl_resource_set_implementation(
264+	    client_resource, &keyboard_impl, keyboard, &unbind);
265 
266 	/* Subtract one to remove terminating NULL character. */
267-	wl_keyboard_send_keymap(client_resource, WL_KEYBOARD_KEYMAP_FORMAT_XKB_V1, keyboard->xkb.keymap.fd, keyboard->xkb.keymap.size - 1);
268+	wl_keyboard_send_keymap(client_resource,
269+	                        WL_KEYBOARD_KEYMAP_FORMAT_XKB_V1,
270+	                        keyboard->xkb.keymap.fd,
271+	                        keyboard->xkb.keymap.size - 1);
272 
273 	input_focus_add_resource(&keyboard->focus, client_resource);
274 
275-	if (version >= 4)
276-		wl_keyboard_send_repeat_info(client_resource, repeat_rate, repeat_delay);
277+	if (version >= 4) {
278+		wl_keyboard_send_repeat_info(
279+		    client_resource, repeat_rate, repeat_delay);
280+	}
281 
282 	return client_resource;
283 }
284 
285 void
286-keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t value, uint32_t state)
287+keyboard_handle_key(struct keyboard *keyboard,
288+                    uint32_t time,
289+                    uint32_t value,
290+                    uint32_t state)
291 {
292 	struct key *key;
293 	struct keyboard_modifier_state modifier_state;
294@@ -348,11 +406,13 @@ keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t value, ui
295 	serial = wl_display_next_serial(swc.display);
296 
297 	/* First handle key release events associated with a particular handler. */
298-	wl_array_for_each (key, &keyboard->keys) {
299+	wl_array_for_each(key, &keyboard->keys)
300+	{
301 		if (key->press.value == value) {
302 			/* Ignore repeat events. */
303-			if (state == WL_KEYBOARD_KEY_STATE_PRESSED)
304+			if (state == WL_KEYBOARD_KEY_STATE_PRESSED) {
305 				return;
306+			}
307 
308 			if (key->handler) {
309 				key->press.serial = serial;
310@@ -365,18 +425,21 @@ keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t value, ui
311 	}
312 
313 	/* If we get a unpaired release event, just ignore it. */
314-	if (state == WL_KEYBOARD_KEY_STATE_RELEASED)
315+	if (state == WL_KEYBOARD_KEY_STATE_RELEASED) {
316 		return;
317+	}
318 
319-	if (!(key = wl_array_add(&keyboard->keys, sizeof(*key))))
320+	if (!(key = wl_array_add(&keyboard->keys, sizeof(*key)))) {
321 		goto update_xkb_state;
322+	}
323 
324 	key->press.value = value;
325 	key->press.serial = serial;
326 	key->handler = NULL;
327 
328 	/* Go through handlers to see if any will accept this key event. */
329-	wl_list_for_each (handler, &keyboard->handlers, link) {
330+	wl_list_for_each(handler, &keyboard->handlers, link)
331+	{
332 		if (handler->key && handler->key(keyboard, time, key, state)) {
333 			key->handler = handler;
334 			break;
335@@ -385,37 +448,48 @@ keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t value, ui
336 
337 	/* Update XKB state. */
338 update_xkb_state:
339-	direction = state == WL_KEYBOARD_KEY_STATE_PRESSED ? XKB_KEY_DOWN : XKB_KEY_UP;
340+	direction =
341+	    state == WL_KEYBOARD_KEY_STATE_PRESSED ? XKB_KEY_DOWN : XKB_KEY_UP;
342 	xkb_state_update_key(xkb->state, XKB_KEY(value), direction);
343 
344-	modifier_state.depressed = xkb_state_serialize_mods(xkb->state, XKB_STATE_DEPRESSED);
345-	modifier_state.latched = xkb_state_serialize_mods(xkb->state, XKB_STATE_LATCHED);
346-	modifier_state.locked = xkb_state_serialize_mods(xkb->state, XKB_STATE_LOCKED);
347-	modifier_state.group = xkb_state_serialize_layout(xkb->state, XKB_STATE_LAYOUT_EFFECTIVE);
348-
349-	if (modifier_state.depressed != keyboard->modifier_state.depressed
350-	 || modifier_state.latched != keyboard->modifier_state.latched
351-	 || modifier_state.locked != keyboard->modifier_state.locked
352-	 || modifier_state.group != keyboard->modifier_state.group)
353-	{
354-		uint32_t mods_active = modifier_state.depressed | modifier_state.latched;
355+	modifier_state.depressed =
356+	    xkb_state_serialize_mods(xkb->state, XKB_STATE_DEPRESSED);
357+	modifier_state.latched =
358+	    xkb_state_serialize_mods(xkb->state, XKB_STATE_LATCHED);
359+	modifier_state.locked =
360+	    xkb_state_serialize_mods(xkb->state, XKB_STATE_LOCKED);
361+	modifier_state.group =
362+	    xkb_state_serialize_layout(xkb->state, XKB_STATE_LAYOUT_EFFECTIVE);
363+
364+	if (modifier_state.depressed != keyboard->modifier_state.depressed ||
365+	    modifier_state.latched != keyboard->modifier_state.latched ||
366+	    modifier_state.locked != keyboard->modifier_state.locked ||
367+	    modifier_state.group != keyboard->modifier_state.group) {
368+		uint32_t mods_active =
369+		    modifier_state.depressed | modifier_state.latched;
370 
371 		/* Update keyboard modifier state. */
372 		keyboard->modifier_state = modifier_state;
373 		keyboard->modifiers = 0;
374-		if (mods_active & (1 << keyboard->xkb.indices.ctrl))
375+		if (mods_active & (1 << keyboard->xkb.indices.ctrl)) {
376 			keyboard->modifiers |= SWC_MOD_CTRL;
377-		if (mods_active & (1 << keyboard->xkb.indices.alt))
378+		}
379+		if (mods_active & (1 << keyboard->xkb.indices.alt)) {
380 			keyboard->modifiers |= SWC_MOD_ALT;
381-		if (mods_active & (1 << keyboard->xkb.indices.super))
382+		}
383+		if (mods_active & (1 << keyboard->xkb.indices.super)) {
384 			keyboard->modifiers |= SWC_MOD_LOGO;
385-		if (mods_active & (1 << keyboard->xkb.indices.shift))
386+		}
387+		if (mods_active & (1 << keyboard->xkb.indices.shift)) {
388 			keyboard->modifiers |= SWC_MOD_SHIFT;
389+		}
390 
391 		/* Run any modifier handlers. */
392-		wl_list_for_each (handler, &keyboard->handlers, link) {
393-			if (handler->modifiers)
394+		wl_list_for_each(handler, &keyboard->handlers, link)
395+		{
396+			if (handler->modifiers) {
397 				handler->modifiers(keyboard, &modifier_state);
398+			}
399 		}
400 	}
401 }
+19, -9
 1@@ -26,8 +26,8 @@
 2 
 3 #include "input.h"
 4 
 5-#include <xkbcommon/xkbcommon.h>
 6 #include <wayland-util.h>
 7+#include <xkbcommon/xkbcommon.h>
 8 
 9 /* Keycodes are offset by 8 in XKB. */
10 #define XKB_KEY(key) ((key) + 8)
11@@ -48,8 +48,10 @@ struct keyboard_modifier_state {
12 };
13 
14 struct keyboard_handler {
15-	bool (*key)(struct keyboard *keyboard, uint32_t time, struct key *key, uint32_t state);
16-	bool (*modifiers)(struct keyboard *keyboard, const struct keyboard_modifier_state *state);
17+	bool (*key)(struct keyboard *keyboard, uint32_t time, struct key *key,
18+	            uint32_t state);
19+	bool (*modifiers)(struct keyboard *keyboard,
20+	                  const struct keyboard_modifier_state *state);
21 
22 	struct wl_list link;
23 };
24@@ -84,11 +86,19 @@ struct keyboard {
25 	uint32_t modifiers;
26 };
27 
28-struct keyboard *keyboard_create(struct xkb_rule_names *names);
29-void keyboard_destroy(struct keyboard *keyboard);
30-bool keyboard_reset(struct keyboard *keyboard);
31-void keyboard_set_focus(struct keyboard *keyboard, struct compositor_view *view);
32-struct wl_resource *keyboard_bind(struct keyboard *keyboard, struct wl_client *client, uint32_t version, uint32_t id);
33-void keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t key, uint32_t state);
34+struct keyboard *
35+keyboard_create(struct xkb_rule_names *names);
36+void
37+keyboard_destroy(struct keyboard *keyboard);
38+bool
39+keyboard_reset(struct keyboard *keyboard);
40+void
41+keyboard_set_focus(struct keyboard *keyboard, struct compositor_view *view);
42+struct wl_resource *
43+keyboard_bind(struct keyboard *keyboard, struct wl_client *client,
44+              uint32_t version, uint32_t id);
45+void
46+keyboard_handle_key(struct keyboard *keyboard, uint32_t time, uint32_t key,
47+                    uint32_t state);
48 
49 #endif
+28, -16
  1@@ -27,8 +27,8 @@
  2 #include "launch/protocol.h"
  3 #include "util.h"
  4 
  5-#include <sys/uio.h>
  6 #include <fcntl.h>
  7+#include <sys/uio.h>
  8 #include <unistd.h>
  9 #include <wayland-server.h>
 10 
 11@@ -60,11 +60,12 @@ handle_data(int fd, uint32_t mask, void *data)
 12 {
 13 	struct swc_launch_event event;
 14 	struct iovec iov[1] = {
 15-		{.iov_base = &event, .iov_len = sizeof(event)},
 16+	    {.iov_base = &event, .iov_len = sizeof(event)},
 17 	};
 18 
 19-	if (receive_fd(fd, NULL, iov, 1) != -1)
 20+	if (receive_fd(fd, NULL, iov, 1) != -1) {
 21 		handle_event(&event);
 22+	}
 23 	return 1;
 24 }
 25 
 26@@ -73,20 +74,25 @@ launch_initialize(void)
 27 {
 28 	char *socket_string, *end;
 29 
 30-	if (!(socket_string = getenv(SWC_LAUNCH_SOCKET_ENV)))
 31+	if (!(socket_string = getenv(SWC_LAUNCH_SOCKET_ENV))) {
 32 		return false;
 33+	}
 34 
 35 	launch.socket = strtol(socket_string, &end, 10);
 36-	if (*end != '\0')
 37+	if (*end != '\0') {
 38 		return false;
 39+	}
 40 
 41 	unsetenv(SWC_LAUNCH_SOCKET_ENV);
 42-	if (fcntl(launch.socket, F_SETFD, FD_CLOEXEC) < 0)
 43+	if (fcntl(launch.socket, F_SETFD, FD_CLOEXEC) < 0) {
 44 		return false;
 45+	}
 46 
 47-	launch.source = wl_event_loop_add_fd(swc.event_loop, launch.socket, WL_EVENT_READABLE, &handle_data, NULL);
 48-	if (!launch.source)
 49+	launch.source = wl_event_loop_add_fd(swc.event_loop, launch.socket,
 50+	                                     WL_EVENT_READABLE, &handle_data, NULL);
 51+	if (!launch.source) {
 52 		return false;
 53+	}
 54 
 55 	return true;
 56 }
 57@@ -99,24 +105,28 @@ launch_finalize(void)
 58 }
 59 
 60 static bool
 61-send_request(struct swc_launch_request *request, const void *data, size_t size, struct swc_launch_event *event, int out_fd, int *in_fd)
 62+send_request(struct swc_launch_request *request, const void *data, size_t size,
 63+             struct swc_launch_event *event, int out_fd, int *in_fd)
 64 {
 65 	struct iovec request_iov[2] = {
 66-		{.iov_base = request, .iov_len = sizeof(*request)},
 67-		{.iov_base = (void *)data, .iov_len = size},
 68+	    {.iov_base = request, .iov_len = sizeof(*request)},
 69+	    {.iov_base = (void *)data, .iov_len = size},
 70 	};
 71 	struct iovec response_iov[1] = {
 72-		{.iov_base = event, .iov_len = sizeof(*event)},
 73+	    {.iov_base = event, .iov_len = sizeof(*event)},
 74 	};
 75 
 76 	request->serial = ++launch.next_serial;
 77 
 78-	if (send_fd(launch.socket, out_fd, request_iov, 1 + (size > 0)) == -1)
 79+	if (send_fd(launch.socket, out_fd, request_iov, 1 + (size > 0)) == -1) {
 80 		return false;
 81+	}
 82 
 83 	while (receive_fd(launch.socket, in_fd, response_iov, 1) != -1) {
 84-		if (event->type == SWC_LAUNCH_EVENT_RESPONSE && event->serial == request->serial)
 85+		if (event->type == SWC_LAUNCH_EVENT_RESPONSE &&
 86+		    event->serial == request->serial) {
 87 			return true;
 88+		}
 89 		handle_event(event);
 90 	}
 91 
 92@@ -133,8 +143,9 @@ launch_open_device(const char *path, int flags)
 93 	request.type = SWC_LAUNCH_REQUEST_OPEN_DEVICE;
 94 	request.flags = flags;
 95 
 96-	if (!send_request(&request, path, strlen(path) + 1, &response, -1, &fd))
 97+	if (!send_request(&request, path, strlen(path) + 1, &response, -1, &fd)) {
 98 		return -1;
 99+	}
100 
101 	return fd;
102 }
103@@ -148,8 +159,9 @@ launch_activate_vt(unsigned vt)
104 	request.type = SWC_LAUNCH_REQUEST_ACTIVATE_VT;
105 	request.vt = vt;
106 
107-	if (!send_request(&request, NULL, 0, &response, -1, NULL))
108+	if (!send_request(&request, NULL, 0, &response, -1, NULL)) {
109 		return false;
110+	}
111 
112 	return response.success;
113 }
+8, -4
 1@@ -26,9 +26,13 @@
 2 
 3 #include <stdbool.h>
 4 
 5-bool launch_initialize(void);
 6-void launch_finalize(void);
 7-int launch_open_device(const char *path, int flags);
 8-bool launch_activate_vt(unsigned vt);
 9+bool
10+launch_initialize(void);
11+void
12+launch_finalize(void);
13+int
14+launch_open_device(const char *path, int flags);
15+bool
16+launch_activate_vt(unsigned vt);
17 
18 #endif
+2, -3
 1@@ -37,7 +37,6 @@ mode_initialize(struct mode *mode, drmModeModeInfo *mode_info)
 2 bool
 3 mode_equal(const struct mode *mode1, const struct mode *mode2)
 4 {
 5-	return mode1->width == mode2->width
 6-	    && mode1->height == mode2->height
 7-	    && mode1->refresh == mode2->refresh;
 8+	return mode1->width == mode2->width && mode1->height == mode2->height &&
 9+	       mode1->refresh == mode2->refresh;
10 }
+4, -2
 1@@ -38,7 +38,9 @@ struct mode {
 2 	drmModeModeInfo info;
 3 };
 4 
 5-bool mode_initialize(struct mode *mode, drmModeModeInfo *mode_info);
 6-bool mode_equal(const struct mode *mode1, const struct mode *mode2);
 7+bool
 8+mode_initialize(struct mode *mode, drmModeModeInfo *mode_info);
 9+bool
10+mode_equal(const struct mode *mode1, const struct mode *mode2);
11 
12 #endif
+30, -17
  1@@ -5,14 +5,14 @@
  2 #include "screen.h"
  3 #include "util.h"
  4 
  5+#include <drm.h>
  6 #include <stdio.h>
  7 #include <stdlib.h>
  8 #include <string.h>
  9-#include <drm.h>
 10 #include <xf86drm.h>
 11 
 12 static const struct wl_output_interface output_impl = {
 13-	.release = destroy_resource,
 14+    .release = destroy_resource,
 15 };
 16 
 17 static void
 18@@ -31,25 +31,32 @@ bind_output(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 19 		return;
 20 	}
 21 
 22-	wl_resource_set_implementation(resource, &output_impl, output, &remove_resource);
 23+	wl_resource_set_implementation(resource, &output_impl, output,
 24+	                               &remove_resource);
 25 	wl_list_insert(&output->resources, wl_resource_get_link(resource));
 26 
 27-	wl_output_send_geometry(resource, screen->base.geometry.x, screen->base.geometry.y,
 28-	                        output->physical_width, output->physical_height,
 29-	                        0, "unknown", "unknown", WL_OUTPUT_TRANSFORM_NORMAL);
 30+	wl_output_send_geometry(resource, screen->base.geometry.x,
 31+	                        screen->base.geometry.y, output->physical_width,
 32+	                        output->physical_height, 0, "unknown", "unknown",
 33+	                        WL_OUTPUT_TRANSFORM_NORMAL);
 34 
 35-	wl_array_for_each (mode, &output->modes) {
 36+	wl_array_for_each(mode, &output->modes)
 37+	{
 38 		flags = 0;
 39-		if (mode->preferred)
 40+		if (mode->preferred) {
 41 			flags |= WL_OUTPUT_MODE_PREFERRED;
 42-		if (mode_equal(&screen->planes.primary.mode, mode))
 43+		}
 44+		if (mode_equal(&screen->planes.primary.mode, mode)) {
 45 			flags |= WL_OUTPUT_MODE_CURRENT;
 46+		}
 47 
 48-		wl_output_send_mode(resource, flags, mode->width, mode->height, mode->refresh);
 49+		wl_output_send_mode(resource, flags, mode->width, mode->height,
 50+		                    mode->refresh);
 51 	}
 52 
 53-	if (version >= 2)
 54+	if (version >= 2) {
 55 		wl_output_send_done(resource);
 56+	}
 57 }
 58 
 59 struct output *
 60@@ -64,7 +71,8 @@ output_new(drmModeConnectorPtr connector)
 61 		goto error0;
 62 	}
 63 
 64-	output->global = wl_global_create(swc.display, &wl_output_interface, 3, output, &bind_output);
 65+	output->global = wl_global_create(swc.display, &wl_output_interface, 3,
 66+	                                  output, &bind_output);
 67 
 68 	if (!output->global) {
 69 		ERROR("Failed to create output global\n");
 70@@ -82,22 +90,27 @@ output_new(drmModeConnectorPtr connector)
 71 
 72 	output->connector = connector->connector_id;
 73 
 74-	if (connector->count_modes == 0)
 75+	if (connector->count_modes == 0) {
 76 		goto error2;
 77+	}
 78 
 79-	modes = wl_array_add(&output->modes, connector->count_modes * sizeof(*modes));
 80-	if (!modes)
 81+	modes =
 82+	    wl_array_add(&output->modes, connector->count_modes * sizeof(*modes));
 83+	if (!modes) {
 84 		goto error2;
 85+	}
 86 
 87 	for (i = 0; i < connector->count_modes; ++i) {
 88 		mode_initialize(&modes[i], &connector->modes[i]);
 89 
 90-		if (modes[i].preferred)
 91+		if (modes[i].preferred) {
 92 			output->preferred_mode = &modes[i];
 93+		}
 94 	}
 95 
 96-	if (!output->preferred_mode)
 97+	if (!output->preferred_mode) {
 98 		output->preferred_mode = &modes[0];
 99+	}
100 
101 	return output;
102 
+5, -3
 1@@ -1,8 +1,8 @@
 2 #ifndef SWC_OUTPUT_H
 3 #define SWC_OUTPUT_H
 4 
 5-#include <stdint.h>
 6 #include <pixman.h>
 7+#include <stdint.h>
 8 #include <wayland-util.h>
 9 #include <xf86drmMode.h>
10 
11@@ -27,7 +27,9 @@ struct output {
12 	struct wl_list link;
13 };
14 
15-struct output *output_new(drmModeConnector *connector);
16-void output_destroy(struct output *output);
17+struct output *
18+output_new(drmModeConnector *connector);
19+void
20+output_destroy(struct output *output);
21 
22 #endif
+50, -34
  1@@ -32,9 +32,9 @@
  2 #include "util.h"
  3 #include "view.h"
  4 
  5+#include "swc-server-protocol.h"
  6 #include <assert.h>
  7 #include <stdlib.h>
  8-#include "swc-server-protocol.h"
  9 
 10 struct panel {
 11 	struct wl_resource *resource;
 12@@ -53,7 +53,8 @@ static void
 13 update_position(struct panel *panel)
 14 {
 15 	int32_t x, y;
 16-	struct swc_rectangle *screen = &panel->screen->base.geometry, *view = &panel->view->base.geometry;
 17+	struct swc_rectangle *screen = &panel->screen->base.geometry,
 18+	                     *view = &panel->view->base.geometry;
 19 
 20 	switch (panel->edge) {
 21 	case SWC_PANEL_EDGE_TOP:
 22@@ -80,16 +81,18 @@ update_position(struct panel *panel)
 23 }
 24 
 25 static void
 26-dock(struct wl_client *client, struct wl_resource *resource, uint32_t edge, struct wl_resource *screen_resource, uint32_t focus)
 27+dock(struct wl_client *client, struct wl_resource *resource, uint32_t edge,
 28+     struct wl_resource *screen_resource, uint32_t focus)
 29 {
 30 	struct panel *panel = wl_resource_get_user_data(resource);
 31 	struct screen *screen;
 32 	uint32_t length;
 33 
 34-	if (screen_resource)
 35+	if (screen_resource) {
 36 		screen = wl_resource_get_user_data(screen_resource);
 37-	else
 38+	} else {
 39 		screen = wl_container_of(swc.screens.next, screen, link);
 40+	}
 41 
 42 	switch (edge) {
 43 	case SWC_PANEL_EDGE_TOP:
 44@@ -117,64 +120,69 @@ dock(struct wl_client *client, struct wl_resource *resource, uint32_t edge, stru
 45 	compositor_view_show(panel->view);
 46 	wl_list_insert(&screen->modifiers, &panel->modifier.link);
 47 
 48-	if (focus)
 49+	if (focus) {
 50 		keyboard_set_focus(swc.seat->keyboard, panel->view);
 51+	}
 52 
 53 	swc_panel_send_docked(resource, length);
 54 }
 55 
 56 static void
 57-set_offset(struct wl_client *client, struct wl_resource *resource, uint32_t offset)
 58+set_offset(struct wl_client *client, struct wl_resource *resource,
 59+           uint32_t offset)
 60 {
 61 	struct panel *panel = wl_resource_get_user_data(resource);
 62 
 63 	panel->offset = offset;
 64-	if (panel->docked)
 65+	if (panel->docked) {
 66 		update_position(panel);
 67+	}
 68 }
 69 
 70 static void
 71-set_strut(struct wl_client *client, struct wl_resource *resource, uint32_t size, uint32_t begin, uint32_t end)
 72+set_strut(struct wl_client *client, struct wl_resource *resource, uint32_t size,
 73+          uint32_t begin, uint32_t end)
 74 {
 75 	struct panel *panel = wl_resource_get_user_data(resource);
 76 
 77 	panel->strut_size = size;
 78-	if (panel->docked)
 79+	if (panel->docked) {
 80 		screen_update_usable_geometry(panel->screen);
 81+	}
 82 }
 83 
 84 static const struct swc_panel_interface panel_impl = {
 85-	.dock = dock,
 86-	.set_offset = set_offset,
 87-	.set_strut = set_strut,
 88+    .dock = dock,
 89+    .set_offset = set_offset,
 90+    .set_strut = set_strut,
 91 };
 92 
 93 static void
 94-handle_resize(struct view_handler *handler, uint32_t old_width, uint32_t old_height)
 95+handle_resize(struct view_handler *handler, uint32_t old_width,
 96+              uint32_t old_height)
 97 {
 98 	struct panel *panel = wl_container_of(handler, panel, view_handler);
 99 	update_position(panel);
100 }
101 
102 static const struct view_handler_impl view_handler_impl = {
103-	.resize = handle_resize,
104+    .resize = handle_resize,
105 };
106 
107 static void
108-modify(struct screen_modifier *modifier, const struct swc_rectangle *geom, pixman_region32_t *usable)
109+modify(struct screen_modifier *modifier, const struct swc_rectangle *geom,
110+       pixman_region32_t *usable)
111 {
112 	struct panel *panel = wl_container_of(modifier, panel, modifier);
113-	pixman_box32_t box = {
114-		.x1 = geom->x,
115-		.y1 = geom->y,
116-		.x2 = geom->x + geom->width,
117-		.y2 = geom->y + geom->height
118-	};
119+	pixman_box32_t box = {.x1 = geom->x,
120+	                      .y1 = geom->y,
121+	                      .x2 = geom->x + geom->width,
122+	                      .y2 = geom->y + geom->height};
123 
124 	assert(panel->docked);
125 
126-	DEBUG("Original geometry { x1: %d, y1: %d, x2: %d, y2: %d }\n",
127-	      box.x1, box.y1, box.x2, box.y2);
128+	DEBUG("Original geometry { x1: %d, y1: %d, x2: %d, y2: %d }\n", box.x1,
129+	      box.y1, box.x2, box.y2);
130 
131 	switch (panel->edge) {
132 	case SWC_PANEL_EDGE_TOP:
133@@ -191,8 +199,8 @@ modify(struct screen_modifier *modifier, const struct swc_rectangle *geom, pixma
134 		break;
135 	}
136 
137-	DEBUG("Usable region { x1: %d, y1: %d, x2: %d, y2: %d }\n",
138-	      box.x1, box.y1, box.x2, box.y2);
139+	DEBUG("Usable region { x1: %d, y1: %d, x2: %d, y2: %d }\n", box.x1, box.y1,
140+	      box.x2, box.y2);
141 
142 	pixman_region32_reset(usable, &box);
143 }
144@@ -214,29 +222,36 @@ destroy_panel(struct wl_resource *resource)
145 static void
146 handle_surface_destroy(struct wl_listener *listener, void *data)
147 {
148-	struct panel *panel = wl_container_of(listener, panel, surface_destroy_listener);
149+	struct panel *panel =
150+	    wl_container_of(listener, panel, surface_destroy_listener);
151 	wl_resource_destroy(panel->resource);
152 }
153 
154 struct panel *
155-panel_new(struct wl_client *client, uint32_t version, uint32_t id, struct surface *surface)
156+panel_new(struct wl_client *client, uint32_t version, uint32_t id,
157+          struct surface *surface)
158 {
159 	struct panel *panel;
160 
161 	panel = malloc(sizeof(*panel));
162 
163-	if (!panel)
164+	if (!panel) {
165 		goto error0;
166+	}
167 
168-	panel->resource = wl_resource_create(client, &swc_panel_interface, version, id);
169+	panel->resource =
170+	    wl_resource_create(client, &swc_panel_interface, version, id);
171 
172-	if (!panel->resource)
173+	if (!panel->resource) {
174 		goto error1;
175+	}
176 
177-	if (!(panel->view = compositor_create_view(surface)))
178+	if (!(panel->view = compositor_create_view(surface))) {
179 		goto error2;
180+	}
181 
182-	wl_resource_set_implementation(panel->resource, &panel_impl, panel, &destroy_panel);
183+	wl_resource_set_implementation(panel->resource, &panel_impl, panel,
184+	                               &destroy_panel);
185 	panel->surface_destroy_listener.notify = &handle_surface_destroy;
186 	panel->view_handler.impl = &view_handler_impl;
187 	panel->modifier.modify = &modify;
188@@ -245,7 +260,8 @@ panel_new(struct wl_client *client, uint32_t version, uint32_t id, struct surfac
189 	panel->strut_size = 0;
190 	panel->docked = false;
191 	wl_list_insert(&panel->view->base.handlers, &panel->view_handler.link);
192-	wl_resource_add_destroy_listener(surface->resource, &panel->surface_destroy_listener);
193+	wl_resource_add_destroy_listener(surface->resource,
194+	                                 &panel->surface_destroy_listener);
195 
196 	return panel;
197 
+3, -1
 1@@ -29,6 +29,8 @@
 2 struct surface;
 3 struct wl_client;
 4 
 5-struct panel *panel_new(struct wl_client *client, uint32_t version, uint32_t id, struct surface *surface);
 6+struct panel *
 7+panel_new(struct wl_client *client, uint32_t version, uint32_t id,
 8+          struct surface *surface);
 9 
10 #endif
+12, -7
 1@@ -25,28 +25,32 @@
 2 #include "internal.h"
 3 #include "panel.h"
 4 
 5-#include <wayland-server.h>
 6 #include "swc-server-protocol.h"
 7+#include <wayland-server.h>
 8 
 9 static void
10-create_panel(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *surface_resource)
11+create_panel(struct wl_client *client, struct wl_resource *resource,
12+             uint32_t id, struct wl_resource *surface_resource)
13 {
14 	struct surface *surface = wl_resource_get_user_data(surface_resource);
15 
16-	if (!panel_new(client, wl_resource_get_version(resource), id, surface))
17+	if (!panel_new(client, wl_resource_get_version(resource), id, surface)) {
18 		wl_client_post_no_memory(client);
19+	}
20 }
21 
22 static const struct swc_panel_manager_interface panel_manager_impl = {
23-	.create_panel = create_panel,
24+    .create_panel = create_panel,
25 };
26 
27 static void
28-bind_panel_manager(struct wl_client *client, void *data, uint32_t version, uint32_t id)
29+bind_panel_manager(struct wl_client *client, void *data, uint32_t version,
30+                   uint32_t id)
31 {
32 	struct wl_resource *resource;
33 
34-	resource = wl_resource_create(client, &swc_panel_manager_interface, version, id);
35+	resource =
36+	    wl_resource_create(client, &swc_panel_manager_interface, version, id);
37 	if (!resource) {
38 		wl_client_post_no_memory(client);
39 		return;
40@@ -57,5 +61,6 @@ bind_panel_manager(struct wl_client *client, void *data, uint32_t version, uint3
41 struct wl_global *
42 panel_manager_create(struct wl_display *display)
43 {
44-	return wl_global_create(display, &swc_panel_manager_interface, 1, NULL, &bind_panel_manager);
45+	return wl_global_create(display, &swc_panel_manager_interface, 1, NULL,
46+	                        &bind_panel_manager);
47 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *panel_manager_create(struct wl_display *display);
6+struct wl_global *
7+panel_manager_create(struct wl_display *display);
8 
9 #endif
+24, -22
  1@@ -22,16 +22,16 @@
  2  */
  3 
  4 #include "plane.h"
  5-#include "event.h"
  6 #include "drm.h"
  7+#include "event.h"
  8 #include "internal.h"
  9 #include "screen.h"
 10 #include "util.h"
 11 
 12 #include <errno.h>
 13 #include <stdlib.h>
 14-#include <wld/wld.h>
 15 #include <wld/drm.h>
 16+#include <wld/wld.h>
 17 #include <xf86drmMode.h>
 18 
 19 enum plane_property {
 20@@ -54,13 +54,16 @@ update(struct view *view)
 21 	struct plane *plane = wl_container_of(view, plane, view);
 22 	uint32_t x, y, w, h;
 23 
 24-	if (!plane->screen)
 25+	if (!plane->screen) {
 26 		return false;
 27+	}
 28 	x = view->geometry.x - plane->screen->base.geometry.x;
 29 	y = view->geometry.y - plane->screen->base.geometry.y;
 30 	w = view->geometry.width;
 31 	h = view->geometry.height;
 32-	if (swc.active && drmModeSetPlane(swc.drm->fd, plane->id, plane->screen->crtc, plane->fb, 0, x, y, w, h, 0, 0, w << 16, h << 16) < 0) {
 33+	if (swc.active &&
 34+	    drmModeSetPlane(swc.drm->fd, plane->id, plane->screen->crtc, plane->fb,
 35+	                    0, x, y, w, h, 0, 0, w << 16, h << 16) < 0) {
 36 		ERROR("Could not set cursor: %s\n", strerror(errno));
 37 		return false;
 38 	}
 39@@ -86,32 +89,28 @@ move(struct view *view, int32_t x, int32_t y)
 40 }
 41 
 42 static const struct view_impl view_impl = {
 43-	.update = update,
 44-	.attach = attach,
 45-	.move = move,
 46+    .update = update,
 47+    .attach = attach,
 48+    .move = move,
 49 };
 50 
 51 static enum plane_property
 52 find_prop(const char *name)
 53 {
 54 	static const char property_names[][16] = {
 55-		[PLANE_TYPE]        = "type",
 56-		[PLANE_IN_FENCE_FD] = "IN_FENCE_FD",
 57-		[PLANE_CRTC_ID]     = "CRTC_ID",
 58-		[PLANE_CRTC_X]      = "CRTC_X",
 59-		[PLANE_CRTC_Y]      = "CRTC_Y",
 60-		[PLANE_CRTC_W]      = "CRTC_W",
 61-		[PLANE_CRTC_H]      = "CRTC_H",
 62-		[PLANE_SRC_X]       = "SRC_X",
 63-		[PLANE_SRC_Y]       = "SRC_Y",
 64-		[PLANE_SRC_W]       = "SRC_W",
 65-		[PLANE_SRC_H]       = "SRC_H",
 66+	    [PLANE_TYPE] = "type",       [PLANE_IN_FENCE_FD] = "IN_FENCE_FD",
 67+	    [PLANE_CRTC_ID] = "CRTC_ID", [PLANE_CRTC_X] = "CRTC_X",
 68+	    [PLANE_CRTC_Y] = "CRTC_Y",   [PLANE_CRTC_W] = "CRTC_W",
 69+	    [PLANE_CRTC_H] = "CRTC_H",   [PLANE_SRC_X] = "SRC_X",
 70+	    [PLANE_SRC_Y] = "SRC_Y",     [PLANE_SRC_W] = "SRC_W",
 71+	    [PLANE_SRC_H] = "SRC_H",
 72 	};
 73 	size_t i;
 74 
 75 	for (i = 0; i < ARRAY_LENGTH(property_names); ++i) {
 76-		if (strcmp(name, property_names[i]) == 0)
 77+		if (strcmp(name, property_names[i]) == 0) {
 78 			return i;
 79+		}
 80 	}
 81 	return -1;
 82 }
 83@@ -139,11 +138,13 @@ plane_new(uint32_t id)
 84 	drmModePlane *drm_plane;
 85 
 86 	plane = malloc(sizeof(*plane));
 87-	if (!plane)
 88+	if (!plane) {
 89 		goto error0;
 90+	}
 91 	drm_plane = drmModeGetPlane(swc.drm->fd, id);
 92-	if (!drm_plane)
 93+	if (!drm_plane) {
 94 		goto error1;
 95+	}
 96 	plane->id = id;
 97 	plane->fb = 0;
 98 	plane->screen = NULL;
 99@@ -153,8 +154,9 @@ plane_new(uint32_t id)
100 	props = drmModeObjectGetProperties(swc.drm->fd, id, DRM_MODE_OBJECT_PLANE);
101 	for (i = 0; i < props->count_props; ++i, drmModeFreeProperty(prop)) {
102 		prop = drmModeGetProperty(swc.drm->fd, props->props[i]);
103-		if (prop && find_prop(prop->name) == PLANE_TYPE)
104+		if (prop && find_prop(prop->name) == PLANE_TYPE) {
105 			plane->type = props->prop_values[i];
106+		}
107 	}
108 	plane->swc_listener.notify = &handle_swc_event;
109 	wl_signal_add(&swc.event_signal, &plane->swc_listener);
+4, -2
 1@@ -39,7 +39,9 @@ struct plane {
 2 	struct wl_list link;
 3 };
 4 
 5-struct plane *plane_new(uint32_t id);
 6-void plane_destroy(struct plane *plane);
 7+struct plane *
 8+plane_new(uint32_t id);
 9+void
10+plane_destroy(struct plane *plane);
11 
12 #endif
+193, -105
  1@@ -23,6 +23,7 @@
  2 
  3 #include "pointer.h"
  4 #include "compositor.h"
  5+#include "cursor/cursor_data.h"
  6 #include "event.h"
  7 #include "internal.h"
  8 #include "plane.h"
  9@@ -31,7 +32,6 @@
 10 #include "shm.h"
 11 #include "surface.h"
 12 #include "util.h"
 13-#include "cursor/cursor_data.h"
 14 
 15 #include <assert.h>
 16 #include <stdio.h>
 17@@ -55,15 +55,19 @@ swc_pointer_send_button(uint32_t time, uint32_t button, uint32_t state)
 18 	struct wl_resource *resource;
 19 	uint32_t serial;
 20 
 21-	if (!pointer || wl_list_empty(&pointer->focus.active))
 22+	if (!pointer || wl_list_empty(&pointer->focus.active)) {
 23 		return;
 24+	}
 25 
 26 	serial = wl_display_next_serial(swc.display);
 27-	wl_resource_for_each (resource, &pointer->focus.active)
 28-		wl_pointer_send_button(resource, serial, time, button, state);
 29-	wl_resource_for_each (resource, &pointer->focus.active) {
 30-		if (wl_resource_get_version(resource) >= WL_POINTER_FRAME_SINCE_VERSION)
 31+	wl_resource_for_each(resource, &pointer->focus.active)
 32+	    wl_pointer_send_button(resource, serial, time, button, state);
 33+	wl_resource_for_each(resource, &pointer->focus.active)
 34+	{
 35+		if (wl_resource_get_version(resource) >=
 36+		    WL_POINTER_FRAME_SINCE_VERSION) {
 37 			wl_pointer_send_frame(resource);
 38+		}
 39 	}
 40 	pointer->client_axis_source = -1;
 41 }
 42@@ -75,38 +79,47 @@ swc_pointer_send_axis(uint32_t time, uint32_t axis, int32_t value120)
 43 	struct wl_resource *resource;
 44 	wl_fixed_t value;
 45 
 46-	if (!pointer || wl_list_empty(&pointer->focus.active))
 47+	if (!pointer || wl_list_empty(&pointer->focus.active)) {
 48 		return;
 49+	}
 50 
 51 	value = wl_fixed_from_double((double)value120 / 120.0);
 52 
 53-	wl_resource_for_each (resource, &pointer->focus.active) {
 54+	wl_resource_for_each(resource, &pointer->focus.active)
 55+	{
 56 		int ver = wl_resource_get_version(resource);
 57 
 58-		if (ver >= WL_POINTER_AXIS_SOURCE_SINCE_VERSION)
 59+		if (ver >= WL_POINTER_AXIS_SOURCE_SINCE_VERSION) {
 60 			wl_pointer_send_axis_source(resource, WL_POINTER_AXIS_SOURCE_WHEEL);
 61+		}
 62 		if (value120) {
 63-			if (ver >= WL_POINTER_AXIS_VALUE120_SINCE_VERSION)
 64+			if (ver >= WL_POINTER_AXIS_VALUE120_SINCE_VERSION) {
 65 				wl_pointer_send_axis_value120(resource, axis, value120);
 66-			else if (ver >= WL_POINTER_AXIS_DISCRETE_SINCE_VERSION)
 67+			} else if (ver >= WL_POINTER_AXIS_DISCRETE_SINCE_VERSION) {
 68 				wl_pointer_send_axis_discrete(resource, axis, value120 / 120);
 69+			}
 70 		}
 71 
 72-		if (value)
 73+		if (value) {
 74 			wl_pointer_send_axis(resource, time, axis, value);
 75-		else if (ver >= WL_POINTER_AXIS_STOP_SINCE_VERSION)
 76+		} else if (ver >= WL_POINTER_AXIS_STOP_SINCE_VERSION) {
 77 			wl_pointer_send_axis_stop(resource, time, axis);
 78+		}
 79 	}
 80 
 81-	wl_resource_for_each (resource, &pointer->focus.active) {
 82-		if (wl_resource_get_version(resource) >= WL_POINTER_FRAME_SINCE_VERSION)
 83+	wl_resource_for_each(resource, &pointer->focus.active)
 84+	{
 85+		if (wl_resource_get_version(resource) >=
 86+		    WL_POINTER_FRAME_SINCE_VERSION) {
 87 			wl_pointer_send_frame(resource);
 88+		}
 89 	}
 90 	pointer->client_axis_source = -1;
 91 }
 92 
 93 static void
 94-enter(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view)
 95+enter(struct input_focus_handler *handler, struct wl_list *resources,
 96+      struct compositor_view *view)
 97 {
 98 	struct pointer *pointer = wl_container_of(handler, pointer, focus_handler);
 99 	struct wl_resource *resource;
100@@ -124,25 +137,27 @@ enter(struct input_focus_handler *handler, struct wl_list *resources, struct com
101 	origin_y = view->base.geometry.y - view->buffer_offset_y;
102 	surface_x = pointer->x - wl_fixed_from_int(origin_x);
103 	surface_y = pointer->y - wl_fixed_from_int(origin_y);
104-	wl_resource_for_each (resource, resources)
105-		wl_pointer_send_enter(resource, serial, view->surface->resource, surface_x, surface_y);
106+	wl_resource_for_each(resource, resources) wl_pointer_send_enter(
107+	    resource, serial, view->surface->resource, surface_x, surface_y);
108 }
109 
110 static void
111-leave(struct input_focus_handler *handler, struct wl_list *resources, struct compositor_view *view)
112+leave(struct input_focus_handler *handler, struct wl_list *resources,
113+      struct compositor_view *view)
114 {
115 	struct wl_resource *resource;
116 	uint32_t serial;
117 
118 	serial = wl_display_next_serial(swc.display);
119-	wl_resource_for_each (resource, resources)
120-		wl_pointer_send_leave(resource, serial, view->surface->resource);
121+	wl_resource_for_each(resource, resources)
122+	    wl_pointer_send_leave(resource, serial, view->surface->resource);
123 }
124 
125 static void
126 handle_cursor_surface_destroy(struct wl_listener *listener, void *data)
127 {
128-	struct pointer *pointer = wl_container_of(listener, pointer, cursor.destroy_listener);
129+	struct pointer *pointer =
130+	    wl_container_of(listener, pointer, cursor.destroy_listener);
131 
132 	view_attach(&pointer->cursor.view, NULL);
133 	pointer->cursor.surface = NULL;
134@@ -162,27 +177,36 @@ attach(struct view *view, struct wld_buffer *buffer)
135 	struct surface *surface = pointer->cursor.surface;
136 	struct screen *screen;
137 
138-	if (surface && !pixman_region32_not_empty(&surface->state.damage))
139+	if (surface && !pixman_region32_not_empty(&surface->state.damage)) {
140 		return 0;
141+	}
142 
143 	wld_set_target_buffer(swc.shm->renderer, pointer->cursor.buffer);
144-	wld_fill_rectangle(swc.shm->renderer, 0x00000000, 0, 0, pointer->cursor.buffer->width, pointer->cursor.buffer->height);
145+	wld_fill_rectangle(swc.shm->renderer, 0x00000000, 0, 0,
146+	                   pointer->cursor.buffer->width,
147+	                   pointer->cursor.buffer->height);
148 
149-	if (buffer)
150-		wld_copy_rectangle(swc.shm->renderer, buffer, 0, 0, 0, 0, buffer->width, buffer->height);
151+	if (buffer) {
152+		wld_copy_rectangle(swc.shm->renderer, buffer, 0, 0, 0, 0, buffer->width,
153+		                   buffer->height);
154+	}
155 
156 	wld_flush(swc.shm->renderer);
157 
158-	if (surface)
159+	if (surface) {
160 		pixman_region32_clear(&surface->state.damage);
161+	}
162 
163 	/* TODO: Send an early release to the buffer */
164 
165-	if (view_set_size_from_buffer(view, buffer))
166+	if (view_set_size_from_buffer(view, buffer)) {
167 		view_update_screens(view);
168+	}
169 
170-	wl_list_for_each (screen, &swc.screens, link) {
171-		view_attach(&screen->planes.cursor->view, buffer ? pointer->cursor.buffer : NULL);
172+	wl_list_for_each(screen, &swc.screens, link)
173+	{
174+		view_attach(&screen->planes.cursor->view,
175+		            buffer ? pointer->cursor.buffer : NULL);
176 		view_update(&screen->planes.cursor->view);
177 	}
178 
179@@ -194,11 +218,14 @@ move(struct view *view, int32_t x, int32_t y)
180 {
181 	struct screen *screen;
182 
183-	if (view_set_position(view, x, y))
184+	if (view_set_position(view, x, y)) {
185 		view_update_screens(view);
186+	}
187 
188-	wl_list_for_each (screen, &swc.screens, link) {
189-		view_move(&screen->planes.cursor->view, view->geometry.x, view->geometry.y);
190+	wl_list_for_each(screen, &swc.screens, link)
191+	{
192+		view_move(&screen->planes.cursor->view, view->geometry.x,
193+		          view->geometry.y);
194 		view_update(&screen->planes.cursor->view);
195 	}
196 
197@@ -206,9 +233,9 @@ move(struct view *view, int32_t x, int32_t y)
198 }
199 
200 static const struct view_impl view_impl = {
201-	.update = update,
202-	.attach = attach,
203-	.move = move,
204+    .update = update,
205+    .attach = attach,
206+    .move = move,
207 };
208 
209 static inline void
210@@ -223,8 +250,9 @@ update_cursor(struct pointer *pointer)
211 static void
212 drop_client_cursor_surface(struct pointer *pointer)
213 {
214-	if (!pointer || !pointer->cursor.surface)
215+	if (!pointer || !pointer->cursor.surface) {
216 		return;
217+	}
218 	surface_set_view(pointer->cursor.surface, NULL);
219 	wl_list_remove(&pointer->cursor.destroy_listener.link);
220 	pointer->cursor.surface = NULL;
221@@ -233,8 +261,9 @@ drop_client_cursor_surface(struct pointer *pointer)
222 static void
223 apply_cursor_override(struct pointer *pointer)
224 {
225-	if (!pointer || pointer->cursor.surface)
226+	if (!pointer || pointer->cursor.surface) {
227 		return;
228+	}
229 
230 	pointer_set_cursor(pointer, cursor_left_ptr);
231 }
232@@ -257,23 +286,25 @@ swc_set_cursor_mode(enum swc_cursor_mode mode)
233 	struct pointer *pointer = swc.seat ? swc.seat->pointer : NULL;
234 
235 	cursor_mode = mode;
236-	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR)
237+	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR) {
238 		drop_client_cursor_surface(pointer);
239+	}
240 	apply_cursor_override(pointer);
241 }
242 
243 EXPORT void
244-swc_set_cursor_image(enum swc_cursor_kind kind,
245-                     const uint32_t *argb8888,
246-                     uint32_t width, uint32_t height,
247-                     int32_t hotspot_x, int32_t hotspot_y)
248+swc_set_cursor_image(enum swc_cursor_kind kind, const uint32_t *argb8888,
249+                     uint32_t width, uint32_t height, int32_t hotspot_x,
250+                     int32_t hotspot_y)
251 {
252 	struct pointer *pointer = swc.seat ? swc.seat->pointer : NULL;
253 
254-	if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images))
255+	if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images)) {
256 		return;
257-	if (!argb8888 || width == 0 || height == 0)
258+	}
259+	if (!argb8888 || width == 0 || height == 0) {
260 		return;
261+	}
262 
263 	cursor_images[kind].data = argb8888;
264 	cursor_images[kind].width = width;
265@@ -282,8 +313,9 @@ swc_set_cursor_image(enum swc_cursor_kind kind,
266 	cursor_images[kind].hotspot_y = hotspot_y;
267 	cursor_images[kind].active = true;
268 
269-	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR)
270+	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR) {
271 		drop_client_cursor_surface(pointer);
272+	}
273 	apply_cursor_override(pointer);
274 }
275 
276@@ -292,8 +324,9 @@ swc_clear_cursor_image(enum swc_cursor_kind kind)
277 {
278 	struct pointer *pointer = swc.seat ? swc.seat->pointer : NULL;
279 
280-	if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images))
281+	if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images)) {
282 		return;
283+	}
284 
285 	cursor_images[kind].active = false;
286 	cursor_images[kind].data = NULL;
287@@ -306,13 +339,14 @@ pointer_set_cursor(struct pointer *pointer, uint32_t id)
288 {
289 	struct cursor *cursor = &cursor_metadata[id];
290 	const uint32_t *data = cursor_data;
291-	union wld_object object = { .ptr = &cursor_data[cursor->offset] };
292+	union wld_object object = {.ptr = &cursor_data[cursor->offset]};
293 	struct wld_buffer *buffer;
294 
295 	if (id == cursor_left_ptr) {
296 		enum swc_cursor_kind kind = cursor_override;
297-		if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images))
298+		if (kind < 0 || kind >= (int)ARRAY_LENGTH(cursor_images)) {
299 			kind = SWC_CURSOR_DEFAULT;
300+		}
301 
302 		if (cursor_images[kind].active) {
303 			static struct cursor custom_cursor;
304@@ -328,8 +362,9 @@ pointer_set_cursor(struct pointer *pointer, uint32_t id)
305 		}
306 	}
307 
308-	if (pointer->cursor.internal_buffer)
309+	if (pointer->cursor.internal_buffer) {
310 		wld_buffer_unreference(pointer->cursor.internal_buffer);
311+	}
312 	if (pointer->cursor.surface) {
313 		surface_set_view(pointer->cursor.surface, NULL);
314 		wl_list_remove(&pointer->cursor.destroy_listener.link);
315@@ -337,9 +372,11 @@ pointer_set_cursor(struct pointer *pointer, uint32_t id)
316 	}
317 
318 	buffer = wld_import_buffer(swc.shm->context, WLD_OBJECT_DATA, object,
319-	                           cursor->width, cursor->height, WLD_FORMAT_ARGB8888, cursor->width * 4);
320-	if (!buffer)
321+	                           cursor->width, cursor->height,
322+	                           WLD_FORMAT_ARGB8888, cursor->width * 4);
323+	if (!buffer) {
324 		WARNING("Failed to create cursor buffer\n");
325+	}
326 	pointer->cursor.internal_buffer = buffer;
327 	pointer->cursor.hotspot.x = cursor->hotspot_x;
328 	pointer->cursor.hotspot.y = cursor->hotspot_y;
329@@ -348,48 +385,59 @@ pointer_set_cursor(struct pointer *pointer, uint32_t id)
330 }
331 
332 static bool
333-client_handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t x, wl_fixed_t y)
334+client_handle_motion(struct pointer_handler *handler, uint32_t time,
335+                     wl_fixed_t x, wl_fixed_t y)
336 {
337 	struct pointer *pointer = wl_container_of(handler, pointer, client_handler);
338 	struct wl_resource *resource;
339 	wl_fixed_t sx, sy;
340 	int32_t origin_x, origin_y;
341 
342-	if (wl_list_empty(&pointer->focus.active))
343+	if (wl_list_empty(&pointer->focus.active)) {
344 		return false;
345+	}
346 
347-	origin_x = pointer->focus.view->base.geometry.x - pointer->focus.view->buffer_offset_x;
348-	origin_y = pointer->focus.view->base.geometry.y - pointer->focus.view->buffer_offset_y;
349+	origin_x = pointer->focus.view->base.geometry.x -
350+	           pointer->focus.view->buffer_offset_x;
351+	origin_y = pointer->focus.view->base.geometry.y -
352+	           pointer->focus.view->buffer_offset_y;
353 	sx = x - wl_fixed_from_int(origin_x);
354 	sy = y - wl_fixed_from_int(origin_y);
355-	wl_resource_for_each (resource, &pointer->focus.active)
356-		wl_pointer_send_motion(resource, time, sx, sy);
357+	wl_resource_for_each(resource, &pointer->focus.active)
358+	    wl_pointer_send_motion(resource, time, sx, sy);
359 	return true;
360 }
361 
362 static bool
363-client_handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state)
364+client_handle_button(struct pointer_handler *handler, uint32_t time,
365+                     struct button *button, uint32_t state)
366 {
367 	struct pointer *pointer = wl_container_of(handler, pointer, client_handler);
368 	struct wl_resource *resource;
369 
370-	if (wl_list_empty(&pointer->focus.active))
371+	if (wl_list_empty(&pointer->focus.active)) {
372 		return false;
373+	}
374 
375-	wl_resource_for_each (resource, &pointer->focus.active)
376-		wl_pointer_send_button(resource, button->press.serial, time, button->press.value, state);
377+	wl_resource_for_each(resource, &pointer->focus.active)
378+	    wl_pointer_send_button(resource, button->press.serial, time,
379+	                           button->press.value, state);
380 	return true;
381 }
382 
383 static bool
384-client_handle_axis(struct pointer_handler *handler, uint32_t time, enum wl_pointer_axis axis, enum wl_pointer_axis_source source, wl_fixed_t value, int value120)
385+client_handle_axis(struct pointer_handler *handler, uint32_t time,
386+                   enum wl_pointer_axis axis,
387+                   enum wl_pointer_axis_source source, wl_fixed_t value,
388+                   int value120)
389 {
390 	struct pointer *pointer = wl_container_of(handler, pointer, client_handler);
391 	struct wl_resource *resource;
392 	int ver;
393 
394-	if (wl_list_empty(&pointer->focus.active))
395+	if (wl_list_empty(&pointer->focus.active)) {
396 		return false;
397+	}
398 
399 	if (pointer->client_axis_source != -1) {
400 		assert(pointer->client_axis_source == source);
401@@ -398,20 +446,24 @@ client_handle_axis(struct pointer_handler *handler, uint32_t time, enum wl_point
402 		pointer->client_axis_source = source;
403 	}
404 
405-	wl_resource_for_each (resource, &pointer->focus.active) {
406+	wl_resource_for_each(resource, &pointer->focus.active)
407+	{
408 		ver = wl_resource_get_version(resource);
409-		if (source != -1 && ver >= WL_POINTER_AXIS_SOURCE_SINCE_VERSION)
410+		if (source != -1 && ver >= WL_POINTER_AXIS_SOURCE_SINCE_VERSION) {
411 			wl_pointer_send_axis_source(resource, source);
412+		}
413 		if (value120) {
414-			if (ver >= WL_POINTER_AXIS_VALUE120_SINCE_VERSION)
415+			if (ver >= WL_POINTER_AXIS_VALUE120_SINCE_VERSION) {
416 				wl_pointer_send_axis_value120(resource, axis, value120);
417-			else if (ver >= WL_POINTER_AXIS_DISCRETE_SINCE_VERSION)
418+			} else if (ver >= WL_POINTER_AXIS_DISCRETE_SINCE_VERSION) {
419 				wl_pointer_send_axis_discrete(resource, axis, value120 / 120);
420+			}
421 		}
422-		if (value)
423+		if (value) {
424 			wl_pointer_send_axis(resource, time, axis, value);
425-		else if (ver >= WL_POINTER_AXIS_STOP_SINCE_VERSION)
426+		} else if (ver >= WL_POINTER_AXIS_STOP_SINCE_VERSION) {
427 			wl_pointer_send_axis_stop(resource, time, axis);
428+		}
429 	}
430 	return true;
431 }
432@@ -422,9 +474,12 @@ client_handle_frame(struct pointer_handler *handler)
433 	struct pointer *pointer = wl_container_of(handler, pointer, client_handler);
434 	struct wl_resource *resource;
435 
436-	wl_resource_for_each (resource, &pointer->focus.active) {
437-		if (wl_resource_get_version(resource) >= WL_POINTER_FRAME_SINCE_VERSION)
438+	wl_resource_for_each(resource, &pointer->focus.active)
439+	{
440+		if (wl_resource_get_version(resource) >=
441+		    WL_POINTER_FRAME_SINCE_VERSION) {
442 			wl_pointer_send_frame(resource);
443+		}
444 	}
445 	pointer->client_axis_source = -1;
446 }
447@@ -454,16 +509,19 @@ pointer_initialize(struct pointer *pointer)
448 	view_initialize(&pointer->cursor.view, &view_impl);
449 	pointer->cursor.surface = NULL;
450 	pointer->cursor.destroy_listener.notify = &handle_cursor_surface_destroy;
451-	pointer->cursor.buffer = wld_create_buffer(swc.drm->context, swc.drm->cursor_w, swc.drm->cursor_h, WLD_FORMAT_ARGB8888, WLD_FLAG_MAP | WLD_FLAG_CURSOR);
452+	pointer->cursor.buffer = wld_create_buffer(
453+	    swc.drm->context, swc.drm->cursor_w, swc.drm->cursor_h,
454+	    WLD_FORMAT_ARGB8888, WLD_FLAG_MAP | WLD_FLAG_CURSOR);
455 	pointer->cursor.internal_buffer = NULL;
456 
457-	if (!pointer->cursor.buffer)
458+	if (!pointer->cursor.buffer) {
459 		return false;
460+	}
461 
462 	pointer_set_cursor(pointer, cursor_left_ptr);
463 
464-	wl_list_for_each (screen, &swc.screens, link)
465-		view_attach(&screen->planes.cursor->view, pointer->cursor.buffer);
466+	wl_list_for_each(screen, &swc.screens, link)
467+	    view_attach(&screen->planes.cursor->view, pointer->cursor.buffer);
468 
469 	input_focus_initialize(&pointer->focus, &pointer->focus_handler);
470 	pixman_region32_init(&pointer->region);
471@@ -496,7 +554,8 @@ clip_position(struct pointer *pointer, wl_fixed_t fx, wl_fixed_t fy)
472 	last_y = wl_fixed_to_int(pointer->y);
473 
474 	if (!pixman_region32_contains_point(&pointer->region, x, y, NULL)) {
475-		if (!pixman_region32_contains_point(&pointer->region, last_x, last_y, &box)) {
476+		if (!pixman_region32_contains_point(&pointer->region, last_x, last_y,
477+		                                    &box)) {
478 			WARNING("cursor is not in the visible screen area\n");
479 			pointer->x = 0;
480 			pointer->y = 0;
481@@ -521,40 +580,46 @@ pointer_set_region(struct pointer *pointer, pixman_region32_t *region)
482 
483 static void
484 set_cursor(struct wl_client *client, struct wl_resource *resource,
485-           uint32_t serial, struct wl_resource *surface_resource, int32_t hotspot_x, int32_t hotspot_y)
486+           uint32_t serial, struct wl_resource *surface_resource,
487+           int32_t hotspot_x, int32_t hotspot_y)
488 {
489 	struct pointer *pointer = wl_resource_get_user_data(resource);
490 	struct surface *surface;
491 
492 	(void)serial;
493 
494-	if (client != pointer->focus.client)
495+	if (client != pointer->focus.client) {
496 		return;
497+	}
498 
499 	/* If forcing compositor cursor, ignore client cursor surfaces. */
500-	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR || cursor_override != SWC_CURSOR_DEFAULT)
501+	if (cursor_mode == SWC_CURSOR_MODE_COMPOSITOR ||
502+	    cursor_override != SWC_CURSOR_DEFAULT) {
503 		return;
504+	}
505 
506 	if (pointer->cursor.surface) {
507 		surface_set_view(pointer->cursor.surface, NULL);
508 		wl_list_remove(&pointer->cursor.destroy_listener.link);
509 	}
510 
511-	surface = surface_resource ? wl_resource_get_user_data(surface_resource) : NULL;
512+	surface =
513+	    surface_resource ? wl_resource_get_user_data(surface_resource) : NULL;
514 	pointer->cursor.surface = surface;
515 	pointer->cursor.hotspot.x = hotspot_x;
516 	pointer->cursor.hotspot.y = hotspot_y;
517 
518 	if (surface) {
519 		surface_set_view(surface, &pointer->cursor.view);
520-		wl_resource_add_destroy_listener(surface->resource, &pointer->cursor.destroy_listener);
521+		wl_resource_add_destroy_listener(surface->resource,
522+		                                 &pointer->cursor.destroy_listener);
523 		update_cursor(pointer);
524 	}
525 }
526 
527 static const struct wl_pointer_interface pointer_impl = {
528-	.set_cursor = set_cursor,
529-	.release = destroy_resource,
530+    .set_cursor = set_cursor,
531+    .release = destroy_resource,
532 };
533 
534 static void
535@@ -565,14 +630,18 @@ unbind(struct wl_resource *resource)
536 }
537 
538 struct wl_resource *
539-pointer_bind(struct pointer *pointer, struct wl_client *client, uint32_t version, uint32_t id)
540+pointer_bind(struct pointer *pointer, struct wl_client *client,
541+             uint32_t version, uint32_t id)
542 {
543 	struct wl_resource *client_resource;
544 
545-	client_resource = wl_resource_create(client, &wl_pointer_interface, version, id);
546-	if (!client_resource)
547+	client_resource =
548+	    wl_resource_create(client, &wl_pointer_interface, version, id);
549+	if (!client_resource) {
550 		return NULL;
551-	wl_resource_set_implementation(client_resource, &pointer_impl, pointer, &unbind);
552+	}
553+	wl_resource_set_implementation(client_resource, &pointer_impl, pointer,
554+	                               &unbind);
555 	input_focus_add_resource(&pointer->focus, client_resource);
556 
557 	return client_resource;
558@@ -583,16 +652,19 @@ pointer_get_button(struct pointer *pointer, uint32_t serial)
559 {
560 	struct button *button;
561 
562-	wl_array_for_each (button, &pointer->buttons) {
563-		if (button->press.serial == serial)
564+	wl_array_for_each(button, &pointer->buttons)
565+	{
566+		if (button->press.serial == serial) {
567 			return button;
568+		}
569 	}
570 
571 	return NULL;
572 }
573 
574 void
575-pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t value, uint32_t state)
576+pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t value,
577+                      uint32_t state)
578 {
579 	struct pointer_handler *handler;
580 	struct button *button;
581@@ -601,11 +673,13 @@ pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t value, ui
582 	serial = wl_display_next_serial(swc.display);
583 
584 	if (state == WL_POINTER_BUTTON_STATE_RELEASED) {
585-		wl_array_for_each (button, &pointer->buttons) {
586+		wl_array_for_each(button, &pointer->buttons)
587+		{
588 			if (button->press.value == value) {
589 				if (button->handler) {
590 					button->press.serial = serial;
591-					button->handler->button(button->handler, time, button, state);
592+					button->handler->button(button->handler, time, button,
593+					                        state);
594 					button->handler->pending = true;
595 				}
596 
597@@ -616,15 +690,18 @@ pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t value, ui
598 	} else {
599 		button = wl_array_add(&pointer->buttons, sizeof(*button));
600 
601-		if (!button)
602+		if (!button) {
603 			return;
604+		}
605 
606 		button->press.value = value;
607 		button->press.serial = serial;
608 		button->handler = NULL;
609 
610-		wl_list_for_each (handler, &pointer->handlers, link) {
611-			if (handler->button && handler->button(handler, time, button, state)) {
612+		wl_list_for_each(handler, &pointer->handlers, link)
613+		{
614+			if (handler->button &&
615+			    handler->button(handler, time, button, state)) {
616 				button->handler = handler;
617 				handler->pending = true;
618 				break;
619@@ -634,12 +711,17 @@ pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t value, ui
620 }
621 
622 void
623-pointer_handle_axis(struct pointer *pointer, uint32_t time, enum wl_pointer_axis axis, enum wl_pointer_axis_source source, wl_fixed_t value, int value120)
624+pointer_handle_axis(struct pointer *pointer, uint32_t time,
625+                    enum wl_pointer_axis axis,
626+                    enum wl_pointer_axis_source source, wl_fixed_t value,
627+                    int value120)
628 {
629 	struct pointer_handler *handler;
630 
631-	wl_list_for_each (handler, &pointer->handlers, link) {
632-		if (handler->axis && handler->axis(handler, time, axis, source, value, value120)) {
633+	wl_list_for_each(handler, &pointer->handlers, link)
634+	{
635+		if (handler->axis &&
636+		    handler->axis(handler, time, axis, source, value, value120)) {
637 			handler->pending = true;
638 			break;
639 		}
640@@ -647,20 +729,25 @@ pointer_handle_axis(struct pointer *pointer, uint32_t time, enum wl_pointer_axis
641 }
642 
643 void
644-pointer_handle_relative_motion(struct pointer *pointer, uint32_t time, wl_fixed_t dx, wl_fixed_t dy)
645+pointer_handle_relative_motion(struct pointer *pointer, uint32_t time,
646+                               wl_fixed_t dx, wl_fixed_t dy)
647 {
648-	pointer_handle_absolute_motion(pointer, time, pointer->x + dx, pointer->y + dy);
649+	pointer_handle_absolute_motion(pointer, time, pointer->x + dx,
650+	                               pointer->y + dy);
651 }
652 
653 void
654-pointer_handle_absolute_motion(struct pointer *pointer, uint32_t time, wl_fixed_t x, wl_fixed_t y)
655+pointer_handle_absolute_motion(struct pointer *pointer, uint32_t time,
656+                               wl_fixed_t x, wl_fixed_t y)
657 {
658 	struct pointer_handler *handler;
659 
660 	clip_position(pointer, x, y);
661 
662-	wl_list_for_each (handler, &pointer->handlers, link) {
663-		if (handler->motion && handler->motion(handler, time, pointer->x, pointer->y)) {
664+	wl_list_for_each(handler, &pointer->handlers, link)
665+	{
666+		if (handler->motion &&
667+		    handler->motion(handler, time, pointer->x, pointer->y)) {
668 			handler->pending = true;
669 			break;
670 		}
671@@ -674,7 +761,8 @@ pointer_handle_frame(struct pointer *pointer)
672 {
673 	struct pointer_handler *handler;
674 
675-	wl_list_for_each (handler, &pointer->handlers, link) {
676+	wl_list_for_each(handler, &pointer->handlers, link)
677+	{
678 		if (handler->pending && handler->frame) {
679 			handler->frame(handler);
680 			handler->pending = false;
+40, -17
 1@@ -36,9 +36,13 @@ struct button {
 2 };
 3 
 4 struct pointer_handler {
 5-	bool (*motion)(struct pointer_handler *handler, uint32_t time, wl_fixed_t x, wl_fixed_t y);
 6-	bool (*button)(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state);
 7-	bool (*axis)(struct pointer_handler *handler, uint32_t time, enum wl_pointer_axis axis, enum wl_pointer_axis_source source, wl_fixed_t value, int value120);
 8+	bool (*motion)(struct pointer_handler *handler, uint32_t time, wl_fixed_t x,
 9+	               wl_fixed_t y);
10+	bool (*button)(struct pointer_handler *handler, uint32_t time,
11+	               struct button *button, uint32_t state);
12+	bool (*axis)(struct pointer_handler *handler, uint32_t time,
13+	             enum wl_pointer_axis axis, enum wl_pointer_axis_source source,
14+	             wl_fixed_t value, int value120);
15 	void (*frame)(struct pointer_handler *handler);
16 
17 	int pending;
18@@ -72,19 +76,38 @@ struct pointer {
19 	pixman_region32_t region;
20 };
21 
22-bool pointer_initialize(struct pointer *pointer);
23-void pointer_finalize(struct pointer *pointer);
24-void pointer_set_focus(struct pointer *pointer, struct compositor_view *view);
25-void pointer_set_region(struct pointer *pointer, pixman_region32_t *region);
26-void pointer_set_cursor(struct pointer *pointer, uint32_t id);
27-
28-struct button *pointer_get_button(struct pointer *pointer, uint32_t serial);
29-
30-struct wl_resource *pointer_bind(struct pointer *pointer, struct wl_client *client, uint32_t version, uint32_t id);
31-void pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t button, uint32_t state);
32-void pointer_handle_axis(struct pointer *pointer, uint32_t time, enum wl_pointer_axis axis, enum wl_pointer_axis_source source, wl_fixed_t value, int value120);
33-void pointer_handle_relative_motion(struct pointer *pointer, uint32_t time, wl_fixed_t dx, wl_fixed_t dy);
34-void pointer_handle_absolute_motion(struct pointer *pointer, uint32_t time, wl_fixed_t x, wl_fixed_t y);
35-void pointer_handle_frame(struct pointer *pointer);
36+bool
37+pointer_initialize(struct pointer *pointer);
38+void
39+pointer_finalize(struct pointer *pointer);
40+void
41+pointer_set_focus(struct pointer *pointer, struct compositor_view *view);
42+void
43+pointer_set_region(struct pointer *pointer, pixman_region32_t *region);
44+void
45+pointer_set_cursor(struct pointer *pointer, uint32_t id);
46+
47+struct button *
48+pointer_get_button(struct pointer *pointer, uint32_t serial);
49+
50+struct wl_resource *
51+pointer_bind(struct pointer *pointer, struct wl_client *client,
52+             uint32_t version, uint32_t id);
53+void
54+pointer_handle_button(struct pointer *pointer, uint32_t time, uint32_t button,
55+                      uint32_t state);
56+void
57+pointer_handle_axis(struct pointer *pointer, uint32_t time,
58+                    enum wl_pointer_axis axis,
59+                    enum wl_pointer_axis_source source, wl_fixed_t value,
60+                    int value120);
61+void
62+pointer_handle_relative_motion(struct pointer *pointer, uint32_t time,
63+                               wl_fixed_t dx, wl_fixed_t dy);
64+void
65+pointer_handle_absolute_motion(struct pointer *pointer, uint32_t time,
66+                               wl_fixed_t x, wl_fixed_t y);
67+void
68+pointer_handle_frame(struct pointer *pointer);
69 
70 #endif
+24, -13
  1@@ -29,8 +29,8 @@
  2 #include "util.h"
  3 
  4 #include <errno.h>
  5-#include <wld/wld.h>
  6 #include <wld/drm.h>
  7+#include <wld/wld.h>
  8 #include <xf86drm.h>
  9 #include <xf86drmMode.h>
 10 
 11@@ -57,17 +57,21 @@ attach(struct view *view, struct wld_buffer *buffer)
 12 
 13 	fb = drm_get_framebuffer(buffer);
 14 	if (plane->need_modeset) {
 15-		ret = drmModeSetCrtc(swc.drm->fd, plane->crtc, fb, 0, 0, plane->connectors.data, plane->connectors.size / 4, &plane->mode.info);
 16+		ret = drmModeSetCrtc(swc.drm->fd, plane->crtc, fb, 0, 0,
 17+		                     plane->connectors.data, plane->connectors.size / 4,
 18+		                     &plane->mode.info);
 19 
 20 		if (ret == 0) {
 21 			wl_event_loop_add_idle(swc.event_loop, &send_frame, plane);
 22 			plane->need_modeset = false;
 23 		} else {
 24-			ERROR("Could not set CRTC to next framebuffer: %s\n", strerror(-ret));
 25+			ERROR("Could not set CRTC to next framebuffer: %s\n",
 26+			      strerror(-ret));
 27 			return ret;
 28 		}
 29 	} else {
 30-		ret = drmModePageFlip(swc.drm->fd, plane->crtc, fb, DRM_MODE_PAGE_FLIP_EVENT, &plane->drm_handler);
 31+		ret = drmModePageFlip(swc.drm->fd, plane->crtc, fb,
 32+		                      DRM_MODE_PAGE_FLIP_EVENT, &plane->drm_handler);
 33 
 34 		if (ret < 0) {
 35 			ERROR("Page flip failed: %s\n", strerror(errno));
 36@@ -86,9 +90,9 @@ move(struct view *view, int32_t x, int32_t y)
 37 }
 38 
 39 static const struct view_impl view_impl = {
 40-	.update = update,
 41-	.attach = attach,
 42-	.move = move,
 43+    .update = update,
 44+    .attach = attach,
 45+    .move = move,
 46 };
 47 
 48 static void
 49@@ -102,7 +106,8 @@ static void
 50 handle_swc_event(struct wl_listener *listener, void *data)
 51 {
 52 	struct event *event = data;
 53-	struct primary_plane *plane = wl_container_of(listener, plane, swc_listener);
 54+	struct primary_plane *plane =
 55+	    wl_container_of(listener, plane, swc_listener);
 56 
 57 	switch (event->type) {
 58 	case SWC_EVENT_ACTIVATED:
 59@@ -112,24 +117,29 @@ handle_swc_event(struct wl_listener *listener, void *data)
 60 }
 61 
 62 bool
 63-primary_plane_initialize(struct primary_plane *plane, uint32_t crtc, struct mode *mode, uint32_t *connectors, uint32_t num_connectors)
 64+primary_plane_initialize(struct primary_plane *plane, uint32_t crtc,
 65+                         struct mode *mode, uint32_t *connectors,
 66+                         uint32_t num_connectors)
 67 {
 68 	uint32_t *plane_connectors;
 69 
 70 	if (!(plane->original_crtc_state = drmModeGetCrtc(swc.drm->fd, crtc))) {
 71-		ERROR("Failed to get CRTC state for CRTC %u: %s\n", crtc, strerror(errno));
 72+		ERROR("Failed to get CRTC state for CRTC %u: %s\n", crtc,
 73+		      strerror(errno));
 74 		goto error0;
 75 	}
 76 
 77 	wl_array_init(&plane->connectors);
 78-	plane_connectors = wl_array_add(&plane->connectors, num_connectors * sizeof(connectors[0]));
 79+	plane_connectors = wl_array_add(&plane->connectors,
 80+	                                num_connectors * sizeof(connectors[0]));
 81 
 82 	if (!plane_connectors) {
 83 		ERROR("Failed to allocate connector array\n");
 84 		goto error1;
 85 	}
 86 
 87-	memcpy(plane_connectors, connectors, num_connectors * sizeof(connectors[0]));
 88+	memcpy(plane_connectors, connectors,
 89+	       num_connectors * sizeof(connectors[0]));
 90 	plane->crtc = crtc;
 91 	plane->need_modeset = true;
 92 	view_initialize(&plane->view, &view_impl);
 93@@ -153,6 +163,7 @@ primary_plane_finalize(struct primary_plane *plane)
 94 {
 95 	wl_array_release(&plane->connectors);
 96 	drmModeCrtcPtr crtc = plane->original_crtc_state;
 97-	drmModeSetCrtc(swc.drm->fd, crtc->crtc_id, crtc->buffer_id, crtc->x, crtc->y, NULL, 0, &crtc->mode);
 98+	drmModeSetCrtc(swc.drm->fd, crtc->crtc_id, crtc->buffer_id, crtc->x,
 99+	               crtc->y, NULL, 0, &crtc->mode);
100 	drmModeFreeCrtc(crtc);
101 }
+7, -3
 1@@ -28,8 +28,8 @@
 2 #include "mode.h"
 3 #include "view.h"
 4 
 5-#include <stdint.h>
 6 #include <stdbool.h>
 7+#include <stdint.h>
 8 #include <wayland-server.h>
 9 
10 struct primary_plane {
11@@ -43,7 +43,11 @@ struct primary_plane {
12 	struct wl_listener swc_listener;
13 };
14 
15-bool primary_plane_initialize(struct primary_plane *plane, uint32_t crtc, struct mode *mode, uint32_t *connectors, uint32_t num_connectors);
16-void primary_plane_finalize(struct primary_plane *plane);
17+bool
18+primary_plane_initialize(struct primary_plane *plane, uint32_t crtc,
19+                         struct mode *mode, uint32_t *connectors,
20+                         uint32_t num_connectors);
21+void
22+primary_plane_finalize(struct primary_plane *plane);
23 
24 #endif
+13, -8
 1@@ -6,7 +6,8 @@
 2 #include <wayland-server.h>
 3 
 4 static void
 5-add(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y, int32_t width, int32_t height)
 6+add(struct wl_client *client, struct wl_resource *resource, int32_t x,
 7+    int32_t y, int32_t width, int32_t height)
 8 {
 9 	pixman_region32_t *region = wl_resource_get_user_data(resource);
10 
11@@ -14,7 +15,8 @@ add(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y
12 }
13 
14 static void
15-subtract(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y, int32_t width, int32_t height)
16+subtract(struct wl_client *client, struct wl_resource *resource, int32_t x,
17+         int32_t y, int32_t width, int32_t height)
18 {
19 	pixman_region32_t *region = wl_resource_get_user_data(resource);
20 	pixman_region32_t operand;
21@@ -24,9 +26,9 @@ subtract(struct wl_client *client, struct wl_resource *resource, int32_t x, int3
22 }
23 
24 static const struct wl_region_interface region_impl = {
25-	.destroy = destroy_resource,
26-	.add = add,
27-	.subtract = subtract,
28+    .destroy = destroy_resource,
29+    .add = add,
30+    .subtract = subtract,
31 };
32 
33 static void
34@@ -45,13 +47,16 @@ region_new(struct wl_client *client, uint32_t version, uint32_t id)
35 	struct wl_resource *resource;
36 
37 	region = malloc(sizeof(*region));
38-	if (!region)
39+	if (!region) {
40 		goto error0;
41+	}
42 
43 	resource = wl_resource_create(client, &wl_region_interface, version, id);
44-	if (!resource)
45+	if (!resource) {
46 		goto error1;
47-	wl_resource_set_implementation(resource, &region_impl, region, &region_destroy);
48+	}
49+	wl_resource_set_implementation(resource, &region_impl, region,
50+	                               &region_destroy);
51 
52 	pixman_region32_init(region);
53 
+2, -1
1@@ -5,6 +5,7 @@
2 
3 struct wl_client;
4 
5-struct wl_resource *region_new(struct wl_client *client, uint32_t version, uint32_t id);
6+struct wl_resource *
7+region_new(struct wl_client *client, uint32_t version, uint32_t id);
8 
9 #endif
+51, -28
  1@@ -31,22 +31,28 @@
  2 #include "pointer.h"
  3 #include "util.h"
  4 
  5-#include <stdlib.h>
  6 #include "swc-server-protocol.h"
  7+#include <stdlib.h>
  8 
  9 #define INTERNAL(s) ((struct screen *)(s))
 10 
 11 static struct screen *active_screen;
 12 static const struct swc_screen_handler null_handler;
 13 
 14-static bool handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t x, wl_fixed_t y);
 15+static bool
 16+handle_motion(struct pointer_handler *handler,
 17+              uint32_t time,
 18+              wl_fixed_t x,
 19+              wl_fixed_t y);
 20 
 21 struct pointer_handler screens_pointer_handler = {
 22-	.motion = handle_motion,
 23+    .motion = handle_motion,
 24 };
 25 
 26 EXPORT void
 27-swc_screen_set_handler(struct swc_screen *base, const struct swc_screen_handler *handler, void *data)
 28+swc_screen_set_handler(struct swc_screen *base,
 29+                       const struct swc_screen_handler *handler,
 30+                       void *data)
 31 {
 32 	struct screen *screen = INTERNAL(base);
 33 
 34@@ -59,11 +65,13 @@ screens_initialize(void)
 35 {
 36 	wl_list_init(&swc.screens);
 37 
 38-	if (!drm_create_screens(&swc.screens))
 39+	if (!drm_create_screens(&swc.screens)) {
 40 		return false;
 41+	}
 42 
 43-	if (wl_list_empty(&swc.screens))
 44+	if (wl_list_empty(&swc.screens)) {
 45 		return false;
 46+	}
 47 
 48 	return true;
 49 }
 50@@ -73,8 +81,8 @@ screens_finalize(void)
 51 {
 52 	struct screen *screen, *tmp;
 53 
 54-	wl_list_for_each_safe (screen, tmp, &swc.screens, link)
 55-		screen_destroy(screen);
 56+	wl_list_for_each_safe(screen, tmp, &swc.screens, link)
 57+	    screen_destroy(screen);
 58 }
 59 
 60 static void
 61@@ -101,13 +109,15 @@ screen_new(uint32_t crtc, struct output *output, struct plane *cursor_plane)
 62 	int32_t x = 0;
 63 
 64 	/* Simple heuristic for initial screen positioning. */
 65-	wl_list_for_each (screen, &swc.screens, link)
 66-		x = MAX(x, screen->base.geometry.x + screen->base.geometry.width);
 67+	wl_list_for_each(screen, &swc.screens, link) x =
 68+	    MAX(x, screen->base.geometry.x + screen->base.geometry.width);
 69 
 70-	if (!(screen = malloc(sizeof(*screen))))
 71+	if (!(screen = malloc(sizeof(*screen)))) {
 72 		goto error0;
 73+	}
 74 
 75-	screen->global = wl_global_create(swc.display, &swc_screen_interface, 1, screen, &bind_screen);
 76+	screen->global = wl_global_create(
 77+	    swc.display, &swc_screen_interface, 1, screen, &bind_screen);
 78 
 79 	if (!screen->global) {
 80 		ERROR("Failed to create screen global\n");
 81@@ -116,7 +126,11 @@ screen_new(uint32_t crtc, struct output *output, struct plane *cursor_plane)
 82 
 83 	screen->crtc = crtc;
 84 
 85-	if (!primary_plane_initialize(&screen->planes.primary, crtc, output->preferred_mode, &output->connector, 1)) {
 86+	if (!primary_plane_initialize(&screen->planes.primary,
 87+	                              crtc,
 88+	                              output->preferred_mode,
 89+	                              &output->connector,
 90+	                              1)) {
 91 		ERROR("Failed to initialize primary plane\n");
 92 		goto error2;
 93 	}
 94@@ -152,13 +166,15 @@ screen_destroy(struct screen *screen)
 95 {
 96 	struct output *output, *next;
 97 
 98-	if (active_screen == screen)
 99+	if (active_screen == screen) {
100 		active_screen = NULL;
101-	if (screen->handler->destroy)
102+	}
103+	if (screen->handler->destroy) {
104 		screen->handler->destroy(screen->handler_data);
105+	}
106 	wl_signal_emit(&screen->destroy_signal, NULL);
107-	wl_list_for_each_safe (output, next, &screen->outputs, link)
108-		output_destroy(output);
109+	wl_list_for_each_safe(output, next, &screen->outputs, link)
110+	    output_destroy(output);
111 	primary_plane_finalize(&screen->planes.primary);
112 	plane_destroy(screen->planes.cursor);
113 	free(screen);
114@@ -174,44 +190,51 @@ screen_update_usable_geometry(struct screen *screen)
115 
116 	DEBUG("Updating usable geometry\n");
117 
118-	pixman_region32_init_rect(&total_usable, geom->x, geom->y, geom->width, geom->height);
119+	pixman_region32_init_rect(
120+	    &total_usable, geom->x, geom->y, geom->width, geom->height);
121 	pixman_region32_init(&usable);
122 
123-	wl_list_for_each (modifier, &screen->modifiers, link) {
124+	wl_list_for_each(modifier, &screen->modifiers, link)
125+	{
126 		modifier->modify(modifier, geom, &usable);
127 		pixman_region32_intersect(&total_usable, &total_usable, &usable);
128 	}
129 
130 	extents = pixman_region32_extents(&total_usable);
131 
132-	if (extents->x1 != screen->base.usable_geometry.x
133-	 || extents->y1 != screen->base.usable_geometry.y
134-	 || (extents->x2 - extents->x1) != screen->base.usable_geometry.width
135-	 || (extents->y2 - extents->y1) != screen->base.usable_geometry.height)
136-	{
137+	if (extents->x1 != screen->base.usable_geometry.x ||
138+	    extents->y1 != screen->base.usable_geometry.y ||
139+	    (extents->x2 - extents->x1) != screen->base.usable_geometry.width ||
140+	    (extents->y2 - extents->y1) != screen->base.usable_geometry.height) {
141 		screen->base.usable_geometry.x = extents->x1;
142 		screen->base.usable_geometry.y = extents->y1;
143 		screen->base.usable_geometry.width = extents->x2 - extents->x1;
144 		screen->base.usable_geometry.height = extents->y2 - extents->y1;
145 
146-		if (screen->handler->usable_geometry_changed)
147+		if (screen->handler->usable_geometry_changed) {
148 			screen->handler->usable_geometry_changed(screen->handler_data);
149+		}
150 	}
151 }
152 
153 bool
154-handle_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_fixed_t fy)
155+handle_motion(struct pointer_handler *handler,
156+              uint32_t time,
157+              wl_fixed_t fx,
158+              wl_fixed_t fy)
159 {
160 	struct screen *screen;
161 	int32_t x = wl_fixed_to_int(fx), y = wl_fixed_to_int(fy);
162 
163-	wl_list_for_each (screen, &swc.screens, link) {
164+	wl_list_for_each(screen, &swc.screens, link)
165+	{
166 		if (rectangle_contains_point(&screen->base.geometry, x, y)) {
167 			if (screen != active_screen) {
168 				active_screen = screen;
169 
170-				if (screen->handler->entered)
171+				if (screen->handler->entered) {
172 					screen->handler->entered(screen->handler_data);
173+				}
174 			}
175 			break;
176 		}
+14, -7
 1@@ -24,8 +24,8 @@
 2 #ifndef SWC_SCREEN_H
 3 #define SWC_SCREEN_H
 4 
 5-#include "swc.h"
 6 #include "primary_plane.h"
 7+#include "swc.h"
 8 
 9 #include <wayland-util.h>
10 
11@@ -37,7 +37,9 @@ struct screen_modifier {
12 	 * Takes the screen geometry and sets 'usable' to the usable region of the
13 	 * screen. 'usable' is an already initialized pixman region.
14 	 */
15-	void (*modify)(struct screen_modifier *modifier, const struct swc_rectangle *geometry, struct pixman_region32 *usable);
16+	void (*modify)(struct screen_modifier *modifier,
17+	               const struct swc_rectangle *geometry,
18+	               struct pixman_region32 *usable);
19 
20 	struct wl_list link;
21 };
22@@ -64,11 +66,15 @@ struct screen {
23 	struct wl_list link;
24 };
25 
26-bool screens_initialize(void);
27-void screens_finalize(void);
28+bool
29+screens_initialize(void);
30+void
31+screens_finalize(void);
32 
33-struct screen *screen_new(uint32_t crtc, struct output *output, struct plane *cursor_plane);
34-void screen_destroy(struct screen *screen);
35+struct screen *
36+screen_new(uint32_t crtc, struct output *output, struct plane *cursor_plane);
37+void
38+screen_destroy(struct screen *screen);
39 
40 static inline uint32_t
41 screen_mask(struct screen *screen)
42@@ -76,6 +82,7 @@ screen_mask(struct screen *screen)
43 	return 1 << screen->id;
44 }
45 
46-void screen_update_usable_geometry(struct screen *screen);
47+void
48+screen_update_usable_geometry(struct screen *screen);
49 
50 #endif
+173, -95
  1@@ -1,4 +1,3 @@
  2-#include "seat.h"
  3 #include "compositor.h"
  4 #include "data_device.h"
  5 #include "event.h"
  6@@ -7,9 +6,11 @@
  7 #include "launch.h"
  8 #include "pointer.h"
  9 #include "screen.h"
 10+#include "seat.h"
 11 #include "surface.h"
 12 #include "util.h"
 13 
 14+#include <ctype.h>
 15 #include <dirent.h>
 16 #include <errno.h>
 17 #include <fcntl.h>
 18@@ -19,9 +20,8 @@
 19 #include <stdio.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22-#include <unistd.h>
 23-#include <ctype.h>
 24 #include <sys/ioctl.h>
 25+#include <unistd.h>
 26 
 27 #include <linux/input.h>
 28 
 29@@ -67,15 +67,18 @@ struct seat {
 30 static void
 31 handle_keyboard_focus_event(struct wl_listener *listener, void *data)
 32 {
 33-	struct seat *seat = wl_container_of(listener, seat, keyboard_focus_listener);
 34+	struct seat *seat =
 35+	    wl_container_of(listener, seat, keyboard_focus_listener);
 36 	struct event *ev = data;
 37 	struct input_focus_event_data *event_data = ev->data;
 38 
 39-	if (ev->type != INPUT_FOCUS_EVENT_CHANGED)
 40+	if (ev->type != INPUT_FOCUS_EVENT_CHANGED) {
 41 		return;
 42+	}
 43 
 44 	if (event_data->new) {
 45-		struct wl_client *client = wl_resource_get_client(event_data->new->surface->resource);
 46+		struct wl_client *client =
 47+		    wl_resource_get_client(event_data->new->surface->resource);
 48 
 49 		/* offer the selection to the new focus */
 50 		data_device_offer_selection(seat->base.data_device, client);
 51@@ -88,11 +91,14 @@ handle_data_device_event(struct wl_listener *listener, void *data)
 52 	struct seat *seat = wl_container_of(listener, seat, data_device_listener);
 53 	struct event *ev = data;
 54 
 55-	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED)
 56+	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED) {
 57 		return;
 58+	}
 59 
 60-	if (seat->base.keyboard->focus.client)
 61-		data_device_offer_selection(seat->base.data_device, seat->base.keyboard->focus.client);
 62+	if (seat->base.keyboard->focus.client) {
 63+		data_device_offer_selection(seat->base.data_device,
 64+		                            seat->base.keyboard->focus.client);
 65+	}
 66 }
 67 
 68 static void
 69@@ -122,11 +128,13 @@ get_pointer(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 70 }
 71 
 72 static void
 73-get_keyboard(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 74+get_keyboard(struct wl_client *client, struct wl_resource *resource,
 75+             uint32_t id)
 76 {
 77 	struct seat *seat = wl_resource_get_user_data(resource);
 78 
 79-	keyboard_bind(seat->base.keyboard, client, wl_resource_get_version(resource), id);
 80+	keyboard_bind(seat->base.keyboard, client,
 81+	              wl_resource_get_version(resource), id);
 82 }
 83 
 84 static void
 85@@ -135,9 +143,9 @@ get_touch(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 86 }
 87 
 88 static struct wl_seat_interface seat_impl = {
 89-	.get_pointer = get_pointer,
 90-	.get_keyboard = get_keyboard,
 91-	.get_touch = get_touch,
 92+    .get_pointer = get_pointer,
 93+    .get_keyboard = get_keyboard,
 94+    .get_touch = get_touch,
 95 };
 96 
 97 static void
 98@@ -146,15 +154,18 @@ bind_seat(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 99 	struct seat *seat = data;
100 	struct wl_resource *resource;
101 
102-	if (version > 4)
103+	if (version > 4) {
104 		version = 4;
105+	}
106 
107 	resource = wl_resource_create(client, &wl_seat_interface, version, id);
108-	wl_resource_set_implementation(resource, &seat_impl, seat, &remove_resource);
109+	wl_resource_set_implementation(resource, &seat_impl, seat,
110+	                               &remove_resource);
111 	wl_list_insert(&seat->resources, wl_resource_get_link(resource));
112 
113-	if (version >= 2)
114+	if (version >= 2) {
115 		wl_seat_send_name(resource, seat->name);
116+	}
117 
118 	wl_seat_send_capabilities(resource, seat->capabilities);
119 }
120@@ -171,16 +182,20 @@ handle_evdev_key(struct seat *seat, const struct input_event *ev)
121 	uint32_t state;
122 	uint32_t time = event_time_ms(ev);
123 
124-	if (ev->value == 2)
125+	if (ev->value == 2) {
126 		return;
127+	}
128 
129-	if (ev->code >= BTN_MISC)
130+	if (ev->code >= BTN_MISC) {
131 		pointer_handle_button(seat->base.pointer, time, ev->code,
132-			ev->value ? WL_POINTER_BUTTON_STATE_PRESSED : WL_POINTER_BUTTON_STATE_RELEASED);
133-	else {
134-		if (ev->code > 255)
135+		                      ev->value ? WL_POINTER_BUTTON_STATE_PRESSED
136+		                                : WL_POINTER_BUTTON_STATE_RELEASED);
137+	} else {
138+		if (ev->code > 255) {
139 			return;
140-		state = (ev->value ? WL_KEYBOARD_KEY_STATE_PRESSED : WL_KEYBOARD_KEY_STATE_RELEASED);
141+		}
142+		state = (ev->value ? WL_KEYBOARD_KEY_STATE_PRESSED
143+		                   : WL_KEYBOARD_KEY_STATE_RELEASED);
144 		keyboard_handle_key(seat->base.keyboard, time, ev->code, state);
145 	}
146 }
147@@ -193,18 +208,24 @@ handle_evdev_rel(struct seat *seat, const struct input_event *ev)
148 
149 	switch (ev->code) {
150 	case REL_X:
151-		pointer_handle_relative_motion(seat->base.pointer, time, wl_fixed_from_int(ev->value), 0);
152+		pointer_handle_relative_motion(seat->base.pointer, time,
153+		                               wl_fixed_from_int(ev->value), 0);
154 		break;
155 	case REL_Y:
156-		pointer_handle_relative_motion(seat->base.pointer, time, 0, wl_fixed_from_int(ev->value));
157+		pointer_handle_relative_motion(seat->base.pointer, time, 0,
158+		                               wl_fixed_from_int(ev->value));
159 		break;
160 	case REL_WHEEL:
161 		value = wl_fixed_from_int(ev->value * 10);
162-		pointer_handle_axis(seat->base.pointer, time, WL_POINTER_AXIS_VERTICAL_SCROLL, WL_POINTER_AXIS_SOURCE_WHEEL, value, ev->value * 120);
163+		pointer_handle_axis(
164+		    seat->base.pointer, time, WL_POINTER_AXIS_VERTICAL_SCROLL,
165+		    WL_POINTER_AXIS_SOURCE_WHEEL, value, ev->value * 120);
166 		break;
167 	case REL_HWHEEL:
168 		value = wl_fixed_from_int(ev->value * 10);
169-		pointer_handle_axis(seat->base.pointer, time, WL_POINTER_AXIS_HORIZONTAL_SCROLL, WL_POINTER_AXIS_SOURCE_WHEEL, value, ev->value * 120);
170+		pointer_handle_axis(
171+		    seat->base.pointer, time, WL_POINTER_AXIS_HORIZONTAL_SCROLL,
172+		    WL_POINTER_AXIS_SOURCE_WHEEL, value, ev->value * 120);
173 		break;
174 	default:
175 		break;
176@@ -229,8 +250,10 @@ handle_evdev_abs(struct seat *seat, const struct input_event *ev)
177 		return;
178 	}
179 
180-	if (seat->abs_initialized)
181-		pointer_handle_absolute_motion(seat->base.pointer, time, seat->abs_x, seat->abs_y);
182+	if (seat->abs_initialized) {
183+		pointer_handle_absolute_motion(seat->base.pointer, time, seat->abs_x,
184+		                               seat->abs_y);
185+	}
186 }
187 
188 static int
189@@ -243,12 +266,14 @@ handle_evdev_data(int fd, uint32_t mask, void *data)
190 	while (!seat->ignore) {
191 		n = read(fd, &ev, sizeof(ev));
192 		if (n == -1) {
193-			if (errno == EAGAIN || errno == EINTR)
194+			if (errno == EAGAIN || errno == EINTR) {
195 				break;
196+			}
197 			return 0;
198 		}
199-		if (n != (ssize_t)sizeof(ev))
200+		if (n != (ssize_t)sizeof(ev)) {
201 			break;
202+		}
203 
204 		switch (ev.type) {
205 		case EV_KEY:
206@@ -261,8 +286,9 @@ handle_evdev_data(int fd, uint32_t mask, void *data)
207 			handle_evdev_abs(seat, &ev);
208 			break;
209 		case EV_SYN:
210-			if (ev.code == SYN_REPORT)
211+			if (ev.code == SYN_REPORT) {
212 				pointer_handle_frame(seat->base.pointer);
213+			}
214 			break;
215 		default:
216 			break;
217@@ -275,7 +301,9 @@ handle_evdev_data(int fd, uint32_t mask, void *data)
218 static bool
219 test_bit(const unsigned long *bits, size_t bit)
220 {
221-	return (bits[bit / (sizeof(unsigned long) * 8)] >> (bit % (sizeof(unsigned long) * 8))) & 1;
222+	return (bits[bit / (sizeof(unsigned long) * 8)] >>
223+	        (bit % (sizeof(unsigned long) * 8))) &
224+	       1;
225 }
226 
227 static bool
228@@ -284,8 +312,9 @@ contains_ci(const char *haystack, const char *needle)
229 	size_t nlen;
230 	const char *h;
231 
232-	if (!haystack || !needle || !*needle)
233+	if (!haystack || !needle || !*needle) {
234 		return false;
235+	}
236 
237 	nlen = strlen(needle);
238 	for (h = haystack; *h; ++h) {
239@@ -293,11 +322,13 @@ contains_ci(const char *haystack, const char *needle)
240 		for (i = 0; i < nlen; ++i) {
241 			unsigned char hc = (unsigned char)h[i];
242 			unsigned char nc = (unsigned char)needle[i];
243-			if (!h[i] || tolower(hc) != tolower(nc))
244+			if (!h[i] || tolower(hc) != tolower(nc)) {
245 				break;
246+			}
247 		}
248-		if (i == nlen)
249+		if (i == nlen) {
250 			return true;
251+		}
252 	}
253 	return false;
254 }
255@@ -305,52 +336,63 @@ contains_ci(const char *haystack, const char *needle)
256 static bool
257 is_keyboard_device(int fd)
258 {
259-	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
260-	unsigned long key_bits[(KEY_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
261+	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) /
262+	                      (8 * sizeof(unsigned long))];
263+	unsigned long key_bits[(KEY_MAX + 8 * sizeof(unsigned long) - 1) /
264+	                       (8 * sizeof(unsigned long))];
265 
266 	memset(ev_bits, 0, sizeof(ev_bits));
267 	memset(key_bits, 0, sizeof(key_bits));
268 
269-	if (ioctl(fd, EVIOCGBIT(0, sizeof(ev_bits)), ev_bits) < 0)
270+	if (ioctl(fd, EVIOCGBIT(0, sizeof(ev_bits)), ev_bits) < 0) {
271 		return false;
272-	if (!test_bit(ev_bits, EV_KEY))
273+	}
274+	if (!test_bit(ev_bits, EV_KEY)) {
275 		return false;
276-	if (ioctl(fd, EVIOCGBIT(EV_KEY, sizeof(key_bits)), key_bits) < 0)
277+	}
278+	if (ioctl(fd, EVIOCGBIT(EV_KEY, sizeof(key_bits)), key_bits) < 0) {
279 		return false;
280+	}
281 
282-	return test_bit(key_bits, KEY_A) &&
283-		test_bit(key_bits, KEY_Z) &&
284-		test_bit(key_bits, KEY_ENTER) &&
285-		test_bit(key_bits, KEY_ESC) &&
286-		test_bit(key_bits, KEY_SPACE);
287+	return test_bit(key_bits, KEY_A) && test_bit(key_bits, KEY_Z) &&
288+	       test_bit(key_bits, KEY_ENTER) && test_bit(key_bits, KEY_ESC) &&
289+	       test_bit(key_bits, KEY_SPACE);
290 }
291 
292 static bool
293 is_pointer_device(int fd)
294 {
295-	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
296-	unsigned long rel_bits[(REL_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
297-	unsigned long key_bits[(KEY_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
298+	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) /
299+	                      (8 * sizeof(unsigned long))];
300+	unsigned long rel_bits[(REL_MAX + 8 * sizeof(unsigned long) - 1) /
301+	                       (8 * sizeof(unsigned long))];
302+	unsigned long key_bits[(KEY_MAX + 8 * sizeof(unsigned long) - 1) /
303+	                       (8 * sizeof(unsigned long))];
304 
305 	memset(ev_bits, 0, sizeof(ev_bits));
306 	memset(rel_bits, 0, sizeof(rel_bits));
307 	memset(key_bits, 0, sizeof(key_bits));
308 
309-	if (ioctl(fd, EVIOCGBIT(0, sizeof(ev_bits)), ev_bits) < 0)
310+	if (ioctl(fd, EVIOCGBIT(0, sizeof(ev_bits)), ev_bits) < 0) {
311 		return false;
312+	}
313 
314 	if (test_bit(ev_bits, EV_REL)) {
315-		if (ioctl(fd, EVIOCGBIT(EV_REL, sizeof(rel_bits)), rel_bits) < 0)
316+		if (ioctl(fd, EVIOCGBIT(EV_REL, sizeof(rel_bits)), rel_bits) < 0) {
317 			return false;
318-		if (test_bit(rel_bits, REL_X) && test_bit(rel_bits, REL_Y))
319+		}
320+		if (test_bit(rel_bits, REL_X) && test_bit(rel_bits, REL_Y)) {
321 			return true;
322+		}
323 	}
324 
325 	if (test_bit(ev_bits, EV_KEY)) {
326-		if (ioctl(fd, EVIOCGBIT(EV_KEY, sizeof(key_bits)), key_bits) < 0)
327+		if (ioctl(fd, EVIOCGBIT(EV_KEY, sizeof(key_bits)), key_bits) < 0) {
328 			return false;
329-		if (test_bit(key_bits, BTN_LEFT) && test_bit(key_bits, BTN_RIGHT))
330+		}
331+		if (test_bit(key_bits, BTN_LEFT) && test_bit(key_bits, BTN_RIGHT)) {
332 			return true;
333+		}
334 	}
335 
336 	return false;
337@@ -367,7 +409,8 @@ static int
338 score_candidate(int fd, bool want_keyboard, const char *id_name)
339 {
340 	char name[256];
341-	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) / (8 * sizeof(unsigned long))];
342+	unsigned long ev_bits[(EV_MAX + 8 * sizeof(unsigned long) - 1) /
343+	                      (8 * sizeof(unsigned long))];
344 	bool is_kbd;
345 	bool is_ptr;
346 	int score = 10;
347@@ -375,45 +418,59 @@ score_candidate(int fd, bool want_keyboard, const char *id_name)
348 	is_kbd = is_keyboard_device(fd);
349 	is_ptr = is_pointer_device(fd);
350 
351-	if (want_keyboard && !is_kbd)
352+	if (want_keyboard && !is_kbd) {
353 		return -1;
354-	if (!want_keyboard && !is_ptr)
355+	}
356+	if (!want_keyboard && !is_ptr) {
357 		return -1;
358+	}
359 
360-	if (ioctl(fd, EVIOCGNAME(sizeof(name)), name) < 0)
361+	if (ioctl(fd, EVIOCGNAME(sizeof(name)), name) < 0) {
362 		name[0] = '\0';
363+	}
364 
365-	if (!get_ev_bits(fd, ev_bits, sizeof(ev_bits)))
366+	if (!get_ev_bits(fd, ev_bits, sizeof(ev_bits))) {
367 		memset(ev_bits, 0, sizeof(ev_bits));
368+	}
369 
370 	if (want_keyboard) {
371-		if (is_ptr)
372+		if (is_ptr) {
373 			score -= 6;
374-		if (contains_ci(id_name, "mouse") || contains_ci(name, "mouse"))
375+		}
376+		if (contains_ci(id_name, "mouse") || contains_ci(name, "mouse")) {
377 			score -= 12;
378-		if (contains_ci(id_name, "kbd") || contains_ci(id_name, "keyboard"))
379+		}
380+		if (contains_ci(id_name, "kbd") || contains_ci(id_name, "keyboard")) {
381 			score += 4;
382-		if (contains_ci(name, "keyboard"))
383+		}
384+		if (contains_ci(name, "keyboard")) {
385 			score += 2;
386-		if (test_bit(ev_bits, EV_LED))
387+		}
388+		if (test_bit(ev_bits, EV_LED)) {
389 			score += 3;
390-		if (test_bit(ev_bits, EV_REP))
391+		}
392+		if (test_bit(ev_bits, EV_REP)) {
393 			score += 1;
394+		}
395 	} else {
396-		if (contains_ci(id_name, "mouse") || contains_ci(name, "mouse"))
397+		if (contains_ci(id_name, "mouse") || contains_ci(name, "mouse")) {
398 			score += 4;
399-		if (contains_ci(id_name, "kbd") || contains_ci(id_name, "keyboard"))
400+		}
401+		if (contains_ci(id_name, "kbd") || contains_ci(id_name, "keyboard")) {
402 			score -= 6;
403-		if (contains_ci(name, "keyboard"))
404+		}
405+		if (contains_ci(name, "keyboard")) {
406 			score -= 4;
407+		}
408 	}
409 
410 	return score;
411 }
412 
413 static bool
414-pick_best_device(const char *dir_path, const char *name_prefix, const char *name_substr,
415-                 bool want_keyboard, char *out, size_t out_len)
416+pick_best_device(const char *dir_path, const char *name_prefix,
417+                 const char *name_substr, bool want_keyboard, char *out,
418+                 size_t out_len)
419 {
420 	DIR *dir;
421 	struct dirent *ent;
422@@ -422,25 +479,30 @@ pick_best_device(const char *dir_path, const char *name_prefix, const char *name
423 	size_t prefix_len = name_prefix ? strlen(name_prefix) : 0;
424 
425 	dir = opendir(dir_path);
426-	if (!dir)
427+	if (!dir) {
428 		return false;
429+	}
430 
431 	while ((ent = readdir(dir)) != NULL) {
432 		char path[PATH_MAX];
433 		int fd;
434 		int score;
435 
436-		if (ent->d_name[0] == '.')
437+		if (ent->d_name[0] == '.') {
438 			continue;
439-		if (name_prefix && strncmp(ent->d_name, name_prefix, prefix_len) != 0)
440+		}
441+		if (name_prefix && strncmp(ent->d_name, name_prefix, prefix_len) != 0) {
442 			continue;
443-		if (name_substr && !strstr(ent->d_name, name_substr))
444+		}
445+		if (name_substr && !strstr(ent->d_name, name_substr)) {
446 			continue;
447+		}
448 
449 		snprintf(path, sizeof(path), "%s/%s", dir_path, ent->d_name);
450 		fd = launch_open_device(path, O_RDONLY | O_NONBLOCK);
451-		if (fd == -1)
452+		if (fd == -1) {
453 			continue;
454+		}
455 
456 		score = score_candidate(fd, want_keyboard, ent->d_name);
457 		if (score < 0) {
458@@ -469,19 +531,27 @@ initialize_evdev(struct seat *seat)
459 	const char *kbd_dev = EVDEV_KBD_DEVICE;
460 	const char *mouse_dev = EVDEV_POINTER_DEVICE;
461 
462-	if (pick_best_device("/dev/input/by-id", NULL, "event-kbd", true, kbd_path, sizeof(kbd_path)))
463+	if (pick_best_device("/dev/input/by-id", NULL, "event-kbd", true, kbd_path,
464+	                     sizeof(kbd_path))) {
465 		kbd_dev = kbd_path;
466-	else if (pick_best_device("/dev/input/by-path", NULL, "event-kbd", true, kbd_path, sizeof(kbd_path)))
467+	} else if (pick_best_device("/dev/input/by-path", NULL, "event-kbd", true,
468+	                            kbd_path, sizeof(kbd_path))) {
469 		kbd_dev = kbd_path;
470-	else if (pick_best_device("/dev/input", "event", NULL, true, kbd_path, sizeof(kbd_path)))
471+	} else if (pick_best_device("/dev/input", "event", NULL, true, kbd_path,
472+	                            sizeof(kbd_path))) {
473 		kbd_dev = kbd_path;
474+	}
475 
476-	if (pick_best_device("/dev/input/by-id", NULL, "event-mouse", false, mouse_path, sizeof(mouse_path)))
477+	if (pick_best_device("/dev/input/by-id", NULL, "event-mouse", false,
478+	                     mouse_path, sizeof(mouse_path))) {
479 		mouse_dev = mouse_path;
480-	else if (pick_best_device("/dev/input/by-path", NULL, "event-mouse", false, mouse_path, sizeof(mouse_path)))
481+	} else if (pick_best_device("/dev/input/by-path", NULL, "event-mouse",
482+	                            false, mouse_path, sizeof(mouse_path))) {
483 		mouse_dev = mouse_path;
484-	else if (pick_best_device("/dev/input", "event", NULL, false, mouse_path, sizeof(mouse_path)))
485+	} else if (pick_best_device("/dev/input", "event", NULL, false, mouse_path,
486+	                            sizeof(mouse_path))) {
487 		mouse_dev = mouse_path;
488+	}
489 
490 	DEBUG("evdev devices: keyboard=%s pointer=%s\n", kbd_dev, mouse_dev);
491 
492@@ -517,8 +587,9 @@ seat_create(struct wl_display *display, const char *seat_name)
493 	struct seat *seat;
494 
495 	seat = malloc(sizeof(*seat));
496-	if (!seat)
497+	if (!seat) {
498 		goto error0;
499+	}
500 
501 	memset(&seat->names, 0, sizeof(seat->names));
502 	seat->names.rules = "base";
503@@ -533,13 +604,17 @@ seat_create(struct wl_display *display, const char *seat_name)
504 		goto error1;
505 	}
506 
507-	if (!initialize_evdev(seat))
508+	if (!initialize_evdev(seat)) {
509 		goto error2;
510+	}
511 
512-	seat->global = wl_global_create(display, &wl_seat_interface, 4, seat, &bind_seat);
513-	if (!seat->global)
514+	seat->global =
515+	    wl_global_create(display, &wl_seat_interface, 4, seat, &bind_seat);
516+	if (!seat->global) {
517 		goto error2;
518-	seat->capabilities = WL_SEAT_CAPABILITY_KEYBOARD | WL_SEAT_CAPABILITY_POINTER;
519+	}
520+	seat->capabilities =
521+	    WL_SEAT_CAPABILITY_KEYBOARD | WL_SEAT_CAPABILITY_POINTER;
522 	wl_list_init(&seat->resources);
523 
524 	seat->swc_listener.notify = &handle_swc_event;
525@@ -551,7 +626,8 @@ seat_create(struct wl_display *display, const char *seat_name)
526 		goto error3;
527 	}
528 	seat->data_device_listener.notify = &handle_data_device_event;
529-	wl_signal_add(&seat->base.data_device->event_signal, &seat->data_device_listener);
530+	wl_signal_add(&seat->base.data_device->event_signal,
531+	              &seat->data_device_listener);
532 
533 	seat->base.keyboard = keyboard_create(&seat->names);
534 	if (!seat->base.keyboard) {
535@@ -559,7 +635,8 @@ seat_create(struct wl_display *display, const char *seat_name)
536 		goto error4;
537 	}
538 	seat->keyboard_focus_listener.notify = handle_keyboard_focus_event;
539-	wl_signal_add(&seat->base.keyboard->focus.event_signal, &seat->keyboard_focus_listener);
540+	wl_signal_add(&seat->base.keyboard->focus.event_signal,
541+	              &seat->keyboard_focus_listener);
542 
543 	if (!pointer_initialize(&seat->pointer)) {
544 		ERROR("Could not initialize pointer\n");
545@@ -567,13 +644,13 @@ seat_create(struct wl_display *display, const char *seat_name)
546 	}
547 	seat->base.pointer = &seat->pointer;
548 
549-	seat->kbd_source = wl_event_loop_add_fd
550-		(swc.event_loop, seat->kbd_fd, WL_EVENT_READABLE,
551-		 &handle_evdev_data, seat);
552+	seat->kbd_source =
553+	    wl_event_loop_add_fd(swc.event_loop, seat->kbd_fd, WL_EVENT_READABLE,
554+	                         &handle_evdev_data, seat);
555 	if (!seat->shared_fd) {
556-		seat->mouse_source = wl_event_loop_add_fd
557-			(swc.event_loop, seat->mouse_fd, WL_EVENT_READABLE,
558-			 &handle_evdev_data, seat);
559+		seat->mouse_source =
560+		    wl_event_loop_add_fd(swc.event_loop, seat->mouse_fd,
561+		                         WL_EVENT_READABLE, &handle_evdev_data, seat);
562 	} else {
563 		seat->mouse_source = NULL;
564 	}
565@@ -601,8 +678,9 @@ seat_destroy(struct swc_seat *seat_base)
566 {
567 	struct seat *seat = wl_container_of(seat_base, seat, base);
568 
569-	if (seat->mouse_source)
570+	if (seat->mouse_source) {
571 		wl_event_source_remove(seat->mouse_source);
572+	}
573 	wl_event_source_remove(seat->kbd_source);
574 	if (seat->mouse_source) {
575 		close(seat->mouse_fd);
+71, -62
  1@@ -22,9 +22,6 @@
  2  * SOFTWARE.
  3  */
  4 
  5-#include "wscons/atKeynames.h"
  6-#include "wscons/bsd_KbdMap.h"
  7-#include "seat.h"
  8 #include "compositor.h"
  9 #include "data_device.h"
 10 #include "event.h"
 11@@ -33,15 +30,18 @@
 12 #include "launch.h"
 13 #include "pointer.h"
 14 #include "screen.h"
 15+#include "seat.h"
 16 #include "surface.h"
 17 #include "util.h"
 18+#include "wscons/atKeynames.h"
 19+#include "wscons/bsd_KbdMap.h"
 20 
 21+#include <errno.h>
 22+#include <fcntl.h>
 23 #include <stdbool.h>
 24 #include <stdio.h>
 25 #include <stdlib.h>
 26 #include <string.h>
 27-#include <errno.h>
 28-#include <fcntl.h>
 29 #include <unistd.h>
 30 
 31 #include <dev/wscons/wsconsio.h>
 32@@ -51,35 +51,21 @@
 33 /* Map wscons encodings to libxkbcommon layout names. */
 34 struct ws_xkb_map {
 35 	const int ws;
 36-	const char * const xkb;
 37+	const char *const xkb;
 38 };
 39 
 40 static const struct ws_xkb_map ws_xkb_encodings[] = {
 41-	{ KB_UK, "gb" },
 42-	{ KB_BE, "be" },
 43+    {KB_UK, "gb"}, {KB_BE, "be"},
 44 #ifdef KB_CZ
 45-	{ KB_CZ, "cz" },
 46+    {KB_CZ, "cz"},
 47 #endif
 48-	{ KB_DK, "dk" },
 49-	{ KB_NL, "nl" },
 50-	{ KB_DE, "de" },
 51+    {KB_DK, "dk"}, {KB_NL, "nl"}, {KB_DE, "de"},
 52 #ifdef KB_GR
 53-	{ KB_GR, "gr" },
 54+    {KB_GR, "gr"},
 55 #endif
 56-	{ KB_HU, "hu" },
 57-	{ KB_IT, "it" },
 58-	{ KB_JP, "jp" },
 59-	{ KB_NO, "no" },
 60-	{ KB_PL, "pl" },
 61-	{ KB_PT, "pt" },
 62-	{ KB_RU, "ru" },
 63-	{ KB_ES, "es" },
 64-	{ KB_SV, "sv" },
 65-	{ KB_SG, "sg" },
 66-	{ KB_TR, "tr" },
 67-	{ KB_UA, "ua" },
 68-	{ -1, NULL }
 69-};
 70+    {KB_HU, "hu"}, {KB_IT, "it"}, {KB_JP, "jp"}, {KB_NO, "no"}, {KB_PL, "pl"},
 71+    {KB_PT, "pt"}, {KB_RU, "ru"}, {KB_ES, "es"}, {KB_SV, "sv"}, {KB_SG, "sg"},
 72+    {KB_TR, "tr"}, {KB_UA, "ua"}, {-1, NULL}};
 73 
 74 struct seat {
 75 	struct swc_seat base;
 76@@ -111,15 +97,18 @@ struct seat {
 77 static void
 78 handle_keyboard_focus_event(struct wl_listener *listener, void *data)
 79 {
 80-	struct seat *seat = wl_container_of(listener, seat, keyboard_focus_listener);
 81+	struct seat *seat =
 82+	    wl_container_of(listener, seat, keyboard_focus_listener);
 83 	struct event *ev = data;
 84 	struct input_focus_event_data *event_data = ev->data;
 85 
 86-	if (ev->type != INPUT_FOCUS_EVENT_CHANGED)
 87+	if (ev->type != INPUT_FOCUS_EVENT_CHANGED) {
 88 		return;
 89+	}
 90 
 91 	if (event_data->new) {
 92-		struct wl_client *client = wl_resource_get_client(event_data->new->surface->resource);
 93+		struct wl_client *client =
 94+		    wl_resource_get_client(event_data->new->surface->resource);
 95 
 96 		/* Offer the selection to the new focus. */
 97 		data_device_offer_selection(seat->base.data_device, client);
 98@@ -132,11 +121,14 @@ handle_data_device_event(struct wl_listener *listener, void *data)
 99 	struct seat *seat = wl_container_of(listener, seat, data_device_listener);
100 	struct event *ev = data;
101 
102-	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED)
103+	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED) {
104 		return;
105+	}
106 
107-	if (seat->base.keyboard->focus.client)
108-		data_device_offer_selection(seat->base.data_device, seat->base.keyboard->focus.client);
109+	if (seat->base.keyboard->focus.client) {
110+		data_device_offer_selection(seat->base.data_device,
111+		                            seat->base.keyboard->focus.client);
112+	}
113 }
114 
115 static void
116@@ -166,11 +158,13 @@ get_pointer(struct wl_client *client, struct wl_resource *resource, uint32_t id)
117 }
118 
119 static void
120-get_keyboard(struct wl_client *client, struct wl_resource *resource, uint32_t id)
121+get_keyboard(struct wl_client *client, struct wl_resource *resource,
122+             uint32_t id)
123 {
124 	struct seat *seat = wl_resource_get_user_data(resource);
125 
126-	keyboard_bind(seat->base.keyboard, client, wl_resource_get_version(resource), id);
127+	keyboard_bind(seat->base.keyboard, client,
128+	              wl_resource_get_version(resource), id);
129 }
130 
131 static void
132@@ -180,9 +174,9 @@ get_touch(struct wl_client *client, struct wl_resource *resource, uint32_t id)
133 }
134 
135 static struct wl_seat_interface seat_impl = {
136-	.get_pointer = get_pointer,
137-	.get_keyboard = get_keyboard,
138-	.get_touch = get_touch,
139+    .get_pointer = get_pointer,
140+    .get_keyboard = get_keyboard,
141+    .get_touch = get_touch,
142 };
143 
144 static void
145@@ -191,15 +185,18 @@ bind_seat(struct wl_client *client, void *data, uint32_t version, uint32_t id)
146 	struct seat *seat = data;
147 	struct wl_resource *resource;
148 
149-	if (version > 4)
150+	if (version > 4) {
151 		version = 4;
152+	}
153 
154 	resource = wl_resource_create(client, &wl_seat_interface, version, id);
155-	wl_resource_set_implementation(resource, &seat_impl, seat, &remove_resource);
156+	wl_resource_set_implementation(resource, &seat_impl, seat,
157+	                               &remove_resource);
158 	wl_list_insert(&seat->resources, wl_resource_get_link(resource));
159 
160-	if (version >= 2)
161+	if (version >= 2) {
162 		wl_seat_send_name(resource, seat->name);
163+	}
164 
165 	wl_seat_send_capabilities(resource, seat->capabilities);
166 }
167@@ -321,11 +318,13 @@ initialize_wscons(struct seat *seat)
168 	int kbd_ver = WSKBDIO_EVENT_VERSION;
169 #endif
170 
171-	if ((seat->mouse_fd = launch_open_device("/dev/wsmouse", O_RDWR | O_NONBLOCK)) == -1) {
172+	if ((seat->mouse_fd =
173+	         launch_open_device("/dev/wsmouse", O_RDWR | O_NONBLOCK)) == -1) {
174 		ERROR("Could not open mouse device\n");
175 		goto error0;
176 	}
177-	if ((seat->kbd_fd = launch_open_device("/dev/wskbd", O_RDWR | O_NONBLOCK)) == -1) {
178+	if ((seat->kbd_fd =
179+	         launch_open_device("/dev/wskbd", O_RDWR | O_NONBLOCK)) == -1) {
180 		ERROR("Could not open keyboard device\n");
181 		goto error1;
182 	}
183@@ -340,16 +339,20 @@ initialize_wscons(struct seat *seat)
184 	/* set devices to nativemode to receive events */
185 #ifdef WSMOUSEIO_SETMODE
186 	{
187-		int mode = WSMOUSE_COMPAT;  /* use compat mode; it sends events */
188-		if (ioctl(seat->mouse_fd, WSMOUSEIO_SETMODE, &mode) == -1)
189-			fprintf(stderr, "wscons: WSMOUSEIO_SETMODE failed: %s\n", strerror(errno));
190+		int mode = WSMOUSE_COMPAT; /* use compat mode; it sends events */
191+		if (ioctl(seat->mouse_fd, WSMOUSEIO_SETMODE, &mode) == -1) {
192+			fprintf(stderr, "wscons: WSMOUSEIO_SETMODE failed: %s\n",
193+			        strerror(errno));
194+		}
195 	}
196 #endif /* WSMOUSEIO_SETMODE */
197 #ifdef WSKBDIO_SETMODE
198 	{
199-		int mode = WSKBD_TRANSLATED;  /* use translated mode for key events */
200-		if (ioctl(seat->kbd_fd, WSKBDIO_SETMODE, &mode) == -1)
201-			fprintf(stderr, "wscons: WSKBDIO_SETMODE failed: %s\n", strerror(errno));
202+		int mode = WSKBD_TRANSLATED; /* use translated mode for key events */
203+		if (ioctl(seat->kbd_fd, WSKBDIO_SETMODE, &mode) == -1) {
204+			fprintf(stderr, "wscons: WSKBDIO_SETMODE failed: %s\n",
205+			        strerror(errno));
206+		}
207 	}
208 #endif /* WSKBDIO_SETMODE */
209 
210@@ -397,8 +400,9 @@ seat_create(struct wl_display *display, const char *seat_name)
211 	struct seat *seat;
212 
213 	seat = malloc(sizeof(*seat));
214-	if (!seat)
215+	if (!seat) {
216 		goto error0;
217+	}
218 
219 	seat->ignore = false;
220 	memset(&seat->names, 0, sizeof(seat->names));
221@@ -413,13 +417,17 @@ seat_create(struct wl_display *display, const char *seat_name)
222 		goto error1;
223 	}
224 
225-	if (!initialize_wscons(seat))
226+	if (!initialize_wscons(seat)) {
227 		goto error2;
228+	}
229 
230-	seat->global = wl_global_create(display, &wl_seat_interface, 4, seat, &bind_seat);
231-	if (!seat->global)
232+	seat->global =
233+	    wl_global_create(display, &wl_seat_interface, 4, seat, &bind_seat);
234+	if (!seat->global) {
235 		goto error2;
236-	seat->capabilities = WL_SEAT_CAPABILITY_KEYBOARD | WL_SEAT_CAPABILITY_POINTER;
237+	}
238+	seat->capabilities =
239+	    WL_SEAT_CAPABILITY_KEYBOARD | WL_SEAT_CAPABILITY_POINTER;
240 	wl_list_init(&seat->resources);
241 
242 	seat->swc_listener.notify = &handle_swc_event;
243@@ -431,7 +439,8 @@ seat_create(struct wl_display *display, const char *seat_name)
244 		goto error3;
245 	}
246 	seat->data_device_listener.notify = &handle_data_device_event;
247-	wl_signal_add(&seat->base.data_device->event_signal, &seat->data_device_listener);
248+	wl_signal_add(&seat->base.data_device->event_signal,
249+	              &seat->data_device_listener);
250 
251 	seat->base.keyboard = keyboard_create(&seat->names);
252 	if (!seat->base.keyboard) {
253@@ -439,7 +448,8 @@ seat_create(struct wl_display *display, const char *seat_name)
254 		goto error4;
255 	}
256 	seat->keyboard_focus_listener.notify = handle_keyboard_focus_event;
257-	wl_signal_add(&seat->base.keyboard->focus.event_signal, &seat->keyboard_focus_listener);
258+	wl_signal_add(&seat->base.keyboard->focus.event_signal,
259+	              &seat->keyboard_focus_listener);
260 
261 	if (!pointer_initialize(&seat->pointer)) {
262 		ERROR("Could not initialize pointer\n");
263@@ -447,12 +457,11 @@ seat_create(struct wl_display *display, const char *seat_name)
264 	}
265 	seat->base.pointer = &seat->pointer;
266 
267-	seat->kbd_source = wl_event_loop_add_fd
268-		(swc.event_loop, seat->kbd_fd, WL_EVENT_READABLE,
269-		 &handle_ws_data, seat);
270-	seat->mouse_source = wl_event_loop_add_fd
271-		(swc.event_loop, seat->mouse_fd, WL_EVENT_READABLE,
272-		 &handle_ws_data, seat);
273+	seat->kbd_source = wl_event_loop_add_fd(
274+	    swc.event_loop, seat->kbd_fd, WL_EVENT_READABLE, &handle_ws_data, seat);
275+	seat->mouse_source =
276+	    wl_event_loop_add_fd(swc.event_loop, seat->mouse_fd, WL_EVENT_READABLE,
277+	                         &handle_ws_data, seat);
278 
279 	return &seat->base;
280 
+100, -49
  1@@ -43,11 +43,11 @@
  2 #include <libinput.h>
  3 #include <linux/input.h>
  4 #ifdef ENABLE_LIBUDEV
  5-# include <libudev.h>
  6+#include <libudev.h>
  7 #endif
  8 
  9 #ifndef NETLINK_MASK
 10-# define NETLINK_MASK 4
 11+#define NETLINK_MASK 4
 12 #endif
 13 
 14 struct seat {
 15@@ -76,15 +76,18 @@ struct seat {
 16 static void
 17 handle_keyboard_focus_event(struct wl_listener *listener, void *data)
 18 {
 19-	struct seat *seat = wl_container_of(listener, seat, keyboard_focus_listener);
 20+	struct seat *seat =
 21+	    wl_container_of(listener, seat, keyboard_focus_listener);
 22 	struct event *ev = data;
 23 	struct input_focus_event_data *event_data = ev->data;
 24 
 25-	if (ev->type != INPUT_FOCUS_EVENT_CHANGED)
 26+	if (ev->type != INPUT_FOCUS_EVENT_CHANGED) {
 27 		return;
 28+	}
 29 
 30 	if (event_data->new) {
 31-		struct wl_client *client = wl_resource_get_client(event_data->new->surface->resource);
 32+		struct wl_client *client =
 33+		    wl_resource_get_client(event_data->new->surface->resource);
 34 
 35 		/* Offer the selection to the new focus. */
 36 		data_device_offer_selection(seat->base.data_device, client);
 37@@ -97,11 +100,14 @@ handle_data_device_event(struct wl_listener *listener, void *data)
 38 	struct seat *seat = wl_container_of(listener, seat, data_device_listener);
 39 	struct event *ev = data;
 40 
 41-	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED)
 42+	if (ev->type != DATA_DEVICE_EVENT_SELECTION_CHANGED) {
 43 		return;
 44+	}
 45 
 46-	if (seat->base.keyboard->focus.client)
 47-		data_device_offer_selection(seat->base.data_device, seat->base.keyboard->focus.client);
 48+	if (seat->base.keyboard->focus.client) {
 49+		data_device_offer_selection(seat->base.data_device,
 50+		                            seat->base.keyboard->focus.client);
 51+	}
 52 }
 53 
 54 static void
 55@@ -116,8 +122,9 @@ handle_swc_event(struct wl_listener *listener, void *data)
 56 		keyboard_reset(seat->base.keyboard);
 57 		break;
 58 	case SWC_EVENT_ACTIVATED:
 59-		if (libinput_resume(seat->libinput) != 0)
 60+		if (libinput_resume(seat->libinput) != 0) {
 61 			WARNING("Failed to resume libinput context\n");
 62+		}
 63 		break;
 64 	}
 65 }
 66@@ -128,17 +135,22 @@ get_pointer(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 67 {
 68 	struct seat *seat = wl_resource_get_user_data(resource);
 69 
 70-	if (!pointer_bind(&seat->pointer, client, wl_resource_get_version(resource), id))
 71+	if (!pointer_bind(&seat->pointer, client, wl_resource_get_version(resource),
 72+	                  id)) {
 73 		wl_resource_post_no_memory(resource);
 74+	}
 75 }
 76 
 77 static void
 78-get_keyboard(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 79+get_keyboard(struct wl_client *client, struct wl_resource *resource,
 80+             uint32_t id)
 81 {
 82 	struct seat *seat = wl_resource_get_user_data(resource);
 83 
 84-	if (!keyboard_bind(seat->base.keyboard, client, wl_resource_get_version(resource), id))
 85+	if (!keyboard_bind(seat->base.keyboard, client,
 86+	                   wl_resource_get_version(resource), id)) {
 87 		wl_resource_post_no_memory(resource);
 88+	}
 89 }
 90 
 91 static void
 92@@ -148,9 +160,9 @@ get_touch(struct wl_client *client, struct wl_resource *resource, uint32_t id)
 93 }
 94 
 95 static const struct wl_seat_interface seat_impl = {
 96-	.get_pointer = get_pointer,
 97-	.get_keyboard = get_keyboard,
 98-	.get_touch = get_touch,
 99+    .get_pointer = get_pointer,
100+    .get_keyboard = get_keyboard,
101+    .get_touch = get_touch,
102 };
103 
104 static void
105@@ -164,11 +176,13 @@ bind_seat(struct wl_client *client, void *data, uint32_t version, uint32_t id)
106 		wl_client_post_no_memory(client);
107 		return;
108 	}
109-	wl_resource_set_implementation(resource, &seat_impl, seat, &remove_resource);
110+	wl_resource_set_implementation(resource, &seat_impl, seat,
111+	                               &remove_resource);
112 	wl_list_insert(&seat->resources, wl_resource_get_link(resource));
113 
114-	if (version >= 2)
115+	if (version >= 2) {
116 		wl_seat_send_name(resource, seat->name);
117+	}
118 
119 	wl_seat_send_capabilities(resource, seat->capabilities);
120 }
121@@ -178,12 +192,13 @@ update_capabilities(struct seat *seat, uint32_t capabilities)
122 {
123 	struct wl_resource *resource;
124 
125-	if (!(~seat->capabilities & capabilities))
126+	if (!(~seat->capabilities & capabilities)) {
127 		return;
128+	}
129 
130 	seat->capabilities |= capabilities;
131 	wl_list_for_each(resource, &seat->resources, link)
132-		wl_seat_send_capabilities(resource, seat->capabilities);
133+	    wl_seat_send_capabilities(resource, seat->capabilities);
134 }
135 
136 static int
137@@ -199,8 +214,8 @@ close_restricted(int fd, void *user_data)
138 }
139 
140 const struct libinput_interface libinput_interface = {
141-	.open_restricted = open_restricted,
142-	.close_restricted = close_restricted,
143+    .open_restricted = open_restricted,
144+    .close_restricted = close_restricted,
145 };
146 
147 static uint32_t
148@@ -208,10 +223,12 @@ device_capabilities(struct libinput_device *device)
149 {
150 	uint32_t capabilities = 0;
151 
152-	if (libinput_device_has_capability(device, LIBINPUT_DEVICE_CAP_KEYBOARD))
153+	if (libinput_device_has_capability(device, LIBINPUT_DEVICE_CAP_KEYBOARD)) {
154 		capabilities |= WL_SEAT_CAPABILITY_KEYBOARD;
155-	if (libinput_device_has_capability(device, LIBINPUT_DEVICE_CAP_POINTER))
156+	}
157+	if (libinput_device_has_capability(device, LIBINPUT_DEVICE_CAP_POINTER)) {
158 		capabilities |= WL_SEAT_CAPABILITY_POINTER;
159+	}
160 	/* TODO: Add touch device support
161 	 * if (libinput_device_has_capability(device, LIBINPUT_DEVICE_CAP_TOUCH))
162 	 * 	capabilities |= WL_SEAT_CAPABILITY_TOUCH;
163@@ -247,8 +264,9 @@ handle_libinput_data(int fd, uint32_t mask, void *data)
164 		case LIBINPUT_EVENT_DEVICE_ADDED:
165 			device = libinput_event_get_device(generic_event);
166 			update_capabilities(seat, device_capabilities(device));
167-			if (swc.manager->new_device)
168+			if (swc.manager->new_device) {
169 				swc.manager->new_device(device);
170+			}
171 			break;
172 		case LIBINPUT_EVENT_KEYBOARD_KEY:
173 			event.k = libinput_event_get_keyboard_event(generic_event);
174@@ -270,8 +288,12 @@ handle_libinput_data(int fd, uint32_t mask, void *data)
175 			rect = &screen->base.geometry;
176 			event.p = libinput_event_get_pointer_event(generic_event);
177 			time = libinput_event_pointer_get_time(event.p);
178-			x = wl_fixed_from_double(libinput_event_pointer_get_absolute_x_transformed(event.p, rect->width));
179-			y = wl_fixed_from_double(libinput_event_pointer_get_absolute_y_transformed(event.p, rect->height));
180+			x = wl_fixed_from_double(
181+			    libinput_event_pointer_get_absolute_x_transformed(event.p,
182+			                                                      rect->width));
183+			y = wl_fixed_from_double(
184+			    libinput_event_pointer_get_absolute_y_transformed(
185+			        event.p, rect->height));
186 			pointer_handle_absolute_motion(&seat->pointer, time, x, y);
187 			pointer_handle_frame(&seat->pointer);
188 			break;
189@@ -282,15 +304,19 @@ handle_libinput_data(int fd, uint32_t mask, void *data)
190 			state = libinput_event_pointer_get_button_state(event.p);
191 			pointer_handle_button(&seat->pointer, time, key, state);
192 			if (state == LIBINPUT_BUTTON_STATE_PRESSED) {
193-		                /* qemu generates GEAR_UP/GEAR_DOWN events on scroll, so pass
194+				/* qemu generates GEAR_UP/GEAR_DOWN events on scroll, so pass
195 				 * those through as axis events. */
196 				source = WL_POINTER_AXIS_SOURCE_WHEEL;
197 				switch (key) {
198 				case BTN_GEAR_DOWN:
199-					pointer_handle_axis(&seat->pointer, time, WL_POINTER_AXIS_VERTICAL_SCROLL, source, wl_fixed_from_int(10), 120);
200+					pointer_handle_axis(&seat->pointer, time,
201+					                    WL_POINTER_AXIS_VERTICAL_SCROLL, source,
202+					                    wl_fixed_from_int(10), 120);
203 					break;
204 				case BTN_GEAR_UP:
205-					pointer_handle_axis(&seat->pointer, time, WL_POINTER_AXIS_VERTICAL_SCROLL, source, wl_fixed_from_int(-10), -120);
206+					pointer_handle_axis(&seat->pointer, time,
207+					                    WL_POINTER_AXIS_VERTICAL_SCROLL, source,
208+					                    wl_fixed_from_int(-10), -120);
209 					break;
210 				}
211 			}
212@@ -309,17 +335,31 @@ handle_libinput_data(int fd, uint32_t mask, void *data)
213 			event.p = libinput_event_get_pointer_event(generic_event);
214 			time = libinput_event_pointer_get_time(event.p);
215 			value120 = 0;
216-			if (libinput_event_pointer_has_axis(event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL)) {
217-				value = wl_fixed_from_double(libinput_event_pointer_get_scroll_value(event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL));
218-				if (source == WL_POINTER_AXIS_SOURCE_WHEEL)
219-					value120 = libinput_event_pointer_get_scroll_value_v120(event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL);
220-				pointer_handle_axis(&seat->pointer, time, WL_POINTER_AXIS_VERTICAL_SCROLL, source, value, value120);
221+			if (libinput_event_pointer_has_axis(
222+			        event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL)) {
223+				value = wl_fixed_from_double(
224+				    libinput_event_pointer_get_scroll_value(
225+				        event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL));
226+				if (source == WL_POINTER_AXIS_SOURCE_WHEEL) {
227+					value120 = libinput_event_pointer_get_scroll_value_v120(
228+					    event.p, LIBINPUT_POINTER_AXIS_SCROLL_VERTICAL);
229+				}
230+				pointer_handle_axis(&seat->pointer, time,
231+				                    WL_POINTER_AXIS_VERTICAL_SCROLL, source,
232+				                    value, value120);
233 			}
234-			if (libinput_event_pointer_has_axis(event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL)) {
235-				value = wl_fixed_from_double(libinput_event_pointer_get_scroll_value(event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL));
236-				if (source == WL_POINTER_AXIS_SOURCE_WHEEL)
237-					value120 = libinput_event_pointer_get_scroll_value_v120(event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL);
238-				pointer_handle_axis(&seat->pointer, time, WL_POINTER_AXIS_HORIZONTAL_SCROLL, source, value, value120);
239+			if (libinput_event_pointer_has_axis(
240+			        event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL)) {
241+				value = wl_fixed_from_double(
242+				    libinput_event_pointer_get_scroll_value(
243+				        event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL));
244+				if (source == WL_POINTER_AXIS_SOURCE_WHEEL) {
245+					value120 = libinput_event_pointer_get_scroll_value_v120(
246+					    event.p, LIBINPUT_POINTER_AXIS_SCROLL_HORIZONTAL);
247+				}
248+				pointer_handle_axis(&seat->pointer, time,
249+				                    WL_POINTER_AXIS_HORIZONTAL_SCROLL, source,
250+				                    value, value120);
251 			}
252 			pointer_handle_frame(&seat->pointer);
253 			break;
254@@ -342,9 +382,11 @@ initialize_libinput(struct seat *seat)
255 		goto error0;
256 	}
257 
258-	seat->libinput = libinput_udev_create_context(&libinput_interface, NULL, seat->udev);
259+	seat->libinput =
260+	    libinput_udev_create_context(&libinput_interface, NULL, seat->udev);
261 #else
262-	seat->libinput = libinput_netlink_create_context(&libinput_interface, NULL, NETLINK_MASK);
263+	seat->libinput = libinput_netlink_create_context(&libinput_interface, NULL,
264+	                                                 NETLINK_MASK);
265 #endif
266 
267 	if (!seat->libinput) {
268@@ -364,14 +406,17 @@ initialize_libinput(struct seat *seat)
269 	}
270 #endif
271 
272-	seat->libinput_source = wl_event_loop_add_fd(swc.event_loop, libinput_get_fd(seat->libinput), WL_EVENT_READABLE, &handle_libinput_data, seat);
273+	seat->libinput_source =
274+	    wl_event_loop_add_fd(swc.event_loop, libinput_get_fd(seat->libinput),
275+	                         WL_EVENT_READABLE, &handle_libinput_data, seat);
276 	if (!seat->libinput_source) {
277 		ERROR("Could not create event source for libinput\n");
278 		goto error2;
279 	}
280 
281-	if (!swc.active)
282+	if (!swc.active) {
283 		libinput_suspend(seat->libinput);
284+	}
285 
286 	return true;
287 
288@@ -391,16 +436,19 @@ seat_create(struct wl_display *display, const char *seat_name)
289 	struct seat *seat;
290 
291 	seat = malloc(sizeof(*seat));
292-	if (!seat)
293+	if (!seat) {
294 		goto error0;
295+	}
296 	seat->name = strdup(seat_name);
297 	if (!seat->name) {
298 		ERROR("Could not allocate seat name string\n");
299 		goto error1;
300 	}
301-	seat->global = wl_global_create(display, &wl_seat_interface, 8, seat, &bind_seat);
302-	if (!seat->global)
303+	seat->global =
304+	    wl_global_create(display, &wl_seat_interface, 8, seat, &bind_seat);
305+	if (!seat->global) {
306 		goto error2;
307+	}
308 	seat->capabilities = 0;
309 	wl_list_init(&seat->resources);
310 
311@@ -413,7 +461,8 @@ seat_create(struct wl_display *display, const char *seat_name)
312 		goto error3;
313 	}
314 	seat->data_device_listener.notify = &handle_data_device_event;
315-	wl_signal_add(&seat->base.data_device->event_signal, &seat->data_device_listener);
316+	wl_signal_add(&seat->base.data_device->event_signal,
317+	              &seat->data_device_listener);
318 
319 	seat->base.keyboard = keyboard_create(NULL);
320 	if (!seat->base.keyboard) {
321@@ -421,7 +470,8 @@ seat_create(struct wl_display *display, const char *seat_name)
322 		goto error4;
323 	}
324 	seat->keyboard_focus_listener.notify = handle_keyboard_focus_event;
325-	wl_signal_add(&seat->base.keyboard->focus.event_signal, &seat->keyboard_focus_listener);
326+	wl_signal_add(&seat->base.keyboard->focus.event_signal,
327+	              &seat->keyboard_focus_listener);
328 
329 	if (!pointer_initialize(&seat->pointer)) {
330 		ERROR("Could not initialize pointer\n");
331@@ -429,8 +479,9 @@ seat_create(struct wl_display *display, const char *seat_name)
332 	}
333 	seat->base.pointer = &seat->pointer;
334 
335-	if (!initialize_libinput(seat))
336+	if (!initialize_libinput(seat)) {
337 		goto error6;
338+	}
339 
340 	return &seat->base;
341 
+4, -2
 1@@ -32,7 +32,9 @@ struct swc_seat {
 2 	struct data_device *data_device;
 3 };
 4 
 5-struct swc_seat *seat_create(struct wl_display *display, const char *name);
 6-void seat_destroy(struct swc_seat *seat);
 7+struct swc_seat *
 8+seat_create(struct wl_display *display, const char *name);
 9+void
10+seat_destroy(struct swc_seat *seat);
11 
12 #endif
+7, -4
 1@@ -28,19 +28,22 @@
 2 #include <wayland-server.h>
 3 
 4 static void
 5-get_shell_surface(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *surface_resource)
 6+get_shell_surface(struct wl_client *client, struct wl_resource *resource,
 7+                  uint32_t id, struct wl_resource *surface_resource)
 8 {
 9 	struct surface *surface = wl_resource_get_user_data(surface_resource);
10 	struct shell_surface *shell_surface;
11 
12-	shell_surface = shell_surface_new(client, wl_resource_get_version(resource), id, surface);
13+	shell_surface = shell_surface_new(client, wl_resource_get_version(resource),
14+	                                  id, surface);
15 
16-	if (!shell_surface)
17+	if (!shell_surface) {
18 		wl_resource_post_no_memory(resource);
19+	}
20 }
21 
22 static const struct wl_shell_interface shell_implementation = {
23-	.get_shell_surface = get_shell_surface,
24+    .get_shell_surface = get_shell_surface,
25 };
26 
27 static void
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *shell_create(struct wl_display *display);
6+struct wl_global *
7+shell_create(struct wl_display *display);
8 
9 #endif
+63, -37
  1@@ -33,8 +33,8 @@
  2 #include "view.h"
  3 #include "window.h"
  4 
  5-#include <stdlib.h>
  6 #include <signal.h>
  7+#include <stdlib.h>
  8 
  9 struct shell_surface {
 10 	struct window window;
 11@@ -46,9 +46,11 @@ struct shell_surface {
 12 static void
 13 configure(struct window *window, uint32_t width, uint32_t height)
 14 {
 15-	struct shell_surface *shell_surface = wl_container_of(window, shell_surface, window);
 16+	struct shell_surface *shell_surface =
 17+	    wl_container_of(window, shell_surface, window);
 18 
 19-	wl_shell_surface_send_configure(shell_surface->resource, WL_SHELL_SURFACE_RESIZE_NONE, width, height);
 20+	wl_shell_surface_send_configure(
 21+	    shell_surface->resource, WL_SHELL_SURFACE_RESIZE_NONE, width, height);
 22 
 23 	/* wl_shell does not support acknowledging configures. */
 24 	window->configure.acknowledged = true;
 25@@ -57,7 +59,8 @@ configure(struct window *window, uint32_t width, uint32_t height)
 26 static void
 27 close_(struct window *window)
 28 {
 29-	struct shell_surface *shell_surface = wl_container_of(window, shell_surface, window);
 30+	struct shell_surface *shell_surface =
 31+	    wl_container_of(window, shell_surface, window);
 32 	struct wl_client *client;
 33 	pid_t pid;
 34 
 35@@ -67,8 +70,8 @@ close_(struct window *window)
 36 }
 37 
 38 static const struct window_impl window_impl = {
 39-	.configure = configure,
 40-	.close = close_,
 41+    .configure = configure,
 42+    .close = close_,
 43 };
 44 
 45 static void
 46@@ -77,13 +80,15 @@ pong(struct wl_client *client, struct wl_resource *resource, uint32_t serial)
 47 }
 48 
 49 static void
 50-move(struct wl_client *client, struct wl_resource *resource, struct wl_resource *seat_resource, uint32_t serial)
 51+move(struct wl_client *client, struct wl_resource *resource,
 52+     struct wl_resource *seat_resource, uint32_t serial)
 53 {
 54 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
 55 	struct button *button;
 56 
 57-	if (!(button = pointer_get_button(swc.seat->pointer, serial)))
 58+	if (!(button = pointer_get_button(swc.seat->pointer, serial))) {
 59 		return;
 60+	}
 61 
 62 	window_begin_move(&shell_surface->window, button);
 63 }
 64@@ -95,8 +100,9 @@ resize(struct wl_client *client, struct wl_resource *resource,
 65 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
 66 	struct button *button;
 67 
 68-	if (!(button = pointer_get_button(swc.seat->pointer, serial)))
 69+	if (!(button = pointer_get_button(swc.seat->pointer, serial))) {
 70 		return;
 71+	}
 72 
 73 	window_begin_resize(&shell_surface->window, edges, button);
 74 }
 75@@ -112,14 +118,16 @@ set_toplevel(struct wl_client *client, struct wl_resource *resource)
 76 
 77 static void
 78 set_transient(struct wl_client *client, struct wl_resource *resource,
 79-              struct wl_resource *parent_resource, int32_t x, int32_t y, uint32_t flags)
 80+              struct wl_resource *parent_resource, int32_t x, int32_t y,
 81+              uint32_t flags)
 82 {
 83 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
 84 	struct surface *parent_surface = wl_resource_get_user_data(parent_resource);
 85 	struct compositor_view *parent_view = compositor_view(parent_surface->view);
 86 
 87-	if (!parent_view || !parent_view->window)
 88+	if (!parent_view || !parent_view->window) {
 89 		return;
 90+	}
 91 
 92 	window_manage(&shell_surface->window);
 93 	window_set_parent(&shell_surface->window, parent_view->window);
 94@@ -127,13 +135,16 @@ set_transient(struct wl_client *client, struct wl_resource *resource,
 95 
 96 static void
 97 set_fullscreen(struct wl_client *client, struct wl_resource *resource,
 98-               uint32_t method, uint32_t framerate, struct wl_resource *output_resource)
 99+               uint32_t method, uint32_t framerate,
100+               struct wl_resource *output_resource)
101 {
102 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
103-	struct output *output = output_resource ? wl_resource_get_user_data(output_resource) : NULL;
104+	struct output *output =
105+	    output_resource ? wl_resource_get_user_data(output_resource) : NULL;
106 	struct screen *screen;
107 
108-	screen = output ? output->screen : wl_container_of(swc.screens.next, screen, link);
109+	screen = output ? output->screen
110+	                : wl_container_of(swc.screens.next, screen, link);
111 
112 	/* TODO: Handle fullscreen windows. */
113 
114@@ -144,22 +155,27 @@ set_fullscreen(struct wl_client *client, struct wl_resource *resource,
115 static void
116 set_popup(struct wl_client *client, struct wl_resource *resource,
117           struct wl_resource *seat_resource, uint32_t serial,
118-          struct wl_resource *parent_resource, int32_t x, int32_t y, uint32_t flags)
119+          struct wl_resource *parent_resource, int32_t x, int32_t y,
120+          uint32_t flags)
121 {
122 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
123 	struct surface *parent_surface = wl_resource_get_user_data(parent_resource);
124 	struct compositor_view *parent_view = compositor_view(parent_surface->view);
125 
126-	if (!parent_view || !parent_view->window)
127+	if (!parent_view || !parent_view->window) {
128 		return;
129+	}
130 
131 	window_unmanage(&shell_surface->window);
132 	window_set_parent(&shell_surface->window, parent_view->window);
133-	view_move(&shell_surface->window.view->base, parent_view->base.geometry.x + x, parent_view->base.geometry.y + y);
134+	view_move(&shell_surface->window.view->base,
135+	          parent_view->base.geometry.x + x,
136+	          parent_view->base.geometry.y + y);
137 }
138 
139 static void
140-set_maximized(struct wl_client *client, struct wl_resource *resource, struct wl_resource *output_resource)
141+set_maximized(struct wl_client *client, struct wl_resource *resource,
142+              struct wl_resource *output_resource)
143 {
144 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
145 
146@@ -170,36 +186,39 @@ set_maximized(struct wl_client *client, struct wl_resource *resource, struct wl_
147 }
148 
149 static void
150-set_title(struct wl_client *client, struct wl_resource *resource, const char *title)
151+set_title(struct wl_client *client, struct wl_resource *resource,
152+          const char *title)
153 {
154 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
155 	window_set_title(&shell_surface->window, title, -1);
156 }
157 
158 static void
159-set_class(struct wl_client *client, struct wl_resource *resource, const char *class)
160+set_class(struct wl_client *client, struct wl_resource *resource,
161+          const char *class)
162 {
163 	struct shell_surface *shell_surface = wl_resource_get_user_data(resource);
164 	window_set_app_id(&shell_surface->window, class);
165 }
166 
167 static const struct wl_shell_surface_interface shell_surface_implementation = {
168-	.pong = pong,
169-	.move = move,
170-	.resize = resize,
171-	.set_toplevel = set_toplevel,
172-	.set_transient = set_transient,
173-	.set_fullscreen = set_fullscreen,
174-	.set_popup = set_popup,
175-	.set_maximized = set_maximized,
176-	.set_title = set_title,
177-	.set_class = set_class,
178+    .pong = pong,
179+    .move = move,
180+    .resize = resize,
181+    .set_toplevel = set_toplevel,
182+    .set_transient = set_transient,
183+    .set_fullscreen = set_fullscreen,
184+    .set_popup = set_popup,
185+    .set_maximized = set_maximized,
186+    .set_title = set_title,
187+    .set_class = set_class,
188 };
189 
190 static void
191 handle_surface_destroy(struct wl_listener *listener, void *data)
192 {
193-	struct shell_surface *shell_surface = wl_container_of(listener, shell_surface, surface_destroy_listener);
194+	struct shell_surface *shell_surface =
195+	    wl_container_of(listener, shell_surface, surface_destroy_listener);
196 	wl_resource_destroy(shell_surface->resource);
197 }
198 
199@@ -213,24 +232,31 @@ destroy_shell_surface(struct wl_resource *resource)
200 }
201 
202 struct shell_surface *
203-shell_surface_new(struct wl_client *client, uint32_t version, uint32_t id, struct surface *surface)
204+shell_surface_new(struct wl_client *client, uint32_t version, uint32_t id,
205+                  struct surface *surface)
206 {
207 	struct shell_surface *shell_surface;
208 
209 	shell_surface = malloc(sizeof(*shell_surface));
210 
211-	if (!shell_surface)
212+	if (!shell_surface) {
213 		goto error0;
214+	}
215 
216-	shell_surface->resource = wl_resource_create(client, &wl_shell_surface_interface, version, id);
217+	shell_surface->resource =
218+	    wl_resource_create(client, &wl_shell_surface_interface, version, id);
219 
220-	if (!shell_surface->resource)
221+	if (!shell_surface->resource) {
222 		goto error1;
223+	}
224 
225-	wl_resource_set_implementation(shell_surface->resource, &shell_surface_implementation, shell_surface, &destroy_shell_surface);
226+	wl_resource_set_implementation(shell_surface->resource,
227+	                               &shell_surface_implementation, shell_surface,
228+	                               &destroy_shell_surface);
229 	window_initialize(&shell_surface->window, &window_impl, surface);
230 	shell_surface->surface_destroy_listener.notify = &handle_surface_destroy;
231-	wl_resource_add_destroy_listener(surface->resource, &shell_surface->surface_destroy_listener);
232+	wl_resource_add_destroy_listener(surface->resource,
233+	                                 &shell_surface->surface_destroy_listener);
234 
235 	return shell_surface;
236 
+3, -1
 1@@ -29,6 +29,8 @@
 2 struct surface;
 3 struct wl_client;
 4 
 5-struct shell_surface *shell_surface_new(struct wl_client *client, uint32_t version, uint32_t id, struct surface *surface);
 6+struct shell_surface *
 7+shell_surface_new(struct wl_client *client, uint32_t version, uint32_t id,
 8+                  struct surface *surface);
 9 
10 #endif
+51, -29
  1@@ -65,8 +65,9 @@ swc_mremap(struct pool *pool, void *oldp, size_t oldsize, size_t newsize)
  2 	void *newp;
  3 
  4 	newp = mmap(NULL, newsize, PROT_READ, MAP_SHARED, pool->fd, 0);
  5-	if (newp == MAP_FAILED)
  6+	if (newp == MAP_FAILED) {
  7 		return MAP_FAILED;
  8+	}
  9 
 10 	(void)munmap(oldp, oldsize);
 11 	return newp;
 12@@ -76,8 +77,9 @@ swc_mremap(struct pool *pool, void *oldp, size_t oldsize, size_t newsize)
 13 static void
 14 unref_pool(struct pool *pool)
 15 {
 16-	if (--pool->references > 0)
 17+	if (--pool->references > 0) {
 18 		return;
 19+	}
 20 
 21 	munmap(pool->data, pool->size);
 22 	close(pool->fd);
 23@@ -94,7 +96,8 @@ destroy_pool_resource(struct wl_resource *resource)
 24 static void
 25 handle_buffer_destroy(struct wld_destructor *destructor)
 26 {
 27-	struct pool_reference *reference = wl_container_of(destructor, reference, destructor);
 28+	struct pool_reference *reference =
 29+	    wl_container_of(destructor, reference, destructor);
 30 	unref_pool(reference->pool);
 31 }
 32 
 33@@ -113,7 +116,8 @@ format_shm_to_wld(uint32_t format)
 34 
 35 static void
 36 create_buffer(struct wl_client *client, struct wl_resource *resource,
 37-              uint32_t id, int32_t offset, int32_t width, int32_t height, int32_t stride, uint32_t format)
 38+              uint32_t id, int32_t offset, int32_t width, int32_t height,
 39+              int32_t stride, uint32_t format)
 40 {
 41 	struct pool *pool = wl_resource_get_user_data(resource);
 42 	struct pool_reference *reference;
 43@@ -122,23 +126,30 @@ create_buffer(struct wl_client *client, struct wl_resource *resource,
 44 	union wld_object object;
 45 
 46 	if (offset > pool->size || offset < 0) {
 47-		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_STRIDE, "offset is too big or negative");
 48+		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_STRIDE,
 49+		                       "offset is too big or negative");
 50 		return;
 51 	}
 52 
 53 	object.ptr = (void *)((uintptr_t)pool->data + offset);
 54-	buffer = wld_import_buffer(pool->shm->context, WLD_OBJECT_DATA, object, width, height, format_shm_to_wld(format), stride);
 55+	buffer =
 56+	    wld_import_buffer(pool->shm->context, WLD_OBJECT_DATA, object, width,
 57+	                      height, format_shm_to_wld(format), stride);
 58 
 59-	if (!buffer)
 60+	if (!buffer) {
 61 		goto error0;
 62+	}
 63 
 64-	buffer_resource = wayland_buffer_create_resource(client, wl_resource_get_version(resource), id, buffer);
 65+	buffer_resource = wayland_buffer_create_resource(
 66+	    client, wl_resource_get_version(resource), id, buffer);
 67 
 68-	if (!buffer_resource)
 69+	if (!buffer_resource) {
 70 		goto error1;
 71+	}
 72 
 73-	if (!(reference = malloc(sizeof(*reference))))
 74+	if (!(reference = malloc(sizeof(*reference)))) {
 75 		goto error2;
 76+	}
 77 
 78 	reference->pool = pool;
 79 	reference->destructor.destroy = &handle_buffer_destroy;
 80@@ -163,17 +174,20 @@ resize(struct wl_client *client, struct wl_resource *resource, int32_t size)
 81 	struct stat st;
 82 
 83 	if (fstat(pool->fd, &st) != 0) {
 84-		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD, "fstat failed: %s", strerror(errno));
 85+		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD,
 86+		                       "fstat failed: %s", strerror(errno));
 87 		return;
 88 	}
 89 	if (st.st_size < size) {
 90 		if (ftruncate(pool->fd, size) != 0) {
 91 			int saved = errno;
 92 			/* some clients seal memfd  if size is already fine, allo */
 93-			if ((saved == EPERM || saved == EACCES) && fstat(pool->fd, &st) == 0 && st.st_size >= size) {
 94+			if ((saved == EPERM || saved == EACCES) &&
 95+			    fstat(pool->fd, &st) == 0 && st.st_size >= size) {
 96 				goto remap;
 97 			}
 98-			wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD, "ftruncate failed: %s", strerror(saved));
 99+			wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD,
100+			                       "ftruncate failed: %s", strerror(saved));
101 			return;
102 		}
103 	}
104@@ -181,7 +195,8 @@ resize(struct wl_client *client, struct wl_resource *resource, int32_t size)
105 remap:
106 	data = swc_mremap(pool, pool->data, pool->size, size);
107 	if (data == MAP_FAILED) {
108-		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD, "mremap failed: %s", strerror(errno));
109+		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD,
110+		                       "mremap failed: %s", strerror(errno));
111 		return;
112 	}
113 	pool->data = data;
114@@ -189,13 +204,14 @@ remap:
115 }
116 
117 static const struct wl_shm_pool_interface shm_pool_impl = {
118-	.create_buffer = create_buffer,
119-	.destroy = destroy_resource,
120-	.resize = resize,
121+    .create_buffer = create_buffer,
122+    .destroy = destroy_resource,
123+    .resize = resize,
124 };
125 
126 static void
127-create_pool(struct wl_client *client, struct wl_resource *resource, uint32_t id, int32_t fd, int32_t size)
128+create_pool(struct wl_client *client, struct wl_resource *resource, uint32_t id,
129+            int32_t fd, int32_t size)
130 {
131 	struct swc_shm *shm = wl_resource_get_user_data(resource);
132 	struct pool *pool;
133@@ -206,15 +222,18 @@ create_pool(struct wl_client *client, struct wl_resource *resource, uint32_t id,
134 		goto error0;
135 	}
136 	pool->shm = shm;
137-	pool->resource = wl_resource_create(client, &wl_shm_pool_interface, wl_resource_get_version(resource), id);
138+	pool->resource = wl_resource_create(client, &wl_shm_pool_interface,
139+	                                    wl_resource_get_version(resource), id);
140 	if (!pool->resource) {
141 		wl_resource_post_no_memory(resource);
142 		goto error1;
143 	}
144-	wl_resource_set_implementation(pool->resource, &shm_pool_impl, pool, &destroy_pool_resource);
145+	wl_resource_set_implementation(pool->resource, &shm_pool_impl, pool,
146+	                               &destroy_pool_resource);
147 	pool->data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
148 	if (pool->data == MAP_FAILED) {
149-		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD, "mmap failed: %s", strerror(errno));
150+		wl_resource_post_error(resource, WL_SHM_ERROR_INVALID_FD,
151+		                       "mmap failed: %s", strerror(errno));
152 		goto error2;
153 	}
154 	/* close(fd); */
155@@ -231,9 +250,7 @@ error0:
156 	close(fd);
157 }
158 
159-static const struct wl_shm_interface shm_impl = {
160-	.create_pool = &create_pool
161-};
162+static const struct wl_shm_interface shm_impl = {.create_pool = &create_pool};
163 
164 static void
165 bind_shm(struct wl_client *client, void *data, uint32_t version, uint32_t id)
166@@ -258,17 +275,22 @@ shm_create(struct wl_display *display)
167 	struct swc_shm *shm;
168 
169 	shm = malloc(sizeof(*shm));
170-	if (!shm)
171+	if (!shm) {
172 		goto error0;
173+	}
174 	shm->context = wld_pixman_create_context();
175-	if (!shm->context)
176+	if (!shm->context) {
177 		goto error1;
178+	}
179 	shm->renderer = wld_create_renderer(shm->context);
180-	if (!shm->renderer)
181+	if (!shm->renderer) {
182 		goto error2;
183-	shm->global = wl_global_create(display, &wl_shm_interface, 1, shm, &bind_shm);
184-	if (!shm->global)
185+	}
186+	shm->global =
187+	    wl_global_create(display, &wl_shm_interface, 1, shm, &bind_shm);
188+	if (!shm->global) {
189 		goto error3;
190+	}
191 
192 	return shm;
193 
+4, -2
 1@@ -32,7 +32,9 @@ struct swc_shm {
 2 	struct wld_renderer *renderer;
 3 };
 4 
 5-struct swc_shm *shm_create(struct wl_display *display);
 6-void shm_destroy(struct swc_shm *shm);
 7+struct swc_shm *
 8+shm_create(struct wl_display *display);
 9+void
10+shm_destroy(struct swc_shm *shm);
11 
12 #endif
+34, -24
  1@@ -1,19 +1,19 @@
  2 #include "snap.h"
  3+#include "compositor.h"
  4 #include "internal.h"
  5+#include "pointer.h"
  6 #include "screen.h"
  7-#include "compositor.h"
  8-#include "shm.h"
  9 #include "seat.h"
 10-#include "pointer.h"
 11+#include "shm.h"
 12 
 13+#include "swc_snap-server-protocol.h"
 14+#include <stdint.h>
 15 #include <stdio.h>
 16 #include <stdlib.h>
 17-#include <stdint.h>
 18 #include <string.h>
 19 #include <unistd.h>
 20 #include <wayland-server.h>
 21 #include <wld/wld.h>
 22-#include "swc_snap-server-protocol.h"
 23 
 24 static void
 25 ppm(int fd, const uint8_t *pixels, uint32_t width, uint32_t height,
 26@@ -34,19 +34,15 @@ ppm(int fd, const uint8_t *pixels, uint32_t width, uint32_t height,
 27 
 28 		for (uint32_t x = 0; x < width; x++) {
 29 			uint32_t pixel = row[x];
 30-			unsigned char rgb[3] = {
 31-				(pixel >> 16) & 0xFF,  
 32-				(pixel >> 8) & 0xFF,   
 33-				pixel & 0xFF           
 34-			};
 35+			unsigned char rgb[3] = {(pixel >> 16) & 0xFF, (pixel >> 8) & 0xFF,
 36+			                        pixel & 0xFF};
 37 			fwrite(rgb, 1, 3, f);
 38 		}
 39 	}
 40 
 41-	fclose(f); 
 42+	fclose(f);
 43 }
 44 
 45-
 46 /* get cursor */
 47 static void
 48 cursor(uint8_t *dst, uint32_t dst_width, uint32_t dst_height,
 49@@ -59,15 +55,18 @@ cursor(uint8_t *dst, uint32_t dst_width, uint32_t dst_height,
 50 	int32_t src_x = 0, src_y = 0;
 51 	uint32_t copy_w, copy_h;
 52 
 53-	if (!pointer || !pointer->cursor.buffer || !pointer->cursor.view.buffer)
 54+	if (!pointer || !pointer->cursor.buffer || !pointer->cursor.view.buffer) {
 55 		return;
 56+	}
 57 
 58-	if (!(pointer->cursor.view.screens & screen_mask(screen)))
 59+	if (!(pointer->cursor.view.screens & screen_mask(screen))) {
 60 		return;
 61+	}
 62 
 63 	cursor_buf = pointer->cursor.buffer;
 64-	if (!wld_map(cursor_buf) || !cursor_buf->map)
 65+	if (!wld_map(cursor_buf) || !cursor_buf->map) {
 66 		return;
 67+	}
 68 
 69 	dst_x = pointer->cursor.view.geometry.x - screen->base.geometry.x;
 70 	dst_y = pointer->cursor.view.geometry.y - screen->base.geometry.y;
 71@@ -89,24 +88,32 @@ cursor(uint8_t *dst, uint32_t dst_width, uint32_t dst_height,
 72 	}
 73 
 74 	copy_w = cursor_buf->width - (uint32_t)src_x;
 75-	if (copy_w > dst_width - (uint32_t)dst_x)
 76+	if (copy_w > dst_width - (uint32_t)dst_x) {
 77 		copy_w = dst_width - (uint32_t)dst_x;
 78+	}
 79 	copy_h = cursor_buf->height - (uint32_t)src_y;
 80-	if (copy_h > dst_height - (uint32_t)dst_y)
 81+	if (copy_h > dst_height - (uint32_t)dst_y) {
 82 		copy_h = dst_height - (uint32_t)dst_y;
 83+	}
 84 
 85 	src = cursor_buf->map;
 86 
 87 	for (uint32_t y = 0; y < copy_h; y++) {
 88-		const uint32_t *src_row = (const uint32_t *)(src + ((size_t)(src_y + (int32_t)y) * cursor_buf->pitch)) + src_x;
 89-		uint32_t *dst_row = (uint32_t *)(dst + ((size_t)(dst_y + (int32_t)y) * dst_pitch)) + dst_x;
 90+		const uint32_t *src_row =
 91+		    (const uint32_t *)(src + ((size_t)(src_y + (int32_t)y) *
 92+		                              cursor_buf->pitch)) +
 93+		    src_x;
 94+		uint32_t *dst_row =
 95+		    (uint32_t *)(dst + ((size_t)(dst_y + (int32_t)y) * dst_pitch)) +
 96+		    dst_x;
 97 
 98 		for (uint32_t x = 0; x < copy_w; x++) {
 99 			uint32_t src_px = src_row[x];
100 			uint32_t a = src_px >> 24;
101 
102-			if (a == 0)
103+			if (a == 0) {
104 				continue;
105+			}
106 			if (a == 255) {
107 				dst_row[x] = 0xFF000000 | (src_px & 0x00FFFFFF);
108 				continue;
109@@ -114,9 +121,12 @@ cursor(uint8_t *dst, uint32_t dst_width, uint32_t dst_height,
110 
111 			uint32_t dst_px = dst_row[x];
112 			uint32_t inv = 255 - a;
113-			uint32_t r = ((src_px >> 16) & 0xFF) + ((((dst_px >> 16) & 0xFF) * inv + 127) / 255);
114-			uint32_t g = ((src_px >> 8) & 0xFF) + ((((dst_px >> 8) & 0xFF) * inv + 127) / 255);
115-			uint32_t b = (src_px & 0xFF) + (((dst_px & 0xFF) * inv + 127) / 255);
116+			uint32_t r = ((src_px >> 16) & 0xFF) +
117+			             ((((dst_px >> 16) & 0xFF) * inv + 127) / 255);
118+			uint32_t g = ((src_px >> 8) & 0xFF) +
119+			             ((((dst_px >> 8) & 0xFF) * inv + 127) / 255);
120+			uint32_t b =
121+			    (src_px & 0xFF) + (((dst_px & 0xFF) * inv + 127) / 255);
122 
123 			dst_row[x] = 0xFF000000 | (r << 16) | (g << 8) | b;
124 		}
125@@ -170,7 +180,7 @@ capture(struct wl_client *client, struct wl_resource *resource, int32_t fd)
126 }
127 
128 static const struct swc_snap_interface snap_impl = {
129-	.capture = capture,
130+    .capture = capture,
131 };
132 
133 static void
+2, -1
1@@ -4,6 +4,7 @@
2 struct wl_display;
3 struct wl_global;
4 
5-struct wl_global *snap_manager_create(struct wl_display *display);
6+struct wl_global *
7+snap_manager_create(struct wl_display *display);
8 
9 #endif
+22, -13
 1@@ -21,11 +21,11 @@
 2  * SOFTWARE.
 3  */
 4 
 5-#include "swc.h"
 6-#include "internal.h"
 7 #include "subcompositor.h"
 8+#include "internal.h"
 9 #include "subsurface.h"
10 #include "surface.h"
11+#include "swc.h"
12 #include "util.h"
13 
14 static bool
15@@ -33,8 +33,9 @@ is_descendant_of(struct surface *ancestor, struct surface *surface)
16 {
17 	while (surface && surface->subsurface) {
18 		surface = surface->subsurface->parent;
19-		if (surface == ancestor)
20+		if (surface == ancestor) {
21 			return true;
22+		}
23 	}
24 
25 	return false;
26@@ -42,28 +43,33 @@ is_descendant_of(struct surface *ancestor, struct surface *surface)
27 
28 static void
29 get_subsurface(struct wl_client *client, struct wl_resource *resource,
30-               uint32_t id, struct wl_resource *surface_resource, struct wl_resource *parent_resource)
31+               uint32_t id, struct wl_resource *surface_resource,
32+               struct wl_resource *parent_resource)
33 {
34 	struct subsurface *subsurface;
35 	struct surface *surface = wl_resource_get_user_data(surface_resource);
36 	struct surface *parent = wl_resource_get_user_data(parent_resource);
37 
38 	if (!surface || !parent) {
39-		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_SURFACE, "invalid surface");
40+		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_SURFACE,
41+		                       "invalid surface");
42 		return;
43 	}
44 
45 	if (surface == parent || is_descendant_of(surface, parent)) {
46-		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_PARENT, "invalid parent surface");
47+		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_PARENT,
48+		                       "invalid parent surface");
49 		return;
50 	}
51 
52 	if (surface->subsurface) {
53-		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_SURFACE, "surface already has a subsurface role");
54+		wl_resource_post_error(resource, WL_SUBCOMPOSITOR_ERROR_BAD_SURFACE,
55+		                       "surface already has a subsurface role");
56 		return;
57 	}
58 
59-	subsurface = subsurface_new(client, wl_resource_get_version(resource), id, surface, parent);
60+	subsurface = subsurface_new(client, wl_resource_get_version(resource), id,
61+	                            surface, parent);
62 
63 	if (!subsurface) {
64 		wl_resource_post_no_memory(resource);
65@@ -73,16 +79,18 @@ get_subsurface(struct wl_client *client, struct wl_resource *resource,
66 }
67 
68 static const struct wl_subcompositor_interface subcompositor_impl = {
69-	.destroy = destroy_resource,
70-	.get_subsurface = get_subsurface,
71+    .destroy = destroy_resource,
72+    .get_subsurface = get_subsurface,
73 };
74 
75 static void
76-bind_subcompositor(struct wl_client *client, void *data, uint32_t version, uint32_t id)
77+bind_subcompositor(struct wl_client *client, void *data, uint32_t version,
78+                   uint32_t id)
79 {
80 	struct wl_resource *resource;
81 
82-	resource = wl_resource_create(client, &wl_subcompositor_interface, version, id);
83+	resource =
84+	    wl_resource_create(client, &wl_subcompositor_interface, version, id);
85 	if (!resource) {
86 		wl_client_post_no_memory(client);
87 		return;
88@@ -93,5 +101,6 @@ bind_subcompositor(struct wl_client *client, void *data, uint32_t version, uint3
89 struct wl_global *
90 subcompositor_create(struct wl_display *display)
91 {
92-	return wl_global_create(display, &wl_subcompositor_interface, 1, NULL, &bind_subcompositor);
93+	return wl_global_create(display, &wl_subcompositor_interface, 1, NULL,
94+	                        &bind_subcompositor);
95 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *subcompositor_create(struct wl_display *display);
6+struct wl_global *
7+subcompositor_create(struct wl_display *display);
8 
9 #endif
+147, -81
  1@@ -34,10 +34,12 @@ bool
  2 subsurface_is_synchronized(const struct subsurface *subsurface)
  3 {
  4 	while (subsurface) {
  5-		if (subsurface->sync)
  6+		if (subsurface->sync) {
  7 			return true;
  8-		if (!subsurface->parent)
  9+		}
 10+		if (!subsurface->parent) {
 11 			return false;
 12+		}
 13 		subsurface = subsurface->parent->subsurface;
 14 	}
 15 
 16@@ -50,17 +52,21 @@ subsurface_update_position(struct subsurface *subsurface)
 17 	struct compositor_view *parent_view;
 18 	struct compositor_view *view;
 19 
 20-	if (!subsurface->surface || !subsurface->parent)
 21+	if (!subsurface->surface || !subsurface->parent) {
 22 		return;
 23+	}
 24 
 25 	view = compositor_view(subsurface->surface->view);
 26 	parent_view = compositor_view(subsurface->parent->view);
 27-	if (!view || !parent_view)
 28+	if (!view || !parent_view) {
 29 		return;
 30+	}
 31 
 32 	view_move(&view->base,
 33-	          parent_view->base.geometry.x + subsurface->x - parent_view->buffer_offset_x,
 34-	          parent_view->base.geometry.y + subsurface->y - parent_view->buffer_offset_y);
 35+	          parent_view->base.geometry.x + subsurface->x -
 36+	              parent_view->buffer_offset_x,
 37+	          parent_view->base.geometry.y + subsurface->y -
 38+	              parent_view->buffer_offset_y);
 39 }
 40 
 41 static void
 42@@ -78,29 +84,35 @@ subsurface_update_visibility(struct subsurface *subsurface)
 43 	struct compositor_view *view;
 44 	struct compositor_view *parent_view;
 45 
 46-	if (!subsurface || !subsurface->surface || !subsurface->parent)
 47+	if (!subsurface || !subsurface->surface || !subsurface->parent) {
 48 		return;
 49+	}
 50 
 51 	view = compositor_view(subsurface->surface->view);
 52 	parent_view = compositor_view(subsurface->parent->view);
 53-	if (!view || !parent_view)
 54+	if (!view || !parent_view) {
 55 		return;
 56+	}
 57 
 58-	if (subsurface->added && parent_view->visible && subsurface->surface->state.buffer)
 59+	if (subsurface->added && parent_view->visible &&
 60+	    subsurface->surface->state.buffer) {
 61 		compositor_view_show(view);
 62-	else
 63+	} else {
 64 		compositor_view_hide(view);
 65+	}
 66 }
 67 
 68 static void
 69 handle_parent_view_change(struct view_handler *handler)
 70 {
 71-	struct subsurface *subsurface = wl_container_of(handler, subsurface, parent_view_handler);
 72+	struct subsurface *subsurface =
 73+	    wl_container_of(handler, subsurface, parent_view_handler);
 74 	subsurface_update_position(subsurface);
 75 }
 76 
 77 static void
 78-handle_parent_view_resize(struct view_handler *handler, uint32_t old_width, uint32_t old_height)
 79+handle_parent_view_resize(struct view_handler *handler, uint32_t old_width,
 80+                          uint32_t old_height)
 81 {
 82 	(void)old_width;
 83 	(void)old_height;
 84@@ -108,9 +120,9 @@ handle_parent_view_resize(struct view_handler *handler, uint32_t old_width, uint
 85 }
 86 
 87 static const struct view_handler_impl parent_view_handler_impl = {
 88-	.attach = handle_parent_view_change,
 89-	.move = handle_parent_view_change,
 90-	.resize = handle_parent_view_resize,
 91+    .attach = handle_parent_view_change,
 92+    .move = handle_parent_view_change,
 93+    .resize = handle_parent_view_resize,
 94 };
 95 
 96 static struct subsurface *
 97@@ -119,17 +131,24 @@ subsurface_find_sibling(struct subsurface *subsurface, struct surface *surface)
 98 	struct surface *parent = subsurface->parent;
 99 	struct subsurface *sibling;
100 
101-	if (!parent)
102+	if (!parent) {
103 		return NULL;
104+	}
105 
106-	wl_list_for_each (sibling, &parent->pending.state.subsurfaces_below, pending_link) {
107-		if (sibling->surface == surface && sibling != subsurface)
108+	wl_list_for_each(sibling, &parent->pending.state.subsurfaces_below,
109+	                 pending_link)
110+	{
111+		if (sibling->surface == surface && sibling != subsurface) {
112 			return sibling;
113+		}
114 	}
115 
116-	wl_list_for_each (sibling, &parent->pending.state.subsurfaces_above, pending_link) {
117-		if (sibling->surface == surface && sibling != subsurface)
118+	wl_list_for_each(sibling, &parent->pending.state.subsurfaces_above,
119+	                 pending_link)
120+	{
121+		if (sibling->surface == surface && sibling != subsurface) {
122 			return sibling;
123+		}
124 	}
125 
126 	return NULL;
127@@ -141,8 +160,10 @@ is_valid_sibling(struct subsurface *subsurface, struct surface *sibling_surface,
128 {
129 	struct subsurface *sibling;
130 
131-	if (!subsurface->parent || !sibling_surface || sibling_surface == subsurface->surface)
132+	if (!subsurface->parent || !sibling_surface ||
133+	    sibling_surface == subsurface->surface) {
134 		return false;
135+	}
136 
137 	if (sibling_surface == subsurface->parent) {
138 		*sibling_subsurface = NULL;
139@@ -150,8 +171,9 @@ is_valid_sibling(struct subsurface *subsurface, struct surface *sibling_surface,
140 	}
141 
142 	sibling = subsurface_find_sibling(subsurface, sibling_surface);
143-	if (!sibling)
144+	if (!sibling) {
145 		return false;
146+	}
147 
148 	*sibling_subsurface = sibling;
149 	return true;
150@@ -161,20 +183,24 @@ static void
151 handle_surface_destroy(struct wl_listener *listener, void *data)
152 {
153 	(void)data;
154-	struct subsurface *subsurface = wl_container_of(listener, subsurface, surface_destroy_listener);
155-	if (subsurface->resource)
156+	struct subsurface *subsurface =
157+	    wl_container_of(listener, subsurface, surface_destroy_listener);
158+	if (subsurface->resource) {
159 		wl_resource_destroy(subsurface->resource);
160+	}
161 }
162 
163 static void
164 handle_parent_destroy(struct wl_listener *listener, void *data)
165 {
166 	(void)data;
167-	struct subsurface *subsurface = wl_container_of(listener, subsurface, parent_destroy_listener);
168+	struct subsurface *subsurface =
169+	    wl_container_of(listener, subsurface, parent_destroy_listener);
170 	struct compositor_view *view = NULL;
171 
172-	if (subsurface->surface && subsurface->surface->view)
173+	if (subsurface->surface && subsurface->surface->view) {
174 		view = compositor_view(subsurface->surface->view);
175+	}
176 
177 	if (view) {
178 		view->parent = NULL;
179@@ -198,7 +224,8 @@ handle_parent_destroy(struct wl_listener *listener, void *data)
180 }
181 
182 static void
183-set_position(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y)
184+set_position(struct wl_client *client, struct wl_resource *resource, int32_t x,
185+             int32_t y)
186 {
187 	(void)client;
188 	struct subsurface *subsurface = wl_resource_get_user_data(resource);
189@@ -209,46 +236,56 @@ set_position(struct wl_client *client, struct wl_resource *resource, int32_t x,
190 }
191 
192 static void
193-place_above(struct wl_client *client, struct wl_resource *resource, struct wl_resource *sibling_resource)
194+place_above(struct wl_client *client, struct wl_resource *resource,
195+            struct wl_resource *sibling_resource)
196 {
197 	(void)client;
198 	struct subsurface *subsurface = wl_resource_get_user_data(resource);
199-	struct surface *sibling_surface = wl_resource_get_user_data(sibling_resource);
200+	struct surface *sibling_surface =
201+	    wl_resource_get_user_data(sibling_resource);
202 	struct subsurface *sibling_subsurface;
203 
204 	if (!is_valid_sibling(subsurface, sibling_surface, &sibling_subsurface)) {
205-		wl_resource_post_error(resource, WL_SUBSURFACE_ERROR_BAD_SURFACE, "invalid sibling surface");
206+		wl_resource_post_error(resource, WL_SUBSURFACE_ERROR_BAD_SURFACE,
207+		                       "invalid sibling surface");
208 		return;
209 	}
210 
211 	if (!sibling_subsurface) {
212 		wl_list_remove(&subsurface->pending_link);
213-		wl_list_insert(&subsurface->parent->pending.state.subsurfaces_above, &subsurface->pending_link);
214+		wl_list_insert(&subsurface->parent->pending.state.subsurfaces_above,
215+		               &subsurface->pending_link);
216 	} else {
217 		wl_list_remove(&subsurface->pending_link);
218-		wl_list_insert(&sibling_subsurface->pending_link, &subsurface->pending_link);
219+		wl_list_insert(&sibling_subsurface->pending_link,
220+		               &subsurface->pending_link);
221 	}
222 }
223 
224 static void
225-place_below(struct wl_client *client, struct wl_resource *resource, struct wl_resource *sibling_resource)
226+place_below(struct wl_client *client, struct wl_resource *resource,
227+            struct wl_resource *sibling_resource)
228 {
229 	(void)client;
230 	struct subsurface *subsurface = wl_resource_get_user_data(resource);
231-	struct surface *sibling_surface = wl_resource_get_user_data(sibling_resource);
232+	struct surface *sibling_surface =
233+	    wl_resource_get_user_data(sibling_resource);
234 	struct subsurface *sibling_subsurface;
235 
236 	if (!is_valid_sibling(subsurface, sibling_surface, &sibling_subsurface)) {
237-		wl_resource_post_error(resource, WL_SUBSURFACE_ERROR_BAD_SURFACE, "invalid sibling surface");
238+		wl_resource_post_error(resource, WL_SUBSURFACE_ERROR_BAD_SURFACE,
239+		                       "invalid sibling surface");
240 		return;
241 	}
242 
243 	if (!sibling_subsurface) {
244 		wl_list_remove(&subsurface->pending_link);
245-		wl_list_insert(subsurface->parent->pending.state.subsurfaces_below.prev, &subsurface->pending_link);
246+		wl_list_insert(subsurface->parent->pending.state.subsurfaces_below.prev,
247+		               &subsurface->pending_link);
248 	} else {
249 		wl_list_remove(&subsurface->pending_link);
250-		wl_list_insert(sibling_subsurface->pending_link.prev, &subsurface->pending_link);
251+		wl_list_insert(sibling_subsurface->pending_link.prev,
252+		               &subsurface->pending_link);
253 	}
254 }
255 
256@@ -269,11 +306,8 @@ set_desync(struct wl_client *client, struct wl_resource *resource)
257 
258 	subsurface->sync = false;
259 
260-	if (synchronized
261-	 && !subsurface_is_synchronized(subsurface)
262-	 && subsurface->pending
263-	 && subsurface->surface)
264-	{
265+	if (synchronized && !subsurface_is_synchronized(subsurface) &&
266+	    subsurface->pending && subsurface->surface) {
267 		surface_commit_pending(subsurface->surface);
268 	}
269 }
270@@ -286,53 +320,67 @@ subsurface_parent_commit(struct surface *parent)
271 	struct compositor_view *reference;
272 	struct compositor_view *child_view;
273 
274-	if (!parent)
275+	if (!parent) {
276 		return;
277+	}
278 
279-	wl_list_for_each (child, &parent->subsurfaces, link)
280-		list_remove_if_linked(&child->current_link);
281+	wl_list_for_each(child, &parent->subsurfaces, link)
282+	    list_remove_if_linked(&child->current_link);
283 
284 	wl_list_init(&parent->state.subsurfaces_below);
285 	wl_list_init(&parent->state.subsurfaces_above);
286 
287-	wl_list_for_each (child, &parent->pending.state.subsurfaces_below, pending_link)
288-		wl_list_insert(parent->state.subsurfaces_below.prev, &child->current_link);
289+	wl_list_for_each(child, &parent->pending.state.subsurfaces_below,
290+	                 pending_link)
291+	    wl_list_insert(parent->state.subsurfaces_below.prev,
292+	                   &child->current_link);
293 
294-	wl_list_for_each (child, &parent->pending.state.subsurfaces_above, pending_link)
295-		wl_list_insert(parent->state.subsurfaces_above.prev, &child->current_link);
296+	wl_list_for_each(child, &parent->pending.state.subsurfaces_above,
297+	                 pending_link)
298+	    wl_list_insert(parent->state.subsurfaces_above.prev,
299+	                   &child->current_link);
300 
301 	parent_view = parent->view ? compositor_view(parent->view) : NULL;
302 	if (parent_view) {
303 		reference = parent_view;
304-		wl_list_for_each_reverse (child, &parent->state.subsurfaces_below, current_link) {
305-			if (!child->surface || !child->surface->view)
306+		wl_list_for_each_reverse(child, &parent->state.subsurfaces_below,
307+		                         current_link)
308+		{
309+			if (!child->surface || !child->surface->view) {
310 				continue;
311+			}
312 
313 			child_view = compositor_view(child->surface->view);
314-			if (!child_view)
315+			if (!child_view) {
316 				continue;
317+			}
318 
319 			compositor_view_restack(child_view, reference, false);
320 			reference = child_view;
321 		}
322 
323 		reference = parent_view;
324-		wl_list_for_each (child, &parent->state.subsurfaces_above, current_link) {
325-			if (!child->surface || !child->surface->view)
326+		wl_list_for_each(child, &parent->state.subsurfaces_above, current_link)
327+		{
328+			if (!child->surface || !child->surface->view) {
329 				continue;
330+			}
331 
332 			child_view = compositor_view(child->surface->view);
333-			if (!child_view)
334+			if (!child_view) {
335 				continue;
336+			}
337 
338 			compositor_view_restack(child_view, reference, true);
339 			reference = child_view;
340 		}
341 	}
342 
343-	wl_list_for_each (child, &parent->subsurfaces, link) {
344-		if (!child->pending_position)
345+	wl_list_for_each(child, &parent->subsurfaces, link)
346+	{
347+		if (!child->pending_position) {
348 			continue;
349+		}
350 
351 		child->x = child->pending_x;
352 		child->y = child->pending_y;
353@@ -340,25 +388,29 @@ subsurface_parent_commit(struct surface *parent)
354 		subsurface_update_position(child);
355 	}
356 
357-	wl_list_for_each (child, &parent->state.subsurfaces_below, current_link) {
358-		if (!child->added)
359+	wl_list_for_each(child, &parent->state.subsurfaces_below, current_link)
360+	{
361+		if (!child->added) {
362 			child->added = true;
363+		}
364 		subsurface_update_visibility(child);
365 	}
366-	wl_list_for_each (child, &parent->state.subsurfaces_above, current_link) {
367-		if (!child->added)
368+	wl_list_for_each(child, &parent->state.subsurfaces_above, current_link)
369+	{
370+		if (!child->added) {
371 			child->added = true;
372+		}
373 		subsurface_update_visibility(child);
374 	}
375 }
376 
377 static const struct wl_subsurface_interface subsurface_impl = {
378-	.destroy = destroy_resource,
379-	.set_position = set_position,
380-	.place_above = place_above,
381-	.place_below = place_below,
382-	.set_sync = set_sync,
383-	.set_desync = set_desync,
384+    .destroy = destroy_resource,
385+    .set_position = set_position,
386+    .place_above = place_above,
387+    .place_below = place_below,
388+    .set_sync = set_sync,
389+    .set_desync = set_desync,
390 };
391 
392 static void
393@@ -367,8 +419,9 @@ subsurface_destroy(struct wl_resource *resource)
394 	struct subsurface *subsurface = wl_resource_get_user_data(resource);
395 
396 	if (subsurface->surface) {
397-		if (subsurface->surface->subsurface == subsurface)
398+		if (subsurface->surface->subsurface == subsurface) {
399 			subsurface->surface->subsurface = NULL;
400+		}
401 	}
402 
403 	if (!wl_list_empty(&subsurface->parent_destroy_listener.link)) {
404@@ -394,9 +447,11 @@ subsurface_destroy(struct wl_resource *resource)
405 	list_remove_if_linked(&subsurface->current_link);
406 
407 	if (subsurface->surface && subsurface->surface->view) {
408-		struct compositor_view *view = compositor_view(subsurface->surface->view);
409-		if (view && !view->window)
410+		struct compositor_view *view =
411+		    compositor_view(subsurface->surface->view);
412+		if (view && !view->window) {
413 			compositor_view_destroy(view);
414+		}
415 	}
416 
417 	free(subsurface);
418@@ -410,15 +465,19 @@ subsurface_new(struct wl_client *client, uint32_t version, uint32_t id,
419 	struct compositor_view *parent_view;
420 	struct compositor_view *view;
421 
422-	if (!(subsurface = malloc(sizeof(*subsurface))))
423+	if (!(subsurface = malloc(sizeof(*subsurface)))) {
424 		goto error0;
425+	}
426 
427-	subsurface->resource = wl_resource_create(client, &wl_subsurface_interface, version, id);
428+	subsurface->resource =
429+	    wl_resource_create(client, &wl_subsurface_interface, version, id);
430 
431-	if (!subsurface->resource)
432+	if (!subsurface->resource) {
433 		goto error1;
434+	}
435 
436-	wl_resource_set_implementation(subsurface->resource, &subsurface_impl, subsurface, &subsurface_destroy);
437+	wl_resource_set_implementation(subsurface->resource, &subsurface_impl,
438+	                               subsurface, &subsurface_destroy);
439 
440 	subsurface->surface = surface;
441 	subsurface->parent = parent;
442@@ -439,30 +498,37 @@ subsurface_new(struct wl_client *client, uint32_t version, uint32_t id,
443 	wl_list_init(&subsurface->pending_link);
444 	wl_list_init(&subsurface->current_link);
445 
446-	if (!surface->view)
447+	if (!surface->view) {
448 		compositor_create_view(surface);
449-	if (!parent->view)
450+	}
451+	if (!parent->view) {
452 		compositor_create_view(parent);
453+	}
454 
455 	parent_view = compositor_view(parent->view);
456 	view = compositor_view(surface->view);
457-	if (!parent_view || !view)
458+	if (!parent_view || !view) {
459 		goto error2;
460+	}
461 
462 	compositor_view_set_parent(view, parent_view);
463 	wl_list_remove(&view->link);
464 	wl_list_insert(parent_view->link.prev, &view->link);
465 
466-	wl_list_insert(&parent_view->base.handlers, &subsurface->parent_view_handler.link);
467+	wl_list_insert(&parent_view->base.handlers,
468+	               &subsurface->parent_view_handler.link);
469 	subsurface_update_position(subsurface);
470 	wl_list_insert(&parent->subsurfaces, &subsurface->link);
471-	wl_list_insert(parent->pending.state.subsurfaces_above.prev, &subsurface->pending_link);
472+	wl_list_insert(parent->pending.state.subsurfaces_above.prev,
473+	               &subsurface->pending_link);
474 	subsurface_update_visibility(subsurface);
475 
476 	subsurface->surface_destroy_listener.notify = handle_surface_destroy;
477-	wl_resource_add_destroy_listener(surface->resource, &subsurface->surface_destroy_listener);
478+	wl_resource_add_destroy_listener(surface->resource,
479+	                                 &subsurface->surface_destroy_listener);
480 	subsurface->parent_destroy_listener.notify = handle_parent_destroy;
481-	wl_resource_add_destroy_listener(parent->resource, &subsurface->parent_destroy_listener);
482+	wl_resource_add_destroy_listener(parent->resource,
483+	                                 &subsurface->parent_destroy_listener);
484 
485 	return subsurface;
486 
+9, -5
 1@@ -51,11 +51,15 @@ struct subsurface {
 2 	bool added;
 3 };
 4 
 5-bool subsurface_is_synchronized(const struct subsurface *subsurface);
 6-void subsurface_update_visibility(struct subsurface *subsurface);
 7-void subsurface_parent_commit(struct surface *parent);
 8+bool
 9+subsurface_is_synchronized(const struct subsurface *subsurface);
10+void
11+subsurface_update_visibility(struct subsurface *subsurface);
12+void
13+subsurface_parent_commit(struct surface *parent);
14 
15-struct subsurface *subsurface_new(struct wl_client *client, uint32_t version, uint32_t id,
16-                                  struct surface *surface, struct surface *parent);
17+struct subsurface *
18+subsurface_new(struct wl_client *client, uint32_t version, uint32_t id,
19+               struct surface *surface, struct surface *parent);
20 
21 #endif
+109, -61
  1@@ -32,8 +32,8 @@
  2 #include "view.h"
  3 #include "wayland_buffer.h"
  4 
  5-#include <stdlib.h>
  6 #include <stdio.h>
  7+#include <stdlib.h>
  8 #include <wld/wld.h>
  9 
 10 /**
 11@@ -68,16 +68,17 @@ state_finalize(struct surface_state *state)
 12 {
 13 	struct wl_resource *resource, *tmp;
 14 
 15-	if (state->buffer)
 16+	if (state->buffer) {
 17 		wl_list_remove(&state->buffer_destroy_listener.link);
 18+	}
 19 
 20 	pixman_region32_fini(&state->damage);
 21 	pixman_region32_fini(&state->opaque);
 22 	pixman_region32_fini(&state->input);
 23 
 24 	/* Remove all leftover callbacks. */
 25-	wl_list_for_each_safe (resource, tmp, &state->frame_callbacks, link)
 26-		wl_resource_destroy(resource);
 27+	wl_list_for_each_safe(resource, tmp, &state->frame_callbacks, link)
 28+	    wl_resource_destroy(resource);
 29 }
 30 
 31 /**
 32@@ -89,11 +90,14 @@ state_set_buffer(struct surface_state *state, struct wl_resource *resource)
 33 {
 34 	struct wld_buffer *buffer = resource ? wayland_buffer_get(resource) : NULL;
 35 
 36-	if (state->buffer)
 37+	if (state->buffer) {
 38 		wl_list_remove(&state->buffer_destroy_listener.link);
 39+	}
 40 
 41-	if (buffer)
 42-		wl_resource_add_destroy_listener(resource, &state->buffer_destroy_listener);
 43+	if (buffer) {
 44+		wl_resource_add_destroy_listener(resource,
 45+		                                 &state->buffer_destroy_listener);
 46+	}
 47 
 48 	state->buffer = buffer;
 49 	state->buffer_resource = resource;
 50@@ -105,7 +109,8 @@ handle_frame(struct view_handler *handler, uint32_t time)
 51 	struct surface *surface = wl_container_of(handler, surface, view_handler);
 52 	struct wl_resource *resource, *tmp;
 53 
 54-	wl_list_for_each_safe (resource, tmp, &surface->state.frame_callbacks, link) {
 55+	wl_list_for_each_safe(resource, tmp, &surface->state.frame_callbacks, link)
 56+	{
 57 		wl_callback_send_done(resource, time);
 58 		wl_resource_destroy(resource);
 59 	}
 60@@ -124,26 +129,30 @@ handle_screens(struct view_handler *handler, uint32_t entered, uint32_t left)
 61 
 62 	client = wl_resource_get_client(surface->resource);
 63 
 64-	wl_list_for_each (screen, &swc.screens, link) {
 65-		if (!((entered | left) & screen_mask(screen)))
 66+	wl_list_for_each(screen, &swc.screens, link)
 67+	{
 68+		if (!((entered | left) & screen_mask(screen))) {
 69 			continue;
 70+		}
 71 
 72-		wl_list_for_each (output, &screen->outputs, link) {
 73+		wl_list_for_each(output, &screen->outputs, link)
 74+		{
 75 			resource = wl_resource_find_for_client(&output->resources, client);
 76 
 77 			if (resource) {
 78-				if (entered & screen_mask(screen))
 79+				if (entered & screen_mask(screen)) {
 80 					wl_surface_send_enter(surface->resource, resource);
 81-				else if (left & screen_mask(screen))
 82+				} else if (left & screen_mask(screen)) {
 83 					wl_surface_send_leave(surface->resource, resource);
 84+				}
 85 			}
 86 		}
 87 	}
 88 }
 89 
 90 static const struct view_handler_impl view_handler_impl = {
 91-	.frame = handle_frame,
 92-	.screens = handle_screens,
 93+    .frame = handle_frame,
 94+    .screens = handle_screens,
 95 };
 96 
 97 static void
 98@@ -160,12 +169,15 @@ attach(struct wl_client *client, struct wl_resource *resource,
 99 }
100 
101 static void
102-damage(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y, int32_t width, int32_t height)
103+damage(struct wl_client *client, struct wl_resource *resource, int32_t x,
104+       int32_t y, int32_t width, int32_t height)
105 {
106 	struct surface *surface = wl_resource_get_user_data(resource);
107 
108 	surface->pending.commit |= SURFACE_COMMIT_DAMAGE;
109-	pixman_region32_union_rect(&surface->pending.state.damage, &surface->pending.state.damage, x, y, width, height);
110+	pixman_region32_union_rect(&surface->pending.state.damage,
111+	                           &surface->pending.state.damage, x, y, width,
112+	                           height);
113 }
114 
115 static void
116@@ -174,18 +186,22 @@ frame(struct wl_client *client, struct wl_resource *resource, uint32_t id)
117 	struct surface *surface = wl_resource_get_user_data(resource);
118 	struct wl_resource *callback_resource;
119 
120-	callback_resource = wl_resource_create(client, &wl_callback_interface, 1, id);
121+	callback_resource =
122+	    wl_resource_create(client, &wl_callback_interface, 1, id);
123 	if (!callback_resource) {
124 		wl_resource_post_no_memory(resource);
125 		return;
126 	}
127 	surface->pending.commit |= SURFACE_COMMIT_FRAME;
128-	wl_resource_set_implementation(callback_resource, NULL, NULL, &remove_resource);
129-	wl_list_insert(surface->pending.state.frame_callbacks.prev, wl_resource_get_link(callback_resource));
130+	wl_resource_set_implementation(callback_resource, NULL, NULL,
131+	                               &remove_resource);
132+	wl_list_insert(surface->pending.state.frame_callbacks.prev,
133+	               wl_resource_get_link(callback_resource));
134 }
135 
136 static void
137-set_opaque_region(struct wl_client *client, struct wl_resource *resource, struct wl_resource *region_resource)
138+set_opaque_region(struct wl_client *client, struct wl_resource *resource,
139+                  struct wl_resource *region_resource)
140 {
141 	struct surface *surface = wl_resource_get_user_data(resource);
142 
143@@ -200,7 +216,8 @@ set_opaque_region(struct wl_client *client, struct wl_resource *resource, struct
144 }
145 
146 static void
147-set_input_region(struct wl_client *client, struct wl_resource *resource, struct wl_resource *region_resource)
148+set_input_region(struct wl_client *client, struct wl_resource *resource,
149+                 struct wl_resource *region_resource)
150 {
151 	struct surface *surface = wl_resource_get_user_data(resource);
152 
153@@ -217,7 +234,9 @@ set_input_region(struct wl_client *client, struct wl_resource *resource, struct
154 static inline void
155 trim_region(pixman_region32_t *region, struct wld_buffer *buffer)
156 {
157-	pixman_region32_intersect_rect(region, region, 0, 0, buffer ? buffer->width : 0, buffer ? buffer->height : 0);
158+	pixman_region32_intersect_rect(region, region, 0, 0,
159+	                               buffer ? buffer->width : 0,
160+	                               buffer ? buffer->height : 0);
161 }
162 
163 static void
164@@ -227,31 +246,40 @@ surface_apply_pending(struct surface *surface, bool flush_children)
165 
166 	/* Attach */
167 	if (surface->pending.commit & SURFACE_COMMIT_ATTACH) {
168-		if (surface->state.buffer && surface->state.buffer != surface->pending.state.buffer)
169+		if (surface->state.buffer &&
170+		    surface->state.buffer != surface->pending.state.buffer) {
171 			wl_buffer_send_release(surface->state.buffer_resource);
172+		}
173 
174-		state_set_buffer(&surface->state, surface->pending.state.buffer_resource);
175+		state_set_buffer(&surface->state,
176+		                 surface->pending.state.buffer_resource);
177 	}
178 
179 	buffer = surface->state.buffer;
180 
181 	/* Damage */
182 	if (surface->pending.commit & SURFACE_COMMIT_DAMAGE) {
183-		pixman_region32_union(&surface->state.damage, &surface->state.damage, &surface->pending.state.damage);
184+		pixman_region32_union(&surface->state.damage, &surface->state.damage,
185+		                      &surface->pending.state.damage);
186 		pixman_region32_clear(&surface->pending.state.damage);
187 	}
188 
189 	/* Opaque */
190-	if (surface->pending.commit & SURFACE_COMMIT_OPAQUE)
191-		pixman_region32_copy(&surface->state.opaque, &surface->pending.state.opaque);
192+	if (surface->pending.commit & SURFACE_COMMIT_OPAQUE) {
193+		pixman_region32_copy(&surface->state.opaque,
194+		                     &surface->pending.state.opaque);
195+	}
196 
197 	/* Input */
198-	if (surface->pending.commit & SURFACE_COMMIT_INPUT)
199-		pixman_region32_copy(&surface->state.input, &surface->pending.state.input);
200+	if (surface->pending.commit & SURFACE_COMMIT_INPUT) {
201+		pixman_region32_copy(&surface->state.input,
202+		                     &surface->pending.state.input);
203+	}
204 
205 	/* Frame */
206 	if (surface->pending.commit & SURFACE_COMMIT_FRAME) {
207-		wl_list_insert_list(&surface->state.frame_callbacks, &surface->pending.state.frame_callbacks);
208+		wl_list_insert_list(&surface->state.frame_callbacks,
209+		                    &surface->pending.state.frame_callbacks);
210 		wl_list_init(&surface->pending.state.frame_callbacks);
211 	}
212 
213@@ -259,28 +287,34 @@ surface_apply_pending(struct surface *surface, bool flush_children)
214 	trim_region(&surface->state.opaque, buffer);
215 
216 	if (surface->view) {
217-		if (surface->pending.commit & SURFACE_COMMIT_ATTACH)
218+		if (surface->pending.commit & SURFACE_COMMIT_ATTACH) {
219 			view_attach(surface->view, buffer);
220+		}
221 		view_update(surface->view);
222 	}
223 
224 	surface->pending.commit = 0;
225 
226-	if (surface->subsurface)
227+	if (surface->subsurface) {
228 		surface->subsurface->pending = false;
229+	}
230 
231-	if (surface->subsurface)
232+	if (surface->subsurface) {
233 		subsurface_update_visibility(surface->subsurface);
234+	}
235 
236 	subsurface_parent_commit(surface);
237 
238 	if (flush_children) {
239 		struct subsurface *child;
240-		wl_list_for_each (child, &surface->subsurfaces, link) {
241-			if (!child->pending || !subsurface_is_synchronized(child))
242+		wl_list_for_each(child, &surface->subsurfaces, link)
243+		{
244+			if (!child->pending || !subsurface_is_synchronized(child)) {
245 				continue;
246-			if (child->surface)
247+			}
248+			if (child->surface) {
249 				surface_apply_pending(child->surface, true);
250+			}
251 		}
252 	}
253 }
254@@ -290,7 +324,8 @@ commit(struct wl_client *client, struct wl_resource *resource)
255 {
256 	struct surface *surface = wl_resource_get_user_data(resource);
257 
258-	if (surface->subsurface && subsurface_is_synchronized(surface->subsurface)) {
259+	if (surface->subsurface &&
260+	    subsurface_is_synchronized(surface->subsurface)) {
261 		surface->subsurface->pending = true;
262 		return;
263 	}
264@@ -299,38 +334,44 @@ commit(struct wl_client *client, struct wl_resource *resource)
265 }
266 
267 static void
268-set_buffer_transform(struct wl_client *client, struct wl_resource *surface, int32_t transform)
269+set_buffer_transform(struct wl_client *client, struct wl_resource *surface,
270+                     int32_t transform)
271 {
272 	if (transform != WL_OUTPUT_TRANSFORM_NORMAL) {
273 		wl_resource_post_error(surface, WL_SURFACE_ERROR_INVALID_TRANSFORM,
274-		                       "buffer transform %" PRId32 " not supported", transform);
275+		                       "buffer transform %" PRId32 " not supported",
276+		                       transform);
277 	}
278 }
279 
280 static void
281-set_buffer_scale(struct wl_client *client, struct wl_resource *surface, int32_t scale)
282+set_buffer_scale(struct wl_client *client, struct wl_resource *surface,
283+                 int32_t scale)
284 {
285-	if (scale != 1)
286-		wl_resource_post_error(surface, WL_SURFACE_ERROR_INVALID_SCALE, "buffer scale not supported");
287+	if (scale != 1) {
288+		wl_resource_post_error(surface, WL_SURFACE_ERROR_INVALID_SCALE,
289+		                       "buffer scale not supported");
290+	}
291 }
292 
293 static void
294-damage_buffer(struct wl_client *client, struct wl_resource *surface, int32_t x, int32_t y, int32_t w, int32_t h)
295+damage_buffer(struct wl_client *client, struct wl_resource *surface, int32_t x,
296+              int32_t y, int32_t w, int32_t h)
297 {
298 	damage(client, surface, x, y, w, h);
299 }
300 
301 static const struct wl_surface_interface surface_impl = {
302-	.destroy = destroy_resource,
303-	.attach = attach,
304-	.damage = damage,
305-	.frame = frame,
306-	.set_opaque_region = set_opaque_region,
307-	.set_input_region = set_input_region,
308-	.commit = commit,
309-	.set_buffer_transform = set_buffer_transform,
310-	.set_buffer_scale = set_buffer_scale,
311-	.damage_buffer = damage_buffer,
312+    .destroy = destroy_resource,
313+    .attach = attach,
314+    .damage = damage,
315+    .frame = frame,
316+    .set_opaque_region = set_opaque_region,
317+    .set_input_region = set_input_region,
318+    .commit = commit,
319+    .set_buffer_transform = set_buffer_transform,
320+    .set_buffer_scale = set_buffer_scale,
321+    .damage_buffer = damage_buffer,
322 };
323 
324 static void
325@@ -341,8 +382,9 @@ surface_destroy(struct wl_resource *resource)
326 	state_finalize(&surface->state);
327 	state_finalize(&surface->pending.state);
328 
329-	if (surface->view)
330+	if (surface->view) {
331 		wl_list_remove(&surface->view_handler.link);
332+	}
333 
334 	free(surface);
335 }
336@@ -360,13 +402,17 @@ surface_new(struct wl_client *client, uint32_t version, uint32_t id)
337 	struct surface *surface;
338 
339 	surface = malloc(sizeof(*surface));
340-	if (!surface)
341+	if (!surface) {
342 		goto error0;
343+	}
344 
345-	surface->resource = wl_resource_create(client, &wl_surface_interface, version, id);
346-	if (!surface->resource)
347+	surface->resource =
348+	    wl_resource_create(client, &wl_surface_interface, version, id);
349+	if (!surface->resource) {
350 		goto error1;
351-	wl_resource_set_implementation(surface->resource, &surface_impl, surface, &surface_destroy);
352+	}
353+	wl_resource_set_implementation(surface->resource, &surface_impl, surface,
354+	                               &surface_destroy);
355 
356 	/* Initialize the surface. */
357 	surface->pending.commit = 0;
358@@ -395,11 +441,13 @@ error0:
359 void
360 surface_set_view(struct surface *surface, struct view *view)
361 {
362-	if (surface->view == view)
363+	if (surface->view == view) {
364 		return;
365+	}
366 
367-	if (surface->view)
368+	if (surface->view) {
369 		wl_list_remove(&surface->view_handler.link);
370+	}
371 
372 	surface->view = view;
373 
+6, -3
 1@@ -82,8 +82,11 @@ struct surface {
 2 	bool window_geometry_applied;
 3 };
 4 
 5-struct surface *surface_new(struct wl_client *client, uint32_t version, uint32_t id);
 6-void surface_set_view(struct surface *surface, struct view *view);
 7-void surface_commit_pending(struct surface *surface);
 8+struct surface *
 9+surface_new(struct wl_client *client, uint32_t version, uint32_t id);
10+void
11+surface_set_view(struct surface *surface, struct view *view);
12+void
13+surface_commit_pending(struct surface *surface);
14 
15 #endif
+36, -21
  1@@ -28,9 +28,9 @@
  2 #include "drm.h"
  3 #include "event.h"
  4 #include "internal.h"
  5-#include "launch.h"
  6 #include "kde_decoration.h"
  7 #include "keyboard.h"
  8+#include "launch.h"
  9 #include "panel_manager.h"
 10 #include "pointer.h"
 11 #include "screen.h"
 12@@ -45,7 +45,7 @@
 13 #include "xdg_decoration.h"
 14 #include "xdg_shell.h"
 15 #ifdef ENABLE_XWAYLAND
 16-# include "xserver.h"
 17+#include "xserver.h"
 18 #endif
 19 
 20 extern struct swc_launch swc_launch;
 21@@ -59,11 +59,11 @@ extern struct swc_xserver swc_xserver;
 22 extern struct pointer_handler screens_pointer_handler;
 23 
 24 struct swc swc = {
 25-	.bindings = &swc_bindings,
 26-	.compositor = &swc_compositor,
 27-	.drm = &swc_drm,
 28+    .bindings = &swc_bindings,
 29+    .compositor = &swc_compositor,
 30+    .drm = &swc_drm,
 31 #ifdef ENABLE_XWAYLAND
 32-	.xserver = &swc_xserver,
 33+    .xserver = &swc_xserver,
 34 #endif
 35 };
 36 
 37@@ -74,18 +74,24 @@ setup_compositor(void)
 38 	struct screen *screen;
 39 	struct swc_rectangle *geom;
 40 
 41-	wl_list_insert(&swc.seat->keyboard->handlers, &swc.bindings->keyboard_handler->link);
 42-	wl_list_insert(&swc.seat->pointer->handlers, &swc.bindings->pointer_handler->link);
 43-	wl_list_insert(&swc.seat->pointer->handlers, &swc.compositor->pointer_handler->link);
 44+	wl_list_insert(&swc.seat->keyboard->handlers,
 45+	               &swc.bindings->keyboard_handler->link);
 46+	wl_list_insert(&swc.seat->pointer->handlers,
 47+	               &swc.bindings->pointer_handler->link);
 48+	wl_list_insert(&swc.seat->pointer->handlers,
 49+	               &swc.compositor->pointer_handler->link);
 50 	wl_list_insert(&swc.seat->pointer->handlers, &screens_pointer_handler.link);
 51-	wl_signal_add(&swc.seat->pointer->focus.event_signal, &window_enter_listener);
 52+	wl_signal_add(&swc.seat->pointer->focus.event_signal,
 53+	              &window_enter_listener);
 54 
 55 	/* Calculate pointer region */
 56 	pixman_region32_init(&pointer_region);
 57 
 58-	wl_list_for_each (screen, &swc.screens, link) {
 59+	wl_list_for_each(screen, &swc.screens, link)
 60+	{
 61 		geom = &screen->base.geometry;
 62-		pixman_region32_union_rect(&pointer_region, &pointer_region, geom->x, geom->y, geom->width, geom->height);
 63+		pixman_region32_union_rect(&pointer_region, &pointer_region, geom->x,
 64+		                           geom->y, geom->width, geom->height);
 65 	}
 66 
 67 	pointer_set_region(swc.seat->pointer, &pointer_region);
 68@@ -97,8 +103,9 @@ swc_activate(void)
 69 {
 70 	swc.active = true;
 71 	send_event(&swc.event_signal, SWC_EVENT_ACTIVATED, NULL);
 72-	if (swc.manager->activate)
 73+	if (swc.manager->activate) {
 74 		swc.manager->activate();
 75+	}
 76 }
 77 
 78 void
 79@@ -106,34 +113,42 @@ swc_deactivate(void)
 80 {
 81 	swc.active = false;
 82 	send_event(&swc.event_signal, SWC_EVENT_DEACTIVATED, NULL);
 83-	if (swc.manager->deactivate)
 84+	if (swc.manager->deactivate) {
 85 		swc.manager->deactivate();
 86+	}
 87 }
 88 
 89 EXPORT bool
 90 swc_cursor_position(int32_t *x, int32_t *y)
 91 {
 92-	if (x)
 93+	if (x) {
 94 		*x = 0;
 95-	if (y)
 96+	}
 97+	if (y) {
 98 		*y = 0;
 99+	}
100 
101-	if (!swc.seat || !swc.seat->pointer)
102+	if (!swc.seat || !swc.seat->pointer) {
103 		return false;
104+	}
105 
106-	if (x)
107+	if (x) {
108 		*x = swc.seat->pointer->x;
109-	if (y)
110+	}
111+	if (y) {
112 		*y = swc.seat->pointer->y;
113+	}
114 
115 	return true;
116 }
117 
118 EXPORT bool
119-swc_initialize(struct wl_display *display, struct wl_event_loop *event_loop, const struct swc_manager *manager)
120+swc_initialize(struct wl_display *display, struct wl_event_loop *event_loop,
121+               const struct swc_manager *manager)
122 {
123 	swc.display = display;
124-	swc.event_loop = event_loop ? event_loop : wl_display_get_event_loop(display);
125+	swc.event_loop =
126+	    event_loop ? event_loop : wl_display_get_event_loop(display);
127 	swc.manager = manager;
128 	const char *default_seat = "seat0";
129 	wl_signal_init(&swc.event_signal);
+103, -50
  1@@ -45,7 +45,8 @@ struct wld_buffer;
  2  * wayland headers
  3  *
  4  */
  5-bool swc_cursor_position(int32_t *x, int32_t *y);
  6+bool
  7+swc_cursor_position(int32_t *x, int32_t *y);
  8 
  9 /**
 10  * Send a pointer button event to the currently focused client.
 11@@ -53,7 +54,8 @@ bool swc_cursor_position(int32_t *x, int32_t *y);
 12  * This is intended for window managers which intercept button events (for
 13  * example for mouse chords) but want normal clicks to still reach clients.
 14  */
 15-void swc_pointer_send_button(uint32_t time, uint32_t button, uint32_t state);
 16+void
 17+swc_pointer_send_button(uint32_t time, uint32_t button, uint32_t state);
 18 
 19 /**
 20  * Send a pointer axis event to the currently focused client.
 21@@ -63,7 +65,8 @@ void swc_pointer_send_button(uint32_t time, uint32_t button, uint32_t state);
 22  *
 23  * value120 uses the wl_pointer "120 units" convention.
 24  */
 25-void swc_pointer_send_axis(uint32_t time, uint32_t axis, int32_t value120);
 26+void
 27+swc_pointer_send_axis(uint32_t time, uint32_t axis, int32_t value120);
 28 
 29 /* Cursor control (compositor-internal cursor) */
 30 enum swc_cursor_kind {
 31@@ -85,15 +88,18 @@ enum swc_cursor_mode {
 32 /**
 33  * override the compositor's internal cursor
 34  *
 35- * this is intended for window managers to show mode cursors (move/resize/select) like the ones in hevel
 36- * If a client has set its own cursor surface, swc may ignore the override.
 37+ * this is intended for window managers to show mode cursors
 38+ * (move/resize/select) like the ones in hevel If a client has set its own
 39+ * cursor surface, swc may ignore the override.
 40  */
 41-void swc_set_cursor(enum swc_cursor_kind kind);
 42+void
 43+swc_set_cursor(enum swc_cursor_kind kind);
 44 
 45 /**
 46  * control whether client cursor surfaces are honored
 47  */
 48-void swc_set_cursor_mode(enum swc_cursor_mode mode);
 49+void
 50+swc_set_cursor_mode(enum swc_cursor_mode mode);
 51 
 52 /**
 53  * set a custom argb8888 cursor image for a given kind
 54@@ -101,12 +107,13 @@ void swc_set_cursor_mode(enum swc_cursor_mode mode);
 55  * `argb8888` is a pointer to `width*height` pixels in ARGB8888 order.
 56  * the caller has to keep the pixel memory alive for as long as it may be used
 57  */
 58-void swc_set_cursor_image(enum swc_cursor_kind kind,
 59-                          const uint32_t *argb8888,
 60-                          uint32_t width, uint32_t height,
 61-                          int32_t hotspot_x, int32_t hotspot_y);
 62+void
 63+swc_set_cursor_image(enum swc_cursor_kind kind, const uint32_t *argb8888,
 64+                     uint32_t width, uint32_t height, int32_t hotspot_x,
 65+                     int32_t hotspot_y);
 66 
 67-void swc_clear_cursor_image(enum swc_cursor_kind kind);
 68+void
 69+swc_clear_cursor_image(enum swc_cursor_kind kind);
 70 
 71 /**
 72  * draw [or update] a simple box overlay
 73@@ -115,12 +122,15 @@ void swc_clear_cursor_image(enum swc_cursor_kind kind);
 74  * coordinates. this draws only the border. Call swc_overlay_clear() to remove
 75  * it
 76  */
 77-void swc_overlay_set_box(int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color, uint32_t border_width);
 78+void
 79+swc_overlay_set_box(int32_t x1, int32_t y1, int32_t x2, int32_t y2,
 80+                    uint32_t color, uint32_t border_width);
 81 
 82 /**
 83  * Clear the current overlay, if any.
 84  */
 85-void swc_overlay_clear(void);
 86+void
 87+swc_overlay_clear(void);
 88 
 89 /**
 90  * Set the compositor zoom level.
 91@@ -128,12 +138,14 @@ void swc_overlay_clear(void);
 92  * 1.0 = normal, >1.0 = zoomed in, <1.0 = zoomed out
 93  * Uses software (pixman) scaling.
 94  */
 95-void swc_set_zoom(float level);
 96+void
 97+swc_set_zoom(float level);
 98 
 99 /**
100  * Get the current zoom level.
101  */
102-float swc_get_zoom(void);
103+float
104+swc_get_zoom(void);
105 
106 /* Rectangles {{{ */
107 
108@@ -160,8 +172,8 @@ struct swc_screen_handler {
109 	void (*geometry_changed)(void *data);
110 
111 	/**
112-	 * Called when the geometry of the screen available for laying out windows has
113-	 * changed.
114+	 * Called when the geometry of the screen available for laying out windows
115+	 * has changed.
116 	 *
117 	 * A window manager should respond by making sure all visible windows are
118 	 * within this area.
119@@ -189,7 +201,9 @@ struct swc_screen {
120 /**
121  * Set the handler associated with this screen.
122  */
123-void swc_screen_set_handler(struct swc_screen *screen, const struct swc_screen_handler *handler, void *data);
124+void
125+swc_screen_set_handler(struct swc_screen *screen,
126+                       const struct swc_screen_handler *handler, void *data);
127 
128 /* }}} */
129 
130@@ -260,29 +274,35 @@ struct swc_window {
131 /**
132  * Set the handler associated with this window.
133  */
134-void swc_window_set_handler(struct swc_window *window, const struct swc_window_handler *handler, void *data);
135+void
136+swc_window_set_handler(struct swc_window *window,
137+                       const struct swc_window_handler *handler, void *data);
138 
139 /**
140  * Request that the specified window close.
141  */
142-void swc_window_close(struct swc_window *window);
143+void
144+swc_window_close(struct swc_window *window);
145 
146 /**
147  * Make the specified window visible.
148  */
149-void swc_window_show(struct swc_window *window);
150+void
151+swc_window_show(struct swc_window *window);
152 
153 /**
154  * Make the specified window hidden.
155  */
156-void swc_window_hide(struct swc_window *window);
157+void
158+swc_window_hide(struct swc_window *window);
159 
160 /**
161  * Set the keyboard focus to the specified window.
162  *
163  * If window is NULL, the keyboard will have no focus.
164  */
165-void swc_window_focus(struct swc_window *window);
166+void
167+swc_window_focus(struct swc_window *window);
168 
169 /**
170  * Sets the window to stacked mode.
171@@ -293,7 +313,8 @@ void swc_window_focus(struct swc_window *window);
172  *
173  * Use of this mode is required to allow interactive moving and resizing.
174  */
175-void swc_window_set_stacked(struct swc_window *window);
176+void
177+swc_window_set_stacked(struct swc_window *window);
178 
179 /**
180  * Sets the window to tiled mode.
181@@ -304,12 +325,14 @@ void swc_window_set_stacked(struct swc_window *window);
182  *
183  * It is invalid to interactively move or resize a window in tiled mode.
184  */
185-void swc_window_set_tiled(struct swc_window *window);
186+void
187+swc_window_set_tiled(struct swc_window *window);
188 
189 /**
190  * Sets the window to fullscreen mode.
191  */
192-void swc_window_set_fullscreen(struct swc_window *window, struct swc_screen *screen);
193+void
194+swc_window_set_fullscreen(struct swc_window *window, struct swc_screen *screen);
195 
196 /**
197  * Set the window's position.
198@@ -317,7 +340,8 @@ void swc_window_set_fullscreen(struct swc_window *window, struct swc_screen *scr
199  * The x and y coordinates refer to the top-left corner of the actual contents
200  * of the window and should be adjusted for the border size.
201  */
202-void swc_window_set_position(struct swc_window *window, int32_t x, int32_t y);
203+void
204+swc_window_set_position(struct swc_window *window, int32_t x, int32_t y);
205 
206 /**
207  * Set the window's size.
208@@ -325,7 +349,8 @@ void swc_window_set_position(struct swc_window *window, int32_t x, int32_t y);
209  * The width and height refer to the dimension of the actual contents of the
210  * window and should be adjusted for the border size.
211  */
212-void swc_window_set_size(struct swc_window *window, uint32_t width, uint32_t height);
213+void
214+swc_window_set_size(struct swc_window *window, uint32_t width, uint32_t height);
215 
216 /**
217  * Set the window's size and position.
218@@ -333,39 +358,49 @@ void swc_window_set_size(struct swc_window *window, uint32_t width, uint32_t hei
219  * This is a convenience function that is equivalent to calling
220  * swc_window_set_size and then swc_window_set_position.
221  */
222-void swc_window_set_geometry(struct swc_window *window, const struct swc_rectangle *geometry);
223+void
224+swc_window_set_geometry(struct swc_window *window,
225+                        const struct swc_rectangle *geometry);
226 
227 /**
228  * Get the window's current geometry in compositor-global coordinates.
229  */
230-bool swc_window_get_geometry(const struct swc_window *window, struct swc_rectangle *geometry);
231+bool
232+swc_window_get_geometry(const struct swc_window *window,
233+                        struct swc_rectangle *geometry);
234 
235 /**
236  * Get the pid of the client that owns this window
237  *
238  * returns pid, or 0 if unavailable
239  */
240-pid_t swc_window_get_pid(struct swc_window *window);
241+pid_t
242+swc_window_get_pid(struct swc_window *window);
243 
244 /**
245  * Set the window's border color and width.
246  *
247  * NOTE: The window's geometry remains unchanged, and should be updated if a
248  *       fixed top-left corner of the border is desired.
249- * 
250+ *
251  * info from dalem: unsure how much double borders break!
252  */
253-void swc_window_set_border(struct swc_window *window, uint32_t inner_border_color, uint32_t inner_border_width, uint32_t outer_border_color, uint32_t outer_border_width);
254+void
255+swc_window_set_border(struct swc_window *window, uint32_t inner_border_color,
256+                      uint32_t inner_border_width, uint32_t outer_border_color,
257+                      uint32_t outer_border_width);
258 
259 /**
260  * Begin an interactive move of the specified window.
261  */
262-void swc_window_begin_move(struct swc_window *window);
263+void
264+swc_window_begin_move(struct swc_window *window);
265 
266 /**
267  * End an interactive move of the specified window.
268  */
269-void swc_window_end_move(struct swc_window *window);
270+void
271+swc_window_end_move(struct swc_window *window);
272 
273 enum {
274 	SWC_WINDOW_EDGE_AUTO = 0,
275@@ -378,19 +413,22 @@ enum {
276 /**
277  * Begin an interactive resize of the specified window.
278  */
279-void swc_window_begin_resize(struct swc_window *window, uint32_t edges);
280+void
281+swc_window_begin_resize(struct swc_window *window, uint32_t edges);
282 
283 /**
284  * End an interactive resize of the specified window.
285  */
286-void swc_window_end_resize(struct swc_window *window);
287+void
288+swc_window_end_resize(struct swc_window *window);
289 
290 /**
291  * returns the topmost window at any given compositor global coordinates
292  *
293  * returns null if there is no window at that point
294  */
295-struct swc_window *swc_window_at(int32_t x, int32_t y);
296+struct swc_window *
297+swc_window_at(int32_t x, int32_t y);
298 
299 /**
300  * move a window in the stacking order by one step
301@@ -398,7 +436,8 @@ struct swc_window *swc_window_at(int32_t x, int32_t y);
302  * direction < 0 moves the window towards the front (higher)
303  * direction > 0 moves the window towards the back (lower)
304  */
305-void swc_window_stack(struct swc_window *window, int32_t direction);
306+void
307+swc_window_stack(struct swc_window *window, int32_t direction);
308 
309 /* }}} */
310 
311@@ -417,15 +456,19 @@ enum swc_binding_type {
312 	SWC_BINDING_BUTTON,
313 };
314 
315-typedef void (*swc_binding_handler)(void *data, uint32_t time, uint32_t value, uint32_t state);
316-typedef void (*swc_axis_binding_handler)(void *data, uint32_t time, uint32_t axis, int32_t value120);
317+typedef void (*swc_binding_handler)(void *data, uint32_t time, uint32_t value,
318+                                    uint32_t state);
319+typedef void (*swc_axis_binding_handler)(void *data, uint32_t time,
320+                                         uint32_t axis, int32_t value120);
321 
322 /**
323  * Register a new input binding.
324  *
325  * Returns 0 on success, negative error code otherwise.
326  */
327-int swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value, swc_binding_handler handler, void *data);
328+int
329+swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t value,
330+                swc_binding_handler handler, void *data);
331 
332 /**
333  * register a new pointer axis binding
334@@ -433,7 +476,9 @@ int swc_add_binding(enum swc_binding_type type, uint32_t modifiers, uint32_t val
335  * this will intercept axis events from clients; use swc_pointer_send_axis()
336  * from the handler to forward events when appropriate
337  */
338-int swc_add_axis_binding(uint32_t modifiers, uint32_t axis, swc_axis_binding_handler handler, void *data);
339+int
340+swc_add_axis_binding(uint32_t modifiers, uint32_t axis,
341+                     swc_axis_binding_handler handler, void *data);
342 
343 /* }}} */
344 
345@@ -442,14 +487,17 @@ int swc_add_axis_binding(uint32_t modifiers, uint32_t axis, swc_axis_binding_han
346 /**
347  * Set fallback wallpaper buffer for all screens that dom't have an override.
348  */
349-void swc_wallpaper_set_buffer(struct wld_buffer *buffer);
350+void
351+swc_wallpaper_set_buffer(struct wld_buffer *buffer);
352 
353 /**
354  * Set wallpaper buffer for specified screen id.
355  *
356  * Passing NULL clears the override for that screen.
357  */
358-void swc_wallpaper_set_buffer_for_screen(uint8_t screen_id, struct wld_buffer *buffer);
359+void
360+swc_wallpaper_set_buffer_for_screen(uint8_t screen_id,
361+                                    struct wld_buffer *buffer);
362 
363 /**
364  * Set wallpaper to a single color
365@@ -458,7 +506,8 @@ void swc_wallpaper_set_buffer_for_screen(uint8_t screen_id, struct wld_buffer *b
366  */
367 
368 extern uint32_t bgcolor;
369-void swc_wallpaper_color_set(uint32_t color);
370+void
371+swc_wallpaper_color_set(uint32_t color);
372 
373 /* }}} */
374 
375@@ -483,7 +532,8 @@ struct swc_manager {
376 	void (*new_device)(struct libinput_device *device);
377 
378 	/**
379-	 * Called when the session gets activated (for example, startup or VT switch).
380+	 * Called when the session gets activated (for example, startup or VT
381+	 * switch).
382 	 */
383 	void (*activate)(void);
384 
385@@ -497,12 +547,15 @@ struct swc_manager {
386  * Initializes the compositor using the specified display, event_loop, and
387  * manager.
388  */
389-bool swc_initialize(struct wl_display *display, struct wl_event_loop *event_loop, const struct swc_manager *manager);
390+bool
391+swc_initialize(struct wl_display *display, struct wl_event_loop *event_loop,
392+               const struct swc_manager *manager);
393 
394 /**
395  * Stops the compositor, releasing any used resources.
396  */
397-void swc_finalize(void);
398+void
399+swc_finalize(void);
400 
401 #ifdef __cplusplus
402 }
+4, -2
 1@@ -26,8 +26,10 @@
 2 #include <wayland-server.h>
 3 
 4 pixman_box32_t infinite_extents = {
 5-	.x1 = INT32_MIN, .y1 = INT32_MIN,
 6-	.x2 = INT32_MAX, .y2 = INT32_MAX,
 7+    .x1 = INT32_MIN,
 8+    .y1 = INT32_MIN,
 9+    .x2 = INT32_MAX,
10+    .y2 = INT32_MAX,
11 };
12 
13 void
+27, -21
 1@@ -26,27 +26,26 @@
 2 
 3 #include "swc.h"
 4 
 5-#include <stdlib.h>
 6-#include <stdio.h>
 7+#include <pixman.h>
 8 #include <stdbool.h>
 9+#include <stdio.h>
10+#include <stdlib.h>
11 #include <string.h>
12 #include <sys/time.h>
13-#include <pixman.h>
14 #include <wayland-util.h>
15 
16 #define EXPORT __attribute__((visibility("default")))
17 
18 #if ENABLE_DEBUG
19-#define MESSAGE_SOURCE \
20-	fprintf(stderr, "[swc:%s:%d] ", __FILE__, __LINE__);
21+#define MESSAGE_SOURCE fprintf(stderr, "[swc:%s:%d] ", __FILE__, __LINE__);
22 #else
23 #define MESSAGE_SOURCE
24 #endif
25 
26-#define MESSAGE(type, format, ...) \
27-	do { \
28-		MESSAGE_SOURCE \
29-		fprintf(stderr, type ": " format, ##__VA_ARGS__); \
30+#define MESSAGE(type, format, ...)                                             \
31+	do {                                                                       \
32+		MESSAGE_SOURCE                                                         \
33+		fprintf(stderr, type ": " format, ##__VA_ARGS__);                      \
34 	} while (false)
35 
36 #define WARNING(format, ...) MESSAGE("WARNING", format, ##__VA_ARGS__)
37@@ -66,8 +65,10 @@
38 struct wl_resource;
39 struct wl_client;
40 
41-void remove_resource(struct wl_resource *resource);
42-void destroy_resource(struct wl_client *client, struct wl_resource *resource);
43+void
44+remove_resource(struct wl_resource *resource);
45+void
46+destroy_resource(struct wl_client *client, struct wl_resource *resource);
47 
48 static inline uint32_t
49 get_time(void)
50@@ -81,27 +82,32 @@ get_time(void)
51 extern pixman_box32_t infinite_extents;
52 
53 static inline bool
54-rectangle_contains_point(const struct swc_rectangle *rectangle, int32_t x, int32_t y)
55+rectangle_contains_point(const struct swc_rectangle *rectangle,
56+                         int32_t x,
57+                         int32_t y)
58 {
59-	return x > rectangle->x && x < rectangle->x + rectangle->width
60-	       && y > rectangle->y && y < rectangle->y + rectangle->height;
61+	return x > rectangle->x && x < rectangle->x + rectangle->width &&
62+	       y > rectangle->y && y < rectangle->y + rectangle->height;
63 }
64 
65 static inline bool
66-rectangle_overlap(const struct swc_rectangle *r1, const struct swc_rectangle *r2)
67+rectangle_overlap(const struct swc_rectangle *r1,
68+                  const struct swc_rectangle *r2)
69 {
70-	return (MAX(r1->x + r1->width, r2->x + r2->width) - MIN(r1->x, r2->x)
71-	        < r1->width + r2->width)
72-	       && (MAX(r1->y + r1->height, r2->y + r2->height) - MIN(r1->y, r2->y)
73-	           < r1->height + r2->height);
74+	return (MAX(r1->x + r1->width, r2->x + r2->width) - MIN(r1->x, r2->x) <
75+	        r1->width + r2->width) &&
76+	       (MAX(r1->y + r1->height, r2->y + r2->height) - MIN(r1->y, r2->y) <
77+	        r1->height + r2->height);
78 }
79 
80 static inline void
81 array_remove(struct wl_array *array, void *item, size_t size)
82 {
83-	size_t bytes = array->size - ((intptr_t)item + size - (intptr_t)array->data);
84-	if (bytes > 0)
85+	size_t bytes =
86+	    array->size - ((intptr_t)item + size - (intptr_t)array->data);
87+	if (bytes > 0) {
88 		memmove(item, (void *)((intptr_t)item + size), bytes);
89+	}
90 	array->size -= size;
91 }
92 
+31, -18
  1@@ -29,12 +29,13 @@
  2 
  3 #include <wld/wld.h>
  4 
  5-#define HANDLE(view, handler, method, ...) \
  6-	do { \
  7-		wl_list_for_each (handler, &view->handlers, link) { \
  8-			if (handler->impl->method) \
  9-				handler->impl->method(handler, ##__VA_ARGS__); \
 10-		} \
 11+#define HANDLE(view, handler, method, ...)                                     \
 12+	do {                                                                       \
 13+		wl_list_for_each(handler, &view->handlers, link)                       \
 14+		{                                                                      \
 15+			if (handler->impl->method)                                         \
 16+				handler->impl->method(handler, ##__VA_ARGS__);                 \
 17+		}                                                                      \
 18 	} while (0)
 19 
 20 void
 21@@ -53,8 +54,9 @@ view_initialize(struct view *view, const struct view_impl *impl)
 22 void
 23 view_finalize(struct view *view)
 24 {
 25-	if (view->buffer)
 26+	if (view->buffer) {
 27 		wld_buffer_unreference(view->buffer);
 28+	}
 29 }
 30 
 31 int
 32@@ -63,14 +65,17 @@ view_attach(struct view *view, struct wld_buffer *buffer)
 33 	int ret;
 34 	struct view_handler *handler;
 35 
 36-	if ((ret = view->impl->attach(view, buffer)) < 0)
 37+	if ((ret = view->impl->attach(view, buffer)) < 0) {
 38 		return ret;
 39+	}
 40 
 41-	if (view->buffer)
 42+	if (view->buffer) {
 43 		wld_buffer_unreference(view->buffer);
 44+	}
 45 
 46-	if (buffer)
 47+	if (buffer) {
 48 		wld_buffer_reference(buffer);
 49+	}
 50 
 51 	view->buffer = buffer;
 52 	HANDLE(view, handler, attach);
 53@@ -95,8 +100,9 @@ view_set_position(struct view *view, int32_t x, int32_t y)
 54 {
 55 	struct view_handler *handler;
 56 
 57-	if (x == view->geometry.x && y == view->geometry.y)
 58+	if (x == view->geometry.x && y == view->geometry.y) {
 59 		return false;
 60+	}
 61 
 62 	view->geometry.x = x;
 63 	view->geometry.y = y;
 64@@ -110,10 +116,12 @@ view_set_size(struct view *view, uint32_t width, uint32_t height)
 65 {
 66 	struct view_handler *handler;
 67 
 68-	if (view->geometry.width == width && view->geometry.height == height)
 69+	if (view->geometry.width == width && view->geometry.height == height) {
 70 		return false;
 71+	}
 72 
 73-	uint32_t old_width = view->geometry.width, old_height = view->geometry.height;
 74+	uint32_t old_width = view->geometry.width,
 75+	         old_height = view->geometry.height;
 76 
 77 	view->geometry.width = width;
 78 	view->geometry.height = height;
 79@@ -125,16 +133,19 @@ view_set_size(struct view *view, uint32_t width, uint32_t height)
 80 bool
 81 view_set_size_from_buffer(struct view *view, struct wld_buffer *buffer)
 82 {
 83-	return view_set_size(view, buffer ? buffer->width : 0, buffer ? buffer->height : 0);
 84+	return view_set_size(view, buffer ? buffer->width : 0,
 85+	                     buffer ? buffer->height : 0);
 86 }
 87 
 88 void
 89 view_set_screens(struct view *view, uint32_t screens)
 90 {
 91-	if (view->screens == screens)
 92+	if (view->screens == screens) {
 93 		return;
 94+	}
 95 
 96-	uint32_t entered = screens & ~view->screens, left = view->screens & ~screens;
 97+	uint32_t entered = screens & ~view->screens,
 98+	         left = view->screens & ~screens;
 99 	struct view_handler *handler;
100 
101 	view->screens = screens;
102@@ -147,9 +158,11 @@ view_update_screens(struct view *view)
103 	uint32_t screens = 0;
104 	struct screen *screen;
105 
106-	wl_list_for_each (screen, &swc.screens, link) {
107-		if (rectangle_overlap(&screen->base.geometry, &view->geometry))
108+	wl_list_for_each(screen, &swc.screens, link)
109+	{
110+		if (rectangle_overlap(&screen->base.geometry, &view->geometry)) {
111 			screens |= screen_mask(screen);
112+		}
113 	}
114 
115 	view_set_screens(view, screens);
+27, -14
 1@@ -74,9 +74,11 @@ struct view_handler_impl {
 2 	/* Called after the view's position changes. */
 3 	void (*move)(struct view_handler *handler);
 4 	/* Called after the view's size changes. */
 5-	void (*resize)(struct view_handler *handler, uint32_t old_width, uint32_t old_height);
 6+	void (*resize)(struct view_handler *handler, uint32_t old_width,
 7+	               uint32_t old_height);
 8 	/* Called when the set of screens the view is visible on changes. */
 9-	void (*screens)(struct view_handler *handler, uint32_t left, uint32_t entered);
10+	void (*screens)(struct view_handler *handler, uint32_t left,
11+	                uint32_t entered);
12 };
13 
14 /**
15@@ -86,39 +88,49 @@ struct view_handler_impl {
16  *
17  * @return 0 on success, negative error code otherwise.
18  */
19-int view_attach(struct view *view, struct wld_buffer *buffer);
20+int
21+view_attach(struct view *view, struct wld_buffer *buffer);
22 
23 /**
24  * Display a new frame consisting of the currently attached buffer.
25  *
26  * @return Whether or not the update succeeds.
27  */
28-bool view_update(struct view *view);
29+bool
30+view_update(struct view *view);
31 
32 /**
33  * Move the view to the specified coordinates, if supported.
34  *
35  * @return Whether or not the move succeeds.
36  */
37-bool view_move(struct view *view, int32_t x, int32_t y);
38+bool
39+view_move(struct view *view, int32_t x, int32_t y);
40 
41 /**** For internal view use only ****/
42 
43 /**
44  * Initialize a new view with the specified implementation.
45  */
46-void view_initialize(struct view *view, const struct view_impl *impl);
47+void
48+view_initialize(struct view *view, const struct view_impl *impl);
49 
50 /**
51  * Release any resources associated with this view.
52  */
53-void view_finalize(struct view *view);
54-
55-bool view_set_position(struct view *view, int32_t x, int32_t y);
56-bool view_set_size(struct view *view, uint32_t width, uint32_t height);
57-bool view_set_size_from_buffer(struct view *view, struct wld_buffer *bufer);
58-void view_set_screens(struct view *view, uint32_t screens);
59-void view_update_screens(struct view *view);
60+void
61+view_finalize(struct view *view);
62+
63+bool
64+view_set_position(struct view *view, int32_t x, int32_t y);
65+bool
66+view_set_size(struct view *view, uint32_t width, uint32_t height);
67+bool
68+view_set_size_from_buffer(struct view *view, struct wld_buffer *bufer);
69+void
70+view_set_screens(struct view *view, uint32_t screens);
71+void
72+view_update_screens(struct view *view);
73 
74 /**
75  * Send a new frame event through the view's event signal.
76@@ -127,6 +139,7 @@ void view_update_screens(struct view *view);
77  * the user. If time information is not available, get_time() can be passed
78  * instead.
79  */
80-void view_frame(struct view *view, uint32_t time);
81+void
82+view_frame(struct view *view, uint32_t time);
83 
84 #endif
+24, -16
  1@@ -1,12 +1,12 @@
  2 #include <wld/wld.h>
  3 
  4-#include "swc.h"
  5 #include "compositor.h"
  6 #include "screen.h"
  7+#include "swc.h"
  8+#include "swc_wallpaper-server-protocol.h"
  9 #include "util.h"
 10-#include "wayland_buffer.h"
 11 #include "wallpaper.h"
 12-#include "swc_wallpaper-server-protocol.h"
 13+#include "wayland_buffer.h"
 14 
 15 #define MAX_WALLPAPER_SCREENS 32
 16 
 17@@ -17,10 +17,12 @@ uint32_t bgcolor = 0xff000000;
 18 static void
 19 set_buffer_slot(struct wld_buffer **slot, struct wld_buffer *buffer)
 20 {
 21-	if (buffer)
 22+	if (buffer) {
 23 		wld_buffer_reference(buffer);
 24-	if (*slot)
 25+	}
 26+	if (*slot) {
 27 		wld_buffer_unreference(*slot);
 28+	}
 29 
 30 	*slot = buffer;
 31 }
 32@@ -28,10 +30,10 @@ set_buffer_slot(struct wld_buffer **slot, struct wld_buffer *buffer)
 33 struct wld_buffer *
 34 swc_wallpaper_buffer_for_screen(struct screen *screen)
 35 {
 36-	if (screen
 37-	 && screen->id < ARRAY_LENGTH(screen_wallbuf)
 38-	 && screen_wallbuf[screen->id])
 39+	if (screen && screen->id < ARRAY_LENGTH(screen_wallbuf) &&
 40+	    screen_wallbuf[screen->id]) {
 41 		return screen_wallbuf[screen->id];
 42+	}
 43 
 44 	return wallbuf;
 45 }
 46@@ -44,10 +46,12 @@ swc_wallpaper_set_buffer(struct wld_buffer *buffer)
 47 }
 48 
 49 EXPORT void
 50-swc_wallpaper_set_buffer_for_screen(uint8_t screen_id, struct wld_buffer *buffer)
 51+swc_wallpaper_set_buffer_for_screen(uint8_t screen_id,
 52+                                    struct wld_buffer *buffer)
 53 {
 54-	if (screen_id >= ARRAY_LENGTH(screen_wallbuf))
 55+	if (screen_id >= ARRAY_LENGTH(screen_wallbuf)) {
 56 		return;
 57+	}
 58 
 59 	set_buffer_slot(&screen_wallbuf[screen_id], buffer);
 60 	compositor_damage_all();
 61@@ -82,25 +86,28 @@ set_buffer(struct wl_client *client, struct wl_resource *resource,
 62 		return;
 63 	}
 64 
 65-	if (screen_id < 0 || screen_id >= ARRAY_LENGTH(screen_wallbuf))
 66+	if (screen_id < 0 || screen_id >= ARRAY_LENGTH(screen_wallbuf)) {
 67 		return;
 68+	}
 69 
 70 	swc_wallpaper_set_buffer_for_screen((uint8_t)screen_id, buffer);
 71 }
 72 
 73 static const struct swc_wallpaper_interface wallpaper_impl = {
 74-	.destroy = destroy_resource,
 75-	.set_buffer = set_buffer,
 76+    .destroy = destroy_resource,
 77+    .set_buffer = set_buffer,
 78 };
 79 
 80 static void
 81-bind_wallpaper(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 82+bind_wallpaper(struct wl_client *client, void *data, uint32_t version,
 83+               uint32_t id)
 84 {
 85 	(void)data;
 86 
 87 	struct wl_resource *resource;
 88 
 89-	resource = wl_resource_create(client, &swc_wallpaper_interface, version, id);
 90+	resource =
 91+	    wl_resource_create(client, &swc_wallpaper_interface, version, id);
 92 	if (!resource) {
 93 		wl_client_post_no_memory(client);
 94 		return;
 95@@ -112,5 +119,6 @@ bind_wallpaper(struct wl_client *client, void *data, uint32_t version, uint32_t
 96 struct wl_global *
 97 swc_wallpaper_manager_create(struct wl_display *display)
 98 {
 99-	return wl_global_create(display, &swc_wallpaper_interface, 1, NULL, bind_wallpaper);
100+	return wl_global_create(display, &swc_wallpaper_interface, 1, NULL,
101+	                        bind_wallpaper);
102 }
+4, -2
 1@@ -6,7 +6,9 @@ struct wl_global;
 2 struct wld_buffer;
 3 struct screen;
 4 
 5-struct wl_global *swc_wallpaper_manager_create(struct wl_display *display);
 6-struct wld_buffer *swc_wallpaper_buffer_for_screen(struct screen *screen);
 7+struct wl_global *
 8+swc_wallpaper_manager_create(struct wl_display *display);
 9+struct wld_buffer *
10+swc_wallpaper_buffer_for_screen(struct screen *screen);
11 
12 #endif
+10, -6
 1@@ -26,18 +26,19 @@
 2 #include "shm.h"
 3 #include "util.h"
 4 
 5-#include <wld/wld.h>
 6 #include <wld/pixman.h>
 7+#include <wld/wld.h>
 8 
 9 static const struct wl_buffer_interface buffer_impl = {
10-	.destroy = destroy_resource,
11+    .destroy = destroy_resource,
12 };
13 
14 struct wld_buffer *
15 wayland_buffer_get(struct wl_resource *resource)
16 {
17-	if (wl_resource_instance_of(resource, &wl_buffer_interface, &buffer_impl))
18+	if (wl_resource_instance_of(resource, &wl_buffer_interface, &buffer_impl)) {
19 		return wl_resource_get_user_data(resource);
20+	}
21 
22 	return NULL;
23 }
24@@ -50,12 +51,15 @@ destroy_buffer(struct wl_resource *resource)
25 }
26 
27 struct wl_resource *
28-wayland_buffer_create_resource(struct wl_client *client, uint32_t version, uint32_t id, struct wld_buffer *buffer)
29+wayland_buffer_create_resource(struct wl_client *client, uint32_t version,
30+                               uint32_t id, struct wld_buffer *buffer)
31 {
32 	struct wl_resource *resource;
33 
34 	resource = wl_resource_create(client, &wl_buffer_interface, version, id);
35-	if (resource)
36-		wl_resource_set_implementation(resource, &buffer_impl, buffer, &destroy_buffer);
37+	if (resource) {
38+		wl_resource_set_implementation(resource, &buffer_impl, buffer,
39+		                               &destroy_buffer);
40+	}
41 	return resource;
42 }
+5, -2
 1@@ -29,7 +29,10 @@
 2 struct wl_client;
 3 struct wl_resource;
 4 
 5-struct wld_buffer *wayland_buffer_get(struct wl_resource *resource);
 6-struct wl_resource *wayland_buffer_create_resource(struct wl_client *client, uint32_t version, uint32_t id, struct wld_buffer *buffer);
 7+struct wld_buffer *
 8+wayland_buffer_get(struct wl_resource *resource);
 9+struct wl_resource *
10+wayland_buffer_create_resource(struct wl_client *client, uint32_t version,
11+                               uint32_t id, struct wld_buffer *buffer);
12 
13 #endif
+167, -86
  1@@ -45,11 +45,13 @@ static const struct swc_window_handler null_handler;
  2 static bool
  3 should_throttle_motion(uint32_t throttle_ms, uint32_t *last_time, uint32_t time)
  4 {
  5-	if (!throttle_ms)
  6+	if (!throttle_ms) {
  7 		return false;
  8+	}
  9 
 10-	if (*last_time && time - *last_time < throttle_ms)
 11+	if (*last_time && time - *last_time < throttle_ms) {
 12 		return true;
 13+	}
 14 
 15 	*last_time = time;
 16 	return false;
 17@@ -58,31 +60,39 @@ should_throttle_motion(uint32_t throttle_ms, uint32_t *last_time, uint32_t time)
 18 static uint32_t
 19 clamp_dimension(int32_t value, uint32_t min, uint32_t max)
 20 {
 21-	if (value < 0)
 22+	if (value < 0) {
 23 		value = 0;
 24+	}
 25 
 26-	if (min && value < min)
 27+	if (min && value < min) {
 28 		value = min;
 29+	}
 30 
 31 	if (max) {
 32-		if (min && max < min)
 33+		if (min && max < min) {
 34 			max = min;
 35+		}
 36 
 37-		if (value > max)
 38+		if (value > max) {
 39 			value = max;
 40+		}
 41 	}
 42 
 43-	if (value > UINT32_MAX)
 44+	if (value > UINT32_MAX) {
 45 		value = UINT32_MAX;
 46+	}
 47 
 48 	return value;
 49 }
 50 
 51 static void
 52-clamp_window_size(const struct window *window, uint32_t *width, uint32_t *height)
 53+clamp_window_size(const struct window *window, uint32_t *width,
 54+                  uint32_t *height)
 55 {
 56-	*width = clamp_dimension(*width, window->base.min_width, window->base.max_width);
 57-	*height = clamp_dimension(*height, window->base.min_height, window->base.max_height);
 58+	*width =
 59+	    clamp_dimension(*width, window->base.min_width, window->base.max_width);
 60+	*height = clamp_dimension(*height, window->base.min_height,
 61+	                          window->base.max_height);
 62 }
 63 
 64 static void
 65@@ -92,22 +102,26 @@ handle_window_enter(struct wl_listener *listener, void *data)
 66 	struct input_focus_event_data *event_data = event->data;
 67 	struct window *window;
 68 
 69-	if (event->type != INPUT_FOCUS_EVENT_CHANGED)
 70+	if (event->type != INPUT_FOCUS_EVENT_CHANGED) {
 71 		return;
 72+	}
 73 
 74-	if (!event_data->new || !(window = event_data->new->window))
 75+	if (!event_data->new || !(window = event_data->new->window)) {
 76 		return;
 77+	}
 78 
 79-	if (window->handler->entered)
 80+	if (window->handler->entered) {
 81 		window->handler->entered(window->handler_data);
 82+	}
 83 }
 84 
 85 struct wl_listener window_enter_listener = {
 86-	.notify = handle_window_enter,
 87+    .notify = handle_window_enter,
 88 };
 89 
 90 static void
 91-begin_interaction(struct window_pointer_interaction *interaction, struct button *button)
 92+begin_interaction(struct window_pointer_interaction *interaction,
 93+                  struct button *button)
 94 {
 95 	if (button) {
 96 		/* Store the serial of the button press so we are able to cancel the
 97@@ -124,10 +138,12 @@ begin_interaction(struct window_pointer_interaction *interaction, struct button
 98 }
 99 
100 static void
101-end_interaction(struct window_pointer_interaction *interaction, struct button *button)
102+end_interaction(struct window_pointer_interaction *interaction,
103+                struct button *button)
104 {
105-	if (!interaction->active)
106+	if (!interaction->active) {
107 		return;
108+	}
109 
110 	if (interaction->original_handler) {
111 		if (!button) {
112@@ -139,7 +155,9 @@ end_interaction(struct window_pointer_interaction *interaction, struct button *b
113 			}
114 		}
115 
116-		interaction->original_handler->button(interaction->original_handler, get_time(), button, WL_POINTER_BUTTON_STATE_RELEASED);
117+		interaction->original_handler->button(interaction->original_handler,
118+		                                      get_time(), button,
119+		                                      WL_POINTER_BUTTON_STATE_RELEASED);
120 	}
121 
122 remove:
123@@ -151,8 +169,9 @@ static void
124 flush(struct window *window)
125 {
126 	if (window->move.pending) {
127-		if (window->impl->move)
128+		if (window->impl->move) {
129 			window->impl->move(window, window->move.x, window->move.y);
130+		}
131 
132 		view_move(&window->view->base, window->move.x, window->move.y);
133 		window->move.pending = false;
134@@ -160,7 +179,8 @@ flush(struct window *window)
135 }
136 
137 EXPORT void
138-swc_window_set_handler(struct swc_window *base, const struct swc_window_handler *handler, void *data)
139+swc_window_set_handler(struct swc_window *base,
140+                       const struct swc_window_handler *handler, void *data)
141 {
142 	struct window *window = INTERNAL(base);
143 
144@@ -173,8 +193,9 @@ swc_window_close(struct swc_window *base)
145 {
146 	struct window *window = INTERNAL(base);
147 
148-	if (window->impl->close)
149+	if (window->impl->close) {
150 		window->impl->close(window);
151+	}
152 }
153 
154 EXPORT void
155@@ -193,18 +214,22 @@ EXPORT void
156 swc_window_focus(struct swc_window *base)
157 {
158 	struct window *window = INTERNAL(base);
159-	struct compositor_view *new = window ? window->view : NULL, *old = swc.seat->keyboard->focus.view;
160+	struct compositor_view *new = window ? window->view : NULL,
161+	                       *old = swc.seat->keyboard->focus.view;
162 
163-	if (new == old)
164+	if (new == old) {
165 		return;
166+	}
167 
168 	/* Focus the new window before unfocusing the old one in case both are X11
169 	 * windows so the xwl_window implementation can handle this transition
170 	 * correctly. */
171-	if (window && window->impl->focus)
172+	if (window && window->impl->focus) {
173 		window->impl->focus(window);
174-	if (old && old->window && old->window->impl->unfocus)
175+	}
176+	if (old && old->window && old->window->impl->unfocus) {
177 		old->window->impl->unfocus(old->window);
178+	}
179 
180 	keyboard_set_focus(swc.seat->keyboard, new);
181 }
182@@ -218,8 +243,9 @@ swc_window_set_stacked(struct swc_window *base)
183 	window->configure.pending = false;
184 	window->configure.width = 0;
185 	window->configure.height = 0;
186-	if (window->impl->set_mode)
187+	if (window->impl->set_mode) {
188 		window->impl->set_mode(window, WINDOW_MODE_STACKED);
189+	}
190 	window->mode = WINDOW_MODE_STACKED;
191 }
192 
193@@ -230,8 +256,9 @@ swc_window_set_tiled(struct swc_window *base)
194 
195 	end_interaction(&window->move.interaction, NULL);
196 	end_interaction(&window->resize.interaction, NULL);
197-	if (window->impl->set_mode)
198+	if (window->impl->set_mode) {
199 		window->impl->set_mode(window, WINDOW_MODE_TILED);
200+	}
201 	window->mode = WINDOW_MODE_TILED;
202 }
203 
204@@ -242,14 +269,15 @@ swc_window_set_fullscreen(struct swc_window *base, struct swc_screen *screen)
205 
206 	struct swc_rectangle geom;
207 	swc_window_get_geometry(base, &geom);
208-	
209+
210 	if (window->mode != WINDOW_MODE_FULLSCREEN) {
211 		window->prev.geom = geom;
212 		window->prev.mode = window->mode;
213 		swc_window_set_geometry(base, &screen->usable_geometry);
214 
215-		if (window->impl->set_mode)
216+		if (window->impl->set_mode) {
217 			window->impl->set_mode(window, WINDOW_MODE_FULLSCREEN);
218+		}
219 		window->mode = WINDOW_MODE_FULLSCREEN;
220 	}
221 
222@@ -275,8 +303,9 @@ swc_window_set_position(struct swc_window *base, int32_t x, int32_t y)
223 	window->move.pending = true;
224 
225 	/* If we don't have a configure pending, perform the move now. */
226-	if (!window->configure.pending)
227+	if (!window->configure.pending) {
228 		flush(window);
229+	}
230 }
231 
232 EXPORT void
233@@ -287,9 +316,10 @@ swc_window_set_size(struct swc_window *base, uint32_t width, uint32_t height)
234 
235 	clamp_window_size(window, &width, &height);
236 
237-	if ((window->configure.pending && width == window->configure.width && height == window->configure.height)
238-	 || (!window->configure.pending && width == geom->width && height == geom->height))
239-	{
240+	if ((window->configure.pending && width == window->configure.width &&
241+	     height == window->configure.height) ||
242+	    (!window->configure.pending && width == geom->width &&
243+	     height == geom->height)) {
244 		return;
245 	}
246 
247@@ -303,35 +333,40 @@ swc_window_set_size(struct swc_window *base, uint32_t width, uint32_t height)
248 }
249 
250 EXPORT void
251-swc_window_set_geometry(struct swc_window *window, const struct swc_rectangle *geometry)
252+swc_window_set_geometry(struct swc_window *window,
253+                        const struct swc_rectangle *geometry)
254 {
255 	swc_window_set_size(window, geometry->width, geometry->height);
256 	swc_window_set_position(window, geometry->x, geometry->y);
257 }
258 
259 EXPORT bool
260-swc_window_get_geometry(const struct swc_window *base, struct swc_rectangle *geometry)
261+swc_window_get_geometry(const struct swc_window *base,
262+                        struct swc_rectangle *geometry)
263 {
264 	struct window *window = INTERNAL((struct swc_window *)base);
265 
266-	if (!window || !geometry)
267+	if (!window || !geometry) {
268 		return false;
269+	}
270 
271 	*geometry = window->view->base.geometry;
272 	return true;
273 }
274 
275 EXPORT void
276-swc_window_set_border(struct swc_window *window, uint32_t inner_border_color, uint32_t inner_border_width, 
277-		uint32_t outer_border_color, uint32_t outer_border_width)
278+swc_window_set_border(struct swc_window *window, uint32_t inner_border_color,
279+                      uint32_t inner_border_width, uint32_t outer_border_color,
280+                      uint32_t outer_border_width)
281 {
282 	struct compositor_view *view = INTERNAL(window)->view;
283 
284-	compositor_view_set_border_color(view, outer_border_color, inner_border_color);
285-	compositor_view_set_border_width(view, outer_border_width, inner_border_width);
286+	compositor_view_set_border_color(view, outer_border_color,
287+	                                 inner_border_color);
288+	compositor_view_set_border_width(view, outer_border_width,
289+	                                 inner_border_width);
290 }
291 
292-
293 EXPORT void
294 swc_window_begin_move(struct swc_window *window)
295 {
296@@ -357,12 +392,16 @@ swc_window_end_resize(struct swc_window *window)
297 }
298 
299 static bool
300-move_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_fixed_t fy)
301+move_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx,
302+            wl_fixed_t fy)
303 {
304-	struct window *window = wl_container_of(handler, window, move.interaction.handler);
305+	struct window *window =
306+	    wl_container_of(handler, window, move.interaction.handler);
307 
308-	if (should_throttle_motion(window->base.motion_throttle_ms, &window->move.last_time, time))
309+	if (should_throttle_motion(window->base.motion_throttle_ms,
310+	                           &window->move.last_time, time)) {
311 		return true;
312+	}
313 
314 	int32_t x = wl_fixed_to_int(fx) + window->move.offset.x,
315 	        y = wl_fixed_to_int(fy) + window->move.offset.y;
316@@ -372,24 +411,30 @@ move_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_fi
317 }
318 
319 static bool
320-resize_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_fixed_t fy)
321+resize_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx,
322+              wl_fixed_t fy)
323 {
324-	struct window *window = wl_container_of(handler, window, resize.interaction.handler);
325+	struct window *window =
326+	    wl_container_of(handler, window, resize.interaction.handler);
327 	const struct swc_rectangle *geometry = &window->view->base.geometry;
328 	uint32_t width = geometry->width, height = geometry->height;
329 
330-	if (should_throttle_motion(window->base.motion_throttle_ms, &window->resize.last_time, time))
331+	if (should_throttle_motion(window->base.motion_throttle_ms,
332+	                           &window->resize.last_time, time)) {
333 		return true;
334+	}
335 
336-	if (window->resize.edges & SWC_WINDOW_EDGE_LEFT)
337+	if (window->resize.edges & SWC_WINDOW_EDGE_LEFT) {
338 		width -= wl_fixed_to_int(fx) + window->resize.offset.x - geometry->x;
339-	else if (window->resize.edges & SWC_WINDOW_EDGE_RIGHT)
340+	} else if (window->resize.edges & SWC_WINDOW_EDGE_RIGHT) {
341 		width = wl_fixed_to_int(fx) + window->resize.offset.x - geometry->x;
342+	}
343 
344-	if (window->resize.edges & SWC_WINDOW_EDGE_TOP)
345+	if (window->resize.edges & SWC_WINDOW_EDGE_TOP) {
346 		height -= wl_fixed_to_int(fy) + window->resize.offset.y - geometry->y;
347-	else if (window->resize.edges & SWC_WINDOW_EDGE_BOTTOM)
348+	} else if (window->resize.edges & SWC_WINDOW_EDGE_BOTTOM) {
349 		height = wl_fixed_to_int(fy) + window->resize.offset.y - geometry->y;
350+	}
351 
352 	clamp_window_size(window, &width, &height);
353 	window->impl->configure(window, width, height);
354@@ -398,12 +443,16 @@ resize_motion(struct pointer_handler *handler, uint32_t time, wl_fixed_t fx, wl_
355 }
356 
357 static bool
358-handle_button(struct pointer_handler *handler, uint32_t time, struct button *button, uint32_t state)
359+handle_button(struct pointer_handler *handler, uint32_t time,
360+              struct button *button, uint32_t state)
361 {
362-	struct window_pointer_interaction *interaction = wl_container_of(handler, interaction, handler);
363+	struct window_pointer_interaction *interaction =
364+	    wl_container_of(handler, interaction, handler);
365 
366-	if (state != WL_POINTER_BUTTON_STATE_RELEASED || !interaction->original_handler)
367+	if (state != WL_POINTER_BUTTON_STATE_RELEASED ||
368+	    !interaction->original_handler) {
369 		return false;
370+	}
371 
372 	end_interaction(interaction, button);
373 	return true;
374@@ -414,36 +463,42 @@ handle_attach(struct view_handler *handler)
375 {
376 	struct window *window = wl_container_of(handler, window, view_handler);
377 
378-	if (window->configure.acknowledged)
379+	if (window->configure.acknowledged) {
380 		flush(window);
381+	}
382 	window->configure.pending = false;
383 }
384 
385 static void
386-handle_resize(struct view_handler *handler, uint32_t old_width, uint32_t old_height)
387+handle_resize(struct view_handler *handler, uint32_t old_width,
388+              uint32_t old_height)
389 {
390 	struct window *window = wl_container_of(handler, window, view_handler);
391 
392-	if (window->resize.interaction.active && window->resize.edges & (SWC_WINDOW_EDGE_TOP | SWC_WINDOW_EDGE_LEFT)) {
393+	if (window->resize.interaction.active &&
394+	    window->resize.edges & (SWC_WINDOW_EDGE_TOP | SWC_WINDOW_EDGE_LEFT)) {
395 		const struct swc_rectangle *geometry = &window->view->base.geometry;
396 		int32_t x = geometry->x, y = geometry->y;
397 
398-		if (window->resize.edges & SWC_WINDOW_EDGE_LEFT)
399+		if (window->resize.edges & SWC_WINDOW_EDGE_LEFT) {
400 			x += old_width - geometry->width;
401-		if (window->resize.edges & SWC_WINDOW_EDGE_TOP)
402+		}
403+		if (window->resize.edges & SWC_WINDOW_EDGE_TOP) {
404 			y += old_height - geometry->height;
405+		}
406 
407 		view_move(&window->view->base, x, y);
408 	}
409 }
410 
411 static const struct view_handler_impl view_handler_impl = {
412-	.attach = handle_attach,
413-	.resize = handle_resize,
414+    .attach = handle_attach,
415+    .resize = handle_resize,
416 };
417 
418 bool
419-window_initialize(struct window *window, const struct window_impl *impl, struct surface *surface)
420+window_initialize(struct window *window, const struct window_impl *impl,
421+                  struct surface *surface)
422 {
423 	DEBUG("Initializing window, %p\n", window);
424 
425@@ -453,11 +508,13 @@ window_initialize(struct window *window, const struct window_impl *impl, struct
426 
427 	if (surface->view) {
428 		window->view = compositor_view(surface->view);
429-		if (!window->view || window->view->window)
430+		if (!window->view || window->view->window) {
431 			return false;
432+		}
433 	} else {
434-		if (!(window->view = compositor_create_view(surface)))
435+		if (!(window->view = compositor_create_view(surface))) {
436 			return false;
437+		}
438 	}
439 
440 	window->impl = impl;
441@@ -475,16 +532,16 @@ window_initialize(struct window *window, const struct window_impl *impl, struct
442 	window->move.last_time = 0;
443 	window->move.interaction.active = false;
444 	window->move.interaction.handler = (struct pointer_handler){
445-		.motion = move_motion,
446-		.button = handle_button,
447+	    .motion = move_motion,
448+	    .button = handle_button,
449 	};
450 	window->configure.pending = false;
451 	window->configure.width = 0;
452 	window->configure.height = 0;
453 	window->resize.interaction.active = false;
454 	window->resize.interaction.handler = (struct pointer_handler){
455-		.motion = resize_motion,
456-		.button = handle_button,
457+	    .motion = resize_motion,
458+	    .button = handle_button,
459 	};
460 	window->resize.last_time = 0;
461 
462@@ -507,8 +564,9 @@ window_finalize(struct window *window)
463 void
464 window_manage(struct window *window)
465 {
466-	if (window->managed)
467+	if (window->managed) {
468 		return;
469+	}
470 
471 	swc.manager->new_window(&window->base);
472 	window->managed = true;
473@@ -517,11 +575,13 @@ window_manage(struct window *window)
474 void
475 window_unmanage(struct window *window)
476 {
477-	if (!window->managed)
478+	if (!window->managed) {
479 		return;
480+	}
481 
482-	if (window->handler->destroy)
483+	if (window->handler->destroy) {
484 		window->handler->destroy(window->handler_data);
485+	}
486 	window->handler = &null_handler;
487 	window->managed = false;
488 }
489@@ -532,8 +592,9 @@ window_set_title(struct window *window, const char *title, size_t length)
490 	free(window->base.title);
491 	window->base.title = strndup(title, length);
492 
493-	if (window->handler->title_changed)
494+	if (window->handler->title_changed) {
495 		window->handler->title_changed(window->handler_data);
496+	}
497 }
498 
499 void
500@@ -542,31 +603,37 @@ window_set_app_id(struct window *window, const char *app_id)
501 	free(window->base.app_id);
502 	window->base.app_id = strdup(app_id);
503 
504-	if (window->handler->app_id_changed)
505+	if (window->handler->app_id_changed) {
506 		window->handler->app_id_changed(window->handler_data);
507+	}
508 }
509 
510 void
511 window_set_parent(struct window *window, struct window *parent)
512 {
513-	if (window->base.parent == &parent->base)
514+	if (window->base.parent == &parent->base) {
515 		return;
516+	}
517 
518 	compositor_view_set_parent(window->view, parent->view);
519 	window->base.parent = &parent->base;
520 
521-	if (window->handler->parent_changed)
522+	if (window->handler->parent_changed) {
523 		window->handler->parent_changed(window->handler_data);
524+	}
525 }
526 
527 void
528 window_begin_move(struct window *window, struct button *button)
529 {
530-	if (window->mode != WINDOW_MODE_STACKED && window->handler->move)
531+	if (window->mode != WINDOW_MODE_STACKED && window->handler->move) {
532 		window->handler->move(window->handler_data);
533+	}
534 
535-	if (window->mode != WINDOW_MODE_STACKED || window->move.interaction.active)
536+	if (window->mode != WINDOW_MODE_STACKED ||
537+	    window->move.interaction.active) {
538 		return;
539+	}
540 
541 	struct swc_rectangle *geometry = &window->view->base.geometry;
542 	int32_t px = wl_fixed_to_int(swc.seat->pointer->x),
543@@ -579,13 +646,17 @@ window_begin_move(struct window *window, struct button *button)
544 }
545 
546 void
547-window_begin_resize(struct window *window, uint32_t edges, struct button *button)
548+window_begin_resize(struct window *window, uint32_t edges,
549+                    struct button *button)
550 {
551-	if (window->mode != WINDOW_MODE_STACKED && window->handler->resize)
552+	if (window->mode != WINDOW_MODE_STACKED && window->handler->resize) {
553 		window->handler->resize(window->handler_data);
554+	}
555 
556-	if (window->mode != WINDOW_MODE_STACKED || window->resize.interaction.active)
557+	if (window->mode != WINDOW_MODE_STACKED ||
558+	    window->resize.interaction.active) {
559 		return;
560+	}
561 
562 	struct swc_rectangle *geometry = &window->view->base.geometry;
563 	int32_t px = wl_fixed_to_int(swc.seat->pointer->x),
564@@ -595,12 +666,20 @@ window_begin_resize(struct window *window, uint32_t edges, struct button *button
565 	window->resize.last_time = 0;
566 
567 	if (!edges) {
568-		edges |= (px < geometry->x + geometry->width / 2) ? SWC_WINDOW_EDGE_LEFT : SWC_WINDOW_EDGE_RIGHT;
569-		edges |= (py < geometry->y + geometry->height / 2) ? SWC_WINDOW_EDGE_TOP : SWC_WINDOW_EDGE_BOTTOM;
570+		edges |= (px < geometry->x + geometry->width / 2)
571+		             ? SWC_WINDOW_EDGE_LEFT
572+		             : SWC_WINDOW_EDGE_RIGHT;
573+		edges |= (py < geometry->y + geometry->height / 2)
574+		             ? SWC_WINDOW_EDGE_TOP
575+		             : SWC_WINDOW_EDGE_BOTTOM;
576 	}
577 
578-	window->resize.offset.x = geometry->x - px + ((edges & SWC_WINDOW_EDGE_RIGHT) ? geometry->width : 0);
579-	window->resize.offset.y = geometry->y - py + ((edges & SWC_WINDOW_EDGE_BOTTOM) ? geometry->height : 0);
580+	window->resize.offset.x =
581+	    geometry->x - px +
582+	    ((edges & SWC_WINDOW_EDGE_RIGHT) ? geometry->width : 0);
583+	window->resize.offset.y =
584+	    geometry->y - py +
585+	    ((edges & SWC_WINDOW_EDGE_BOTTOM) ? geometry->height : 0);
586 	window->resize.edges = edges;
587 }
588 
589@@ -614,12 +693,14 @@ swc_window_get_pid(struct swc_window *base)
590 	uid_t uid;
591 	gid_t gid;
592 
593-	if (!window || !window->view || !window->view->surface)
594+	if (!window || !window->view || !window->view->surface) {
595 		return 0;
596+	}
597 
598 	surface = window->view->surface;
599-	if (!surface->resource)
600+	if (!surface->resource) {
601 		return 0;
602+	}
603 
604 	client = wl_resource_get_client(surface->resource);
605 	wl_client_get_credentials(client, &pid, &uid, &gid);
+22, -11
 1@@ -24,8 +24,8 @@
 2 #ifndef SWC_WINDOW_H
 3 #define SWC_WINDOW_H
 4 
 5-#include "swc.h"
 6 #include "pointer.h"
 7+#include "swc.h"
 8 
 9 #include <stdint.h>
10 #include <wayland-server.h>
11@@ -52,7 +52,7 @@ struct window {
12 	struct view_handler view_handler;
13 	bool managed;
14 	unsigned mode;
15-	
16+
17 	struct {
18 		struct swc_rectangle geom;
19 		unsigned mode;
20@@ -95,14 +95,25 @@ struct window_impl {
21 
22 extern struct wl_listener window_enter_listener;
23 
24-bool window_initialize(struct window *window, const struct window_impl *impl, struct surface *surface);
25-void window_finalize(struct window *window);
26-void window_manage(struct window *window);
27-void window_unmanage(struct window *window);
28-void window_set_title(struct window *window, const char *title, size_t length);
29-void window_set_app_id(struct window *window, const char *app_id);
30-void window_set_parent(struct window *window, struct window *parent);
31-void window_begin_move(struct window *window, struct button *button);
32-void window_begin_resize(struct window *window, uint32_t edges, struct button *button);
33+bool
34+window_initialize(struct window *window, const struct window_impl *impl,
35+                  struct surface *surface);
36+void
37+window_finalize(struct window *window);
38+void
39+window_manage(struct window *window);
40+void
41+window_unmanage(struct window *window);
42+void
43+window_set_title(struct window *window, const char *title, size_t length);
44+void
45+window_set_app_id(struct window *window, const char *app_id);
46+void
47+window_set_parent(struct window *window, struct window *parent);
48+void
49+window_begin_move(struct window *window, struct button *button);
50+void
51+window_begin_resize(struct window *window, uint32_t edges,
52+                    struct button *button);
53 
54 #endif
+171, -171
  1@@ -51,7 +51,7 @@
  2 #define _ATKEYNAMES_H
  3 
  4 #define XK_TECHNICAL
  5-#define	XK_KATAKANA
  6+#define XK_KATAKANA
  7 
  8 /*
  9  * NOTE: The AT/MF keyboards can generate (via the 8042) two (MF: three)
 10@@ -77,151 +77,151 @@
 11  *      ----------------   ---------- -------    ------    ------
 12  */
 13 
 14-#define KEY_Escape       /* Escape                0x01  */    1  
 15-#define KEY_1            /* 1           !         0x02  */    2 
 16-#define KEY_2            /* 2           @         0x03  */    3 
 17-#define KEY_3            /* 3           #         0x04  */    4 
 18-#define KEY_4            /* 4           $         0x05  */    5 
 19-#define KEY_5            /* 5           %         0x06  */    6 
 20-#define KEY_6            /* 6           ^         0x07  */    7 
 21-#define KEY_7            /* 7           &         0x08  */    8 
 22-#define KEY_8            /* 8           *         0x09  */    9 
 23-#define KEY_9            /* 9           (         0x0a  */   10 
 24-#define KEY_0            /* 0           )         0x0b  */   11 
 25-#define KEY_Minus        /* - (Minus)   _ (Under) 0x0c  */   12
 26-#define KEY_Equal        /* = (Equal)   +         0x0d  */   13 
 27-#define KEY_BackSpace    /* Back Space            0x0e  */   14 
 28-#define KEY_Tab          /* Tab                   0x0f  */   15
 29-#define KEY_Q            /* Q                     0x10  */   16
 30-#define KEY_W            /* W                     0x11  */   17
 31-#define KEY_E            /* E                     0x12  */   18
 32-#define KEY_R            /* R                     0x13  */   19
 33-#define KEY_T            /* T                     0x14  */   20
 34-#define KEY_Y            /* Y                     0x15  */   21
 35-#define KEY_U            /* U                     0x16  */   22
 36-#define KEY_I            /* I                     0x17  */   23
 37-#define KEY_O            /* O                     0x18  */   24
 38-#define KEY_P            /* P                     0x19  */   25
 39-#define KEY_LBrace       /* [           {         0x1a  */   26
 40-#define KEY_RBrace       /* ]           }         0x1b  */   27 
 41-#define KEY_Enter        /* Enter                 0x1c  */   28
 42-#define KEY_LCtrl        /* Ctrl(left)            0x1d  */   29
 43-#define KEY_A            /* A                     0x1e  */   30
 44-#define KEY_S            /* S                     0x1f  */   31
 45-#define KEY_D            /* D                     0x20  */   32 
 46-#define KEY_F            /* F                     0x21  */   33
 47-#define KEY_G            /* G                     0x22  */   34
 48-#define KEY_H            /* H                     0x23  */   35
 49-#define KEY_J            /* J                     0x24  */   36
 50-#define KEY_K            /* K                     0x25  */   37
 51-#define KEY_L            /* L                     0x26  */   38
 52-#define KEY_SemiColon    /* ;(SemiColon) :(Colon) 0x27  */   39
 53-#define KEY_Quote        /* ' (Apostr)  " (Quote) 0x28  */   40
 54-#define KEY_Tilde        /* ` (Accent)  ~ (Tilde) 0x29  */   41
 55-#define KEY_ShiftL       /* Shift(left)           0x2a  */   42
 56-#define KEY_BSlash       /* \(BckSlash) |(VertBar)0x2b  */   43
 57-#define KEY_Z            /* Z                     0x2c  */   44
 58-#define KEY_X            /* X                     0x2d  */   45
 59-#define KEY_C            /* C                     0x2e  */   46
 60-#define KEY_V            /* V                     0x2f  */   47
 61-#define KEY_B            /* B                     0x30  */   48
 62-#define KEY_N            /* N                     0x31  */   49
 63-#define KEY_M            /* M                     0x32  */   50
 64-#define KEY_Comma        /* , (Comma)   < (Less)  0x33  */   51
 65-#define KEY_Period       /* . (Period)  >(Greater)0x34  */   52
 66-#define KEY_Slash        /* / (Slash)   ?         0x35  */   53
 67-#define KEY_ShiftR       /* Shift(right)          0x36  */   54
 68-#define KEY_KP_Multiply  /* *                     0x37  */   55
 69-#define KEY_Alt          /* Alt(left)             0x38  */   56
 70-#define KEY_Space        /*   (SpaceBar)          0x39  */   57
 71-#define KEY_CapsLock     /* CapsLock              0x3a  */   58
 72-#define KEY_F1           /* F1                    0x3b  */   59
 73-#define KEY_F2           /* F2                    0x3c  */   60
 74-#define KEY_F3           /* F3                    0x3d  */   61
 75-#define KEY_F4           /* F4                    0x3e  */   62
 76-#define KEY_F5           /* F5                    0x3f  */   63
 77-#define KEY_F6           /* F6                    0x40  */   64
 78-#define KEY_F7           /* F7                    0x41  */   65
 79-#define KEY_F8           /* F8                    0x42  */   66
 80-#define KEY_F9           /* F9                    0x43  */   67
 81-#define KEY_F10          /* F10                   0x44  */   68
 82-#define KEY_NumLock      /* NumLock               0x45  */   69
 83-#define KEY_ScrollLock   /* ScrollLock            0x46  */   70
 84-#define KEY_KP_7         /* 7           Home      0x47  */   71 
 85-#define KEY_KP_8         /* 8           Up        0x48  */   72 
 86-#define KEY_KP_9         /* 9           PgUp      0x49  */   73 
 87-#define KEY_KP_Minus     /* - (Minus)             0x4a  */   74
 88-#define KEY_KP_4         /* 4           Left      0x4b  */   75
 89-#define KEY_KP_5         /* 5                     0x4c  */   76
 90-#define KEY_KP_6         /* 6           Right     0x4d  */   77
 91-#define KEY_KP_Plus      /* + (Plus)              0x4e  */   78
 92-#define KEY_KP_1         /* 1           End       0x4f  */   79
 93-#define KEY_KP_2         /* 2           Down      0x50  */   80
 94-#define KEY_KP_3         /* 3           PgDown    0x51  */   81
 95-#define KEY_KP_0         /* 0           Insert    0x52  */   82
 96-#define KEY_KP_Decimal   /* . (Decimal) Delete    0x53  */   83 
 97-#define KEY_SysReqest    /* SysReqest             0x54  */   84
 98-                         /* NOTUSED               0x55  */
 99-#define KEY_Less         /* < (Less)   >(Greater) 0x56  */   86
100-#define KEY_F11          /* F11                   0x57  */   87
101-#define KEY_F12          /* F12                   0x58  */   88
102+#define KEY_Escape /* Escape                0x01  */ 1
103+#define KEY_1 /* 1           !         0x02  */ 2
104+#define KEY_2 /* 2           @         0x03  */ 3
105+#define KEY_3 /* 3           #         0x04  */ 4
106+#define KEY_4 /* 4           $         0x05  */ 5
107+#define KEY_5 /* 5           %         0x06  */ 6
108+#define KEY_6 /* 6           ^         0x07  */ 7
109+#define KEY_7 /* 7           &         0x08  */ 8
110+#define KEY_8 /* 8           *         0x09  */ 9
111+#define KEY_9 /* 9           (         0x0a  */ 10
112+#define KEY_0 /* 0           )         0x0b  */ 11
113+#define KEY_Minus /* - (Minus)   _ (Under) 0x0c  */ 12
114+#define KEY_Equal /* = (Equal)   +         0x0d  */ 13
115+#define KEY_BackSpace /* Back Space            0x0e  */ 14
116+#define KEY_Tab /* Tab                   0x0f  */ 15
117+#define KEY_Q /* Q                     0x10  */ 16
118+#define KEY_W /* W                     0x11  */ 17
119+#define KEY_E /* E                     0x12  */ 18
120+#define KEY_R /* R                     0x13  */ 19
121+#define KEY_T /* T                     0x14  */ 20
122+#define KEY_Y /* Y                     0x15  */ 21
123+#define KEY_U /* U                     0x16  */ 22
124+#define KEY_I /* I                     0x17  */ 23
125+#define KEY_O /* O                     0x18  */ 24
126+#define KEY_P /* P                     0x19  */ 25
127+#define KEY_LBrace /* [           {         0x1a  */ 26
128+#define KEY_RBrace /* ]           }         0x1b  */ 27
129+#define KEY_Enter /* Enter                 0x1c  */ 28
130+#define KEY_LCtrl /* Ctrl(left)            0x1d  */ 29
131+#define KEY_A /* A                     0x1e  */ 30
132+#define KEY_S /* S                     0x1f  */ 31
133+#define KEY_D /* D                     0x20  */ 32
134+#define KEY_F /* F                     0x21  */ 33
135+#define KEY_G /* G                     0x22  */ 34
136+#define KEY_H /* H                     0x23  */ 35
137+#define KEY_J /* J                     0x24  */ 36
138+#define KEY_K /* K                     0x25  */ 37
139+#define KEY_L /* L                     0x26  */ 38
140+#define KEY_SemiColon /* ;(SemiColon) :(Colon) 0x27  */ 39
141+#define KEY_Quote /* ' (Apostr)  " (Quote) 0x28  */ 40
142+#define KEY_Tilde /* ` (Accent)  ~ (Tilde) 0x29  */ 41
143+#define KEY_ShiftL /* Shift(left)           0x2a  */ 42
144+#define KEY_BSlash /* \(BckSlash) |(VertBar)0x2b  */ 43
145+#define KEY_Z /* Z                     0x2c  */ 44
146+#define KEY_X /* X                     0x2d  */ 45
147+#define KEY_C /* C                     0x2e  */ 46
148+#define KEY_V /* V                     0x2f  */ 47
149+#define KEY_B /* B                     0x30  */ 48
150+#define KEY_N /* N                     0x31  */ 49
151+#define KEY_M /* M                     0x32  */ 50
152+#define KEY_Comma /* , (Comma)   < (Less)  0x33  */ 51
153+#define KEY_Period /* . (Period)  >(Greater)0x34  */ 52
154+#define KEY_Slash /* / (Slash)   ?         0x35  */ 53
155+#define KEY_ShiftR /* Shift(right)          0x36  */ 54
156+#define KEY_KP_Multiply /* *                     0x37  */ 55
157+#define KEY_Alt /* Alt(left)             0x38  */ 56
158+#define KEY_Space /*   (SpaceBar)          0x39  */ 57
159+#define KEY_CapsLock /* CapsLock              0x3a  */ 58
160+#define KEY_F1 /* F1                    0x3b  */ 59
161+#define KEY_F2 /* F2                    0x3c  */ 60
162+#define KEY_F3 /* F3                    0x3d  */ 61
163+#define KEY_F4 /* F4                    0x3e  */ 62
164+#define KEY_F5 /* F5                    0x3f  */ 63
165+#define KEY_F6 /* F6                    0x40  */ 64
166+#define KEY_F7 /* F7                    0x41  */ 65
167+#define KEY_F8 /* F8                    0x42  */ 66
168+#define KEY_F9 /* F9                    0x43  */ 67
169+#define KEY_F10 /* F10                   0x44  */ 68
170+#define KEY_NumLock /* NumLock               0x45  */ 69
171+#define KEY_ScrollLock /* ScrollLock            0x46  */ 70
172+#define KEY_KP_7 /* 7           Home      0x47  */ 71
173+#define KEY_KP_8 /* 8           Up        0x48  */ 72
174+#define KEY_KP_9 /* 9           PgUp      0x49  */ 73
175+#define KEY_KP_Minus /* - (Minus)             0x4a  */ 74
176+#define KEY_KP_4 /* 4           Left      0x4b  */ 75
177+#define KEY_KP_5 /* 5                     0x4c  */ 76
178+#define KEY_KP_6 /* 6           Right     0x4d  */ 77
179+#define KEY_KP_Plus /* + (Plus)              0x4e  */ 78
180+#define KEY_KP_1 /* 1           End       0x4f  */ 79
181+#define KEY_KP_2 /* 2           Down      0x50  */ 80
182+#define KEY_KP_3 /* 3           PgDown    0x51  */ 81
183+#define KEY_KP_0 /* 0           Insert    0x52  */ 82
184+#define KEY_KP_Decimal /* . (Decimal) Delete    0x53  */ 83
185+#define KEY_SysReqest /* SysReqest             0x54  */ 84
186+/* NOTUSED               0x55  */
187+#define KEY_Less /* < (Less)   >(Greater) 0x56  */ 86
188+#define KEY_F11 /* F11                   0x57  */ 87
189+#define KEY_F12 /* F12                   0x58  */ 88
190 
191-#define KEY_Prefix0      /* special               0x60  */   96
192-#define KEY_Prefix1      /* specail               0x61  */   97
193+#define KEY_Prefix0 /* special               0x60  */ 96
194+#define KEY_Prefix1 /* specail               0x61  */ 97
195 
196 /*
197  * The 'scancodes' below are generated by the server, because the MF101/102
198  * keyboard sends them as sequence of other scancodes
199  */
200-#define KEY_Home         /* Home                  0x59  */   89
201-#define KEY_Up           /* Up                    0x5a  */   90
202-#define KEY_PgUp         /* PgUp                  0x5b  */   91
203-#define KEY_Left         /* Left                  0x5c  */   92
204-#define KEY_Begin        /* Begin                 0x5d  */   93
205-#define KEY_Right        /* Right                 0x5e  */   94
206-#define KEY_End          /* End                   0x5f  */   95
207-#define KEY_Down         /* Down                  0x60  */   96
208-#define KEY_PgDown       /* PgDown                0x61  */   97
209-#define KEY_Insert       /* Insert                0x62  */   98
210-#define KEY_Delete       /* Delete                0x63  */   99
211-#define KEY_KP_Enter     /* Enter                 0x64  */  100
212-#define KEY_RCtrl        /* Ctrl(right)           0x65  */  101
213-#define KEY_Pause        /* Pause                 0x66  */  102
214-#define KEY_Print        /* Print                 0x67  */  103
215-#define KEY_KP_Divide    /* Divide                0x68  */  104
216-#define KEY_AltLang      /* AtlLang(right)        0x69  */  105
217-#define KEY_Break        /* Break                 0x6a  */  106
218-#define KEY_LMeta        /* Left Meta             0x6b  */  107
219-#define KEY_RMeta        /* Right Meta            0x6c  */  108
220-#define KEY_Menu         /* Menu                  0x6d  */  109
221-#define KEY_F13          /* F13                   0x6e  */  110
222-#define KEY_F14          /* F14                   0x6f  */  111
223-#define KEY_F15          /* F15                   0x70  */  112
224-#define KEY_HKTG         /* Hirugana/Katakana tog 0x70  */  112
225-#define KEY_F16          /* F16                   0x71  */  113
226-#define KEY_F17          /* F17                   0x72  */  114
227-#define KEY_KP_DEC       /* KP_DEC                0x73  */  115
228-#define KEY_BSlash2      /* \           _         0x73  */  115
229-#define KEY_KP_Equal	 /* Equal (Keypad)        0x76  */  118
230-#define KEY_XFER         /* Kanji Transfer        0x79  */  121
231-#define KEY_NFER         /* No Kanji Transfer     0x7b  */  123
232-#define KEY_Yen          /* Yen                   0x7d  */  125
233+#define KEY_Home /* Home                  0x59  */ 89
234+#define KEY_Up /* Up                    0x5a  */ 90
235+#define KEY_PgUp /* PgUp                  0x5b  */ 91
236+#define KEY_Left /* Left                  0x5c  */ 92
237+#define KEY_Begin /* Begin                 0x5d  */ 93
238+#define KEY_Right /* Right                 0x5e  */ 94
239+#define KEY_End /* End                   0x5f  */ 95
240+#define KEY_Down /* Down                  0x60  */ 96
241+#define KEY_PgDown /* PgDown                0x61  */ 97
242+#define KEY_Insert /* Insert                0x62  */ 98
243+#define KEY_Delete /* Delete                0x63  */ 99
244+#define KEY_KP_Enter /* Enter                 0x64  */ 100
245+#define KEY_RCtrl /* Ctrl(right)           0x65  */ 101
246+#define KEY_Pause /* Pause                 0x66  */ 102
247+#define KEY_Print /* Print                 0x67  */ 103
248+#define KEY_KP_Divide /* Divide                0x68  */ 104
249+#define KEY_AltLang /* AtlLang(right)        0x69  */ 105
250+#define KEY_Break /* Break                 0x6a  */ 106
251+#define KEY_LMeta /* Left Meta             0x6b  */ 107
252+#define KEY_RMeta /* Right Meta            0x6c  */ 108
253+#define KEY_Menu /* Menu                  0x6d  */ 109
254+#define KEY_F13 /* F13                   0x6e  */ 110
255+#define KEY_F14 /* F14                   0x6f  */ 111
256+#define KEY_F15 /* F15                   0x70  */ 112
257+#define KEY_HKTG /* Hirugana/Katakana tog 0x70  */ 112
258+#define KEY_F16 /* F16                   0x71  */ 113
259+#define KEY_F17 /* F17                   0x72  */ 114
260+#define KEY_KP_DEC /* KP_DEC                0x73  */ 115
261+#define KEY_BSlash2 /* \           _         0x73  */ 115
262+#define KEY_KP_Equal /* Equal (Keypad)        0x76  */ 118
263+#define KEY_XFER /* Kanji Transfer        0x79  */ 121
264+#define KEY_NFER /* No Kanji Transfer     0x7b  */ 123
265+#define KEY_Yen /* Yen                   0x7d  */ 125
266 
267-#define KEY_Power        /* Power Key             0x84  */  132
268-#define KEY_Mute         /* Audio Mute            0x85  */  133
269-#define KEY_AudioLower   /* Audio Lower           0x86  */  134
270-#define KEY_AudioRaise   /* Audio Raise           0x87  */  135
271-#define KEY_Help         /* Help                  0x88  */  136
272-#define KEY_L1           /* Stop                  0x89  */  137
273-#define KEY_L2           /* Again                 0x8a  */  138
274-#define KEY_L3           /* Props                 0x8b  */  139
275-#define KEY_L4           /* Undo                  0x8c  */  140
276-#define KEY_L5           /* Front                 0x8d  */  141
277-#define KEY_L6           /* Copy                  0x8e  */  142
278-#define KEY_L7           /* Open                  0x8f  */  143
279-#define KEY_L8           /* Paste                 0x90  */  144
280-#define KEY_L9           /* Find                  0x91  */  145
281-#define KEY_L10          /* Cut                   0x92  */  146
282+#define KEY_Power /* Power Key             0x84  */ 132
283+#define KEY_Mute /* Audio Mute            0x85  */ 133
284+#define KEY_AudioLower /* Audio Lower           0x86  */ 134
285+#define KEY_AudioRaise /* Audio Raise           0x87  */ 135
286+#define KEY_Help /* Help                  0x88  */ 136
287+#define KEY_L1 /* Stop                  0x89  */ 137
288+#define KEY_L2 /* Again                 0x8a  */ 138
289+#define KEY_L3 /* Props                 0x8b  */ 139
290+#define KEY_L4 /* Undo                  0x8c  */ 140
291+#define KEY_L5 /* Front                 0x8d  */ 141
292+#define KEY_L6 /* Copy                  0x8e  */ 142
293+#define KEY_L7 /* Open                  0x8f  */ 143
294+#define KEY_L8 /* Paste                 0x90  */ 144
295+#define KEY_L9 /* Find                  0x91  */ 145
296+#define KEY_L10 /* Cut                   0x92  */ 146
297 
298 /*
299  * Fake 'scancodes' in the following ranges are generated for 2-byte
300@@ -243,39 +243,39 @@
301  * 0x59-0x5f,0x62-0x76.  These are used for some extra keys on some keyboards.
302  */
303 
304-#define KEY_0x59		0x95
305-#define KEY_0x5A		0xA2
306-#define KEY_0x5B		0xAD
307-#define KEY_0x5C		KEY_KP_EQUAL
308-#define KEY_0x5D		0xAE
309-#define KEY_0x5E		0xAF
310-#define KEY_0x5F		0xB0
311-#define KEY_0x62		0xB5
312-#define KEY_0x63		0xB6
313-#define KEY_0x64		0xB7
314-#define KEY_0x65		0xB8
315-#define KEY_0x66		0xB9
316-#define KEY_0x67		0xBE
317-#define KEY_0x68		0xBF
318-#define KEY_0x69		0xC0
319-#define KEY_0x6A		0xC1
320-#define KEY_0x6B		0xC3
321-#define KEY_0x6C		0xC4
322-#define KEY_0x6D		0xC5
323-#define KEY_0x6E		0xC6
324-#define KEY_0x6F		0xC7
325-#define KEY_0x70		0xC8
326-#define KEY_0x71		0xC9
327-#define KEY_0x72		0xCA
328-#define KEY_0x73		0xCB
329-#define KEY_0x74		0xD3
330-#define KEY_0x75		0xD4
331-#define KEY_0x76		0xD5
332-#define KEY_R_0xF4		0xF4
333-#define KEY_R_0xF5		0xF5
334+#define KEY_0x59 0x95
335+#define KEY_0x5A 0xA2
336+#define KEY_0x5B 0xAD
337+#define KEY_0x5C KEY_KP_EQUAL
338+#define KEY_0x5D 0xAE
339+#define KEY_0x5E 0xAF
340+#define KEY_0x5F 0xB0
341+#define KEY_0x62 0xB5
342+#define KEY_0x63 0xB6
343+#define KEY_0x64 0xB7
344+#define KEY_0x65 0xB8
345+#define KEY_0x66 0xB9
346+#define KEY_0x67 0xBE
347+#define KEY_0x68 0xBF
348+#define KEY_0x69 0xC0
349+#define KEY_0x6A 0xC1
350+#define KEY_0x6B 0xC3
351+#define KEY_0x6C 0xC4
352+#define KEY_0x6D 0xC5
353+#define KEY_0x6E 0xC6
354+#define KEY_0x6F 0xC7
355+#define KEY_0x70 0xC8
356+#define KEY_0x71 0xC9
357+#define KEY_0x72 0xCA
358+#define KEY_0x73 0xCB
359+#define KEY_0x74 0xD3
360+#define KEY_0x75 0xD4
361+#define KEY_0x76 0xD5
362+#define KEY_R_0xF4 0xF4
363+#define KEY_R_0xF5 0xF5
364 
365 /* These are for "notused" and "unknown" entries in translation maps. */
366-#define KEY_NOTUSED	  0
367-#define KEY_UNKNOWN	255
368+#define KEY_NOTUSED 0
369+#define KEY_UNKNOWN 255
370 
371 #endif /* _ATKEYNAMES_H */
+458, -458
  1@@ -9,477 +9,477 @@
  2 #include <stdint.h>
  3 
  4 static uint8_t wsUsbMap[] = {
  5-	/* 0 */ KEY_NOTUSED,
  6-	/* 1 */ KEY_NOTUSED,
  7-	/* 2 */ KEY_NOTUSED,
  8-	/* 3 */ KEY_NOTUSED,
  9-	/* 4 */ KEY_A,		
 10-	/* 5 */ KEY_B,
 11-	/* 6 */ KEY_C,
 12-	/* 7 */ KEY_D,
 13-	/* 8 */ KEY_E,
 14-	/* 9 */ KEY_F,
 15-	/* 10 */ KEY_G,
 16-	/* 11 */ KEY_H,
 17-	/* 12 */ KEY_I,
 18-	/* 13 */ KEY_J,
 19-	/* 14 */ KEY_K,
 20-	/* 15 */ KEY_L,
 21-	/* 16 */ KEY_M,
 22-	/* 17 */ KEY_N,
 23-	/* 18 */ KEY_O,
 24-	/* 19 */ KEY_P,
 25-	/* 20 */ KEY_Q,
 26-	/* 21 */ KEY_R,
 27-	/* 22 */ KEY_S,
 28-	/* 23 */ KEY_T,
 29-	/* 24 */ KEY_U,
 30-	/* 25 */ KEY_V,
 31-	/* 26 */ KEY_W,
 32-	/* 27 */ KEY_X,
 33-	/* 28 */ KEY_Y,
 34-	/* 29 */ KEY_Z,
 35-	/* 30 */ KEY_1,		/* 1 !*/
 36-	/* 31 */ KEY_2,		/* 2 @ */
 37-	/* 32 */ KEY_3,		/* 3 # */
 38-	/* 33 */ KEY_4,		/* 4 $ */
 39-	/* 34 */ KEY_5,		/* 5 % */
 40-	/* 35 */ KEY_6,		/* 6 ^ */
 41-	/* 36 */ KEY_7,		/* 7 & */
 42-	/* 37 */ KEY_8,		/* 8 * */
 43-	/* 38 */ KEY_9,		/* 9 ( */
 44-	/* 39 */ KEY_0,		/* 0 ) */
 45-	/* 40 */ KEY_Enter,	/* Return  */
 46-	/* 41 */ KEY_Escape,	/* Escape */
 47-	/* 42 */ KEY_BackSpace,	/* Backspace Delete */
 48-	/* 43 */ KEY_Tab,	/* Tab */
 49-	/* 44 */ KEY_Space,	/* Space */
 50-	/* 45 */ KEY_Minus,	/* - _ */
 51-	/* 46 */ KEY_Equal,	/* = + */
 52-	/* 47 */ KEY_LBrace,	/* [ { */
 53-	/* 48 */ KEY_RBrace,	/* ] } */
 54-	/* 49 */ KEY_BSlash,	/* \ | */
 55-	/* 50 */ KEY_BSlash,    /* \ _ # ~ on some keyboards */
 56-	/* 51 */ KEY_SemiColon,	/* ; : */
 57-	/* 52 */ KEY_Quote,	/* ' " */
 58-	/* 53 */ KEY_Tilde,	/* ` ~ */
 59-	/* 54 */ KEY_Comma,	/* , <  */
 60-	/* 55 */ KEY_Period,	/* . > */
 61-	/* 56 */ KEY_Slash,	/* / ? */
 62-	/* 57 */ KEY_CapsLock,	/* Caps Lock */
 63-	/* 58 */ KEY_F1,		/* F1 */
 64-	/* 59 */ KEY_F2,		/* F2 */
 65-	/* 60 */ KEY_F3,		/* F3 */
 66-	/* 61 */ KEY_F4,		/* F4 */
 67-	/* 62 */ KEY_F5,		/* F5 */
 68-	/* 63 */ KEY_F6,		/* F6 */
 69-	/* 64 */ KEY_F7,		/* F7 */
 70-	/* 65 */ KEY_F8,		/* F8 */
 71-	/* 66 */ KEY_F9,		/* F9 */
 72-	/* 67 */ KEY_F10,	/* F10 */
 73-	/* 68 */ KEY_F11,	/* F11 */
 74-	/* 69 */ KEY_F12,	/* F12 */
 75-	/* 70 */ KEY_Print,	/* PrintScrn SysReq */
 76-	/* 71 */ KEY_ScrollLock,	/* Scroll Lock */
 77-	/* 72 */ KEY_Pause,	/* Pause Break */
 78-	/* 73 */ KEY_Insert,	/* Insert XXX  Help on some Mac Keyboards */
 79-	/* 74 */ KEY_Home,	/* Home */
 80-	/* 75 */ KEY_PgUp,	/* Page Up */
 81-	/* 76 */ KEY_Delete,	/* Delete */
 82-	/* 77 */ KEY_End,	/* End */
 83-	/* 78 */ KEY_PgDown,	/* Page Down */
 84-	/* 79 */ KEY_Right,	/* Right Arrow */
 85-	/* 80 */ KEY_Left,	/* Left Arrow */
 86-	/* 81 */ KEY_Down,	/* Down Arrow */
 87-	/* 82 */ KEY_Up,		/* Up Arrow */
 88-	/* 83 */ KEY_NumLock,	/* Num Lock */
 89-	/* 84 */ KEY_KP_Divide,	/* Keypad / */
 90-	/* 85 */ KEY_KP_Multiply, /* Keypad * */
 91-	/* 86 */ KEY_KP_Minus,	/* Keypad - */
 92-	/* 87 */ KEY_KP_Plus,	/* Keypad + */
 93-	/* 88 */ KEY_KP_Enter,	/* Keypad Enter */
 94-	/* 89 */ KEY_KP_1,	/* Keypad 1 End */
 95-	/* 90 */ KEY_KP_2,	/* Keypad 2 Down */
 96-	/* 91 */ KEY_KP_3,	/* Keypad 3 Pg Down */
 97-	/* 92 */ KEY_KP_4,	/* Keypad 4 Left  */
 98-	/* 93 */ KEY_KP_5,	/* Keypad 5 */
 99-	/* 94 */ KEY_KP_6,	/* Keypad 6 */
100-	/* 95 */ KEY_KP_7,	/* Keypad 7 Home */
101-	/* 96 */ KEY_KP_8,	/* Keypad 8 Up */
102-	/* 97 */ KEY_KP_9,	/* KEypad 9 Pg Up */
103-	/* 98 */ KEY_KP_0,	/* Keypad 0 Ins */
104-	/* 99 */ KEY_KP_Decimal,	/* Keypad . Del */
105-	/* 100 */ KEY_Less,	/* < > on some keyboards */
106-	/* 101 */ KEY_Menu,	/* Menu */
107-	/* 102 */ KEY_Power,	/* sleep key on Sun USB */
108-	/* 103 */ KEY_KP_Equal, /* Keypad = on Mac keyboards */
109-	/* 104 */ KEY_F13,
110-	/* 105 */ KEY_F14,
111-	/* 106 */ KEY_F15,
112-	/* 107 */ KEY_F16,
113-	/* 108 */ KEY_NOTUSED,
114-	/* 109 */ KEY_Power,
115-	/* 110 */ KEY_NOTUSED,
116-	/* 111 */ KEY_NOTUSED,
117-	/* 112 */ KEY_NOTUSED,
118-	/* 113 */ KEY_NOTUSED,
119-	/* 114 */ KEY_NOTUSED,
120-	/* 115 */ KEY_NOTUSED,
121-	/* 116 */ KEY_L7,
122-	/* 117 */ KEY_Help,
123-	/* 118 */ KEY_L3,
124-	/* 119 */ KEY_L5,
125-	/* 120 */ KEY_L1,
126-	/* 121 */ KEY_L2,
127-	/* 122 */ KEY_L4,
128-	/* 123 */ KEY_L10,
129-	/* 124 */ KEY_L6,
130-	/* 125 */ KEY_L8,
131-	/* 126 */ KEY_L9,
132-	/* 127 */ KEY_Mute,
133-	/* 128 */ KEY_AudioRaise,
134-	/* 129 */ KEY_AudioLower,
135-	/* 130 */ KEY_NOTUSED,
136-	/* 131 */ KEY_NOTUSED,
137-	/* 132 */ KEY_NOTUSED,
138-	/* 133 */ KEY_NOTUSED,
139-	/* 134 */ KEY_NOTUSED,
140+    /* 0 */ KEY_NOTUSED,
141+    /* 1 */ KEY_NOTUSED,
142+    /* 2 */ KEY_NOTUSED,
143+    /* 3 */ KEY_NOTUSED,
144+    /* 4 */ KEY_A,
145+    /* 5 */ KEY_B,
146+    /* 6 */ KEY_C,
147+    /* 7 */ KEY_D,
148+    /* 8 */ KEY_E,
149+    /* 9 */ KEY_F,
150+    /* 10 */ KEY_G,
151+    /* 11 */ KEY_H,
152+    /* 12 */ KEY_I,
153+    /* 13 */ KEY_J,
154+    /* 14 */ KEY_K,
155+    /* 15 */ KEY_L,
156+    /* 16 */ KEY_M,
157+    /* 17 */ KEY_N,
158+    /* 18 */ KEY_O,
159+    /* 19 */ KEY_P,
160+    /* 20 */ KEY_Q,
161+    /* 21 */ KEY_R,
162+    /* 22 */ KEY_S,
163+    /* 23 */ KEY_T,
164+    /* 24 */ KEY_U,
165+    /* 25 */ KEY_V,
166+    /* 26 */ KEY_W,
167+    /* 27 */ KEY_X,
168+    /* 28 */ KEY_Y,
169+    /* 29 */ KEY_Z,
170+    /* 30 */ KEY_1,           /* 1 !*/
171+    /* 31 */ KEY_2,           /* 2 @ */
172+    /* 32 */ KEY_3,           /* 3 # */
173+    /* 33 */ KEY_4,           /* 4 $ */
174+    /* 34 */ KEY_5,           /* 5 % */
175+    /* 35 */ KEY_6,           /* 6 ^ */
176+    /* 36 */ KEY_7,           /* 7 & */
177+    /* 37 */ KEY_8,           /* 8 * */
178+    /* 38 */ KEY_9,           /* 9 ( */
179+    /* 39 */ KEY_0,           /* 0 ) */
180+    /* 40 */ KEY_Enter,       /* Return  */
181+    /* 41 */ KEY_Escape,      /* Escape */
182+    /* 42 */ KEY_BackSpace,   /* Backspace Delete */
183+    /* 43 */ KEY_Tab,         /* Tab */
184+    /* 44 */ KEY_Space,       /* Space */
185+    /* 45 */ KEY_Minus,       /* - _ */
186+    /* 46 */ KEY_Equal,       /* = + */
187+    /* 47 */ KEY_LBrace,      /* [ { */
188+    /* 48 */ KEY_RBrace,      /* ] } */
189+    /* 49 */ KEY_BSlash,      /* \ | */
190+    /* 50 */ KEY_BSlash,      /* \ _ # ~ on some keyboards */
191+    /* 51 */ KEY_SemiColon,   /* ; : */
192+    /* 52 */ KEY_Quote,       /* ' " */
193+    /* 53 */ KEY_Tilde,       /* ` ~ */
194+    /* 54 */ KEY_Comma,       /* , <  */
195+    /* 55 */ KEY_Period,      /* . > */
196+    /* 56 */ KEY_Slash,       /* / ? */
197+    /* 57 */ KEY_CapsLock,    /* Caps Lock */
198+    /* 58 */ KEY_F1,          /* F1 */
199+    /* 59 */ KEY_F2,          /* F2 */
200+    /* 60 */ KEY_F3,          /* F3 */
201+    /* 61 */ KEY_F4,          /* F4 */
202+    /* 62 */ KEY_F5,          /* F5 */
203+    /* 63 */ KEY_F6,          /* F6 */
204+    /* 64 */ KEY_F7,          /* F7 */
205+    /* 65 */ KEY_F8,          /* F8 */
206+    /* 66 */ KEY_F9,          /* F9 */
207+    /* 67 */ KEY_F10,         /* F10 */
208+    /* 68 */ KEY_F11,         /* F11 */
209+    /* 69 */ KEY_F12,         /* F12 */
210+    /* 70 */ KEY_Print,       /* PrintScrn SysReq */
211+    /* 71 */ KEY_ScrollLock,  /* Scroll Lock */
212+    /* 72 */ KEY_Pause,       /* Pause Break */
213+    /* 73 */ KEY_Insert,      /* Insert XXX  Help on some Mac Keyboards */
214+    /* 74 */ KEY_Home,        /* Home */
215+    /* 75 */ KEY_PgUp,        /* Page Up */
216+    /* 76 */ KEY_Delete,      /* Delete */
217+    /* 77 */ KEY_End,         /* End */
218+    /* 78 */ KEY_PgDown,      /* Page Down */
219+    /* 79 */ KEY_Right,       /* Right Arrow */
220+    /* 80 */ KEY_Left,        /* Left Arrow */
221+    /* 81 */ KEY_Down,        /* Down Arrow */
222+    /* 82 */ KEY_Up,          /* Up Arrow */
223+    /* 83 */ KEY_NumLock,     /* Num Lock */
224+    /* 84 */ KEY_KP_Divide,   /* Keypad / */
225+    /* 85 */ KEY_KP_Multiply, /* Keypad * */
226+    /* 86 */ KEY_KP_Minus,    /* Keypad - */
227+    /* 87 */ KEY_KP_Plus,     /* Keypad + */
228+    /* 88 */ KEY_KP_Enter,    /* Keypad Enter */
229+    /* 89 */ KEY_KP_1,        /* Keypad 1 End */
230+    /* 90 */ KEY_KP_2,        /* Keypad 2 Down */
231+    /* 91 */ KEY_KP_3,        /* Keypad 3 Pg Down */
232+    /* 92 */ KEY_KP_4,        /* Keypad 4 Left  */
233+    /* 93 */ KEY_KP_5,        /* Keypad 5 */
234+    /* 94 */ KEY_KP_6,        /* Keypad 6 */
235+    /* 95 */ KEY_KP_7,        /* Keypad 7 Home */
236+    /* 96 */ KEY_KP_8,        /* Keypad 8 Up */
237+    /* 97 */ KEY_KP_9,        /* KEypad 9 Pg Up */
238+    /* 98 */ KEY_KP_0,        /* Keypad 0 Ins */
239+    /* 99 */ KEY_KP_Decimal,  /* Keypad . Del */
240+    /* 100 */ KEY_Less,       /* < > on some keyboards */
241+    /* 101 */ KEY_Menu,       /* Menu */
242+    /* 102 */ KEY_Power,      /* sleep key on Sun USB */
243+    /* 103 */ KEY_KP_Equal,   /* Keypad = on Mac keyboards */
244+    /* 104 */ KEY_F13,
245+    /* 105 */ KEY_F14,
246+    /* 106 */ KEY_F15,
247+    /* 107 */ KEY_F16,
248+    /* 108 */ KEY_NOTUSED,
249+    /* 109 */ KEY_Power,
250+    /* 110 */ KEY_NOTUSED,
251+    /* 111 */ KEY_NOTUSED,
252+    /* 112 */ KEY_NOTUSED,
253+    /* 113 */ KEY_NOTUSED,
254+    /* 114 */ KEY_NOTUSED,
255+    /* 115 */ KEY_NOTUSED,
256+    /* 116 */ KEY_L7,
257+    /* 117 */ KEY_Help,
258+    /* 118 */ KEY_L3,
259+    /* 119 */ KEY_L5,
260+    /* 120 */ KEY_L1,
261+    /* 121 */ KEY_L2,
262+    /* 122 */ KEY_L4,
263+    /* 123 */ KEY_L10,
264+    /* 124 */ KEY_L6,
265+    /* 125 */ KEY_L8,
266+    /* 126 */ KEY_L9,
267+    /* 127 */ KEY_Mute,
268+    /* 128 */ KEY_AudioRaise,
269+    /* 129 */ KEY_AudioLower,
270+    /* 130 */ KEY_NOTUSED,
271+    /* 131 */ KEY_NOTUSED,
272+    /* 132 */ KEY_NOTUSED,
273+    /* 133 */ KEY_NOTUSED,
274+    /* 134 */ KEY_NOTUSED,
275 /*
276  * Special keycodes for Japanese keyboards
277  * Override atKeyname HKTG and BSlash2 code to unique values for JP106 keyboards
278  */
279 #undef KEY_HKTG
280-#define KEY_HKTG	200	/* Japanese Hiragana Katakana Toggle */
281+#define KEY_HKTG 200 /* Japanese Hiragana Katakana Toggle */
282 #undef KEY_BSlash2
283-#define KEY_BSlash2	203	/* Japanese '\_' key */
284+#define KEY_BSlash2 203 /* Japanese '\_' key */
285 
286-	/* 135 */ KEY_BSlash2,	/* Japanese 106 kbd: '\_' */
287-	/* 136 */ KEY_HKTG,	/* Japanese 106 kbd: Hiragana Katakana toggle */
288-	/* 137 */ KEY_Yen,	/* Japanese 106 kbd: '\|' */
289-	/* 138 */ KEY_XFER,	/* Japanese 106 kbd: Henkan */
290-	/* 139 */ KEY_NFER,	/* Japanese 106 kbd: Muhenkan */
291-	/* 140 */ KEY_NOTUSED,
292-	/* 141 */ KEY_NOTUSED,
293-	/* 142 */ KEY_NOTUSED,
294-	/* 143 */ KEY_NOTUSED,
295+    /* 135 */ KEY_BSlash2, /* Japanese 106 kbd: '\_' */
296+    /* 136 */ KEY_HKTG,    /* Japanese 106 kbd: Hiragana Katakana toggle */
297+    /* 137 */ KEY_Yen,     /* Japanese 106 kbd: '\|' */
298+    /* 138 */ KEY_XFER,    /* Japanese 106 kbd: Henkan */
299+    /* 139 */ KEY_NFER,    /* Japanese 106 kbd: Muhenkan */
300+    /* 140 */ KEY_NOTUSED,
301+    /* 141 */ KEY_NOTUSED,
302+    /* 142 */ KEY_NOTUSED,
303+    /* 143 */ KEY_NOTUSED,
304 /*
305  * Special keycodes for Korean keyboards
306  * Define Hangul and Hangul_Hanja unique key codes
307  * These keys also use KANA and EISU on some Macintosh Japanese USB keyboards
308  */
309-#define KEY_Hangul		201	/* Also KANA Key on Mac JP USB kbd */
310-#define KEY_Hangul_Hanja	202	/* Also EISU Key on Mac JP USB kbd */
311-	/* 144 */ KEY_Hangul,		/* Korean 106 kbd: Hangul */
312-	/* 145 */ KEY_Hangul_Hanja,	/* Korean 106 kbd: Hangul Hanja */
313-	/* 146 */ KEY_NOTUSED,
314-	/* 147 */ KEY_NOTUSED,
315-	/* 148 */ KEY_NOTUSED,
316-	/* 149 */ KEY_NOTUSED,
317-	/* 150 */ KEY_NOTUSED,
318-	/* 151 */ KEY_NOTUSED,
319-	/* 152 */ KEY_NOTUSED,
320-	/* 153 */ KEY_NOTUSED,
321-	/* 154 */ KEY_NOTUSED,
322-	/* 155 */ KEY_NOTUSED,
323-	/* 156 */ KEY_NOTUSED,
324-	/* 157 */ KEY_NOTUSED,
325-	/* 158 */ KEY_NOTUSED,
326-	/* 159 */ KEY_NOTUSED,
327-	/* 160 */ KEY_NOTUSED,
328-	/* 161 */ KEY_NOTUSED,
329-	/* 162 */ KEY_NOTUSED,
330-	/* 163 */ KEY_NOTUSED,
331-	/* 164 */ KEY_NOTUSED,
332-	/* 165 */ KEY_NOTUSED,
333-	/* 166 */ KEY_NOTUSED,
334-	/* 167 */ KEY_NOTUSED,
335-	/* 168 */ KEY_NOTUSED,
336-	/* 169 */ KEY_NOTUSED,
337-	/* 170 */ KEY_NOTUSED,
338-	/* 171 */ KEY_NOTUSED,
339-	/* 172 */ KEY_NOTUSED,
340-	/* 173 */ KEY_NOTUSED,
341-	/* 174 */ KEY_NOTUSED,
342-	/* 175 */ KEY_NOTUSED,
343-	/* 176 */ KEY_NOTUSED,
344-	/* 177 */ KEY_NOTUSED,
345-	/* 178 */ KEY_NOTUSED,
346-	/* 179 */ KEY_NOTUSED,
347-	/* 180 */ KEY_NOTUSED,
348-	/* 181 */ KEY_NOTUSED,
349-	/* 182 */ KEY_NOTUSED,
350-	/* 183 */ KEY_NOTUSED,
351-	/* 184 */ KEY_NOTUSED,
352-	/* 185 */ KEY_NOTUSED,
353-	/* 186 */ KEY_NOTUSED,
354-	/* 187 */ KEY_NOTUSED,
355-	/* 188 */ KEY_NOTUSED,
356-	/* 189 */ KEY_NOTUSED,
357-	/* 190 */ KEY_NOTUSED,
358-	/* 191 */ KEY_NOTUSED,
359-	/* 192 */ KEY_NOTUSED,
360-	/* 193 */ KEY_NOTUSED,
361-	/* 194 */ KEY_NOTUSED,
362-	/* 195 */ KEY_NOTUSED,
363-	/* 196 */ KEY_NOTUSED,
364-	/* 197 */ KEY_NOTUSED,
365-	/* 198 */ KEY_NOTUSED,
366-	/* 199 */ KEY_NOTUSED,
367-	/* 200 */ KEY_NOTUSED,
368-	/* 201 */ KEY_NOTUSED,
369-	/* 202 */ KEY_NOTUSED,
370-	/* 203 */ KEY_NOTUSED,
371-	/* 204 */ KEY_NOTUSED,
372-	/* 205 */ KEY_NOTUSED,
373-	/* 206 */ KEY_NOTUSED,
374-	/* 207 */ KEY_NOTUSED,
375-	/* 208 */ KEY_NOTUSED,
376-	/* 209 */ KEY_NOTUSED,
377-	/* 210 */ KEY_NOTUSED,
378-	/* 211 */ KEY_NOTUSED,
379-	/* 212 */ KEY_NOTUSED,
380-	/* 213 */ KEY_NOTUSED,
381-	/* 214 */ KEY_NOTUSED,
382-	/* 215 */ KEY_NOTUSED,
383-	/* 216 */ KEY_NOTUSED,
384-	/* 217 */ KEY_NOTUSED,
385-	/* 218 */ KEY_NOTUSED,
386-	/* 219 */ KEY_NOTUSED,
387-	/* 220 */ KEY_NOTUSED,
388-	/* 221 */ KEY_NOTUSED,
389-	/* 222 */ KEY_NOTUSED,
390-	/* 223 */ KEY_NOTUSED,
391-	/* 224 */ KEY_LCtrl,	/* Left Control */
392-	/* 225 */ KEY_ShiftL,	/* Left Shift */
393-	/* 226 */ KEY_Alt,	/* Left Alt */
394-	/* 227 */ KEY_LMeta,	/* Left Meta */
395-	/* 228 */ KEY_RCtrl,	/* Right Control */
396-	/* 229 */ KEY_ShiftR,	/* Right Shift */
397-	/* 230 */ KEY_AltLang,	/* Right Alt, AKA AltGr */
398-	/* 231 */ KEY_LMeta,	/* Right Meta XXX */
399+#define KEY_Hangul 201          /* Also KANA Key on Mac JP USB kbd */
400+#define KEY_Hangul_Hanja 202    /* Also EISU Key on Mac JP USB kbd */
401+    /* 144 */ KEY_Hangul,       /* Korean 106 kbd: Hangul */
402+    /* 145 */ KEY_Hangul_Hanja, /* Korean 106 kbd: Hangul Hanja */
403+    /* 146 */ KEY_NOTUSED,
404+    /* 147 */ KEY_NOTUSED,
405+    /* 148 */ KEY_NOTUSED,
406+    /* 149 */ KEY_NOTUSED,
407+    /* 150 */ KEY_NOTUSED,
408+    /* 151 */ KEY_NOTUSED,
409+    /* 152 */ KEY_NOTUSED,
410+    /* 153 */ KEY_NOTUSED,
411+    /* 154 */ KEY_NOTUSED,
412+    /* 155 */ KEY_NOTUSED,
413+    /* 156 */ KEY_NOTUSED,
414+    /* 157 */ KEY_NOTUSED,
415+    /* 158 */ KEY_NOTUSED,
416+    /* 159 */ KEY_NOTUSED,
417+    /* 160 */ KEY_NOTUSED,
418+    /* 161 */ KEY_NOTUSED,
419+    /* 162 */ KEY_NOTUSED,
420+    /* 163 */ KEY_NOTUSED,
421+    /* 164 */ KEY_NOTUSED,
422+    /* 165 */ KEY_NOTUSED,
423+    /* 166 */ KEY_NOTUSED,
424+    /* 167 */ KEY_NOTUSED,
425+    /* 168 */ KEY_NOTUSED,
426+    /* 169 */ KEY_NOTUSED,
427+    /* 170 */ KEY_NOTUSED,
428+    /* 171 */ KEY_NOTUSED,
429+    /* 172 */ KEY_NOTUSED,
430+    /* 173 */ KEY_NOTUSED,
431+    /* 174 */ KEY_NOTUSED,
432+    /* 175 */ KEY_NOTUSED,
433+    /* 176 */ KEY_NOTUSED,
434+    /* 177 */ KEY_NOTUSED,
435+    /* 178 */ KEY_NOTUSED,
436+    /* 179 */ KEY_NOTUSED,
437+    /* 180 */ KEY_NOTUSED,
438+    /* 181 */ KEY_NOTUSED,
439+    /* 182 */ KEY_NOTUSED,
440+    /* 183 */ KEY_NOTUSED,
441+    /* 184 */ KEY_NOTUSED,
442+    /* 185 */ KEY_NOTUSED,
443+    /* 186 */ KEY_NOTUSED,
444+    /* 187 */ KEY_NOTUSED,
445+    /* 188 */ KEY_NOTUSED,
446+    /* 189 */ KEY_NOTUSED,
447+    /* 190 */ KEY_NOTUSED,
448+    /* 191 */ KEY_NOTUSED,
449+    /* 192 */ KEY_NOTUSED,
450+    /* 193 */ KEY_NOTUSED,
451+    /* 194 */ KEY_NOTUSED,
452+    /* 195 */ KEY_NOTUSED,
453+    /* 196 */ KEY_NOTUSED,
454+    /* 197 */ KEY_NOTUSED,
455+    /* 198 */ KEY_NOTUSED,
456+    /* 199 */ KEY_NOTUSED,
457+    /* 200 */ KEY_NOTUSED,
458+    /* 201 */ KEY_NOTUSED,
459+    /* 202 */ KEY_NOTUSED,
460+    /* 203 */ KEY_NOTUSED,
461+    /* 204 */ KEY_NOTUSED,
462+    /* 205 */ KEY_NOTUSED,
463+    /* 206 */ KEY_NOTUSED,
464+    /* 207 */ KEY_NOTUSED,
465+    /* 208 */ KEY_NOTUSED,
466+    /* 209 */ KEY_NOTUSED,
467+    /* 210 */ KEY_NOTUSED,
468+    /* 211 */ KEY_NOTUSED,
469+    /* 212 */ KEY_NOTUSED,
470+    /* 213 */ KEY_NOTUSED,
471+    /* 214 */ KEY_NOTUSED,
472+    /* 215 */ KEY_NOTUSED,
473+    /* 216 */ KEY_NOTUSED,
474+    /* 217 */ KEY_NOTUSED,
475+    /* 218 */ KEY_NOTUSED,
476+    /* 219 */ KEY_NOTUSED,
477+    /* 220 */ KEY_NOTUSED,
478+    /* 221 */ KEY_NOTUSED,
479+    /* 222 */ KEY_NOTUSED,
480+    /* 223 */ KEY_NOTUSED,
481+    /* 224 */ KEY_LCtrl,   /* Left Control */
482+    /* 225 */ KEY_ShiftL,  /* Left Shift */
483+    /* 226 */ KEY_Alt,     /* Left Alt */
484+    /* 227 */ KEY_LMeta,   /* Left Meta */
485+    /* 228 */ KEY_RCtrl,   /* Right Control */
486+    /* 229 */ KEY_ShiftR,  /* Right Shift */
487+    /* 230 */ KEY_AltLang, /* Right Alt, AKA AltGr */
488+    /* 231 */ KEY_LMeta,   /* Right Meta XXX */
489 };
490 
491 static uint8_t wsXtMap[] = {
492-	/* 0 */ KEY_NOTUSED,
493-	/* 1 */ KEY_Escape,
494-	/* 2 */ KEY_1,
495-	/* 3 */ KEY_2,
496-	/* 4 */ KEY_3,
497-	/* 5 */ KEY_4,
498-	/* 6 */ KEY_5,
499-	/* 7 */ KEY_6,
500-	/* 8 */ KEY_7,
501-	/* 9 */ KEY_8,
502-	/* 10 */ KEY_9,
503-	/* 11 */ KEY_0,
504-	/* 12 */ KEY_Minus,
505-	/* 13 */ KEY_Equal,
506-	/* 14 */ KEY_BackSpace,
507-	/* 15 */ KEY_Tab,
508-	/* 16 */ KEY_Q,
509-	/* 17 */ KEY_W,
510-	/* 18 */ KEY_E,
511-	/* 19 */ KEY_R,
512-	/* 20 */ KEY_T,
513-	/* 21 */ KEY_Y,
514-	/* 22 */ KEY_U,
515-	/* 23 */ KEY_I,
516-	/* 24 */ KEY_O,
517-	/* 25 */ KEY_P,
518-	/* 26 */ KEY_LBrace,
519-	/* 27 */ KEY_RBrace,
520-	/* 28 */ KEY_Enter,
521-	/* 29 */ KEY_LCtrl,
522-	/* 30 */ KEY_A,
523-	/* 31 */ KEY_S,
524-	/* 32 */ KEY_D,
525-	/* 33 */ KEY_F,
526-	/* 34 */ KEY_G,
527-	/* 35 */ KEY_H,
528-	/* 36 */ KEY_J,
529-	/* 37 */ KEY_K,
530-	/* 38 */ KEY_L,
531-	/* 39 */ KEY_SemiColon,
532-	/* 40 */ KEY_Quote,
533-	/* 41 */ KEY_Tilde,
534-	/* 42 */ KEY_ShiftL,
535-	/* 43 */ KEY_BSlash,
536-	/* 44 */ KEY_Z,
537-	/* 45 */ KEY_X,
538-	/* 46 */ KEY_C,
539-	/* 47 */ KEY_V,
540-	/* 48 */ KEY_B,
541-	/* 49 */ KEY_N,
542-	/* 50 */ KEY_M,
543-	/* 51 */ KEY_Comma,
544-	/* 52 */ KEY_Period,
545-	/* 53 */ KEY_Slash,
546-	/* 54 */ KEY_ShiftR,
547-	/* 55 */ KEY_KP_Multiply,
548-	/* 56 */ KEY_Alt,
549-	/* 57 */ KEY_Space,
550-	/* 58 */ KEY_CapsLock,
551-	/* 59 */ KEY_F1,
552-	/* 60 */ KEY_F2,
553-	/* 61 */ KEY_F3,
554-	/* 62 */ KEY_F4,
555-	/* 63 */ KEY_F5,
556-	/* 64 */ KEY_F6,
557-	/* 65 */ KEY_F7,
558-	/* 66 */ KEY_F8,
559-	/* 67 */ KEY_F9,
560-	/* 68 */ KEY_F10,
561-	/* 69 */ KEY_NumLock,
562-	/* 70 */ KEY_ScrollLock,
563-	/* 71 */ KEY_KP_7,
564-	/* 72 */ KEY_KP_8,
565-	/* 73 */ KEY_KP_9,
566-	/* 74 */ KEY_KP_Minus,
567-	/* 75 */ KEY_KP_4,
568-	/* 76 */ KEY_KP_5,
569-	/* 77 */ KEY_KP_6,
570-	/* 78 */ KEY_KP_Plus,
571-	/* 79 */ KEY_KP_1,
572-	/* 80 */ KEY_KP_2,
573-	/* 81 */ KEY_KP_3,
574-	/* 82 */ KEY_KP_0,
575-	/* 83 */ KEY_KP_Decimal,
576-	/* 84 */ KEY_NOTUSED,
577-	/* 85 */ KEY_NOTUSED,
578-	/* 86 */ KEY_Less,	/* backslash on uk, < on german */
579-	/* 87 */ KEY_F11,
580-	/* 88 */ KEY_F12,
581-	/* 89 */ KEY_NOTUSED,
582-	/* 90 */ KEY_NOTUSED,
583-	/* 91 */ KEY_NOTUSED,
584-	/* 92 */ KEY_NOTUSED,
585-	/* 93 */ KEY_NOTUSED,
586-	/* 94 */ KEY_NOTUSED,
587-	/* 95 */ KEY_NOTUSED,
588-	/* 96 */ KEY_NOTUSED,
589-	/* 97 */ KEY_NOTUSED,
590-	/* 98 */ KEY_NOTUSED,
591-	/* 99 */ KEY_NOTUSED,
592-	/* 100 */ KEY_NOTUSED,
593-	/* 101 */ KEY_NOTUSED,
594-	/* 102 */ KEY_NOTUSED,
595-	/* 103 */ KEY_NOTUSED,
596-	/* 104 */ KEY_NOTUSED,
597-	/* 105 */ KEY_NOTUSED,
598-	/* 106 */ KEY_NOTUSED,
599-	/* 107 */ KEY_NOTUSED,
600-	/* 108 */ KEY_NOTUSED,
601-	/* 109 */ KEY_NOTUSED,
602-	/* 110 */ KEY_NOTUSED,
603-	/* 111 */ KEY_NOTUSED,
604-	/* 112 */ KEY_NOTUSED,
605-	/* 113 */ KEY_NOTUSED,
606-	/* 114 */ KEY_NOTUSED,
607-	/* 115 */ KEY_NOTUSED,
608-	/* 116 */ KEY_NOTUSED,
609-	/* 117 */ KEY_NOTUSED,
610-	/* 118 */ KEY_NOTUSED,
611-	/* 119 */ KEY_NOTUSED,
612-	/* 120 */ KEY_NOTUSED,
613-	/* 121 */ KEY_NOTUSED,
614-	/* 122 */ KEY_NOTUSED,
615-	/* 123 */ KEY_NOTUSED,
616-	/* 124 */ KEY_NOTUSED,
617-	/* 125 */ KEY_NOTUSED,
618-	/* 126 */ KEY_NOTUSED,
619-	/* 127 */ KEY_Pause,
620-	/* 128 */ KEY_NOTUSED,
621-	/* 129 */ KEY_NOTUSED,
622-	/* 130 */ KEY_NOTUSED,
623-	/* 131 */ KEY_NOTUSED,
624-	/* 132 */ KEY_NOTUSED,
625-	/* 133 */ KEY_NOTUSED,
626-	/* 134 */ KEY_NOTUSED,
627-	/* 135 */ KEY_NOTUSED,
628-	/* 136 */ KEY_NOTUSED,
629-	/* 137 */ KEY_NOTUSED,
630-	/* 138 */ KEY_NOTUSED,
631-	/* 139 */ KEY_NOTUSED,
632-	/* 140 */ KEY_NOTUSED,
633-	/* 141 */ KEY_NOTUSED,
634-	/* 142 */ KEY_NOTUSED,
635-	/* 143 */ KEY_NOTUSED,
636-	/* 144 */ KEY_NOTUSED,
637-	/* 145 */ KEY_NOTUSED,
638-	/* 146 */ KEY_NOTUSED,
639-	/* 147 */ KEY_NOTUSED,
640-	/* 148 */ KEY_NOTUSED,
641-	/* 149 */ KEY_NOTUSED,
642-	/* 150 */ KEY_NOTUSED,
643-	/* 151 */ KEY_NOTUSED,
644-	/* 152 */ KEY_NOTUSED,
645-	/* 153 */ KEY_NOTUSED,
646-	/* 154 */ KEY_NOTUSED,
647-	/* 155 */ KEY_NOTUSED,
648-	/* 156 */ KEY_KP_Enter,
649-	/* 157 */ KEY_RCtrl,
650-	/* 158 */ KEY_NOTUSED,
651-	/* 159 */ KEY_NOTUSED,
652-	/* 160 */ KEY_Mute,
653-	/* 161 */ KEY_NOTUSED,
654-	/* 162 */ KEY_NOTUSED,
655-	/* 163 */ KEY_NOTUSED,
656-	/* 164 */ KEY_NOTUSED,
657-	/* 165 */ KEY_NOTUSED,
658-	/* 166 */ KEY_NOTUSED,
659-	/* 167 */ KEY_NOTUSED,
660-	/* 168 */ KEY_NOTUSED,
661-	/* 169 */ KEY_NOTUSED,
662-	/* 170 */ KEY_Print,
663-	/* 171 */ KEY_NOTUSED,
664-	/* 172 */ KEY_NOTUSED,
665-	/* 173 */ KEY_NOTUSED,
666-	/* 174 */ KEY_AudioLower,
667-	/* 175 */ KEY_AudioRaise,
668-	/* 176 */ KEY_NOTUSED,
669-	/* 177 */ KEY_NOTUSED,
670-	/* 178 */ KEY_NOTUSED,
671-	/* 179 */ KEY_NOTUSED,
672-	/* 180 */ KEY_NOTUSED,
673-	/* 181 */ KEY_KP_Divide,
674-	/* 182 */ KEY_NOTUSED,
675-	/* 183 */ KEY_Print,
676-	/* 184 */ KEY_AltLang,
677-	/* 185 */ KEY_NOTUSED,
678-	/* 186 */ KEY_NOTUSED,
679-	/* 187 */ KEY_NOTUSED,
680-	/* 188 */ KEY_NOTUSED,
681-	/* 189 */ KEY_NOTUSED,
682-	/* 190 */ KEY_NOTUSED,
683-	/* 191 */ KEY_NOTUSED,
684-	/* 192 */ KEY_NOTUSED,
685-	/* 193 */ KEY_NOTUSED,
686-	/* 194 */ KEY_NOTUSED,
687-	/* 195 */ KEY_NOTUSED,
688-	/* 196 */ KEY_NOTUSED,
689-	/* 197 */ KEY_NOTUSED,
690-	/* 198 */ KEY_NOTUSED,
691-	/* 199 */ KEY_Home,
692-	/* 200 */ KEY_Up,
693-	/* 201 */ KEY_PgUp,
694-	/* 202 */ KEY_NOTUSED,
695-	/* 203 */ KEY_Left,
696-	/* 204 */ KEY_NOTUSED,
697-	/* 205 */ KEY_Right,
698-	/* 206 */ KEY_NOTUSED,
699-	/* 207 */ KEY_End,
700-	/* 208 */ KEY_Down,
701-	/* 209 */ KEY_PgDown,
702-	/* 210 */ KEY_Insert,
703-	/* 211 */ KEY_Delete,
704-	/* 212 */ KEY_NOTUSED,
705-	/* 213 */ KEY_NOTUSED,
706-	/* 214 */ KEY_NOTUSED,
707-	/* 215 */ KEY_NOTUSED,
708-	/* 216 */ KEY_NOTUSED,
709-	/* 217 */ KEY_NOTUSED,
710-	/* 218 */ KEY_NOTUSED,
711-	/* 219 */ KEY_LMeta,
712-	/* 220 */ KEY_RMeta,
713-	/* 221 */ KEY_Menu,
714+    /* 0 */ KEY_NOTUSED,
715+    /* 1 */ KEY_Escape,
716+    /* 2 */ KEY_1,
717+    /* 3 */ KEY_2,
718+    /* 4 */ KEY_3,
719+    /* 5 */ KEY_4,
720+    /* 6 */ KEY_5,
721+    /* 7 */ KEY_6,
722+    /* 8 */ KEY_7,
723+    /* 9 */ KEY_8,
724+    /* 10 */ KEY_9,
725+    /* 11 */ KEY_0,
726+    /* 12 */ KEY_Minus,
727+    /* 13 */ KEY_Equal,
728+    /* 14 */ KEY_BackSpace,
729+    /* 15 */ KEY_Tab,
730+    /* 16 */ KEY_Q,
731+    /* 17 */ KEY_W,
732+    /* 18 */ KEY_E,
733+    /* 19 */ KEY_R,
734+    /* 20 */ KEY_T,
735+    /* 21 */ KEY_Y,
736+    /* 22 */ KEY_U,
737+    /* 23 */ KEY_I,
738+    /* 24 */ KEY_O,
739+    /* 25 */ KEY_P,
740+    /* 26 */ KEY_LBrace,
741+    /* 27 */ KEY_RBrace,
742+    /* 28 */ KEY_Enter,
743+    /* 29 */ KEY_LCtrl,
744+    /* 30 */ KEY_A,
745+    /* 31 */ KEY_S,
746+    /* 32 */ KEY_D,
747+    /* 33 */ KEY_F,
748+    /* 34 */ KEY_G,
749+    /* 35 */ KEY_H,
750+    /* 36 */ KEY_J,
751+    /* 37 */ KEY_K,
752+    /* 38 */ KEY_L,
753+    /* 39 */ KEY_SemiColon,
754+    /* 40 */ KEY_Quote,
755+    /* 41 */ KEY_Tilde,
756+    /* 42 */ KEY_ShiftL,
757+    /* 43 */ KEY_BSlash,
758+    /* 44 */ KEY_Z,
759+    /* 45 */ KEY_X,
760+    /* 46 */ KEY_C,
761+    /* 47 */ KEY_V,
762+    /* 48 */ KEY_B,
763+    /* 49 */ KEY_N,
764+    /* 50 */ KEY_M,
765+    /* 51 */ KEY_Comma,
766+    /* 52 */ KEY_Period,
767+    /* 53 */ KEY_Slash,
768+    /* 54 */ KEY_ShiftR,
769+    /* 55 */ KEY_KP_Multiply,
770+    /* 56 */ KEY_Alt,
771+    /* 57 */ KEY_Space,
772+    /* 58 */ KEY_CapsLock,
773+    /* 59 */ KEY_F1,
774+    /* 60 */ KEY_F2,
775+    /* 61 */ KEY_F3,
776+    /* 62 */ KEY_F4,
777+    /* 63 */ KEY_F5,
778+    /* 64 */ KEY_F6,
779+    /* 65 */ KEY_F7,
780+    /* 66 */ KEY_F8,
781+    /* 67 */ KEY_F9,
782+    /* 68 */ KEY_F10,
783+    /* 69 */ KEY_NumLock,
784+    /* 70 */ KEY_ScrollLock,
785+    /* 71 */ KEY_KP_7,
786+    /* 72 */ KEY_KP_8,
787+    /* 73 */ KEY_KP_9,
788+    /* 74 */ KEY_KP_Minus,
789+    /* 75 */ KEY_KP_4,
790+    /* 76 */ KEY_KP_5,
791+    /* 77 */ KEY_KP_6,
792+    /* 78 */ KEY_KP_Plus,
793+    /* 79 */ KEY_KP_1,
794+    /* 80 */ KEY_KP_2,
795+    /* 81 */ KEY_KP_3,
796+    /* 82 */ KEY_KP_0,
797+    /* 83 */ KEY_KP_Decimal,
798+    /* 84 */ KEY_NOTUSED,
799+    /* 85 */ KEY_NOTUSED,
800+    /* 86 */ KEY_Less, /* backslash on uk, < on german */
801+    /* 87 */ KEY_F11,
802+    /* 88 */ KEY_F12,
803+    /* 89 */ KEY_NOTUSED,
804+    /* 90 */ KEY_NOTUSED,
805+    /* 91 */ KEY_NOTUSED,
806+    /* 92 */ KEY_NOTUSED,
807+    /* 93 */ KEY_NOTUSED,
808+    /* 94 */ KEY_NOTUSED,
809+    /* 95 */ KEY_NOTUSED,
810+    /* 96 */ KEY_NOTUSED,
811+    /* 97 */ KEY_NOTUSED,
812+    /* 98 */ KEY_NOTUSED,
813+    /* 99 */ KEY_NOTUSED,
814+    /* 100 */ KEY_NOTUSED,
815+    /* 101 */ KEY_NOTUSED,
816+    /* 102 */ KEY_NOTUSED,
817+    /* 103 */ KEY_NOTUSED,
818+    /* 104 */ KEY_NOTUSED,
819+    /* 105 */ KEY_NOTUSED,
820+    /* 106 */ KEY_NOTUSED,
821+    /* 107 */ KEY_NOTUSED,
822+    /* 108 */ KEY_NOTUSED,
823+    /* 109 */ KEY_NOTUSED,
824+    /* 110 */ KEY_NOTUSED,
825+    /* 111 */ KEY_NOTUSED,
826+    /* 112 */ KEY_NOTUSED,
827+    /* 113 */ KEY_NOTUSED,
828+    /* 114 */ KEY_NOTUSED,
829+    /* 115 */ KEY_NOTUSED,
830+    /* 116 */ KEY_NOTUSED,
831+    /* 117 */ KEY_NOTUSED,
832+    /* 118 */ KEY_NOTUSED,
833+    /* 119 */ KEY_NOTUSED,
834+    /* 120 */ KEY_NOTUSED,
835+    /* 121 */ KEY_NOTUSED,
836+    /* 122 */ KEY_NOTUSED,
837+    /* 123 */ KEY_NOTUSED,
838+    /* 124 */ KEY_NOTUSED,
839+    /* 125 */ KEY_NOTUSED,
840+    /* 126 */ KEY_NOTUSED,
841+    /* 127 */ KEY_Pause,
842+    /* 128 */ KEY_NOTUSED,
843+    /* 129 */ KEY_NOTUSED,
844+    /* 130 */ KEY_NOTUSED,
845+    /* 131 */ KEY_NOTUSED,
846+    /* 132 */ KEY_NOTUSED,
847+    /* 133 */ KEY_NOTUSED,
848+    /* 134 */ KEY_NOTUSED,
849+    /* 135 */ KEY_NOTUSED,
850+    /* 136 */ KEY_NOTUSED,
851+    /* 137 */ KEY_NOTUSED,
852+    /* 138 */ KEY_NOTUSED,
853+    /* 139 */ KEY_NOTUSED,
854+    /* 140 */ KEY_NOTUSED,
855+    /* 141 */ KEY_NOTUSED,
856+    /* 142 */ KEY_NOTUSED,
857+    /* 143 */ KEY_NOTUSED,
858+    /* 144 */ KEY_NOTUSED,
859+    /* 145 */ KEY_NOTUSED,
860+    /* 146 */ KEY_NOTUSED,
861+    /* 147 */ KEY_NOTUSED,
862+    /* 148 */ KEY_NOTUSED,
863+    /* 149 */ KEY_NOTUSED,
864+    /* 150 */ KEY_NOTUSED,
865+    /* 151 */ KEY_NOTUSED,
866+    /* 152 */ KEY_NOTUSED,
867+    /* 153 */ KEY_NOTUSED,
868+    /* 154 */ KEY_NOTUSED,
869+    /* 155 */ KEY_NOTUSED,
870+    /* 156 */ KEY_KP_Enter,
871+    /* 157 */ KEY_RCtrl,
872+    /* 158 */ KEY_NOTUSED,
873+    /* 159 */ KEY_NOTUSED,
874+    /* 160 */ KEY_Mute,
875+    /* 161 */ KEY_NOTUSED,
876+    /* 162 */ KEY_NOTUSED,
877+    /* 163 */ KEY_NOTUSED,
878+    /* 164 */ KEY_NOTUSED,
879+    /* 165 */ KEY_NOTUSED,
880+    /* 166 */ KEY_NOTUSED,
881+    /* 167 */ KEY_NOTUSED,
882+    /* 168 */ KEY_NOTUSED,
883+    /* 169 */ KEY_NOTUSED,
884+    /* 170 */ KEY_Print,
885+    /* 171 */ KEY_NOTUSED,
886+    /* 172 */ KEY_NOTUSED,
887+    /* 173 */ KEY_NOTUSED,
888+    /* 174 */ KEY_AudioLower,
889+    /* 175 */ KEY_AudioRaise,
890+    /* 176 */ KEY_NOTUSED,
891+    /* 177 */ KEY_NOTUSED,
892+    /* 178 */ KEY_NOTUSED,
893+    /* 179 */ KEY_NOTUSED,
894+    /* 180 */ KEY_NOTUSED,
895+    /* 181 */ KEY_KP_Divide,
896+    /* 182 */ KEY_NOTUSED,
897+    /* 183 */ KEY_Print,
898+    /* 184 */ KEY_AltLang,
899+    /* 185 */ KEY_NOTUSED,
900+    /* 186 */ KEY_NOTUSED,
901+    /* 187 */ KEY_NOTUSED,
902+    /* 188 */ KEY_NOTUSED,
903+    /* 189 */ KEY_NOTUSED,
904+    /* 190 */ KEY_NOTUSED,
905+    /* 191 */ KEY_NOTUSED,
906+    /* 192 */ KEY_NOTUSED,
907+    /* 193 */ KEY_NOTUSED,
908+    /* 194 */ KEY_NOTUSED,
909+    /* 195 */ KEY_NOTUSED,
910+    /* 196 */ KEY_NOTUSED,
911+    /* 197 */ KEY_NOTUSED,
912+    /* 198 */ KEY_NOTUSED,
913+    /* 199 */ KEY_Home,
914+    /* 200 */ KEY_Up,
915+    /* 201 */ KEY_PgUp,
916+    /* 202 */ KEY_NOTUSED,
917+    /* 203 */ KEY_Left,
918+    /* 204 */ KEY_NOTUSED,
919+    /* 205 */ KEY_Right,
920+    /* 206 */ KEY_NOTUSED,
921+    /* 207 */ KEY_End,
922+    /* 208 */ KEY_Down,
923+    /* 209 */ KEY_PgDown,
924+    /* 210 */ KEY_Insert,
925+    /* 211 */ KEY_Delete,
926+    /* 212 */ KEY_NOTUSED,
927+    /* 213 */ KEY_NOTUSED,
928+    /* 214 */ KEY_NOTUSED,
929+    /* 215 */ KEY_NOTUSED,
930+    /* 216 */ KEY_NOTUSED,
931+    /* 217 */ KEY_NOTUSED,
932+    /* 218 */ KEY_NOTUSED,
933+    /* 219 */ KEY_LMeta,
934+    /* 220 */ KEY_RMeta,
935+    /* 221 */ KEY_Menu,
936 };
+35, -20
  1@@ -24,8 +24,8 @@
  2 #include "xdg_decoration.h"
  3 #include "util.h"
  4 
  5-#include <wayland-server.h>
  6 #include "xdg-decoration-unstable-v1-server-protocol.h"
  7+#include <wayland-server.h>
  8 
  9 struct xdg_toplevel_decoration {
 10 	struct wl_resource *resource;
 11@@ -43,15 +43,16 @@ unset_mode(struct wl_client *client, struct wl_resource *resource)
 12 }
 13 
 14 static const struct zxdg_toplevel_decoration_v1_interface decoration_impl = {
 15-	.destroy = destroy_resource,
 16-	.set_mode = set_mode,
 17-	.unset_mode = unset_mode,
 18+    .destroy = destroy_resource,
 19+    .set_mode = set_mode,
 20+    .unset_mode = unset_mode,
 21 };
 22 
 23 static void
 24 handle_toplevel_destroy(struct wl_listener *listener, void *data)
 25 {
 26-	struct xdg_toplevel_decoration *decoration = wl_container_of(listener, decoration, toplevel_destroy_listener);
 27+	struct xdg_toplevel_decoration *decoration =
 28+	    wl_container_of(listener, decoration, toplevel_destroy_listener);
 29 
 30 	wl_resource_destroy(decoration->resource);
 31 }
 32@@ -59,27 +60,36 @@ handle_toplevel_destroy(struct wl_listener *listener, void *data)
 33 static void
 34 decoration_destroy(struct wl_resource *resource)
 35 {
 36-	struct xdg_toplevel_decoration *decoration = wl_resource_get_user_data(resource);
 37+	struct xdg_toplevel_decoration *decoration =
 38+	    wl_resource_get_user_data(resource);
 39 
 40 	wl_list_remove(&decoration->toplevel_destroy_listener.link);
 41 	free(decoration);
 42 }
 43 
 44 static void
 45-get_toplevel_decoration(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *toplevel_resource)
 46+get_toplevel_decoration(struct wl_client *client, struct wl_resource *resource,
 47+                        uint32_t id, struct wl_resource *toplevel_resource)
 48 {
 49 	struct xdg_toplevel_decoration *decoration;
 50 
 51 	decoration = malloc(sizeof(*decoration));
 52-	if (!decoration)
 53+	if (!decoration) {
 54 		goto error0;
 55-	decoration->resource = wl_resource_create(client, &zxdg_toplevel_decoration_v1_interface, wl_resource_get_version(resource), id);
 56-	if (!decoration->resource)
 57+	}
 58+	decoration->resource =
 59+	    wl_resource_create(client, &zxdg_toplevel_decoration_v1_interface,
 60+	                       wl_resource_get_version(resource), id);
 61+	if (!decoration->resource) {
 62 		goto error1;
 63+	}
 64 	decoration->toplevel_destroy_listener.notify = &handle_toplevel_destroy;
 65-	wl_resource_add_destroy_listener(toplevel_resource, &decoration->toplevel_destroy_listener);
 66-	wl_resource_set_implementation(decoration->resource, &decoration_impl, decoration, decoration_destroy);
 67-	zxdg_toplevel_decoration_v1_send_configure(decoration->resource, ZXDG_TOPLEVEL_DECORATION_V1_MODE_SERVER_SIDE);
 68+	wl_resource_add_destroy_listener(toplevel_resource,
 69+	                                 &decoration->toplevel_destroy_listener);
 70+	wl_resource_set_implementation(decoration->resource, &decoration_impl,
 71+	                               decoration, decoration_destroy);
 72+	zxdg_toplevel_decoration_v1_send_configure(
 73+	    decoration->resource, ZXDG_TOPLEVEL_DECORATION_V1_MODE_SERVER_SIDE);
 74 	return;
 75 
 76 error1:
 77@@ -88,26 +98,31 @@ error0:
 78 	wl_resource_post_no_memory(resource);
 79 }
 80 
 81-static const struct zxdg_decoration_manager_v1_interface decoration_manager_impl = {
 82-	.destroy = destroy_resource,
 83-	.get_toplevel_decoration = get_toplevel_decoration,
 84+static const struct zxdg_decoration_manager_v1_interface
 85+    decoration_manager_impl = {
 86+        .destroy = destroy_resource,
 87+        .get_toplevel_decoration = get_toplevel_decoration,
 88 };
 89 
 90 static void
 91-bind_decoration_manager(struct wl_client *client, void *data, uint32_t version, uint32_t id)
 92+bind_decoration_manager(struct wl_client *client, void *data, uint32_t version,
 93+                        uint32_t id)
 94 {
 95 	struct wl_resource *resource;
 96 
 97-	resource = wl_resource_create(client, &zxdg_decoration_manager_v1_interface, version, id);
 98+	resource = wl_resource_create(client, &zxdg_decoration_manager_v1_interface,
 99+	                              version, id);
100 	if (!resource) {
101 		wl_client_post_no_memory(client);
102 		return;
103 	}
104-	wl_resource_set_implementation(resource, &decoration_manager_impl, NULL, NULL);
105+	wl_resource_set_implementation(resource, &decoration_manager_impl, NULL,
106+	                               NULL);
107 }
108 
109 struct wl_global *
110 xdg_decoration_manager_create(struct wl_display *display)
111 {
112-	return wl_global_create(display, &zxdg_decoration_manager_v1_interface, 1, NULL, &bind_decoration_manager);
113+	return wl_global_create(display, &zxdg_decoration_manager_v1_interface, 1,
114+	                        NULL, &bind_decoration_manager);
115 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *xdg_decoration_manager_create(struct wl_display *display);
6+struct wl_global *
7+xdg_decoration_manager_create(struct wl_display *display);
8 
9 #endif
+208, -124
  1@@ -22,17 +22,17 @@
  2  */
  3 
  4 #include "xdg_shell.h"
  5-#include "internal.h"
  6 #include "compositor.h"
  7+#include "internal.h"
  8 #include "seat.h"
  9 #include "surface.h"
 10 #include "util.h"
 11 #include "window.h"
 12 
 13+#include "xdg-shell-server-protocol.h"
 14 #include <assert.h>
 15 #include <stdlib.h>
 16 #include <wayland-server.h>
 17-#include "xdg-shell-server-protocol.h"
 18 
 19 struct xdg_surface {
 20 	struct wl_resource *resource, *role;
 21@@ -75,12 +75,14 @@ destroy_positioner(struct wl_resource *resource)
 22 }
 23 
 24 static void
 25-set_size(struct wl_client *client, struct wl_resource *resource, int32_t width, int32_t height)
 26+set_size(struct wl_client *client, struct wl_resource *resource, int32_t width,
 27+         int32_t height)
 28 {
 29 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 30 
 31 	if (width <= 0 || height <= 0) {
 32-		wl_resource_post_error(resource, XDG_POSITIONER_ERROR_INVALID_INPUT, "invalid size");
 33+		wl_resource_post_error(resource, XDG_POSITIONER_ERROR_INVALID_INPUT,
 34+		                       "invalid size");
 35 		return;
 36 	}
 37 	positioner->width = width;
 38@@ -88,12 +90,14 @@ set_size(struct wl_client *client, struct wl_resource *resource, int32_t width,
 39 }
 40 
 41 static void
 42-set_anchor_rect(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y, int32_t width, int32_t height)
 43+set_anchor_rect(struct wl_client *client, struct wl_resource *resource,
 44+                int32_t x, int32_t y, int32_t width, int32_t height)
 45 {
 46 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 47 
 48 	if (width <= 0 || height <= 0) {
 49-		wl_resource_post_error(resource, XDG_POSITIONER_ERROR_INVALID_INPUT, "invalid anchor size");
 50+		wl_resource_post_error(resource, XDG_POSITIONER_ERROR_INVALID_INPUT,
 51+		                       "invalid anchor size");
 52 		return;
 53 	}
 54 	positioner->anchor_x = x;
 55@@ -103,7 +107,8 @@ set_anchor_rect(struct wl_client *client, struct wl_resource *resource, int32_t
 56 }
 57 
 58 static void
 59-set_anchor(struct wl_client *client, struct wl_resource *resource, uint32_t anchor)
 60+set_anchor(struct wl_client *client, struct wl_resource *resource,
 61+           uint32_t anchor)
 62 {
 63 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 64 
 65@@ -111,7 +116,8 @@ set_anchor(struct wl_client *client, struct wl_resource *resource, uint32_t anch
 66 }
 67 
 68 static void
 69-set_gravity(struct wl_client *client, struct wl_resource *resource, uint32_t gravity)
 70+set_gravity(struct wl_client *client, struct wl_resource *resource,
 71+            uint32_t gravity)
 72 {
 73 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 74 
 75@@ -119,7 +125,8 @@ set_gravity(struct wl_client *client, struct wl_resource *resource, uint32_t gra
 76 }
 77 
 78 static void
 79-set_constraint_adjustment(struct wl_client *client, struct wl_resource *resource, uint32_t constraint)
 80+set_constraint_adjustment(struct wl_client *client,
 81+                          struct wl_resource *resource, uint32_t constraint)
 82 {
 83 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 84 
 85@@ -127,7 +134,8 @@ set_constraint_adjustment(struct wl_client *client, struct wl_resource *resource
 86 }
 87 
 88 static void
 89-set_offset(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y)
 90+set_offset(struct wl_client *client, struct wl_resource *resource, int32_t x,
 91+           int32_t y)
 92 {
 93 	struct xdg_positioner *positioner = wl_resource_get_user_data(resource);
 94 
 95@@ -136,23 +144,23 @@ set_offset(struct wl_client *client, struct wl_resource *resource, int32_t x, in
 96 }
 97 
 98 static const struct xdg_positioner_interface positioner_impl = {
 99-	.destroy = destroy_resource,
100-	.set_size = set_size,
101-	.set_anchor_rect = set_anchor_rect,
102-	.set_anchor = set_anchor,
103-	.set_gravity = set_gravity,
104-	.set_constraint_adjustment = set_constraint_adjustment,
105-	.set_offset = set_offset,
106+    .destroy = destroy_resource,
107+    .set_size = set_size,
108+    .set_anchor_rect = set_anchor_rect,
109+    .set_anchor = set_anchor,
110+    .set_gravity = set_gravity,
111+    .set_constraint_adjustment = set_constraint_adjustment,
112+    .set_offset = set_offset,
113 };
114 
115 static struct swc_rectangle
116 calculate_position(struct xdg_positioner *positioner)
117 {
118 	struct swc_rectangle r = {
119-		.x = positioner->offset_x,
120-		.y = positioner->offset_y,
121-		.width = positioner->width,
122-		.height = positioner->height,
123+	    .x = positioner->offset_x,
124+	    .y = positioner->offset_y,
125+	    .width = positioner->width,
126+	    .height = positioner->height,
127 	};
128 
129 	switch (positioner->anchor) {
130@@ -229,9 +237,11 @@ add_state(struct xdg_toplevel *toplevel, uint32_t state)
131 {
132 	uint32_t *current_state;
133 
134-	wl_array_for_each (current_state, &toplevel->states) {
135-		if (*current_state == state)
136+	wl_array_for_each(current_state, &toplevel->states)
137+	{
138+		if (*current_state == state) {
139 			return false;
140+		}
141 	}
142 
143 	if (!(current_state = wl_array_add(&toplevel->states, sizeof(state)))) {
144@@ -248,7 +258,8 @@ remove_state(struct xdg_toplevel *toplevel, uint32_t state)
145 {
146 	uint32_t *current_state;
147 
148-	wl_array_for_each (current_state, &toplevel->states) {
149+	wl_array_for_each(current_state, &toplevel->states)
150+	{
151 		if (*current_state == state) {
152 			array_remove(&toplevel->states, current_state, sizeof(state));
153 			return true;
154@@ -259,15 +270,19 @@ remove_state(struct xdg_toplevel *toplevel, uint32_t state)
155 }
156 
157 static uint32_t
158-send_configure(struct xdg_toplevel *toplevel, int32_t width, int32_t height) {
159+send_configure(struct xdg_toplevel *toplevel, int32_t width, int32_t height)
160+{
161 	uint32_t serial = wl_display_next_serial(swc.display);
162 
163-	if (width < 0)
164+	if (width < 0) {
165 		width = toplevel->window.configure.width;
166-	if (height < 0)
167+	}
168+	if (height < 0) {
169 		height = toplevel->window.configure.height;
170+	}
171 
172-	xdg_toplevel_send_configure(toplevel->resource, width, height, &toplevel->states);
173+	xdg_toplevel_send_configure(toplevel->resource, width, height,
174+	                            &toplevel->states);
175 	xdg_surface_send_configure(toplevel->xdg_surface->resource, serial);
176 
177 	return serial;
178@@ -279,7 +294,8 @@ configure(struct window *window, uint32_t width, uint32_t height)
179 	struct xdg_toplevel *toplevel = wl_container_of(window, toplevel, window);
180 
181 	window->configure.acknowledged = false;
182-	toplevel->xdg_surface->configure_serial = send_configure(toplevel, width, height);
183+	toplevel->xdg_surface->configure_serial =
184+	    send_configure(toplevel, width, height);
185 }
186 
187 static void
188@@ -291,7 +307,8 @@ focus(struct window *window)
189 
190 	add_state(toplevel, XDG_TOPLEVEL_STATE_ACTIVATED);
191 	/* dont send  0x0 on focus change */
192-	send_configure(toplevel, width ? (int32_t)width : -1, height ? (int32_t)height : -1);
193+	send_configure(toplevel, width ? (int32_t)width : -1,
194+	               height ? (int32_t)height : -1);
195 }
196 
197 static void
198@@ -302,7 +319,8 @@ unfocus(struct window *window)
199 	uint32_t height = window->view->base.geometry.height;
200 
201 	remove_state(toplevel, XDG_TOPLEVEL_STATE_ACTIVATED);
202-	send_configure(toplevel, width ? (int32_t)width : -1, height ? (int32_t)height : -1);
203+	send_configure(toplevel, width ? (int32_t)width : -1,
204+	               height ? (int32_t)height : -1);
205 }
206 
207 static void
208@@ -340,71 +358,84 @@ set_mode(struct window *window, unsigned mode)
209 }
210 
211 static const struct window_impl toplevel_window_impl = {
212-	.configure = configure,
213-	.focus = focus,
214-	.unfocus = unfocus,
215-	.close = close_,
216-	.set_mode = set_mode,
217+    .configure = configure,
218+    .focus = focus,
219+    .unfocus = unfocus,
220+    .close = close_,
221+    .set_mode = set_mode,
222 };
223 
224 static void
225-set_parent(struct wl_client *client, struct wl_resource *resource, struct wl_resource *parent_resource)
226+set_parent(struct wl_client *client, struct wl_resource *resource,
227+           struct wl_resource *parent_resource)
228 {
229-	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource), *parent = NULL;
230+	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource),
231+	                    *parent = NULL;
232 
233-	if (parent_resource)
234+	if (parent_resource) {
235 		parent = wl_resource_get_user_data(parent_resource);
236+	}
237 	window_set_parent(&toplevel->window, parent ? &parent->window : NULL);
238 }
239 
240 static void
241-set_title(struct wl_client *client, struct wl_resource *resource, const char *title)
242+set_title(struct wl_client *client, struct wl_resource *resource,
243+          const char *title)
244 {
245 	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource);
246 	window_set_title(&toplevel->window, title, -1);
247 }
248 
249 static void
250-set_app_id(struct wl_client *client, struct wl_resource *resource, const char *app_id)
251+set_app_id(struct wl_client *client, struct wl_resource *resource,
252+           const char *app_id)
253 {
254 	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource);
255 	window_set_app_id(&toplevel->window, app_id);
256 }
257 
258 static void
259-show_window_menu(struct wl_client *client, struct wl_resource *resource, struct wl_resource *seat, uint32_t serial, int32_t x, int32_t y)
260+show_window_menu(struct wl_client *client, struct wl_resource *resource,
261+                 struct wl_resource *seat, uint32_t serial, int32_t x,
262+                 int32_t y)
263 {
264 }
265 
266 static void
267-move(struct wl_client *client, struct wl_resource *resource, struct wl_resource *seat, uint32_t serial)
268+move(struct wl_client *client, struct wl_resource *resource,
269+     struct wl_resource *seat, uint32_t serial)
270 {
271 	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource);
272 	struct button *button;
273 
274 	button = pointer_get_button(swc.seat->pointer, serial);
275-	if (button)
276+	if (button) {
277 		window_begin_move(&toplevel->window, button);
278+	}
279 }
280 
281 static void
282-resize(struct wl_client *client, struct wl_resource *resource, struct wl_resource *seat, uint32_t serial, uint32_t edges)
283+resize(struct wl_client *client, struct wl_resource *resource,
284+       struct wl_resource *seat, uint32_t serial, uint32_t edges)
285 {
286 	struct xdg_toplevel *toplevel = wl_resource_get_user_data(resource);
287 	struct button *button;
288 
289 	button = pointer_get_button(swc.seat->pointer, serial);
290-	if (button)
291+	if (button) {
292 		window_begin_resize(&toplevel->window, edges, button);
293+	}
294 }
295 
296 static void
297-set_max_size(struct wl_client *client, struct wl_resource *resource, int32_t width, int32_t height)
298+set_max_size(struct wl_client *client, struct wl_resource *resource,
299+             int32_t width, int32_t height)
300 {
301 }
302 
303 static void
304-set_min_size(struct wl_client *client, struct wl_resource *resource, int32_t width, int32_t height)
305+set_min_size(struct wl_client *client, struct wl_resource *resource,
306+             int32_t width, int32_t height)
307 {
308 }
309 
310@@ -419,7 +450,8 @@ unset_maximized(struct wl_client *client, struct wl_resource *resource)
311 }
312 
313 static void
314-set_fullscreen(struct wl_client *client, struct wl_resource *resource, struct wl_resource *output)
315+set_fullscreen(struct wl_client *client, struct wl_resource *resource,
316+               struct wl_resource *output)
317 {
318 }
319 
320@@ -434,37 +466,43 @@ set_minimized(struct wl_client *client, struct wl_resource *resource)
321 }
322 
323 static const struct xdg_toplevel_interface toplevel_impl = {
324-	.destroy = destroy_resource,
325-	.set_parent = set_parent,
326-	.set_title = set_title,
327-	.set_app_id = set_app_id,
328-	.show_window_menu = show_window_menu,
329-	.move = move,
330-	.resize = resize,
331-	.set_max_size = set_max_size,
332-	.set_min_size = set_min_size,
333-	.set_maximized = set_maximized,
334-	.unset_maximized = unset_maximized,
335-	.set_fullscreen = set_fullscreen,
336-	.unset_fullscreen = unset_fullscreen,
337-	.set_minimized = set_minimized,
338+    .destroy = destroy_resource,
339+    .set_parent = set_parent,
340+    .set_title = set_title,
341+    .set_app_id = set_app_id,
342+    .show_window_menu = show_window_menu,
343+    .move = move,
344+    .resize = resize,
345+    .set_max_size = set_max_size,
346+    .set_min_size = set_min_size,
347+    .set_maximized = set_maximized,
348+    .unset_maximized = unset_maximized,
349+    .set_fullscreen = set_fullscreen,
350+    .unset_fullscreen = unset_fullscreen,
351+    .set_minimized = set_minimized,
352 };
353 
354 static struct xdg_toplevel *
355-xdg_toplevel_new(struct wl_client *client, uint32_t version, uint32_t id, struct xdg_surface *xdg_surface)
356+xdg_toplevel_new(struct wl_client *client, uint32_t version, uint32_t id,
357+                 struct xdg_surface *xdg_surface)
358 {
359 	struct xdg_toplevel *toplevel;
360 
361 	toplevel = malloc(sizeof(*toplevel));
362-	if (!toplevel)
363+	if (!toplevel) {
364 		goto error0;
365+	}
366 	toplevel->xdg_surface = xdg_surface;
367-	toplevel->resource = wl_resource_create(client, &xdg_toplevel_interface, version, id);
368-	if (!toplevel->resource)
369+	toplevel->resource =
370+	    wl_resource_create(client, &xdg_toplevel_interface, version, id);
371+	if (!toplevel->resource) {
372 		goto error1;
373-	window_initialize(&toplevel->window, &toplevel_window_impl, xdg_surface->surface);
374+	}
375+	window_initialize(&toplevel->window, &toplevel_window_impl,
376+	                  xdg_surface->surface);
377 	wl_array_init(&toplevel->states);
378-	wl_resource_set_implementation(toplevel->resource, &toplevel_impl, toplevel, &destroy_toplevel);
379+	wl_resource_set_implementation(toplevel->resource, &toplevel_impl, toplevel,
380+	                               &destroy_toplevel);
381 	window_manage(&toplevel->window);
382 
383 	return toplevel;
384@@ -486,42 +524,54 @@ destroy_popup(struct wl_resource *resource)
385 }
386 
387 static void
388-grab(struct wl_client *client, struct wl_resource *resource, struct wl_resource *seat, uint32_t serial)
389+grab(struct wl_client *client, struct wl_resource *resource,
390+     struct wl_resource *seat, uint32_t serial)
391 {
392 }
393 
394 static const struct xdg_popup_interface popup_impl = {
395-	.destroy = destroy_resource,
396-	.grab = grab,
397+    .destroy = destroy_resource,
398+    .grab = grab,
399 };
400 
401 static struct xdg_popup *
402-xdg_popup_new(struct wl_client *client, uint32_t version, uint32_t id, struct xdg_surface *xdg_surface, struct xdg_surface *parent, struct xdg_positioner *positioner)
403+xdg_popup_new(struct wl_client *client, uint32_t version, uint32_t id,
404+              struct xdg_surface *xdg_surface, struct xdg_surface *parent,
405+              struct xdg_positioner *positioner)
406 {
407 	struct xdg_popup *popup;
408-	struct compositor_view *parent_view = compositor_view(parent->surface->view);
409+	struct compositor_view *parent_view =
410+	    compositor_view(parent->surface->view);
411 	uint32_t serial = wl_display_next_serial(swc.display);
412 	struct swc_rectangle rect;
413 
414-	if (!parent_view)
415+	if (!parent_view) {
416 		goto error0;
417+	}
418 	popup = malloc(sizeof(*popup));
419-	if (!popup)
420+	if (!popup) {
421 		goto error0;
422+	}
423 	popup->xdg_surface = xdg_surface;
424 	popup->positioner = *positioner;
425-	popup->resource = wl_resource_create(client, &xdg_popup_interface, version, id);
426-	if (!popup->resource)
427+	popup->resource =
428+	    wl_resource_create(client, &xdg_popup_interface, version, id);
429+	if (!popup->resource) {
430 		goto error1;
431-	wl_resource_set_implementation(popup->resource, &popup_impl, popup, &destroy_popup);
432+	}
433+	wl_resource_set_implementation(popup->resource, &popup_impl, popup,
434+	                               &destroy_popup);
435 	popup->view = compositor_create_view(xdg_surface->surface);
436-	if (!popup->view)
437+	if (!popup->view) {
438 		goto error2;
439+	}
440 
441 	rect = calculate_position(positioner);
442 	compositor_view_set_parent(popup->view, parent_view);
443-	view_move(&popup->view->base, parent_view->base.geometry.x + rect.x, parent_view->base.geometry.y + rect.y);
444-	xdg_popup_send_configure(popup->resource, rect.x, rect.y, rect.width, rect.height);
445+	view_move(&popup->view->base, parent_view->base.geometry.x + rect.x,
446+	          parent_view->base.geometry.y + rect.y);
447+	xdg_popup_send_configure(popup->resource, rect.x, rect.y, rect.width,
448+	                         rect.height);
449 	xdg_surface_send_configure(xdg_surface->resource, serial);
450 
451 	return popup;
452@@ -536,60 +586,74 @@ error0:
453 
454 /* xdg_surface */
455 static void
456-get_toplevel(struct wl_client *client, struct wl_resource *resource, uint32_t id)
457+get_toplevel(struct wl_client *client, struct wl_resource *resource,
458+             uint32_t id)
459 {
460 	struct xdg_surface *xdg_surface = wl_resource_get_user_data(resource);
461 	struct xdg_toplevel *toplevel;
462 
463 	if (xdg_surface->role) {
464-		wl_resource_post_error(resource, XDG_WM_BASE_ERROR_ROLE, "surface already has a role");
465+		wl_resource_post_error(resource, XDG_WM_BASE_ERROR_ROLE,
466+		                       "surface already has a role");
467 		return;
468 	}
469-	toplevel = xdg_toplevel_new(client, wl_resource_get_version(resource), id, xdg_surface);
470+	toplevel = xdg_toplevel_new(client, wl_resource_get_version(resource), id,
471+	                            xdg_surface);
472 	if (!toplevel) {
473 		wl_client_post_no_memory(client);
474 		return;
475 	}
476 	xdg_surface->role = toplevel->resource;
477-	wl_resource_add_destroy_listener(xdg_surface->role, &xdg_surface->role_destroy_listener);
478+	wl_resource_add_destroy_listener(xdg_surface->role,
479+	                                 &xdg_surface->role_destroy_listener);
480 }
481 
482 static void
483-get_popup(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *parent_resource, struct wl_resource *positioner_resource)
484+get_popup(struct wl_client *client, struct wl_resource *resource, uint32_t id,
485+          struct wl_resource *parent_resource,
486+          struct wl_resource *positioner_resource)
487 {
488 	struct xdg_surface *xdg_surface = wl_resource_get_user_data(resource);
489 	struct xdg_surface *parent = wl_resource_get_user_data(parent_resource);
490-	struct xdg_positioner *positioner = wl_resource_get_user_data(positioner_resource);
491+	struct xdg_positioner *positioner =
492+	    wl_resource_get_user_data(positioner_resource);
493 	struct xdg_popup *popup;
494 
495 	if (xdg_surface->role) {
496-		wl_resource_post_error(resource, XDG_WM_BASE_ERROR_ROLE, "surface already has a role");
497+		wl_resource_post_error(resource, XDG_WM_BASE_ERROR_ROLE,
498+		                       "surface already has a role");
499 		return;
500 	}
501-	popup = xdg_popup_new(client, wl_resource_get_version(resource), id, xdg_surface, parent, positioner);
502+	popup = xdg_popup_new(client, wl_resource_get_version(resource), id,
503+	                      xdg_surface, parent, positioner);
504 	if (!popup) {
505 		wl_client_post_no_memory(client);
506 		return;
507 	}
508 	xdg_surface->role = popup->resource;
509-	wl_resource_add_destroy_listener(xdg_surface->role, &xdg_surface->role_destroy_listener);
510+	wl_resource_add_destroy_listener(xdg_surface->role,
511+	                                 &xdg_surface->role_destroy_listener);
512 }
513 
514 static void
515-ack_configure(struct wl_client *client, struct wl_resource *resource, uint32_t serial)
516+ack_configure(struct wl_client *client, struct wl_resource *resource,
517+              uint32_t serial)
518 {
519 	struct xdg_surface *xdg_surface = wl_resource_get_user_data(resource);
520 	struct window *window;
521 
522-	if (!xdg_surface->role)
523+	if (!xdg_surface->role) {
524 		return;
525+	}
526 	window = wl_resource_get_user_data(xdg_surface->role);
527-	if (window && serial == xdg_surface->configure_serial)
528+	if (window && serial == xdg_surface->configure_serial) {
529 		window->configure.acknowledged = true;
530+	}
531 }
532 
533 static void
534-set_window_geometry(struct wl_client *client, struct wl_resource *resource, int32_t x, int32_t y, int32_t width, int32_t height)
535+set_window_geometry(struct wl_client *client, struct wl_resource *resource,
536+                    int32_t x, int32_t y, int32_t width, int32_t height)
537 {
538 	(void)client;
539 	struct xdg_surface *xdg_surface = wl_resource_get_user_data(resource);
540@@ -605,7 +669,8 @@ set_window_geometry(struct wl_client *client, struct wl_resource *resource, int3
541 	surface->window_y = y;
542 	surface->window_width = width;
543 	surface->window_height = height;
544-	if (!surface->window_geometry_applied && surface->view && (x != 0 || y != 0)) {
545+	if (!surface->window_geometry_applied && surface->view &&
546+	    (x != 0 || y != 0)) {
547 		struct swc_rectangle *geom = &surface->view->geometry;
548 		view_move(surface->view, geom->x - x, geom->y - y);
549 		surface->window_geometry_applied = true;
550@@ -613,17 +678,18 @@ set_window_geometry(struct wl_client *client, struct wl_resource *resource, int3
551 }
552 
553 static const struct xdg_surface_interface xdg_surface_impl = {
554-	.destroy = destroy_resource,
555-	.get_toplevel = get_toplevel,
556-	.get_popup = get_popup,
557-	.ack_configure = ack_configure,
558-	.set_window_geometry = set_window_geometry,
559+    .destroy = destroy_resource,
560+    .get_toplevel = get_toplevel,
561+    .get_popup = get_popup,
562+    .ack_configure = ack_configure,
563+    .set_window_geometry = set_window_geometry,
564 };
565 
566 static void
567 handle_surface_destroy(struct wl_listener *listener, void *data)
568 {
569-	struct xdg_surface *xdg_surface = wl_container_of(listener, xdg_surface, surface_destroy_listener);
570+	struct xdg_surface *xdg_surface =
571+	    wl_container_of(listener, xdg_surface, surface_destroy_listener);
572 
573 	wl_resource_destroy(xdg_surface->resource);
574 }
575@@ -631,7 +697,8 @@ handle_surface_destroy(struct wl_listener *listener, void *data)
576 static void
577 handle_role_destroy(struct wl_listener *listener, void *data)
578 {
579-	struct xdg_surface *xdg_surface = wl_container_of(listener, xdg_surface, role_destroy_listener);
580+	struct xdg_surface *xdg_surface =
581+	    wl_container_of(listener, xdg_surface, role_destroy_listener);
582 
583 	xdg_surface->role = NULL;
584 }
585@@ -642,28 +709,35 @@ destroy_xdg_surface(struct wl_resource *resource)
586 	struct xdg_surface *xdg_surface = wl_resource_get_user_data(resource);
587 
588 	wl_list_remove(&xdg_surface->surface_destroy_listener.link);
589-	if (xdg_surface->role)
590+	if (xdg_surface->role) {
591 		wl_resource_destroy(xdg_surface->role);
592+	}
593 	free(xdg_surface);
594 }
595 
596 static struct xdg_surface *
597-xdg_surface_new(struct wl_client *client, uint32_t version, uint32_t id, struct surface *surface)
598+xdg_surface_new(struct wl_client *client, uint32_t version, uint32_t id,
599+                struct surface *surface)
600 {
601 	struct xdg_surface *xdg_surface;
602 
603 	xdg_surface = malloc(sizeof(*xdg_surface));
604-	if (!xdg_surface)
605+	if (!xdg_surface) {
606 		goto error0;
607-	xdg_surface->resource = wl_resource_create(client, &xdg_surface_interface, version, id);
608-	if (!xdg_surface->resource)
609+	}
610+	xdg_surface->resource =
611+	    wl_resource_create(client, &xdg_surface_interface, version, id);
612+	if (!xdg_surface->resource) {
613 		goto error1;
614+	}
615 	xdg_surface->surface = surface;
616 	xdg_surface->surface_destroy_listener.notify = &handle_surface_destroy;
617 	xdg_surface->role = NULL;
618 	xdg_surface->role_destroy_listener.notify = &handle_role_destroy;
619-	wl_resource_add_destroy_listener(surface->resource, &xdg_surface->surface_destroy_listener);
620-	wl_resource_set_implementation(xdg_surface->resource, &xdg_surface_impl, xdg_surface, destroy_xdg_surface);
621+	wl_resource_add_destroy_listener(surface->resource,
622+	                                 &xdg_surface->surface_destroy_listener);
623+	wl_resource_set_implementation(xdg_surface->resource, &xdg_surface_impl,
624+	                               xdg_surface, destroy_xdg_surface);
625 
626 	return xdg_surface;
627 
628@@ -675,21 +749,26 @@ error0:
629 
630 /* xdg_shell */
631 static void
632-create_positioner(struct wl_client *client, struct wl_resource *resource, uint32_t id)
633+create_positioner(struct wl_client *client, struct wl_resource *resource,
634+                  uint32_t id)
635 {
636 	struct xdg_positioner *positioner;
637 	struct wl_resource *positioner_resource;
638 	uint32_t version;
639 
640 	positioner = calloc(1, sizeof(*positioner));
641-	if (!positioner)
642+	if (!positioner) {
643 		goto error0;
644+	}
645 
646 	version = wl_resource_get_version(resource);
647-	positioner_resource = wl_resource_create(client, &xdg_positioner_interface, version, id);
648-	if (!positioner_resource)
649+	positioner_resource =
650+	    wl_resource_create(client, &xdg_positioner_interface, version, id);
651+	if (!positioner_resource) {
652 		goto error1;
653-	wl_resource_set_implementation(positioner_resource, &positioner_impl, positioner, &destroy_positioner);
654+	}
655+	wl_resource_set_implementation(positioner_resource, &positioner_impl,
656+	                               positioner, &destroy_positioner);
657 	return;
658 
659 error1:
660@@ -699,14 +778,17 @@ error0:
661 }
662 
663 static void
664-get_xdg_surface(struct wl_client *client, struct wl_resource *resource, uint32_t id, struct wl_resource *surface_resource)
665+get_xdg_surface(struct wl_client *client, struct wl_resource *resource,
666+                uint32_t id, struct wl_resource *surface_resource)
667 {
668 	struct xdg_surface *xdg_surface;
669 	struct surface *surface = wl_resource_get_user_data(surface_resource);
670 
671-	xdg_surface = xdg_surface_new(client, wl_resource_get_version(resource), id, surface);
672-	if (!xdg_surface)
673+	xdg_surface =
674+	    xdg_surface_new(client, wl_resource_get_version(resource), id, surface);
675+	if (!xdg_surface) {
676 		wl_client_post_no_memory(client);
677+	}
678 }
679 
680 static void
681@@ -715,14 +797,15 @@ pong(struct wl_client *client, struct wl_resource *resource, uint32_t serial)
682 }
683 
684 static const struct xdg_wm_base_interface wm_base_impl = {
685-	.destroy = destroy_resource,
686-	.create_positioner = create_positioner,
687-	.get_xdg_surface = get_xdg_surface,
688-	.pong = pong,
689+    .destroy = destroy_resource,
690+    .create_positioner = create_positioner,
691+    .get_xdg_surface = get_xdg_surface,
692+    .pong = pong,
693 };
694 
695 static void
696-bind_wm_base(struct wl_client *client, void *data, uint32_t version, uint32_t id)
697+bind_wm_base(struct wl_client *client, void *data, uint32_t version,
698+             uint32_t id)
699 {
700 	struct wl_resource *resource;
701 
702@@ -737,5 +820,6 @@ bind_wm_base(struct wl_client *client, void *data, uint32_t version, uint32_t id
703 struct wl_global *
704 xdg_shell_create(struct wl_display *display)
705 {
706-	return wl_global_create(display, &xdg_wm_base_interface, 1, NULL, &bind_wm_base);
707+	return wl_global_create(display, &xdg_wm_base_interface, 1, NULL,
708+	                        &bind_wm_base);
709 }
+2, -1
1@@ -26,6 +26,7 @@
2 
3 struct wl_display;
4 
5-struct wl_global *xdg_shell_create(struct wl_display *display);
6+struct wl_global *
7+xdg_shell_create(struct wl_display *display);
8 
9 #endif
+60, -42
  1@@ -30,15 +30,15 @@
  2 #include "util.h"
  3 #include "xwm.h"
  4 
  5+#include <errno.h>
  6+#include <fcntl.h>
  7 #include <signal.h>
  8-#include <stdlib.h>
  9 #include <stdio.h>
 10-#include <unistd.h>
 11-#include <fcntl.h>
 12-#include <errno.h>
 13-#include <sys/stat.h>
 14+#include <stdlib.h>
 15 #include <sys/socket.h>
 16+#include <sys/stat.h>
 17 #include <sys/un.h>
 18+#include <unistd.h>
 19 #include <wayland-server.h>
 20 
 21 #define LOCK_FMT "/tmp/.X%d-lock"
 22@@ -61,24 +61,28 @@ open_socket(struct sockaddr_un *addr)
 23 {
 24 	int fd;
 25 
 26-	if ((fd = socket(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0)) < 0)
 27+	if ((fd = socket(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0)) < 0) {
 28 		goto error0;
 29+	}
 30 
 31 	/* Unlink the socket location in case it was being used by a process which
 32 	 * left around a stale lockfile. */
 33 	unlink(addr->sun_path);
 34 
 35-	if (bind(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0)
 36+	if (bind(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
 37 		goto error1;
 38+	}
 39 
 40-	if (listen(fd, 1) < 0)
 41+	if (listen(fd, 1) < 0) {
 42 		goto error2;
 43+	}
 44 
 45 	return fd;
 46 
 47 error2:
 48-	if (addr->sun_path[0])
 49+	if (addr->sun_path[0]) {
 50 		unlink(addr->sun_path);
 51+	}
 52 error1:
 53 	close(fd);
 54 error0:
 55@@ -116,22 +120,27 @@ begin:
 56 		pid_t owner;
 57 
 58 		/* Check if the owning process is still alive. */
 59-		if ((lock_fd = open(lock_name, O_RDONLY)) == -1)
 60+		if ((lock_fd = open(lock_name, O_RDONLY)) == -1) {
 61 			goto retry0;
 62+		}
 63 
 64-		if (read(lock_fd, pid, sizeof(pid) - 1) != sizeof(pid) - 1)
 65+		if (read(lock_fd, pid, sizeof(pid) - 1) != sizeof(pid) - 1) {
 66 			goto retry0;
 67+		}
 68 
 69 		owner = strtol(pid, &end, 10);
 70 
 71-		if (end != pid + 10)
 72+		if (end != pid + 10) {
 73 			goto retry0;
 74+		}
 75 
 76-		if (kill(owner, 0) == 0 || errno != ESRCH)
 77+		if (kill(owner, 0) == 0 || errno != ESRCH) {
 78 			goto retry0;
 79+		}
 80 
 81-		if (unlink(lock_name) != 0)
 82+		if (unlink(lock_name) != 0) {
 83 			goto retry0;
 84+		}
 85 
 86 		goto begin;
 87 	}
 88@@ -148,17 +157,21 @@ begin:
 89 
 90 	/* Bind to abstract socket */
 91 	addr.sun_path[0] = '\0';
 92-	snprintf(addr.sun_path + 1, sizeof(addr.sun_path) - 1, SOCKET_FMT, xserver.display);
 93-	if ((xserver.abstract_fd = open_socket(&addr)) < 0)
 94+	snprintf(addr.sun_path + 1, sizeof(addr.sun_path) - 1, SOCKET_FMT,
 95+	         xserver.display);
 96+	if ((xserver.abstract_fd = open_socket(&addr)) < 0) {
 97 		goto retry1;
 98+	}
 99 
100 	/* Bind to unix socket */
101 	mkdir(SOCKET_DIR, 0777);
102 	snprintf(addr.sun_path, sizeof(addr.sun_path), SOCKET_FMT, xserver.display);
103-	if ((xserver.unix_fd = open_socket(&addr)) < 0)
104+	if ((xserver.unix_fd = open_socket(&addr)) < 0) {
105 		goto retry2;
106+	}
107 
108-	snprintf(xserver.display_name, sizeof(xserver.display_name), ":%d", xserver.display);
109+	snprintf(xserver.display_name, sizeof(xserver.display_name), ":%d",
110+	         xserver.display);
111 	setenv("DISPLAY", xserver.display_name, true);
112 
113 	return true;
114@@ -196,12 +209,13 @@ handle_usr1(int signal_number, void *data)
115 }
116 
117 static void
118-handle_client_destroy(struct wl_listener *listener, void *data) {
119+handle_client_destroy(struct wl_listener *listener, void *data)
120+{
121 	swc_xserver.client = NULL;
122 }
123 
124 static struct wl_listener client_destroy_listener = {
125-	.notify = handle_client_destroy,
126+    .notify = handle_client_destroy,
127 };
128 
129 bool
130@@ -215,7 +229,8 @@ xserver_initialize(void)
131 		goto error0;
132 	}
133 
134-	xserver.usr1_source = wl_event_loop_add_signal(swc.event_loop, SIGUSR1, &handle_usr1, NULL);
135+	xserver.usr1_source =
136+	    wl_event_loop_add_signal(swc.event_loop, SIGUSR1, &handle_usr1, NULL);
137 
138 	if (!xserver.usr1_source) {
139 		ERROR("Failed to create SIGUSR1 event source\n");
140@@ -234,56 +249,57 @@ xserver_initialize(void)
141 		goto error3;
142 	}
143 
144-	if (!(swc_xserver.client = wl_client_create(swc.display, wl[0])))
145+	if (!(swc_xserver.client = wl_client_create(swc.display, wl[0]))) {
146 		goto error4;
147+	}
148 
149-	wl_client_add_destroy_listener(swc_xserver.client, &client_destroy_listener);
150+	wl_client_add_destroy_listener(swc_xserver.client,
151+	                               &client_destroy_listener);
152 	xserver.wm_fd = wm[0];
153 
154 	/* Start the X server */
155 	switch (fork()) {
156 	case 0: {
157-		int fds[] = { wl[1], wm[1], xserver.abstract_fd, xserver.unix_fd };
158+		int fds[] = {wl[1], wm[1], xserver.abstract_fd, xserver.unix_fd};
159 		char strings[ARRAY_LENGTH(fds)][16];
160 		unsigned index;
161-		struct sigaction action = {.sa_handler = SIG_IGN };
162+		struct sigaction action = {.sa_handler = SIG_IGN};
163 
164-		/* Unset the FD_CLOEXEC flag on the FDs that will get passed to Xwayland. */
165+		/* Unset the FD_CLOEXEC flag on the FDs that will get passed to
166+		 * Xwayland. */
167 		for (index = 0; index < ARRAY_LENGTH(fds); ++index) {
168 			if (fcntl(fds[index], F_SETFD, 0) != 0) {
169 				ERROR("fcntl() failed: %s\n", strerror(errno));
170 				goto fail;
171 			}
172 
173-			if (snprintf(strings[index], sizeof(strings[index]), "%d", fds[index]) >= sizeof(strings[index])) {
174+			if (snprintf(strings[index], sizeof(strings[index]), "%d",
175+			             fds[index]) >= sizeof(strings[index])) {
176 				ERROR("FD is too large\n");
177 				goto fail;
178 			}
179 		}
180 
181-		/* Ignore the USR1 signal so that Xwayland will send a USR1 signal to the
182-		 * parent process (us) after it finishes initializing. See Xserver(1) for
183-		 * more details. */
184+		/* Ignore the USR1 signal so that Xwayland will send a USR1 signal to
185+		 * the parent process (us) after it finishes initializing. See
186+		 * Xserver(1) for more details. */
187 		if (sigaction(SIGUSR1, &action, NULL) != 0) {
188-			ERROR("Failed to set SIGUSR1 handler to SIG_IGN: %s\n", strerror(errno));
189+			ERROR("Failed to set SIGUSR1 handler to SIG_IGN: %s\n",
190+			      strerror(errno));
191 			goto fail;
192 		}
193 
194 		setenv("WAYLAND_SOCKET", strings[0], true);
195-		execlp("Xwayland", "Xwayland",
196-		       xserver.display_name,
197-		       "-rootless",
198-		       "-terminate",
199-		       "-listen", strings[2],
200-		       "-listen", strings[3],
201-		       "-wm", strings[1],
202-		       NULL);
203+		execlp("Xwayland", "Xwayland", xserver.display_name, "-rootless",
204+		       "-terminate", "-listen", strings[2], "-listen", strings[3],
205+		       "-wm", strings[1], NULL);
206 
207 	fail:
208 		exit(EXIT_FAILURE);
209 	}
210 	case -1:
211-		ERROR("fork() failed when trying to start X server: %s\n", strerror(errno));
212+		ERROR("fork() failed when trying to start X server: %s\n",
213+		      strerror(errno));
214 		goto error5;
215 	}
216 
217@@ -311,9 +327,11 @@ error0:
218 void
219 xserver_finalize(void)
220 {
221-	if (xserver.xwm_initialized)
222+	if (xserver.xwm_initialized) {
223 		xwm_finalize();
224-	if (swc_xserver.client)
225+	}
226+	if (swc_xserver.client) {
227 		wl_client_destroy(swc_xserver.client);
228+	}
229 	close_display();
230 }
+4, -2
 1@@ -30,7 +30,9 @@ struct swc_xserver {
 2 	struct wl_client *client;
 3 };
 4 
 5-bool xserver_initialize(void);
 6-void xserver_finalize(void);
 7+bool
 8+xserver_initialize(void);
 9+void
10+xserver_finalize(void);
11 
12 #endif
+102, -67
  1@@ -69,14 +69,12 @@ static struct {
  2 		xcb_intern_atom_cookie_t cookie;
  3 		xcb_atom_t value;
  4 	} atoms[4];
  5-} xwm = {
  6-	.atoms = {
  7-		[ATOM_WL_SURFACE_ID] = {"WL_SURFACE_ID"},
  8-		[ATOM_WM_DELETE_WINDOW] = {"WM_DELETE_WINDOW"},
  9-		[ATOM_WM_PROTOCOLS] = {"WM_PROTOCOLS"},
 10-		[ATOM_WM_S0] = {"WM_S0"},
 11-	}
 12-};
 13+} xwm = {.atoms = {
 14+             [ATOM_WL_SURFACE_ID] = {"WL_SURFACE_ID"},
 15+             [ATOM_WM_DELETE_WINDOW] = {"WM_DELETE_WINDOW"},
 16+             [ATOM_WM_PROTOCOLS] = {"WM_PROTOCOLS"},
 17+             [ATOM_WM_S0] = {"WM_S0"},
 18+         }};
 19 
 20 static void
 21 update_name(struct xwl_window *xwl_window)
 22@@ -86,8 +84,10 @@ update_name(struct xwl_window *xwl_window)
 23 
 24 	wm_name_cookie = xcb_ewmh_get_wm_name(&xwm.ewmh, xwl_window->id);
 25 
 26-	if (xcb_ewmh_get_wm_name_reply(&xwm.ewmh, wm_name_cookie, &wm_name_reply, NULL)) {
 27-		window_set_title(&xwl_window->window, wm_name_reply.strings, wm_name_reply.strings_len);
 28+	if (xcb_ewmh_get_wm_name_reply(&xwm.ewmh, wm_name_cookie, &wm_name_reply,
 29+	                               NULL)) {
 30+		window_set_title(&xwl_window->window, wm_name_reply.strings,
 31+		                 wm_name_reply.strings_len);
 32 		xcb_ewmh_get_utf8_strings_reply_wipe(&wm_name_reply);
 33 	} else {
 34 		window_set_title(&xwl_window->window, NULL, 0);
 35@@ -101,15 +101,19 @@ update_protocols(struct xwl_window *xwl_window)
 36 	xcb_icccm_get_wm_protocols_reply_t reply;
 37 	unsigned index;
 38 
 39-	cookie = xcb_icccm_get_wm_protocols(xwm.connection, xwl_window->id, xwm.atoms[ATOM_WM_PROTOCOLS].value);
 40+	cookie = xcb_icccm_get_wm_protocols(xwm.connection, xwl_window->id,
 41+	                                    xwm.atoms[ATOM_WM_PROTOCOLS].value);
 42 	xwl_window->supports_delete = true;
 43 
 44-	if (!xcb_icccm_get_wm_protocols_reply(xwm.connection, cookie, &reply, NULL))
 45+	if (!xcb_icccm_get_wm_protocols_reply(xwm.connection, cookie, &reply,
 46+	                                      NULL)) {
 47 		return;
 48+	}
 49 
 50 	for (index = 0; index < reply.atoms_len; ++index) {
 51-		if (reply.atoms[index] == xwm.atoms[ATOM_WM_DELETE_WINDOW].value)
 52+		if (reply.atoms[index] == xwm.atoms[ATOM_WM_DELETE_WINDOW].value) {
 53 			xwl_window->supports_delete = true;
 54+		}
 55 	}
 56 
 57 	xcb_icccm_get_wm_protocols_reply_wipe(&reply);
 58@@ -120,9 +124,11 @@ find_window(struct wl_list *list, xcb_window_t id)
 59 {
 60 	struct xwl_window *window;
 61 
 62-	wl_list_for_each (window, list, link) {
 63-		if (window->id == id)
 64+	wl_list_for_each(window, list, link)
 65+	{
 66+		if (window->id == id) {
 67 			return window;
 68+		}
 69 	}
 70 
 71 	return NULL;
 72@@ -133,9 +139,11 @@ find_window_by_surface_id(struct wl_list *list, uint32_t id)
 73 {
 74 	struct xwl_window *window;
 75 
 76-	wl_list_for_each (window, list, link) {
 77-		if (window->surface_id == id)
 78+	wl_list_for_each(window, list, link)
 79+	{
 80+		if (window->surface_id == id) {
 81 			return window;
 82+		}
 83 	}
 84 
 85 	return NULL;
 86@@ -175,7 +183,8 @@ focus(struct window *window)
 87 {
 88 	struct xwl_window *xwl_window = wl_container_of(window, xwl_window, window);
 89 
 90-	xcb_set_input_focus(xwm.connection, XCB_INPUT_FOCUS_NONE, xwl_window->id, XCB_CURRENT_TIME);
 91+	xcb_set_input_focus(xwm.connection, XCB_INPUT_FOCUS_NONE, xwl_window->id,
 92+	                    XCB_CURRENT_TIME);
 93 	xcb_flush(xwm.connection);
 94 	xwm.focus = xwl_window;
 95 }
 96@@ -185,12 +194,13 @@ unfocus(struct window *window)
 97 {
 98 	struct xwl_window *xwl_window = wl_container_of(window, xwl_window, window);
 99 
100-	/* If the window we are unfocusing is the latest xwl_window to be focused, we
101-	 * know we have transitioned to some other window type, so the X11 focus can
102-	 * be set to XCB_NONE. Otherwise, we have transitioned to another X11 window,
103-	 * and the X11 focus has already been updated. */
104+	/* If the window we are unfocusing is the latest xwl_window to be focused,
105+	 * we know we have transitioned to some other window type, so the X11 focus
106+	 * can be set to XCB_NONE. Otherwise, we have transitioned to another X11
107+	 * window, and the X11 focus has already been updated. */
108 	if (xwl_window == xwm.focus) {
109-		xcb_set_input_focus(xwm.connection, XCB_INPUT_FOCUS_NONE, XCB_NONE, XCB_CURRENT_TIME);
110+		xcb_set_input_focus(xwm.connection, XCB_INPUT_FOCUS_NONE, XCB_NONE,
111+		                    XCB_CURRENT_TIME);
112 		xcb_flush(xwm.connection);
113 	}
114 }
115@@ -202,17 +212,19 @@ close_(struct window *window)
116 
117 	if (xwl_window->supports_delete) {
118 		xcb_client_message_event_t event = {
119-			.response_type = XCB_CLIENT_MESSAGE,
120-			.format = 32,
121-			.window = xwl_window->id,
122-			.type = xwm.atoms[ATOM_WM_PROTOCOLS].value,
123-			.data.data32 = {
124-				xwm.atoms[ATOM_WM_DELETE_WINDOW].value,
125-				XCB_CURRENT_TIME,
126-			},
127+		    .response_type = XCB_CLIENT_MESSAGE,
128+		    .format = 32,
129+		    .window = xwl_window->id,
130+		    .type = xwm.atoms[ATOM_WM_PROTOCOLS].value,
131+		    .data.data32 =
132+		        {
133+		            xwm.atoms[ATOM_WM_DELETE_WINDOW].value,
134+		            XCB_CURRENT_TIME,
135+		        },
136 		};
137 
138-		xcb_send_event(xwm.connection, false, xwl_window->id, XCB_EVENT_MASK_NO_EVENT, (const char *)&event);
139+		xcb_send_event(xwm.connection, false, xwl_window->id,
140+		               XCB_EVENT_MASK_NO_EVENT, (const char *)&event);
141 	} else {
142 		xcb_kill_client(xwm.connection, xwl_window->id);
143 	}
144@@ -221,20 +233,22 @@ close_(struct window *window)
145 }
146 
147 static const struct window_impl xwl_window_handler = {
148-	.move = move,
149-	.configure = configure,
150-	.focus = focus,
151-	.unfocus = unfocus,
152-	.close = close_,
153+    .move = move,
154+    .configure = configure,
155+    .focus = focus,
156+    .unfocus = unfocus,
157+    .close = close_,
158 };
159 
160 static void
161 handle_surface_destroy(struct wl_listener *listener, void *data)
162 {
163-	struct xwl_window *xwl_window = wl_container_of(listener, xwl_window, surface_destroy_listener);
164+	struct xwl_window *xwl_window =
165+	    wl_container_of(listener, xwl_window, surface_destroy_listener);
166 
167-	if (xwm.focus == xwl_window)
168+	if (xwm.focus == xwl_window) {
169 		xwm.focus = NULL;
170+	}
171 
172 	window_finalize(&xwl_window->window);
173 	wl_list_remove(&xwl_window->link);
174@@ -250,19 +264,23 @@ manage_window(struct xwl_window *xwl_window)
175 	xcb_get_geometry_cookie_t geometry_cookie;
176 	xcb_get_geometry_reply_t *geometry_reply;
177 
178-	resource = wl_client_get_object(swc.xserver->client, xwl_window->surface_id);
179+	resource =
180+	    wl_client_get_object(swc.xserver->client, xwl_window->surface_id);
181 
182-	if (!resource)
183+	if (!resource) {
184 		return false;
185+	}
186 
187 	surface = wl_resource_get_user_data(resource);
188 	geometry_cookie = xcb_get_geometry(xwm.connection, xwl_window->id);
189 
190 	window_initialize(&xwl_window->window, &xwl_window_handler, surface);
191 	xwl_window->surface_destroy_listener.notify = &handle_surface_destroy;
192-	wl_resource_add_destroy_listener(surface->resource, &xwl_window->surface_destroy_listener);
193+	wl_resource_add_destroy_listener(surface->resource,
194+	                                 &xwl_window->surface_destroy_listener);
195 
196-	if ((geometry_reply = xcb_get_geometry_reply(xwm.connection, geometry_cookie, NULL))) {
197+	if ((geometry_reply =
198+	         xcb_get_geometry_reply(xwm.connection, geometry_cookie, NULL))) {
199 		view_move(surface->view, geometry_reply->x, geometry_reply->y);
200 		free(geometry_reply);
201 	}
202@@ -274,7 +292,8 @@ manage_window(struct xwl_window *xwl_window)
203 
204 		mask = XCB_CW_EVENT_MASK;
205 		values[0] = XCB_EVENT_MASK_PROPERTY_CHANGE;
206-		xcb_change_window_attributes(xwm.connection, xwl_window->id, mask, values);
207+		xcb_change_window_attributes(xwm.connection, xwl_window->id, mask,
208+		                             values);
209 		mask = XCB_CONFIG_WINDOW_BORDER_WIDTH;
210 		values[0] = 0;
211 		xcb_configure_window(xwm.connection, xwl_window->id, mask, values);
212@@ -295,17 +314,18 @@ handle_new_surface(struct wl_listener *listener, void *data)
213 	struct surface *surface = data;
214 	struct xwl_window *window;
215 
216-	window = find_window_by_surface_id(&xwm.unpaired_windows, wl_resource_get_id(surface->resource));
217+	window = find_window_by_surface_id(&xwm.unpaired_windows,
218+	                                   wl_resource_get_id(surface->resource));
219 
220-	if (!window)
221+	if (!window) {
222 		return;
223+	}
224 
225 	manage_window(window);
226 }
227 
228-static struct wl_listener new_surface_listener = {
229-	.notify = &handle_new_surface
230-};
231+static struct wl_listener new_surface_listener = {.notify =
232+                                                      &handle_new_surface};
233 
234 /* X event handlers */
235 static void
236@@ -313,8 +333,9 @@ create_notify(xcb_create_notify_event_t *event)
237 {
238 	struct xwl_window *xwl_window;
239 
240-	if (!(xwl_window = malloc(sizeof *xwl_window)))
241+	if (!(xwl_window = malloc(sizeof *xwl_window))) {
242 		return;
243+	}
244 
245 	xwl_window->id = event->window;
246 	xwl_window->surface_id = 0;
247@@ -330,7 +351,8 @@ destroy_notify(xcb_destroy_notify_event_t *event)
248 	if ((xwl_window = find_window(&xwm.windows, event->window))) {
249 		wl_list_remove(&xwl_window->surface_destroy_listener.link);
250 		window_finalize(&xwl_window->window);
251-	} else if (!(xwl_window = find_window(&xwm.unpaired_windows, event->window))) {
252+	} else if (!(xwl_window =
253+	                 find_window(&xwm.unpaired_windows, event->window))) {
254 		return;
255 	}
256 
257@@ -354,13 +376,16 @@ property_notify(xcb_property_notify_event_t *event)
258 {
259 	struct xwl_window *xwl_window;
260 
261-	if (!(xwl_window = find_window(&xwm.windows, event->window)))
262+	if (!(xwl_window = find_window(&xwm.windows, event->window))) {
263 		return;
264+	}
265 
266-	if (event->atom == xwm.ewmh._NET_WM_NAME && event->state == XCB_PROPERTY_NEW_VALUE)
267+	if (event->atom == xwm.ewmh._NET_WM_NAME &&
268+	    event->state == XCB_PROPERTY_NEW_VALUE) {
269 		update_name(xwl_window);
270-	else if (event->atom == xwm.atoms[ATOM_WM_PROTOCOLS].value)
271+	} else if (event->atom == xwm.atoms[ATOM_WM_PROTOCOLS].value) {
272 		update_protocols(xwl_window);
273+	}
274 }
275 
276 static void
277@@ -369,8 +394,9 @@ client_message(xcb_client_message_event_t *event)
278 	if (event->type == xwm.atoms[ATOM_WL_SURFACE_ID].value) {
279 		struct xwl_window *xwl_window;
280 
281-		if (!(xwl_window = find_window(&xwm.unpaired_windows, event->window)))
282+		if (!(xwl_window = find_window(&xwm.unpaired_windows, event->window))) {
283 			return;
284+		}
285 
286 		xwl_window->surface_id = event->data.data32[0];
287 		manage_window(xwl_window);
288@@ -446,7 +472,8 @@ xwm_initialize(int fd)
289 
290 	for (index = 0; index < ARRAY_LENGTH(xwm.atoms); ++index) {
291 		name = xwm.atoms[index].name;
292-		xwm.atoms[index].cookie = xcb_intern_atom(xwm.connection, 0, strlen(name), name);
293+		xwm.atoms[index].cookie =
294+		    xcb_intern_atom(xwm.connection, 0, strlen(name), name);
295 	}
296 
297 	setup = xcb_get_setup(xwm.connection);
298@@ -455,10 +482,13 @@ xwm_initialize(int fd)
299 
300 	/* Try to select for substructure redirect. */
301 	mask = XCB_CW_EVENT_MASK;
302-	values[0] = XCB_EVENT_MASK_SUBSTRUCTURE_NOTIFY | XCB_EVENT_MASK_SUBSTRUCTURE_REDIRECT;
303-	change_attributes_cookie = xcb_change_window_attributes(xwm.connection, xwm.screen->root, mask, values);
304+	values[0] = XCB_EVENT_MASK_SUBSTRUCTURE_NOTIFY |
305+	            XCB_EVENT_MASK_SUBSTRUCTURE_REDIRECT;
306+	change_attributes_cookie = xcb_change_window_attributes(
307+	    xwm.connection, xwm.screen->root, mask, values);
308 
309-	xwm.source = wl_event_loop_add_fd(swc.event_loop, fd, WL_EVENT_READABLE, &connection_data, NULL);
310+	xwm.source = wl_event_loop_add_fd(swc.event_loop, fd, WL_EVENT_READABLE,
311+	                                  &connection_data, NULL);
312 	wl_list_init(&xwm.windows);
313 	wl_list_init(&xwm.unpaired_windows);
314 
315@@ -467,14 +497,16 @@ xwm_initialize(int fd)
316 		goto error2;
317 	}
318 
319-	composite_extension = xcb_get_extension_data(xwm.connection, &xcb_composite_id);
320+	composite_extension =
321+	    xcb_get_extension_data(xwm.connection, &xcb_composite_id);
322 
323 	if (!composite_extension->present) {
324 		ERROR("xwm: X server does not have composite extension\n");
325 		goto error3;
326 	}
327 
328-	redirect_subwindows_cookie = xcb_composite_redirect_subwindows_checked(xwm.connection, xwm.screen->root, XCB_COMPOSITE_REDIRECT_MANUAL);
329+	redirect_subwindows_cookie = xcb_composite_redirect_subwindows_checked(
330+	    xwm.connection, xwm.screen->root, XCB_COMPOSITE_REDIRECT_MANUAL);
331 
332 	if ((error = xcb_request_check(xwm.connection, change_attributes_cookie))) {
333 		ERROR("xwm: Another window manager is running\n");
334@@ -482,16 +514,17 @@ xwm_initialize(int fd)
335 		goto error3;
336 	}
337 
338-	if ((error = xcb_request_check(xwm.connection, redirect_subwindows_cookie))) {
339+	if ((error =
340+	         xcb_request_check(xwm.connection, redirect_subwindows_cookie))) {
341 		ERROR("xwm: Could not redirect subwindows of root for compositing\n");
342 		free(error);
343 		goto error3;
344 	}
345 
346 	xwm.window = xcb_generate_id(xwm.connection);
347-	xcb_create_window(xwm.connection, 0, xwm.window, xwm.screen->root,
348-	                  0, 0, 1, 1, 0, XCB_WINDOW_CLASS_INPUT_ONLY,
349-	                  XCB_COPY_FROM_PARENT, 0, NULL);
350+	xcb_create_window(xwm.connection, 0, xwm.window, xwm.screen->root, 0, 0, 1,
351+	                  1, 0, XCB_WINDOW_CLASS_INPUT_ONLY, XCB_COPY_FROM_PARENT,
352+	                  0, NULL);
353 
354 	xcb_ewmh_init_atoms_replies(&xwm.ewmh, ewmh_cookies, &error);
355 
356@@ -501,7 +534,8 @@ xwm_initialize(int fd)
357 	}
358 
359 	for (index = 0; index < ARRAY_LENGTH(xwm.atoms); ++index) {
360-		atom_reply = xcb_intern_atom_reply(xwm.connection, xwm.atoms[index].cookie, &error);
361+		atom_reply = xcb_intern_atom_reply(xwm.connection,
362+		                                   xwm.atoms[index].cookie, &error);
363 
364 		if (error) {
365 			ERROR("xwm: Failed to get atom reply: %u\n", error->error_code);
366@@ -512,7 +546,8 @@ xwm_initialize(int fd)
367 		free(atom_reply);
368 	}
369 
370-	xcb_set_selection_owner(xwm.connection, xwm.window, xwm.atoms[ATOM_WM_S0].value, XCB_CURRENT_TIME);
371+	xcb_set_selection_owner(xwm.connection, xwm.window,
372+	                        xwm.atoms[ATOM_WM_S0].value, XCB_CURRENT_TIME);
373 	xcb_flush(xwm.connection);
374 
375 	wl_signal_add(&swc.compositor->signal.new_surface, &new_surface_listener);
+4, -2
 1@@ -26,7 +26,9 @@
 2 
 3 #include <stdbool.h>
 4 
 5-bool xwm_initialize(int fd);
 6-void xwm_finalize(void);
 7+bool
 8+xwm_initialize(int fd);
 9+void
10+xwm_finalize(void);
11 
12 #endif
+8344, -6457
    1@@ -3,7 +3,8 @@
    2 
    3    Do this:
    4       #define STB_IMAGE_IMPLEMENTATION
    5-   before you include this file in *one* C or C++ file to create the implementation.
    6+   before you include this file in *one* C or C++ file to create the
    7+implementation.
    8 
    9    // i.e. it should look like this:
   10    #include ...
   11@@ -13,15 +14,16 @@
   12    #include "stb_image.h"
   13 
   14    You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
   15-   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
   16+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using
   17+malloc,realloc,free
   18 
   19 
   20    QUICK NOTES:
   21       Primarily of interest to game developers and other people who can
   22           avoid problematic images and only need the trivial interface
   23 
   24-      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
   25-      PNG 1/2/4/8/16-bit-per-channel
   26+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as
   27+stock IJG lib) PNG 1/2/4/8/16-bit-per-channel
   28 
   29       TGA (not sure what subset, if a subset)
   30       BMP non-1bpp, non-RLE
   31@@ -51,23 +53,19 @@ RECENT REVISION HISTORY:
   32       2.30  (2024-05-31) avoid erroneous gcc warning
   33       2.29  (2023-05-xx) optimizations
   34       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
   35-      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
   36-      2.26  (2020-07-13) many minor fixes
   37-      2.25  (2020-02-02) fix warnings
   38-      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
   39-      2.23  (2019-08-11) fix clang static analysis warning
   40-      2.22  (2019-03-04) gif fixes, fix warnings
   41-      2.21  (2019-02-25) fix typo in comment
   42-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
   43-      2.19  (2018-02-11) fix warning
   44-      2.18  (2018-01-30) fix warnings
   45-      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
   46-      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
   47-      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
   48-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
   49-      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
   50-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
   51-      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
   52+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug
   53+fixes 2.26  (2020-07-13) many minor fixes 2.25  (2020-02-02) fix warnings 2.24
   54+(2020-02-02) fix warnings; thread-local failure_reason and flip_vertically 2.23
   55+(2019-08-11) fix clang static analysis warning 2.22  (2019-03-04) gif fixes, fix
   56+warnings 2.21  (2019-02-25) fix typo in comment 2.20  (2019-02-07) support utf8
   57+filenames in Windows; fix warnings and platform ifdefs 2.19  (2018-02-11) fix
   58+warning 2.18  (2018-01-30) fix warnings 2.17  (2018-01-29) bugfix, 1-bit BMP,
   59+16-bitness query, fix warnings 2.16  (2017-07-23) all functions have 16-bit
   60+variants; optimizations; bugfixes 2.15  (2017-03-18) fix png-1,2,4; all Imagenet
   61+JPGs; no runtime SSE detection on GCC 2.14  (2017-03-03) remove deprecated
   62+STBI_JPEG_OLD; fixes for Imagenet JPGs 2.13  (2016-12-04) experimental 16-bit
   63+API, only for PNG so far; fixes 2.12  (2016-04-02) fix typo in 2.11 PSD fix that
   64+caused crashes 2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
   65                          RGB-format JPEG; remove white matting in PSD;
   66                          allocate large structures on the stack;
   67                          correct channel count for PNG & BMP
   68@@ -90,40 +88,39 @@ RECENT REVISION HISTORY:
   69     github:urraka (animated gif)           Junggon Kim (PNM comments)
   70     Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
   71                                            socks-the-fox (16-bit PNG)
   72-                                           Jeremy Sawicki (handle all ImageNet JPGs)
   73- Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
   74+                                           Jeremy Sawicki (handle all ImageNet
   75+JPGs) Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
   76     Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
   77     Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
   78     John-Mark Allen
   79     Carmelo J Fdez-Aguera
   80 
   81  Bug & warning fixes
   82-    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
   83-    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
   84-    Phil Jordan                                Dave Moore           Roy Eltham
   85-    Hayaki Saito            Nathan Reed        Won Chun
   86-    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
   87-    Thomas Ruf              Ronny Chevalier                         github:rlyeh
   88-    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
   89-    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
   90-    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
   91-    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
   92-    Cass Everitt            Ryamond Barbiero                        github:grim210
   93-    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
   94-    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
   95-    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
   96-    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
   97-    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
   98-                            Brad Weinberger    Matvey Cherevko      github:mosra
   99-    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
  100-    Ryan C. Gordon          [reserved]                              [reserved]
  101-                     DO NOT ADD YOUR NAME HERE
  102+    Marc LeBlanc            David Woo          Guillaume George     Martins
  103+Mozeiko Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej
  104+Dariusz Roszkowski Phil Jordan                                Dave Moore Roy
  105+Eltham Hayaki Saito            Nathan Reed        Won Chun Luke Graham Johan
  106+Duparc       Nick Verigakis       the Horde3D community Thomas Ruf Ronny
  107+Chevalier                         github:rlyeh Janez Zemva             John
  108+Bartholomew   Michal Cichon        github:romigrou Jonathan Blow           Ken
  109+Hamada         Tero Hanninen        github:svdijk Eugene Golushkov Laurent
  110+Gomila     Cort Stratton        github:snagar Aruelien Pocheville     Sergio
  111+Gonzalez    Thibault Reuille     github:Zelex Cass Everitt            Ryamond
  112+Barbiero                        github:grim210 Paul Du Bois            Engin
  113+Manap        Aldo Culquicondor    github:sammyhw Philipp Wiesemann       Dale
  114+Weiler        Oriol Ferrer Mesia   github:phprus Josh Tobin              Neil
  115+Bickford      Matthew Gregan       github:poppolopoppo Julian Raschke Gregory
  116+Mullen     Christian Floisand   github:darealshinji Baldur Karlsson Kevin
  117+Schmidt      JR Smith             github:Michaelangel007 Brad Weinberger Matvey
  118+Cherevko      github:mosra Luca Sas                Alexander Veselov  Zack
  119+Middleton       [reserved] Ryan C. Gordon          [reserved] [reserved] DO NOT
  120+ADD YOUR NAME HERE
  121 
  122                      Jacko Dirks
  123 
  124-  To add your name to the credits, pick a random blank space in the middle and fill it.
  125-  80% of merge conflicts on stb PRs are due to people adding their name at the end
  126-  of the credits.
  127+  To add your name to the credits, pick a random blank space in the middle and
  128+fill it. 80% of merge conflicts on stb PRs are due to people adding their name
  129+at the end of the credits.
  130 */
  131 
  132 #ifndef STBI_INCLUDE_STB_IMAGE_H
  133@@ -142,14 +139,15 @@ RECENT REVISION HISTORY:
  134 //    // ... process data if not NULL ...
  135 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
  136 //    // ... replace '0' with '1'..'4' to force that many components per pixel
  137-//    // ... but 'n' will always be the number that it would have been if you said 0
  138-//    stbi_image_free(data);
  139+//    // ... but 'n' will always be the number that it would have been if you
  140+//    said 0 stbi_image_free(data);
  141 //
  142 // Standard parameters:
  143 //    int *x                 -- outputs image width in pixels
  144 //    int *y                 -- outputs image height in pixels
  145 //    int *channels_in_file  -- outputs # of image components in image file
  146-//    int desired_channels   -- if non-zero, # of image components requested in result
  147+//    int desired_channels   -- if non-zero, # of image components requested in
  148+//    result
  149 //
  150 // The return value from an image loader is an 'unsigned char *' which points
  151 // to the pixel data, or NULL on an allocation failure or if the image is
  152@@ -177,8 +175,8 @@ RECENT REVISION HISTORY:
  153 // and *x, *y, *channels_in_file will be unchanged. The function
  154 // stbi_failure_reason() can be queried for an extremely brief, end-user
  155 // unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
  156-// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
  157-// more user-friendly ones.
  158+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get
  159+// slightly more user-friendly ones.
  160 //
  161 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
  162 //
  163@@ -228,11 +226,12 @@ RECENT REVISION HISTORY:
  164 //    2. easy to maintain
  165 //    3. good performance
  166 //
  167-// Sometimes I let "good performance" creep up in priority over "easy to maintain",
  168-// and for best performance I may provide less-easy-to-use APIs that give higher
  169-// performance, in addition to the easy-to-use ones. Nevertheless, it's important
  170-// to keep in mind that from the standpoint of you, a client of this library,
  171-// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
  172+// Sometimes I let "good performance" creep up in priority over "easy to
  173+// maintain", and for best performance I may provide less-easy-to-use APIs that
  174+// give higher performance, in addition to the easy-to-use ones. Nevertheless,
  175+// it's important to keep in mind that from the standpoint of you, a client of
  176+// this library, all you care about is #1 and #3, and stb libraries DO NOT
  177+// emphasize #3 above all.
  178 //
  179 // Some secondary priorities arise directly from the first two, some of which
  180 // provide more explicit reasons why performance can't be emphasized.
  181@@ -251,7 +250,8 @@ RECENT REVISION HISTORY:
  182 // overhead.
  183 //
  184 // The three functions you must define are "read" (reads some bytes of data),
  185-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
  186+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the
  187+// end).
  188 //
  189 // ===========================================================================
  190 //
  191@@ -279,10 +279,11 @@ RECENT REVISION HISTORY:
  192 // HDR image support   (disable by defining STBI_NO_HDR)
  193 //
  194 // stb_image supports loading HDR images in general, and currently the Radiance
  195-// .HDR file format specifically. You can still load any file through the existing
  196-// interface; if you attempt to load an HDR file, it will be automatically remapped
  197-// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
  198-// both of these constants can be reconfigured through this interface:
  199+// .HDR file format specifically. You can still load any file through the
  200+// existing interface; if you attempt to load an HDR file, it will be
  201+// automatically remapped to LDR, assuming gamma 2.2 and an arbitrary scale
  202+// factor defaulting to 1; both of these constants can be reconfigured through
  203+// this interface:
  204 //
  205 //     stbi_hdr_to_ldr_gamma(2.2f);
  206 //     stbi_hdr_to_ldr_scale(1.0f);
  207@@ -373,14 +374,13 @@ RECENT REVISION HISTORY:
  208 
  209 #define STBI_VERSION 1
  210 
  211-enum
  212-{
  213-   STBI_default = 0, // only used for desired_channels
  214+enum {
  215+	STBI_default = 0, // only used for desired_channels
  216 
  217-   STBI_grey       = 1,
  218-   STBI_grey_alpha = 2,
  219-   STBI_rgb        = 3,
  220-   STBI_rgb_alpha  = 4
  221+	STBI_grey = 1,
  222+	STBI_grey_alpha = 2,
  223+	STBI_rgb = 3,
  224+	STBI_rgb_alpha = 4
  225 };
  226 
  227 #include <stdlib.h>
  228@@ -408,11 +408,13 @@ extern "C" {
  229 // load image by filename, open file, or memory buffer
  230 //
  231 
  232-typedef struct
  233-{
  234-   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
  235-   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
  236-   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
  237+typedef struct {
  238+	int (*read)(void *user, char *data,
  239+	            int size); // fill 'data' with 'size' bytes.  return number of
  240+	                       // bytes actually read
  241+	void (*skip)(void *user, int n); // skip the next 'n' bytes, or 'unget' the
  242+	                                 // last -n bytes if negative
  243+	int (*eof)(void *user); // returns nonzero if we are at end of file/data
  244 } stbi_io_callbacks;
  245 
  246 ////////////////////////////////////
  247@@ -420,21 +422,34 @@ typedef struct
  248 // 8-bits-per-channel interface
  249 //
  250 
  251-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
  252-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
  253+STBIDEF stbi_uc *
  254+stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
  255+                      int *channels_in_file, int desired_channels);
  256+STBIDEF stbi_uc *
  257+stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
  258+                         int *y, int *channels_in_file, int desired_channels);
  259 
  260 #ifndef STBI_NO_STDIO
  261-STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
  262-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
  263-// for stbi_load_from_file, file pointer is left pointing immediately after image
  264+STBIDEF stbi_uc *
  265+stbi_load(char const *filename, int *x, int *y, int *channels_in_file,
  266+          int desired_channels);
  267+STBIDEF stbi_uc *
  268+stbi_load_from_file(FILE *f, int *x, int *y, int *channels_in_file,
  269+                    int desired_channels);
  270+// for stbi_load_from_file, file pointer is left pointing immediately after
  271+// image
  272 #endif
  273 
  274 #ifndef STBI_NO_GIF
  275-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
  276+STBIDEF stbi_uc *
  277+stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x,
  278+                          int *y, int *z, int *comp, int req_comp);
  279 #endif
  280 
  281 #ifdef STBI_WINDOWS_UTF8
  282-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
  283+STBIDEF int
  284+stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
  285+                           const wchar_t *input);
  286 #endif
  287 
  288 ////////////////////////////////////
  289@@ -442,12 +457,21 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch
  290 // 16-bits-per-channel interface
  291 //
  292 
  293-STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
  294-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
  295+STBIDEF stbi_us *
  296+stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
  297+                         int *channels_in_file, int desired_channels);
  298+STBIDEF stbi_us *
  299+stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
  300+                            int *y, int *channels_in_file,
  301+                            int desired_channels);
  302 
  303 #ifndef STBI_NO_STDIO
  304-STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
  305-STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
  306+STBIDEF stbi_us *
  307+stbi_load_16(char const *filename, int *x, int *y, int *channels_in_file,
  308+             int desired_channels);
  309+STBIDEF stbi_us *
  310+stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file,
  311+                       int desired_channels);
  312 #endif
  313 
  314 ////////////////////////////////////
  315@@ -455,85 +479,126 @@ STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_i
  316 // float-per-channel interface
  317 //
  318 #ifndef STBI_NO_LINEAR
  319-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
  320-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
  321+STBIDEF float *
  322+stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
  323+                       int *channels_in_file, int desired_channels);
  324+STBIDEF float *
  325+stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
  326+                          int *y, int *channels_in_file, int desired_channels);
  327 
  328-   #ifndef STBI_NO_STDIO
  329-   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
  330-   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
  331-   #endif
  332+#ifndef STBI_NO_STDIO
  333+STBIDEF float *
  334+stbi_loadf(char const *filename, int *x, int *y, int *channels_in_file,
  335+           int desired_channels);
  336+STBIDEF float *
  337+stbi_loadf_from_file(FILE *f, int *x, int *y, int *channels_in_file,
  338+                     int desired_channels);
  339+#endif
  340 #endif
  341 
  342 #ifndef STBI_NO_HDR
  343-   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
  344-   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
  345+STBIDEF void
  346+stbi_hdr_to_ldr_gamma(float gamma);
  347+STBIDEF void
  348+stbi_hdr_to_ldr_scale(float scale);
  349 #endif // STBI_NO_HDR
  350 
  351 #ifndef STBI_NO_LINEAR
  352-   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
  353-   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
  354+STBIDEF void
  355+stbi_ldr_to_hdr_gamma(float gamma);
  356+STBIDEF void
  357+stbi_ldr_to_hdr_scale(float scale);
  358 #endif // STBI_NO_LINEAR
  359 
  360 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
  361-STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
  362-STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
  363+STBIDEF int
  364+stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
  365+STBIDEF int
  366+stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
  367 #ifndef STBI_NO_STDIO
  368-STBIDEF int      stbi_is_hdr          (char const *filename);
  369-STBIDEF int      stbi_is_hdr_from_file(FILE *f);
  370+STBIDEF int
  371+stbi_is_hdr(char const *filename);
  372+STBIDEF int
  373+stbi_is_hdr_from_file(FILE *f);
  374 #endif // STBI_NO_STDIO
  375 
  376-
  377 // get a VERY brief reason for failure
  378 // on most compilers (and ALL modern mainstream compilers) this is threadsafe
  379-STBIDEF const char *stbi_failure_reason  (void);
  380+STBIDEF const char *
  381+stbi_failure_reason(void);
  382 
  383 // free the loaded image -- this is just free()
  384-STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
  385+STBIDEF void
  386+stbi_image_free(void *retval_from_stbi_load);
  387 
  388 // get image dimensions & components without fully decoding
  389-STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
  390-STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
  391-STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
  392-STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
  393+STBIDEF int
  394+stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
  395+                      int *comp);
  396+STBIDEF int
  397+stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
  398+                         int *y, int *comp);
  399+STBIDEF int
  400+stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
  401+STBIDEF int
  402+stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
  403 
  404 #ifndef STBI_NO_STDIO
  405-STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
  406-STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
  407-STBIDEF int      stbi_is_16_bit          (char const *filename);
  408-STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
  409+STBIDEF int
  410+stbi_info(char const *filename, int *x, int *y, int *comp);
  411+STBIDEF int
  412+stbi_info_from_file(FILE *f, int *x, int *y, int *comp);
  413+STBIDEF int
  414+stbi_is_16_bit(char const *filename);
  415+STBIDEF int
  416+stbi_is_16_bit_from_file(FILE *f);
  417 #endif
  418 
  419-
  420-
  421 // for image formats that explicitly notate that they have premultiplied alpha,
  422 // we just return the colors as stored in the file. set this flag to force
  423 // unpremultiplication. results are undefined if the unpremultiply overflow.
  424-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
  425+STBIDEF void
  426+stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
  427 
  428 // indicate whether we should process iphone images back to canonical format,
  429 // or just pass them through "as-is"
  430-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
  431-
  432-// flip the image vertically, so the first pixel in the output array is the bottom left
  433-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
  434-
  435-// as above, but only applies to images loaded on the thread that calls the function
  436-// this function is only available if your compiler supports thread-local variables;
  437-// calling it will fail to link if your compiler doesn't
  438-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
  439-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
  440-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
  441+STBIDEF void
  442+stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
  443+
  444+// flip the image vertically, so the first pixel in the output array is the
  445+// bottom left
  446+STBIDEF void
  447+stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
  448+
  449+// as above, but only applies to images loaded on the thread that calls the
  450+// function this function is only available if your compiler supports
  451+// thread-local variables; calling it will fail to link if your compiler doesn't
  452+STBIDEF void
  453+stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
  454+STBIDEF void
  455+stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
  456+STBIDEF void
  457+stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
  458 
  459 // ZLIB client - used by PNG, available for other purposes
  460 
  461-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
  462-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
  463-STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
  464-STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
  465-
  466-STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
  467-STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
  468-
  469+STBIDEF char *
  470+stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size,
  471+                                  int *outlen);
  472+STBIDEF char *
  473+stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len,
  474+                                             int initial_size, int *outlen,
  475+                                             int parse_header);
  476+STBIDEF char *
  477+stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
  478+STBIDEF int
  479+stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
  480+
  481+STBIDEF char *
  482+stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
  483+STBIDEF int
  484+stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer,
  485+                                 int ilen);
  486 
  487 #ifdef __cplusplus
  488 }
  489@@ -546,52 +611,53 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
  490 
  491 #ifdef STB_IMAGE_IMPLEMENTATION
  492 
  493-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
  494-  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
  495-  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
  496-  || defined(STBI_ONLY_ZLIB)
  497-   #ifndef STBI_ONLY_JPEG
  498-   #define STBI_NO_JPEG
  499-   #endif
  500-   #ifndef STBI_ONLY_PNG
  501-   #define STBI_NO_PNG
  502-   #endif
  503-   #ifndef STBI_ONLY_BMP
  504-   #define STBI_NO_BMP
  505-   #endif
  506-   #ifndef STBI_ONLY_PSD
  507-   #define STBI_NO_PSD
  508-   #endif
  509-   #ifndef STBI_ONLY_TGA
  510-   #define STBI_NO_TGA
  511-   #endif
  512-   #ifndef STBI_ONLY_GIF
  513-   #define STBI_NO_GIF
  514-   #endif
  515-   #ifndef STBI_ONLY_HDR
  516-   #define STBI_NO_HDR
  517-   #endif
  518-   #ifndef STBI_ONLY_PIC
  519-   #define STBI_NO_PIC
  520-   #endif
  521-   #ifndef STBI_ONLY_PNM
  522-   #define STBI_NO_PNM
  523-   #endif
  524-#endif
  525-
  526-#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
  527-#define STBI_NO_ZLIB
  528+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) ||                       \
  529+    defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) ||                        \
  530+    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) ||                        \
  531+    defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||                        \
  532+    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
  533+#ifndef STBI_ONLY_JPEG
  534+#define STBI_NO_JPEG
  535+#endif
  536+#ifndef STBI_ONLY_PNG
  537+#define STBI_NO_PNG
  538+#endif
  539+#ifndef STBI_ONLY_BMP
  540+#define STBI_NO_BMP
  541+#endif
  542+#ifndef STBI_ONLY_PSD
  543+#define STBI_NO_PSD
  544+#endif
  545+#ifndef STBI_ONLY_TGA
  546+#define STBI_NO_TGA
  547+#endif
  548+#ifndef STBI_ONLY_GIF
  549+#define STBI_NO_GIF
  550+#endif
  551+#ifndef STBI_ONLY_HDR
  552+#define STBI_NO_HDR
  553+#endif
  554+#ifndef STBI_ONLY_PIC
  555+#define STBI_NO_PIC
  556+#endif
  557+#ifndef STBI_ONLY_PNM
  558+#define STBI_NO_PNM
  559+#endif
  560 #endif
  561 
  562+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) &&                     \
  563+    !defined(STBI_NO_ZLIB)
  564+#define STBI_NO_ZLIB
  565+#endif
  566 
  567+#include <limits.h>
  568 #include <stdarg.h>
  569 #include <stddef.h> // ptrdiff_t on osx
  570 #include <stdlib.h>
  571 #include <string.h>
  572-#include <limits.h>
  573 
  574 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
  575-#include <math.h>  // ldexp, pow
  576+#include <math.h> // ldexp, pow
  577 #endif
  578 
  579 #ifndef STBI_NO_STDIO
  580@@ -609,55 +675,55 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
  581 #define STBI_EXTERN extern
  582 #endif
  583 
  584-
  585 #ifndef _MSC_VER
  586-   #ifdef __cplusplus
  587-   #define stbi_inline inline
  588-   #else
  589-   #define stbi_inline
  590-   #endif
  591+#ifdef __cplusplus
  592+#define stbi_inline inline
  593+#else
  594+#define stbi_inline
  595+#endif
  596 #else
  597-   #define stbi_inline __forceinline
  598+#define stbi_inline __forceinline
  599 #endif
  600 
  601 #ifndef STBI_NO_THREAD_LOCALS
  602-   #if defined(__cplusplus) &&  __cplusplus >= 201103L
  603-      #define STBI_THREAD_LOCAL       thread_local
  604-   #elif defined(__GNUC__) && __GNUC__ < 5
  605-      #define STBI_THREAD_LOCAL       __thread
  606-   #elif defined(_MSC_VER)
  607-      #define STBI_THREAD_LOCAL       __declspec(thread)
  608-   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
  609-      #define STBI_THREAD_LOCAL       _Thread_local
  610-   #endif
  611-
  612-   #ifndef STBI_THREAD_LOCAL
  613-      #if defined(__GNUC__)
  614-        #define STBI_THREAD_LOCAL       __thread
  615-      #endif
  616-   #endif
  617+#if defined(__cplusplus) && __cplusplus >= 201103L
  618+#define STBI_THREAD_LOCAL thread_local
  619+#elif defined(__GNUC__) && __GNUC__ < 5
  620+#define STBI_THREAD_LOCAL __thread
  621+#elif defined(_MSC_VER)
  622+#define STBI_THREAD_LOCAL __declspec(thread)
  623+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L &&              \
  624+    !defined(__STDC_NO_THREADS__)
  625+#define STBI_THREAD_LOCAL _Thread_local
  626+#endif
  627+
  628+#ifndef STBI_THREAD_LOCAL
  629+#if defined(__GNUC__)
  630+#define STBI_THREAD_LOCAL __thread
  631+#endif
  632+#endif
  633 #endif
  634 
  635 #if defined(_MSC_VER) || defined(__SYMBIAN32__)
  636 typedef unsigned short stbi__uint16;
  637-typedef   signed short stbi__int16;
  638-typedef unsigned int   stbi__uint32;
  639-typedef   signed int   stbi__int32;
  640+typedef signed short stbi__int16;
  641+typedef unsigned int stbi__uint32;
  642+typedef signed int stbi__int32;
  643 #else
  644 #include <stdint.h>
  645 typedef uint16_t stbi__uint16;
  646-typedef int16_t  stbi__int16;
  647+typedef int16_t stbi__int16;
  648 typedef uint32_t stbi__uint32;
  649-typedef int32_t  stbi__int32;
  650+typedef int32_t stbi__int32;
  651 #endif
  652 
  653 // should produce compiler error if size is wrong
  654-typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  655+typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
  656 
  657 #ifdef _MSC_VER
  658-#define STBI_NOTUSED(v)  (void)(v)
  659+#define STBI_NOTUSED(v) (void)(v)
  660 #else
  661-#define STBI_NOTUSED(v)  (void)sizeof(v)
  662+#define STBI_NOTUSED(v) (void)sizeof(v)
  663 #endif
  664 
  665 #ifdef _MSC_VER
  666@@ -665,27 +731,30 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  667 #endif
  668 
  669 #ifdef STBI_HAS_LROTL
  670-   #define stbi_lrot(x,y)  _lrotl(x,y)
  671+#define stbi_lrot(x, y) _lrotl(x, y)
  672 #else
  673-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
  674+#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (-(y) & 31)))
  675 #endif
  676 
  677-#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
  678+#if defined(STBI_MALLOC) && defined(STBI_FREE) &&                              \
  679+    (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
  680 // ok
  681-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
  682+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) &&                          \
  683+    !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
  684 // ok
  685 #else
  686-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
  687+#error                                                                         \
  688+    "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
  689 #endif
  690 
  691 #ifndef STBI_MALLOC
  692-#define STBI_MALLOC(sz)           malloc(sz)
  693-#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
  694-#define STBI_FREE(p)              free(p)
  695+#define STBI_MALLOC(sz) malloc(sz)
  696+#define STBI_REALLOC(p, newsz) realloc(p, newsz)
  697+#define STBI_FREE(p) free(p)
  698 #endif
  699 
  700 #ifndef STBI_REALLOC_SIZED
  701-#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
  702+#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz)
  703 #endif
  704 
  705 // x86/x64 detection
  706@@ -695,7 +764,8 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  707 #define STBI__X86_TARGET
  708 #endif
  709 
  710-#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
  711+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) &&    \
  712+    !defined(STBI_NO_SIMD)
  713 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
  714 // which in turn means it gets to use SSE2 everywhere. This is unfortunate,
  715 // but previous attempts to provide the SSE2 functions with runtime
  716@@ -706,8 +776,10 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  717 #define STBI_NO_SIMD
  718 #endif
  719 
  720-#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
  721-// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
  722+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) &&                       \
  723+    !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
  724+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid
  725+// STBI__X64_TARGET
  726 //
  727 // 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
  728 // Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
  729@@ -717,44 +789,49 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
  730 // See https://github.com/nothings/stb/issues/81 for more information.
  731 //
  732 // So default to no SSE2 on 32-bit MinGW. If you've read this far and added
  733-// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
  734+// -mstackrealign to your build settings, feel free to #define
  735+// STBI_MINGW_ENABLE_SSE2.
  736 #define STBI_NO_SIMD
  737 #endif
  738 
  739-#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
  740+#if !defined(STBI_NO_SIMD) &&                                                  \
  741+    (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
  742 #define STBI_SSE2
  743 #include <emmintrin.h>
  744 
  745 #ifdef _MSC_VER
  746 
  747-#if _MSC_VER >= 1400  // not VC6
  748-#include <intrin.h> // __cpuid
  749-static int stbi__cpuid3(void)
  750+#if _MSC_VER >= 1400 // not VC6
  751+#include <intrin.h>  // __cpuid
  752+static int
  753+stbi__cpuid3(void)
  754 {
  755-   int info[4];
  756-   __cpuid(info,1);
  757-   return info[3];
  758+	int info[4];
  759+	__cpuid(info, 1);
  760+	return info[3];
  761 }
  762 #else
  763-static int stbi__cpuid3(void)
  764+static int
  765+stbi__cpuid3(void)
  766 {
  767-   int res;
  768-   __asm {
  769+	int res;
  770+	__asm {
  771       mov  eax,1
  772       cpuid
  773       mov  res,edx
  774-   }
  775-   return res;
  776+	}
  777+	return res;
  778 }
  779 #endif
  780 
  781 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
  782 
  783 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
  784-static int stbi__sse2_available(void)
  785+static int
  786+stbi__sse2_available(void)
  787 {
  788-   int info3 = stbi__cpuid3();
  789-   return ((info3 >> 26) & 1) != 0;
  790+	int info3 = stbi__cpuid3();
  791+	return ((info3 >> 26) & 1) != 0;
  792 }
  793 #endif
  794 
  795@@ -762,12 +839,13 @@ static int stbi__sse2_available(void)
  796 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
  797 
  798 #if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
  799-static int stbi__sse2_available(void)
  800+static int
  801+stbi__sse2_available(void)
  802 {
  803-   // If we're even attempting to compile this on GCC/Clang, that means
  804-   // -msse2 is on, which means the compiler is allowed to use SSE2
  805-   // instructions at will, and so are we.
  806-   return 1;
  807+	// If we're even attempting to compile this on GCC/Clang, that means
  808+	// -msse2 is on, which means the compiler is allowed to use SSE2
  809+	// instructions at will, and so are we.
  810+	return 1;
  811 }
  812 #endif
  813 
  814@@ -802,189 +880,234 @@ static int stbi__sse2_available(void)
  815 
  816 // stbi__context structure is our basic context used by all images, so it
  817 // contains all the IO context, plus some basic image information
  818-typedef struct
  819-{
  820-   stbi__uint32 img_x, img_y;
  821-   int img_n, img_out_n;
  822+typedef struct {
  823+	stbi__uint32 img_x, img_y;
  824+	int img_n, img_out_n;
  825 
  826-   stbi_io_callbacks io;
  827-   void *io_user_data;
  828+	stbi_io_callbacks io;
  829+	void *io_user_data;
  830 
  831-   int read_from_callbacks;
  832-   int buflen;
  833-   stbi_uc buffer_start[128];
  834-   int callback_already_read;
  835+	int read_from_callbacks;
  836+	int buflen;
  837+	stbi_uc buffer_start[128];
  838+	int callback_already_read;
  839 
  840-   stbi_uc *img_buffer, *img_buffer_end;
  841-   stbi_uc *img_buffer_original, *img_buffer_original_end;
  842+	stbi_uc *img_buffer, *img_buffer_end;
  843+	stbi_uc *img_buffer_original, *img_buffer_original_end;
  844 } stbi__context;
  845 
  846-
  847-static void stbi__refill_buffer(stbi__context *s);
  848+static void
  849+stbi__refill_buffer(stbi__context *s);
  850 
  851 // initialize a memory-decode context
  852-static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
  853+static void
  854+stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
  855 {
  856-   s->io.read = NULL;
  857-   s->read_from_callbacks = 0;
  858-   s->callback_already_read = 0;
  859-   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
  860-   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
  861+	s->io.read = NULL;
  862+	s->read_from_callbacks = 0;
  863+	s->callback_already_read = 0;
  864+	s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer;
  865+	s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len;
  866 }
  867 
  868 // initialize a callback-based context
  869-static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
  870+static void
  871+stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
  872 {
  873-   s->io = *c;
  874-   s->io_user_data = user;
  875-   s->buflen = sizeof(s->buffer_start);
  876-   s->read_from_callbacks = 1;
  877-   s->callback_already_read = 0;
  878-   s->img_buffer = s->img_buffer_original = s->buffer_start;
  879-   stbi__refill_buffer(s);
  880-   s->img_buffer_original_end = s->img_buffer_end;
  881+	s->io = *c;
  882+	s->io_user_data = user;
  883+	s->buflen = sizeof(s->buffer_start);
  884+	s->read_from_callbacks = 1;
  885+	s->callback_already_read = 0;
  886+	s->img_buffer = s->img_buffer_original = s->buffer_start;
  887+	stbi__refill_buffer(s);
  888+	s->img_buffer_original_end = s->img_buffer_end;
  889 }
  890 
  891 #ifndef STBI_NO_STDIO
  892 
  893-static int stbi__stdio_read(void *user, char *data, int size)
  894+static int
  895+stbi__stdio_read(void *user, char *data, int size)
  896 {
  897-   return (int) fread(data,1,size,(FILE*) user);
  898+	return (int)fread(data, 1, size, (FILE *)user);
  899 }
  900 
  901-static void stbi__stdio_skip(void *user, int n)
  902+static void
  903+stbi__stdio_skip(void *user, int n)
  904 {
  905-   int ch;
  906-   fseek((FILE*) user, n, SEEK_CUR);
  907-   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
  908-   if (ch != EOF) {
  909-      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
  910-   }
  911+	int ch;
  912+	fseek((FILE *)user, n, SEEK_CUR);
  913+	ch = fgetc((FILE *)user); /* have to read a byte to reset feof()'s flag */
  914+	if (ch != EOF) {
  915+		ungetc(ch, (FILE *)user); /* push byte back onto stream if valid. */
  916+	}
  917 }
  918 
  919-static int stbi__stdio_eof(void *user)
  920+static int
  921+stbi__stdio_eof(void *user)
  922 {
  923-   return feof((FILE*) user) || ferror((FILE *) user);
  924+	return feof((FILE *)user) || ferror((FILE *)user);
  925 }
  926 
  927-static stbi_io_callbacks stbi__stdio_callbacks =
  928-{
  929-   stbi__stdio_read,
  930-   stbi__stdio_skip,
  931-   stbi__stdio_eof,
  932+static stbi_io_callbacks stbi__stdio_callbacks = {
  933+    stbi__stdio_read,
  934+    stbi__stdio_skip,
  935+    stbi__stdio_eof,
  936 };
  937 
  938-static void stbi__start_file(stbi__context *s, FILE *f)
  939+static void
  940+stbi__start_file(stbi__context *s, FILE *f)
  941 {
  942-   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
  943+	stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f);
  944 }
  945 
  946-//static void stop_file(stbi__context *s) { }
  947+// static void stop_file(stbi__context *s) { }
  948 
  949 #endif // !STBI_NO_STDIO
  950 
  951-static void stbi__rewind(stbi__context *s)
  952+static void
  953+stbi__rewind(stbi__context *s)
  954 {
  955-   // conceptually rewind SHOULD rewind to the beginning of the stream,
  956-   // but we just rewind to the beginning of the initial buffer, because
  957-   // we only use it after doing 'test', which only ever looks at at most 92 bytes
  958-   s->img_buffer = s->img_buffer_original;
  959-   s->img_buffer_end = s->img_buffer_original_end;
  960+	// conceptually rewind SHOULD rewind to the beginning of the stream,
  961+	// but we just rewind to the beginning of the initial buffer, because
  962+	// we only use it after doing 'test', which only ever looks at at most 92
  963+	// bytes
  964+	s->img_buffer = s->img_buffer_original;
  965+	s->img_buffer_end = s->img_buffer_original_end;
  966 }
  967 
  968-enum
  969-{
  970-   STBI_ORDER_RGB,
  971-   STBI_ORDER_BGR
  972-};
  973+enum { STBI_ORDER_RGB, STBI_ORDER_BGR };
  974 
  975-typedef struct
  976-{
  977-   int bits_per_channel;
  978-   int num_channels;
  979-   int channel_order;
  980+typedef struct {
  981+	int bits_per_channel;
  982+	int num_channels;
  983+	int channel_order;
  984 } stbi__result_info;
  985 
  986 #ifndef STBI_NO_JPEG
  987-static int      stbi__jpeg_test(stbi__context *s);
  988-static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
  989-static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
  990+static int
  991+stbi__jpeg_test(stbi__context *s);
  992+static void *
  993+stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
  994+                stbi__result_info *ri);
  995+static int
  996+stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
  997 #endif
  998 
  999 #ifndef STBI_NO_PNG
 1000-static int      stbi__png_test(stbi__context *s);
 1001-static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1002-static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
 1003-static int      stbi__png_is16(stbi__context *s);
 1004+static int
 1005+stbi__png_test(stbi__context *s);
 1006+static void *
 1007+stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1008+               stbi__result_info *ri);
 1009+static int
 1010+stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
 1011+static int
 1012+stbi__png_is16(stbi__context *s);
 1013 #endif
 1014 
 1015 #ifndef STBI_NO_BMP
 1016-static int      stbi__bmp_test(stbi__context *s);
 1017-static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1018-static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 1019+static int
 1020+stbi__bmp_test(stbi__context *s);
 1021+static void *
 1022+stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1023+               stbi__result_info *ri);
 1024+static int
 1025+stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 1026 #endif
 1027 
 1028 #ifndef STBI_NO_TGA
 1029-static int      stbi__tga_test(stbi__context *s);
 1030-static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1031-static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 1032+static int
 1033+stbi__tga_test(stbi__context *s);
 1034+static void *
 1035+stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1036+               stbi__result_info *ri);
 1037+static int
 1038+stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 1039 #endif
 1040 
 1041 #ifndef STBI_NO_PSD
 1042-static int      stbi__psd_test(stbi__context *s);
 1043-static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
 1044-static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
 1045-static int      stbi__psd_is16(stbi__context *s);
 1046+static int
 1047+stbi__psd_test(stbi__context *s);
 1048+static void *
 1049+stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1050+               stbi__result_info *ri, int bpc);
 1051+static int
 1052+stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
 1053+static int
 1054+stbi__psd_is16(stbi__context *s);
 1055 #endif
 1056 
 1057 #ifndef STBI_NO_HDR
 1058-static int      stbi__hdr_test(stbi__context *s);
 1059-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1060-static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 1061+static int
 1062+stbi__hdr_test(stbi__context *s);
 1063+static float *
 1064+stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1065+               stbi__result_info *ri);
 1066+static int
 1067+stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 1068 #endif
 1069 
 1070 #ifndef STBI_NO_PIC
 1071-static int      stbi__pic_test(stbi__context *s);
 1072-static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1073-static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 1074+static int
 1075+stbi__pic_test(stbi__context *s);
 1076+static void *
 1077+stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1078+               stbi__result_info *ri);
 1079+static int
 1080+stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 1081 #endif
 1082 
 1083 #ifndef STBI_NO_GIF
 1084-static int      stbi__gif_test(stbi__context *s);
 1085-static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1086-static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 1087-static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 1088+static int
 1089+stbi__gif_test(stbi__context *s);
 1090+static void *
 1091+stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1092+               stbi__result_info *ri);
 1093+static void *
 1094+stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z,
 1095+                    int *comp, int req_comp);
 1096+static int
 1097+stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 1098 #endif
 1099 
 1100 #ifndef STBI_NO_PNM
 1101-static int      stbi__pnm_test(stbi__context *s);
 1102-static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 1103-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 1104-static int      stbi__pnm_is16(stbi__context *s);
 1105+static int
 1106+stbi__pnm_test(stbi__context *s);
 1107+static void *
 1108+stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1109+               stbi__result_info *ri);
 1110+static int
 1111+stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 1112+static int
 1113+stbi__pnm_is16(stbi__context *s);
 1114 #endif
 1115 
 1116 static
 1117 #ifdef STBI_THREAD_LOCAL
 1118-STBI_THREAD_LOCAL
 1119+    STBI_THREAD_LOCAL
 1120 #endif
 1121-const char *stbi__g_failure_reason;
 1122+    const char *stbi__g_failure_reason;
 1123 
 1124-STBIDEF const char *stbi_failure_reason(void)
 1125+STBIDEF const char *
 1126+stbi_failure_reason(void)
 1127 {
 1128-   return stbi__g_failure_reason;
 1129+	return stbi__g_failure_reason;
 1130 }
 1131 
 1132 #ifndef STBI_NO_FAILURE_STRINGS
 1133-static int stbi__err(const char *str)
 1134+static int
 1135+stbi__err(const char *str)
 1136 {
 1137-   stbi__g_failure_reason = str;
 1138-   return 0;
 1139+	stbi__g_failure_reason = str;
 1140+	return 0;
 1141 }
 1142 #endif
 1143 
 1144-static void *stbi__malloc(size_t size)
 1145+static void *
 1146+stbi__malloc(size_t size)
 1147 {
 1148-    return STBI_MALLOC(size);
 1149+	return STBI_MALLOC(size);
 1150 }
 1151 
 1152 // stb_image uses ints pervasively, including for offset calculations.
 1153@@ -999,88 +1122,128 @@ static void *stbi__malloc(size_t size)
 1154 
 1155 // return 1 if the sum is valid, 0 on overflow.
 1156 // negative terms are considered invalid.
 1157-static int stbi__addsizes_valid(int a, int b)
 1158+static int
 1159+stbi__addsizes_valid(int a, int b)
 1160 {
 1161-   if (b < 0) return 0;
 1162-   // now 0 <= b <= INT_MAX, hence also
 1163-   // 0 <= INT_MAX - b <= INTMAX.
 1164-   // And "a + b <= INT_MAX" (which might overflow) is the
 1165-   // same as a <= INT_MAX - b (no overflow)
 1166-   return a <= INT_MAX - b;
 1167+	if (b < 0) {
 1168+		return 0;
 1169+	}
 1170+	// now 0 <= b <= INT_MAX, hence also
 1171+	// 0 <= INT_MAX - b <= INTMAX.
 1172+	// And "a + b <= INT_MAX" (which might overflow) is the
 1173+	// same as a <= INT_MAX - b (no overflow)
 1174+	return a <= INT_MAX - b;
 1175 }
 1176 
 1177 // returns 1 if the product is valid, 0 on overflow.
 1178 // negative factors are considered invalid.
 1179-static int stbi__mul2sizes_valid(int a, int b)
 1180+static int
 1181+stbi__mul2sizes_valid(int a, int b)
 1182 {
 1183-   if (a < 0 || b < 0) return 0;
 1184-   if (b == 0) return 1; // mul-by-0 is always safe
 1185-   // portable way to check for no overflows in a*b
 1186-   return a <= INT_MAX/b;
 1187+	if (a < 0 || b < 0) {
 1188+		return 0;
 1189+	}
 1190+	if (b == 0) {
 1191+		return 1; // mul-by-0 is always safe
 1192+	}
 1193+	// portable way to check for no overflows in a*b
 1194+	return a <= INT_MAX / b;
 1195 }
 1196 
 1197-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 1198+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) ||                         \
 1199+    !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 1200 // returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
 1201-static int stbi__mad2sizes_valid(int a, int b, int add)
 1202+static int
 1203+stbi__mad2sizes_valid(int a, int b, int add)
 1204 {
 1205-   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
 1206+	return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
 1207 }
 1208 #endif
 1209 
 1210 // returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
 1211-static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 1212+static int
 1213+stbi__mad3sizes_valid(int a, int b, int c, int add)
 1214 {
 1215-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
 1216-      stbi__addsizes_valid(a*b*c, add);
 1217+	return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
 1218+	       stbi__addsizes_valid(a * b * c, add);
 1219 }
 1220 
 1221-// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
 1222+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't
 1223+// overflow
 1224 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 1225-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 1226+static int
 1227+stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 1228 {
 1229-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
 1230-      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
 1231+	return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
 1232+	       stbi__mul2sizes_valid(a * b * c, d) &&
 1233+	       stbi__addsizes_valid(a * b * c * d, add);
 1234 }
 1235 #endif
 1236 
 1237-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 1238+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) ||                         \
 1239+    !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
 1240 // mallocs with size overflow checking
 1241-static void *stbi__malloc_mad2(int a, int b, int add)
 1242+static void *
 1243+stbi__malloc_mad2(int a, int b, int add)
 1244 {
 1245-   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
 1246-   return stbi__malloc(a*b + add);
 1247+	if (!stbi__mad2sizes_valid(a, b, add)) {
 1248+		return NULL;
 1249+	}
 1250+	return stbi__malloc(a * b + add);
 1251 }
 1252 #endif
 1253 
 1254-static void *stbi__malloc_mad3(int a, int b, int c, int add)
 1255+static void *
 1256+stbi__malloc_mad3(int a, int b, int c, int add)
 1257 {
 1258-   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
 1259-   return stbi__malloc(a*b*c + add);
 1260+	if (!stbi__mad3sizes_valid(a, b, c, add)) {
 1261+		return NULL;
 1262+	}
 1263+	return stbi__malloc(a * b * c + add);
 1264 }
 1265 
 1266 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 1267-static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 1268+static void *
 1269+stbi__malloc_mad4(int a, int b, int c, int d, int add)
 1270 {
 1271-   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
 1272-   return stbi__malloc(a*b*c*d + add);
 1273+	if (!stbi__mad4sizes_valid(a, b, c, d, add)) {
 1274+		return NULL;
 1275+	}
 1276+	return stbi__malloc(a * b * c * d + add);
 1277 }
 1278 #endif
 1279 
 1280-// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
 1281-static int stbi__addints_valid(int a, int b)
 1282+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1
 1283+// inclusive), 0 on overflow.
 1284+static int
 1285+stbi__addints_valid(int a, int b)
 1286 {
 1287-   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
 1288-   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
 1289-   return a <= INT_MAX - b;
 1290+	if ((a >= 0) != (b >= 0)) {
 1291+		return 1; // a and b have different signs, so no overflow
 1292+	}
 1293+	if (a < 0 && b < 0) {
 1294+		return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot
 1295+		                         // overflow since b < 0.
 1296+	}
 1297+	return a <= INT_MAX - b;
 1298 }
 1299 
 1300 // returns 1 if the product of two ints fits in a signed short, 0 on overflow.
 1301-static int stbi__mul2shorts_valid(int a, int b)
 1302-{
 1303-   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
 1304-   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
 1305-   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
 1306-   return a >= SHRT_MIN / b;
 1307+static int
 1308+stbi__mul2shorts_valid(int a, int b)
 1309+{
 1310+	if (b == 0 || b == -1) {
 1311+		return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b
 1312+		          // doesn't overflow
 1313+	}
 1314+	if ((a >= 0) == (b >= 0)) {
 1315+		return a <= SHRT_MAX /
 1316+		                b; // product is positive, so similar to mul2sizes_valid
 1317+	}
 1318+	if (b < 0) {
 1319+		return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
 1320+	}
 1321+	return a >= SHRT_MIN / b;
 1322 }
 1323 
 1324 // stbi__err - error
 1325@@ -1088,423 +1251,524 @@ static int stbi__mul2shorts_valid(int a, int b)
 1326 // stbi__errpuc - error returning pointer to unsigned char
 1327 
 1328 #ifdef STBI_NO_FAILURE_STRINGS
 1329-   #define stbi__err(x,y)  0
 1330+#define stbi__err(x, y) 0
 1331 #elif defined(STBI_FAILURE_USERMSG)
 1332-   #define stbi__err(x,y)  stbi__err(y)
 1333+#define stbi__err(x, y) stbi__err(y)
 1334 #else
 1335-   #define stbi__err(x,y)  stbi__err(x)
 1336+#define stbi__err(x, y) stbi__err(x)
 1337 #endif
 1338 
 1339-#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
 1340-#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
 1341+#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL))
 1342+#define stbi__errpuc(x, y)                                                     \
 1343+	((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL))
 1344 
 1345-STBIDEF void stbi_image_free(void *retval_from_stbi_load)
 1346+STBIDEF void
 1347+stbi_image_free(void *retval_from_stbi_load)
 1348 {
 1349-   STBI_FREE(retval_from_stbi_load);
 1350+	STBI_FREE(retval_from_stbi_load);
 1351 }
 1352 
 1353 #ifndef STBI_NO_LINEAR
 1354-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 1355+static float *
 1356+stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
 1357 #endif
 1358 
 1359 #ifndef STBI_NO_HDR
 1360-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
 1361+static stbi_uc *
 1362+stbi__hdr_to_ldr(float *data, int x, int y, int comp);
 1363 #endif
 1364 
 1365 static int stbi__vertically_flip_on_load_global = 0;
 1366 
 1367-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 1368+STBIDEF void
 1369+stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
 1370 {
 1371-   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
 1372+	stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
 1373 }
 1374 
 1375 #ifndef STBI_THREAD_LOCAL
 1376-#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
 1377+#define stbi__vertically_flip_on_load stbi__vertically_flip_on_load_global
 1378 #else
 1379-static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
 1380+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local,
 1381+    stbi__vertically_flip_on_load_set;
 1382 
 1383-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
 1384+STBIDEF void
 1385+stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
 1386 {
 1387-   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
 1388-   stbi__vertically_flip_on_load_set = 1;
 1389+	stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
 1390+	stbi__vertically_flip_on_load_set = 1;
 1391 }
 1392 
 1393-#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
 1394-                                         ? stbi__vertically_flip_on_load_local  \
 1395-                                         : stbi__vertically_flip_on_load_global)
 1396+#define stbi__vertically_flip_on_load                                          \
 1397+	(stbi__vertically_flip_on_load_set ? stbi__vertically_flip_on_load_local   \
 1398+	                                   : stbi__vertically_flip_on_load_global)
 1399 #endif // STBI_THREAD_LOCAL
 1400 
 1401-static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 1402-{
 1403-   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
 1404-   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
 1405-   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
 1406-   ri->num_channels = 0;
 1407-
 1408-   // test the formats with a very explicit header first (at least a FOURCC
 1409-   // or distinctive magic number first)
 1410-   #ifndef STBI_NO_PNG
 1411-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
 1412-   #endif
 1413-   #ifndef STBI_NO_BMP
 1414-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
 1415-   #endif
 1416-   #ifndef STBI_NO_GIF
 1417-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
 1418-   #endif
 1419-   #ifndef STBI_NO_PSD
 1420-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
 1421-   #else
 1422-   STBI_NOTUSED(bpc);
 1423-   #endif
 1424-   #ifndef STBI_NO_PIC
 1425-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
 1426-   #endif
 1427-
 1428-   // then the formats that can end up attempting to load with just 1 or 2
 1429-   // bytes matching expectations; these are prone to false positives, so
 1430-   // try them later
 1431-   #ifndef STBI_NO_JPEG
 1432-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
 1433-   #endif
 1434-   #ifndef STBI_NO_PNM
 1435-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
 1436-   #endif
 1437-
 1438-   #ifndef STBI_NO_HDR
 1439-   if (stbi__hdr_test(s)) {
 1440-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
 1441-      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
 1442-   }
 1443-   #endif
 1444-
 1445-   #ifndef STBI_NO_TGA
 1446-   // test tga last because it's a crappy test!
 1447-   if (stbi__tga_test(s))
 1448-      return stbi__tga_load(s,x,y,comp,req_comp, ri);
 1449-   #endif
 1450-
 1451-   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 1452-}
 1453-
 1454-static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 1455-{
 1456-   int i;
 1457-   int img_len = w * h * channels;
 1458-   stbi_uc *reduced;
 1459-
 1460-   reduced = (stbi_uc *) stbi__malloc(img_len);
 1461-   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
 1462-
 1463-   for (i = 0; i < img_len; ++i)
 1464-      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
 1465-
 1466-   STBI_FREE(orig);
 1467-   return reduced;
 1468-}
 1469-
 1470-static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
 1471-{
 1472-   int i;
 1473-   int img_len = w * h * channels;
 1474-   stbi__uint16 *enlarged;
 1475-
 1476-   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
 1477-   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
 1478-
 1479-   for (i = 0; i < img_len; ++i)
 1480-      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
 1481-
 1482-   STBI_FREE(orig);
 1483-   return enlarged;
 1484-}
 1485-
 1486-static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
 1487-{
 1488-   int row;
 1489-   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
 1490-   stbi_uc temp[2048];
 1491-   stbi_uc *bytes = (stbi_uc *)image;
 1492-
 1493-   for (row = 0; row < (h>>1); row++) {
 1494-      stbi_uc *row0 = bytes + row*bytes_per_row;
 1495-      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
 1496-      // swap row0 with row1
 1497-      size_t bytes_left = bytes_per_row;
 1498-      while (bytes_left) {
 1499-         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
 1500-         memcpy(temp, row0, bytes_copy);
 1501-         memcpy(row0, row1, bytes_copy);
 1502-         memcpy(row1, temp, bytes_copy);
 1503-         row0 += bytes_copy;
 1504-         row1 += bytes_copy;
 1505-         bytes_left -= bytes_copy;
 1506-      }
 1507-   }
 1508+static void *
 1509+stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 1510+                stbi__result_info *ri, int bpc)
 1511+{
 1512+	memset(ri, 0,
 1513+	       sizeof(*ri)); // make sure it's initialized if we add new fields
 1514+	ri->bits_per_channel =
 1515+	    8; // default is 8 so most paths don't have to be changed
 1516+	ri->channel_order =
 1517+	    STBI_ORDER_RGB; // all current input & output are this, but this is here
 1518+	                    // so we can add BGR order
 1519+	ri->num_channels = 0;
 1520+
 1521+// test the formats with a very explicit header first (at least a FOURCC
 1522+// or distinctive magic number first)
 1523+#ifndef STBI_NO_PNG
 1524+	if (stbi__png_test(s)) {
 1525+		return stbi__png_load(s, x, y, comp, req_comp, ri);
 1526+	}
 1527+#endif
 1528+#ifndef STBI_NO_BMP
 1529+	if (stbi__bmp_test(s)) {
 1530+		return stbi__bmp_load(s, x, y, comp, req_comp, ri);
 1531+	}
 1532+#endif
 1533+#ifndef STBI_NO_GIF
 1534+	if (stbi__gif_test(s)) {
 1535+		return stbi__gif_load(s, x, y, comp, req_comp, ri);
 1536+	}
 1537+#endif
 1538+#ifndef STBI_NO_PSD
 1539+	if (stbi__psd_test(s)) {
 1540+		return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
 1541+	}
 1542+#else
 1543+	STBI_NOTUSED(bpc);
 1544+#endif
 1545+#ifndef STBI_NO_PIC
 1546+	if (stbi__pic_test(s)) {
 1547+		return stbi__pic_load(s, x, y, comp, req_comp, ri);
 1548+	}
 1549+#endif
 1550+
 1551+// then the formats that can end up attempting to load with just 1 or 2
 1552+// bytes matching expectations; these are prone to false positives, so
 1553+// try them later
 1554+#ifndef STBI_NO_JPEG
 1555+	if (stbi__jpeg_test(s)) {
 1556+		return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
 1557+	}
 1558+#endif
 1559+#ifndef STBI_NO_PNM
 1560+	if (stbi__pnm_test(s)) {
 1561+		return stbi__pnm_load(s, x, y, comp, req_comp, ri);
 1562+	}
 1563+#endif
 1564+
 1565+#ifndef STBI_NO_HDR
 1566+	if (stbi__hdr_test(s)) {
 1567+		float *hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
 1568+		return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
 1569+	}
 1570+#endif
 1571+
 1572+#ifndef STBI_NO_TGA
 1573+	// test tga last because it's a crappy test!
 1574+	if (stbi__tga_test(s)) {
 1575+		return stbi__tga_load(s, x, y, comp, req_comp, ri);
 1576+	}
 1577+#endif
 1578+
 1579+	return stbi__errpuc("unknown image type",
 1580+	                    "Image not of any known type, or corrupt");
 1581+}
 1582+
 1583+static stbi_uc *
 1584+stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 1585+{
 1586+	int i;
 1587+	int img_len = w * h * channels;
 1588+	stbi_uc *reduced;
 1589+
 1590+	reduced = (stbi_uc *)stbi__malloc(img_len);
 1591+	if (reduced == NULL) {
 1592+		return stbi__errpuc("outofmem", "Out of memory");
 1593+	}
 1594+
 1595+	for (i = 0; i < img_len; ++i) {
 1596+		reduced[i] = (stbi_uc)((orig[i] >> 8) &
 1597+		                       0xFF); // top half of each byte is sufficient
 1598+		                              // approx of 16->8 bit scaling
 1599+	}
 1600+
 1601+	STBI_FREE(orig);
 1602+	return reduced;
 1603+}
 1604+
 1605+static stbi__uint16 *
 1606+stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
 1607+{
 1608+	int i;
 1609+	int img_len = w * h * channels;
 1610+	stbi__uint16 *enlarged;
 1611+
 1612+	enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2);
 1613+	if (enlarged == NULL) {
 1614+		return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
 1615+	}
 1616+
 1617+	for (i = 0; i < img_len; ++i) {
 1618+		enlarged[i] = (stbi__uint16)((orig[i] << 8) +
 1619+		                             orig[i]); // replicate to high and low
 1620+		                                       // byte, maps 0->0, 255->0xffff
 1621+	}
 1622+
 1623+	STBI_FREE(orig);
 1624+	return enlarged;
 1625+}
 1626+
 1627+static void
 1628+stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
 1629+{
 1630+	int row;
 1631+	size_t bytes_per_row = (size_t)w * bytes_per_pixel;
 1632+	stbi_uc temp[2048];
 1633+	stbi_uc *bytes = (stbi_uc *)image;
 1634+
 1635+	for (row = 0; row < (h >> 1); row++) {
 1636+		stbi_uc *row0 = bytes + row * bytes_per_row;
 1637+		stbi_uc *row1 = bytes + (h - row - 1) * bytes_per_row;
 1638+		// swap row0 with row1
 1639+		size_t bytes_left = bytes_per_row;
 1640+		while (bytes_left) {
 1641+			size_t bytes_copy =
 1642+			    (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
 1643+			memcpy(temp, row0, bytes_copy);
 1644+			memcpy(row0, row1, bytes_copy);
 1645+			memcpy(row1, temp, bytes_copy);
 1646+			row0 += bytes_copy;
 1647+			row1 += bytes_copy;
 1648+			bytes_left -= bytes_copy;
 1649+		}
 1650+	}
 1651 }
 1652 
 1653 #ifndef STBI_NO_GIF
 1654-static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
 1655+static void
 1656+stbi__vertical_flip_slices(void *image, int w, int h, int z,
 1657+                           int bytes_per_pixel)
 1658 {
 1659-   int slice;
 1660-   int slice_size = w * h * bytes_per_pixel;
 1661+	int slice;
 1662+	int slice_size = w * h * bytes_per_pixel;
 1663 
 1664-   stbi_uc *bytes = (stbi_uc *)image;
 1665-   for (slice = 0; slice < z; ++slice) {
 1666-      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
 1667-      bytes += slice_size;
 1668-   }
 1669+	stbi_uc *bytes = (stbi_uc *)image;
 1670+	for (slice = 0; slice < z; ++slice) {
 1671+		stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
 1672+		bytes += slice_size;
 1673+	}
 1674 }
 1675 #endif
 1676 
 1677-static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 1678+static unsigned char *
 1679+stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp,
 1680+                                int req_comp)
 1681 {
 1682-   stbi__result_info ri;
 1683-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 1684+	stbi__result_info ri;
 1685+	void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
 1686 
 1687-   if (result == NULL)
 1688-      return NULL;
 1689+	if (result == NULL) {
 1690+		return NULL;
 1691+	}
 1692 
 1693-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
 1694-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 1695+	// it is the responsibility of the loaders to make sure we get either 8 or
 1696+	// 16 bit.
 1697+	STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 1698 
 1699-   if (ri.bits_per_channel != 8) {
 1700-      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
 1701-      ri.bits_per_channel = 8;
 1702-   }
 1703+	if (ri.bits_per_channel != 8) {
 1704+		result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y,
 1705+		                               req_comp == 0 ? *comp : req_comp);
 1706+		ri.bits_per_channel = 8;
 1707+	}
 1708 
 1709-   // @TODO: move stbi__convert_format to here
 1710+	// @TODO: move stbi__convert_format to here
 1711 
 1712-   if (stbi__vertically_flip_on_load) {
 1713-      int channels = req_comp ? req_comp : *comp;
 1714-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
 1715-   }
 1716+	if (stbi__vertically_flip_on_load) {
 1717+		int channels = req_comp ? req_comp : *comp;
 1718+		stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
 1719+	}
 1720 
 1721-   return (unsigned char *) result;
 1722+	return (unsigned char *)result;
 1723 }
 1724 
 1725-static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 1726+static stbi__uint16 *
 1727+stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp,
 1728+                                 int req_comp)
 1729 {
 1730-   stbi__result_info ri;
 1731-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 1732+	stbi__result_info ri;
 1733+	void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
 1734 
 1735-   if (result == NULL)
 1736-      return NULL;
 1737+	if (result == NULL) {
 1738+		return NULL;
 1739+	}
 1740 
 1741-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
 1742-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 1743+	// it is the responsibility of the loaders to make sure we get either 8 or
 1744+	// 16 bit.
 1745+	STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
 1746 
 1747-   if (ri.bits_per_channel != 16) {
 1748-      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
 1749-      ri.bits_per_channel = 16;
 1750-   }
 1751+	if (ri.bits_per_channel != 16) {
 1752+		result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y,
 1753+		                               req_comp == 0 ? *comp : req_comp);
 1754+		ri.bits_per_channel = 16;
 1755+	}
 1756 
 1757-   // @TODO: move stbi__convert_format16 to here
 1758-   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
 1759+	// @TODO: move stbi__convert_format16 to here
 1760+	// @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to
 1761+	// keep more precision
 1762 
 1763-   if (stbi__vertically_flip_on_load) {
 1764-      int channels = req_comp ? req_comp : *comp;
 1765-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
 1766-   }
 1767+	if (stbi__vertically_flip_on_load) {
 1768+		int channels = req_comp ? req_comp : *comp;
 1769+		stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
 1770+	}
 1771 
 1772-   return (stbi__uint16 *) result;
 1773+	return (stbi__uint16 *)result;
 1774 }
 1775 
 1776 #if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
 1777-static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 1778+static void
 1779+stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 1780 {
 1781-   if (stbi__vertically_flip_on_load && result != NULL) {
 1782-      int channels = req_comp ? req_comp : *comp;
 1783-      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
 1784-   }
 1785+	if (stbi__vertically_flip_on_load && result != NULL) {
 1786+		int channels = req_comp ? req_comp : *comp;
 1787+		stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
 1788+	}
 1789 }
 1790 #endif
 1791 
 1792 #ifndef STBI_NO_STDIO
 1793 
 1794 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 1795-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
 1796-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
 1797+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(
 1798+    unsigned int cp, unsigned long flags, const char *str, int cbmb,
 1799+    wchar_t *widestr, int cchwide);
 1800+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(
 1801+    unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide,
 1802+    char *str, int cbmb, const char *defchar, int *used_default);
 1803 #endif
 1804 
 1805 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 1806-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 1807+STBIDEF int
 1808+stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t *input)
 1809 {
 1810-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 1811+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer,
 1812+	                           (int)bufferlen, NULL, NULL);
 1813 }
 1814 #endif
 1815 
 1816-static FILE *stbi__fopen(char const *filename, char const *mode)
 1817+static FILE *
 1818+stbi__fopen(char const *filename, char const *mode)
 1819 {
 1820-   FILE *f;
 1821+	FILE *f;
 1822 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 1823-   wchar_t wMode[64];
 1824-   wchar_t wFilename[1024];
 1825-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
 1826-      return 0;
 1827-
 1828-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
 1829-      return 0;
 1830+	wchar_t wMode[64];
 1831+	wchar_t wFilename[1024];
 1832+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
 1833+	                             sizeof(wFilename) / sizeof(*wFilename))) {
 1834+		return 0;
 1835+	}
 1836+
 1837+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode,
 1838+	                             sizeof(wMode) / sizeof(*wMode))) {
 1839+		return 0;
 1840+	}
 1841 
 1842 #if defined(_MSC_VER) && _MSC_VER >= 1400
 1843-	if (0 != _wfopen_s(&f, wFilename, wMode))
 1844+	if (0 != _wfopen_s(&f, wFilename, wMode)) {
 1845 		f = 0;
 1846+	}
 1847 #else
 1848-   f = _wfopen(wFilename, wMode);
 1849+	f = _wfopen(wFilename, wMode);
 1850 #endif
 1851 
 1852 #elif defined(_MSC_VER) && _MSC_VER >= 1400
 1853-   if (0 != fopen_s(&f, filename, mode))
 1854-      f=0;
 1855+	if (0 != fopen_s(&f, filename, mode)) {
 1856+		f = 0;
 1857+	}
 1858 #else
 1859-   f = fopen(filename, mode);
 1860+	f = fopen(filename, mode);
 1861 #endif
 1862-   return f;
 1863+	return f;
 1864 }
 1865 
 1866-
 1867-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
 1868+STBIDEF stbi_uc *
 1869+stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
 1870 {
 1871-   FILE *f = stbi__fopen(filename, "rb");
 1872-   unsigned char *result;
 1873-   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
 1874-   result = stbi_load_from_file(f,x,y,comp,req_comp);
 1875-   fclose(f);
 1876-   return result;
 1877+	FILE *f = stbi__fopen(filename, "rb");
 1878+	unsigned char *result;
 1879+	if (!f) {
 1880+		return stbi__errpuc("can't fopen", "Unable to open file");
 1881+	}
 1882+	result = stbi_load_from_file(f, x, y, comp, req_comp);
 1883+	fclose(f);
 1884+	return result;
 1885 }
 1886 
 1887-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 1888+STBIDEF stbi_uc *
 1889+stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 1890 {
 1891-   unsigned char *result;
 1892-   stbi__context s;
 1893-   stbi__start_file(&s,f);
 1894-   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 1895-   if (result) {
 1896-      // need to 'unget' all the characters in the IO buffer
 1897-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
 1898-   }
 1899-   return result;
 1900+	unsigned char *result;
 1901+	stbi__context s;
 1902+	stbi__start_file(&s, f);
 1903+	result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 1904+	if (result) {
 1905+		// need to 'unget' all the characters in the IO buffer
 1906+		fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
 1907+	}
 1908+	return result;
 1909 }
 1910 
 1911-STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
 1912+STBIDEF stbi__uint16 *
 1913+stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
 1914 {
 1915-   stbi__uint16 *result;
 1916-   stbi__context s;
 1917-   stbi__start_file(&s,f);
 1918-   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
 1919-   if (result) {
 1920-      // need to 'unget' all the characters in the IO buffer
 1921-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
 1922-   }
 1923-   return result;
 1924+	stbi__uint16 *result;
 1925+	stbi__context s;
 1926+	stbi__start_file(&s, f);
 1927+	result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
 1928+	if (result) {
 1929+		// need to 'unget' all the characters in the IO buffer
 1930+		fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
 1931+	}
 1932+	return result;
 1933 }
 1934 
 1935-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
 1936+STBIDEF stbi_us *
 1937+stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
 1938 {
 1939-   FILE *f = stbi__fopen(filename, "rb");
 1940-   stbi__uint16 *result;
 1941-   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
 1942-   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
 1943-   fclose(f);
 1944-   return result;
 1945+	FILE *f = stbi__fopen(filename, "rb");
 1946+	stbi__uint16 *result;
 1947+	if (!f) {
 1948+		return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file");
 1949+	}
 1950+	result = stbi_load_from_file_16(f, x, y, comp, req_comp);
 1951+	fclose(f);
 1952+	return result;
 1953 }
 1954 
 1955+#endif //! STBI_NO_STDIO
 1956 
 1957-#endif //!STBI_NO_STDIO
 1958-
 1959-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
 1960+STBIDEF stbi_us *
 1961+stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
 1962+                         int *channels_in_file, int desired_channels)
 1963 {
 1964-   stbi__context s;
 1965-   stbi__start_mem(&s,buffer,len);
 1966-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
 1967+	stbi__context s;
 1968+	stbi__start_mem(&s, buffer, len);
 1969+	return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
 1970+	                                        desired_channels);
 1971 }
 1972 
 1973-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
 1974+STBIDEF stbi_us *
 1975+stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
 1976+                            int *y, int *channels_in_file, int desired_channels)
 1977 {
 1978-   stbi__context s;
 1979-   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
 1980-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
 1981+	stbi__context s;
 1982+	stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
 1983+	return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
 1984+	                                        desired_channels);
 1985 }
 1986 
 1987-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 1988+STBIDEF stbi_uc *
 1989+stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp,
 1990+                      int req_comp)
 1991 {
 1992-   stbi__context s;
 1993-   stbi__start_mem(&s,buffer,len);
 1994-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 1995+	stbi__context s;
 1996+	stbi__start_mem(&s, buffer, len);
 1997+	return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 1998 }
 1999 
 2000-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 2001+STBIDEF stbi_uc *
 2002+stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
 2003+                         int *y, int *comp, int req_comp)
 2004 {
 2005-   stbi__context s;
 2006-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
 2007-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 2008+	stbi__context s;
 2009+	stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
 2010+	return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
 2011 }
 2012 
 2013 #ifndef STBI_NO_GIF
 2014-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
 2015+STBIDEF stbi_uc *
 2016+stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x,
 2017+                          int *y, int *z, int *comp, int req_comp)
 2018 {
 2019-   unsigned char *result;
 2020-   stbi__context s;
 2021-   stbi__start_mem(&s,buffer,len);
 2022+	unsigned char *result;
 2023+	stbi__context s;
 2024+	stbi__start_mem(&s, buffer, len);
 2025 
 2026-   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
 2027-   if (stbi__vertically_flip_on_load) {
 2028-      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
 2029-   }
 2030+	result = (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp,
 2031+	                                              req_comp);
 2032+	if (stbi__vertically_flip_on_load) {
 2033+		stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
 2034+	}
 2035 
 2036-   return result;
 2037+	return result;
 2038 }
 2039 #endif
 2040 
 2041 #ifndef STBI_NO_LINEAR
 2042-static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 2043+static float *
 2044+stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 2045 {
 2046-   unsigned char *data;
 2047-   #ifndef STBI_NO_HDR
 2048-   if (stbi__hdr_test(s)) {
 2049-      stbi__result_info ri;
 2050-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
 2051-      if (hdr_data)
 2052-         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
 2053-      return hdr_data;
 2054-   }
 2055-   #endif
 2056-   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
 2057-   if (data)
 2058-      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
 2059-   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
 2060+	unsigned char *data;
 2061+#ifndef STBI_NO_HDR
 2062+	if (stbi__hdr_test(s)) {
 2063+		stbi__result_info ri;
 2064+		float *hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
 2065+		if (hdr_data) {
 2066+			stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
 2067+		}
 2068+		return hdr_data;
 2069+	}
 2070+#endif
 2071+	data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
 2072+	if (data) {
 2073+		return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
 2074+	}
 2075+	return stbi__errpf("unknown image type",
 2076+	                   "Image not of any known type, or corrupt");
 2077 }
 2078 
 2079-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 2080+STBIDEF float *
 2081+stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y,
 2082+                       int *comp, int req_comp)
 2083 {
 2084-   stbi__context s;
 2085-   stbi__start_mem(&s,buffer,len);
 2086-   return stbi__loadf_main(&s,x,y,comp,req_comp);
 2087+	stbi__context s;
 2088+	stbi__start_mem(&s, buffer, len);
 2089+	return stbi__loadf_main(&s, x, y, comp, req_comp);
 2090 }
 2091 
 2092-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 2093+STBIDEF float *
 2094+stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x,
 2095+                          int *y, int *comp, int req_comp)
 2096 {
 2097-   stbi__context s;
 2098-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
 2099-   return stbi__loadf_main(&s,x,y,comp,req_comp);
 2100+	stbi__context s;
 2101+	stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
 2102+	return stbi__loadf_main(&s, x, y, comp, req_comp);
 2103 }
 2104 
 2105 #ifndef STBI_NO_STDIO
 2106-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
 2107+STBIDEF float *
 2108+stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
 2109 {
 2110-   float *result;
 2111-   FILE *f = stbi__fopen(filename, "rb");
 2112-   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
 2113-   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
 2114-   fclose(f);
 2115-   return result;
 2116+	float *result;
 2117+	FILE *f = stbi__fopen(filename, "rb");
 2118+	if (!f) {
 2119+		return stbi__errpf("can't fopen", "Unable to open file");
 2120+	}
 2121+	result = stbi_loadf_from_file(f, x, y, comp, req_comp);
 2122+	fclose(f);
 2123+	return result;
 2124 }
 2125 
 2126-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 2127+STBIDEF float *
 2128+stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 2129 {
 2130-   stbi__context s;
 2131-   stbi__start_file(&s,f);
 2132-   return stbi__loadf_main(&s,x,y,comp,req_comp);
 2133+	stbi__context s;
 2134+	stbi__start_file(&s, f);
 2135+	return stbi__loadf_main(&s, x, y, comp, req_comp);
 2136 }
 2137 #endif // !STBI_NO_STDIO
 2138 
 2139@@ -1514,222 +1778,262 @@ STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_
 2140 // defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
 2141 // reports false!
 2142 
 2143-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
 2144+STBIDEF int
 2145+stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
 2146 {
 2147-   #ifndef STBI_NO_HDR
 2148-   stbi__context s;
 2149-   stbi__start_mem(&s,buffer,len);
 2150-   return stbi__hdr_test(&s);
 2151-   #else
 2152-   STBI_NOTUSED(buffer);
 2153-   STBI_NOTUSED(len);
 2154-   return 0;
 2155-   #endif
 2156+#ifndef STBI_NO_HDR
 2157+	stbi__context s;
 2158+	stbi__start_mem(&s, buffer, len);
 2159+	return stbi__hdr_test(&s);
 2160+#else
 2161+	STBI_NOTUSED(buffer);
 2162+	STBI_NOTUSED(len);
 2163+	return 0;
 2164+#endif
 2165 }
 2166 
 2167 #ifndef STBI_NO_STDIO
 2168-STBIDEF int      stbi_is_hdr          (char const *filename)
 2169-{
 2170-   FILE *f = stbi__fopen(filename, "rb");
 2171-   int result=0;
 2172-   if (f) {
 2173-      result = stbi_is_hdr_from_file(f);
 2174-      fclose(f);
 2175-   }
 2176-   return result;
 2177-}
 2178-
 2179-STBIDEF int stbi_is_hdr_from_file(FILE *f)
 2180-{
 2181-   #ifndef STBI_NO_HDR
 2182-   long pos = ftell(f);
 2183-   int res;
 2184-   stbi__context s;
 2185-   stbi__start_file(&s,f);
 2186-   res = stbi__hdr_test(&s);
 2187-   fseek(f, pos, SEEK_SET);
 2188-   return res;
 2189-   #else
 2190-   STBI_NOTUSED(f);
 2191-   return 0;
 2192-   #endif
 2193+STBIDEF int
 2194+stbi_is_hdr(char const *filename)
 2195+{
 2196+	FILE *f = stbi__fopen(filename, "rb");
 2197+	int result = 0;
 2198+	if (f) {
 2199+		result = stbi_is_hdr_from_file(f);
 2200+		fclose(f);
 2201+	}
 2202+	return result;
 2203+}
 2204+
 2205+STBIDEF int
 2206+stbi_is_hdr_from_file(FILE *f)
 2207+{
 2208+#ifndef STBI_NO_HDR
 2209+	long pos = ftell(f);
 2210+	int res;
 2211+	stbi__context s;
 2212+	stbi__start_file(&s, f);
 2213+	res = stbi__hdr_test(&s);
 2214+	fseek(f, pos, SEEK_SET);
 2215+	return res;
 2216+#else
 2217+	STBI_NOTUSED(f);
 2218+	return 0;
 2219+#endif
 2220 }
 2221 #endif // !STBI_NO_STDIO
 2222 
 2223-STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
 2224+STBIDEF int
 2225+stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
 2226 {
 2227-   #ifndef STBI_NO_HDR
 2228-   stbi__context s;
 2229-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
 2230-   return stbi__hdr_test(&s);
 2231-   #else
 2232-   STBI_NOTUSED(clbk);
 2233-   STBI_NOTUSED(user);
 2234-   return 0;
 2235-   #endif
 2236+#ifndef STBI_NO_HDR
 2237+	stbi__context s;
 2238+	stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
 2239+	return stbi__hdr_test(&s);
 2240+#else
 2241+	STBI_NOTUSED(clbk);
 2242+	STBI_NOTUSED(user);
 2243+	return 0;
 2244+#endif
 2245 }
 2246 
 2247 #ifndef STBI_NO_LINEAR
 2248-static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 2249+static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f;
 2250 
 2251-STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
 2252-STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 2253+STBIDEF void
 2254+stbi_ldr_to_hdr_gamma(float gamma)
 2255+{
 2256+	stbi__l2h_gamma = gamma;
 2257+}
 2258+STBIDEF void
 2259+stbi_ldr_to_hdr_scale(float scale)
 2260+{
 2261+	stbi__l2h_scale = scale;
 2262+}
 2263 #endif
 2264 
 2265-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
 2266-
 2267-STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
 2268-STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
 2269+static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f;
 2270 
 2271+STBIDEF void
 2272+stbi_hdr_to_ldr_gamma(float gamma)
 2273+{
 2274+	stbi__h2l_gamma_i = 1 / gamma;
 2275+}
 2276+STBIDEF void
 2277+stbi_hdr_to_ldr_scale(float scale)
 2278+{
 2279+	stbi__h2l_scale_i = 1 / scale;
 2280+}
 2281 
 2282 //////////////////////////////////////////////////////////////////////////////
 2283 //
 2284 // Common code used by all image loaders
 2285 //
 2286 
 2287-enum
 2288-{
 2289-   STBI__SCAN_load=0,
 2290-   STBI__SCAN_type,
 2291-   STBI__SCAN_header
 2292-};
 2293-
 2294-static void stbi__refill_buffer(stbi__context *s)
 2295-{
 2296-   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
 2297-   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
 2298-   if (n == 0) {
 2299-      // at end of file, treat same as if from memory, but need to handle case
 2300-      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
 2301-      s->read_from_callbacks = 0;
 2302-      s->img_buffer = s->buffer_start;
 2303-      s->img_buffer_end = s->buffer_start+1;
 2304-      *s->img_buffer = 0;
 2305-   } else {
 2306-      s->img_buffer = s->buffer_start;
 2307-      s->img_buffer_end = s->buffer_start + n;
 2308-   }
 2309-}
 2310-
 2311-stbi_inline static stbi_uc stbi__get8(stbi__context *s)
 2312-{
 2313-   if (s->img_buffer < s->img_buffer_end)
 2314-      return *s->img_buffer++;
 2315-   if (s->read_from_callbacks) {
 2316-      stbi__refill_buffer(s);
 2317-      return *s->img_buffer++;
 2318-   }
 2319-   return 0;
 2320-}
 2321-
 2322-#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 2323+enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
 2324+
 2325+static void
 2326+stbi__refill_buffer(stbi__context *s)
 2327+{
 2328+	int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
 2329+	s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
 2330+	if (n == 0) {
 2331+		// at end of file, treat same as if from memory, but need to handle case
 2332+		// where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
 2333+		s->read_from_callbacks = 0;
 2334+		s->img_buffer = s->buffer_start;
 2335+		s->img_buffer_end = s->buffer_start + 1;
 2336+		*s->img_buffer = 0;
 2337+	} else {
 2338+		s->img_buffer = s->buffer_start;
 2339+		s->img_buffer_end = s->buffer_start + n;
 2340+	}
 2341+}
 2342+
 2343+stbi_inline static stbi_uc
 2344+stbi__get8(stbi__context *s)
 2345+{
 2346+	if (s->img_buffer < s->img_buffer_end) {
 2347+		return *s->img_buffer++;
 2348+	}
 2349+	if (s->read_from_callbacks) {
 2350+		stbi__refill_buffer(s);
 2351+		return *s->img_buffer++;
 2352+	}
 2353+	return 0;
 2354+}
 2355+
 2356+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) &&   \
 2357+    defined(STBI_NO_PNM)
 2358 // nothing
 2359 #else
 2360-stbi_inline static int stbi__at_eof(stbi__context *s)
 2361-{
 2362-   if (s->io.read) {
 2363-      if (!(s->io.eof)(s->io_user_data)) return 0;
 2364-      // if feof() is true, check if buffer = end
 2365-      // special case: we've only got the special 0 character at the end
 2366-      if (s->read_from_callbacks == 0) return 1;
 2367-   }
 2368-
 2369-   return s->img_buffer >= s->img_buffer_end;
 2370+stbi_inline static int
 2371+stbi__at_eof(stbi__context *s)
 2372+{
 2373+	if (s->io.read) {
 2374+		if (!(s->io.eof)(s->io_user_data)) {
 2375+			return 0;
 2376+		}
 2377+		// if feof() is true, check if buffer = end
 2378+		// special case: we've only got the special 0 character at the end
 2379+		if (s->read_from_callbacks == 0) {
 2380+			return 1;
 2381+		}
 2382+	}
 2383+
 2384+	return s->img_buffer >= s->img_buffer_end;
 2385 }
 2386 #endif
 2387 
 2388-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
 2389+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) &&   \
 2390+    defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) &&    \
 2391+    defined(STBI_NO_PIC)
 2392 // nothing
 2393 #else
 2394-static void stbi__skip(stbi__context *s, int n)
 2395-{
 2396-   if (n == 0) return;  // already there!
 2397-   if (n < 0) {
 2398-      s->img_buffer = s->img_buffer_end;
 2399-      return;
 2400-   }
 2401-   if (s->io.read) {
 2402-      int blen = (int) (s->img_buffer_end - s->img_buffer);
 2403-      if (blen < n) {
 2404-         s->img_buffer = s->img_buffer_end;
 2405-         (s->io.skip)(s->io_user_data, n - blen);
 2406-         return;
 2407-      }
 2408-   }
 2409-   s->img_buffer += n;
 2410-}
 2411-#endif
 2412-
 2413-#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
 2414+static void
 2415+stbi__skip(stbi__context *s, int n)
 2416+{
 2417+	if (n == 0) {
 2418+		return; // already there!
 2419+	}
 2420+	if (n < 0) {
 2421+		s->img_buffer = s->img_buffer_end;
 2422+		return;
 2423+	}
 2424+	if (s->io.read) {
 2425+		int blen = (int)(s->img_buffer_end - s->img_buffer);
 2426+		if (blen < n) {
 2427+			s->img_buffer = s->img_buffer_end;
 2428+			(s->io.skip)(s->io_user_data, n - blen);
 2429+			return;
 2430+		}
 2431+	}
 2432+	s->img_buffer += n;
 2433+}
 2434+#endif
 2435+
 2436+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) &&    \
 2437+    defined(STBI_NO_PNM)
 2438 // nothing
 2439 #else
 2440-static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
 2441-{
 2442-   if (s->io.read) {
 2443-      int blen = (int) (s->img_buffer_end - s->img_buffer);
 2444-      if (blen < n) {
 2445-         int res, count;
 2446-
 2447-         memcpy(buffer, s->img_buffer, blen);
 2448-
 2449-         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
 2450-         res = (count == (n-blen));
 2451-         s->img_buffer = s->img_buffer_end;
 2452-         return res;
 2453-      }
 2454-   }
 2455-
 2456-   if (s->img_buffer+n <= s->img_buffer_end) {
 2457-      memcpy(buffer, s->img_buffer, n);
 2458-      s->img_buffer += n;
 2459-      return 1;
 2460-   } else
 2461-      return 0;
 2462+static int
 2463+stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
 2464+{
 2465+	if (s->io.read) {
 2466+		int blen = (int)(s->img_buffer_end - s->img_buffer);
 2467+		if (blen < n) {
 2468+			int res, count;
 2469+
 2470+			memcpy(buffer, s->img_buffer, blen);
 2471+
 2472+			count =
 2473+			    (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen);
 2474+			res = (count == (n - blen));
 2475+			s->img_buffer = s->img_buffer_end;
 2476+			return res;
 2477+		}
 2478+	}
 2479+
 2480+	if (s->img_buffer + n <= s->img_buffer_end) {
 2481+		memcpy(buffer, s->img_buffer, n);
 2482+		s->img_buffer += n;
 2483+		return 1;
 2484+	} else {
 2485+		return 0;
 2486+	}
 2487 }
 2488 #endif
 2489 
 2490-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
 2491+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) &&   \
 2492+    defined(STBI_NO_PIC)
 2493 // nothing
 2494 #else
 2495-static int stbi__get16be(stbi__context *s)
 2496+static int
 2497+stbi__get16be(stbi__context *s)
 2498 {
 2499-   int z = stbi__get8(s);
 2500-   return (z << 8) + stbi__get8(s);
 2501+	int z = stbi__get8(s);
 2502+	return (z << 8) + stbi__get8(s);
 2503 }
 2504 #endif
 2505 
 2506 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
 2507 // nothing
 2508 #else
 2509-static stbi__uint32 stbi__get32be(stbi__context *s)
 2510+static stbi__uint32
 2511+stbi__get32be(stbi__context *s)
 2512 {
 2513-   stbi__uint32 z = stbi__get16be(s);
 2514-   return (z << 16) + stbi__get16be(s);
 2515+	stbi__uint32 z = stbi__get16be(s);
 2516+	return (z << 16) + stbi__get16be(s);
 2517 }
 2518 #endif
 2519 
 2520 #if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
 2521 // nothing
 2522 #else
 2523-static int stbi__get16le(stbi__context *s)
 2524+static int
 2525+stbi__get16le(stbi__context *s)
 2526 {
 2527-   int z = stbi__get8(s);
 2528-   return z + (stbi__get8(s) << 8);
 2529+	int z = stbi__get8(s);
 2530+	return z + (stbi__get8(s) << 8);
 2531 }
 2532 #endif
 2533 
 2534 #ifndef STBI_NO_BMP
 2535-static stbi__uint32 stbi__get32le(stbi__context *s)
 2536+static stbi__uint32
 2537+stbi__get32le(stbi__context *s)
 2538 {
 2539-   stbi__uint32 z = stbi__get16le(s);
 2540-   z += (stbi__uint32)stbi__get16le(s) << 16;
 2541-   return z;
 2542+	stbi__uint32 z = stbi__get16le(s);
 2543+	z += (stbi__uint32)stbi__get16le(s) << 16;
 2544+	return z;
 2545 }
 2546 #endif
 2547 
 2548-#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 2549+#define STBI__BYTECAST(x)                                                      \
 2550+	((stbi_uc)((x) & 255)) // truncate int to byte without warnings
 2551 
 2552-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 2553+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) &&   \
 2554+    defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) &&    \
 2555+    defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 2556 // nothing
 2557 #else
 2558 //////////////////////////////////////////////////////////////////////////////
 2559@@ -1743,169 +2047,327 @@ static stbi__uint32 stbi__get32le(stbi__context *s)
 2560 //  assume data buffer is malloced, so malloc a new one and free that one
 2561 //  only failure mode is malloc failing
 2562 
 2563-static stbi_uc stbi__compute_y(int r, int g, int b)
 2564+static stbi_uc
 2565+stbi__compute_y(int r, int g, int b)
 2566 {
 2567-   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
 2568+	return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
 2569 }
 2570 #endif
 2571 
 2572-#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
 2573+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) &&    \
 2574+    defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) &&    \
 2575+    defined(STBI_NO_PNM)
 2576 // nothing
 2577 #else
 2578-static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
 2579-{
 2580-   int i,j;
 2581-   unsigned char *good;
 2582-
 2583-   if (req_comp == img_n) return data;
 2584-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 2585-
 2586-   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
 2587-   if (good == NULL) {
 2588-      STBI_FREE(data);
 2589-      return stbi__errpuc("outofmem", "Out of memory");
 2590-   }
 2591-
 2592-   for (j=0; j < (int) y; ++j) {
 2593-      unsigned char *src  = data + j * x * img_n   ;
 2594-      unsigned char *dest = good + j * x * req_comp;
 2595-
 2596-      #define STBI__COMBO(a,b)  ((a)*8+(b))
 2597-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
 2598-      // convert source image with img_n components to one with req_comp components;
 2599-      // avoid switch per pixel, so use switch per scanline and massive macros
 2600-      switch (STBI__COMBO(img_n, req_comp)) {
 2601-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
 2602-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
 2603-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
 2604-         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
 2605-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
 2606-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
 2607-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
 2608-         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
 2609-         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
 2610-         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
 2611-         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
 2612-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
 2613-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
 2614-      }
 2615-      #undef STBI__CASE
 2616-   }
 2617-
 2618-   STBI_FREE(data);
 2619-   return good;
 2620+static unsigned char *
 2621+stbi__convert_format(unsigned char *data, int img_n, int req_comp,
 2622+                     unsigned int x, unsigned int y)
 2623+{
 2624+	int i, j;
 2625+	unsigned char *good;
 2626+
 2627+	if (req_comp == img_n) {
 2628+		return data;
 2629+	}
 2630+	STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 2631+
 2632+	good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0);
 2633+	if (good == NULL) {
 2634+		STBI_FREE(data);
 2635+		return stbi__errpuc("outofmem", "Out of memory");
 2636+	}
 2637+
 2638+	for (j = 0; j < (int)y; ++j) {
 2639+		unsigned char *src = data + j * x * img_n;
 2640+		unsigned char *dest = good + j * x * req_comp;
 2641+
 2642+#define STBI__COMBO(a, b) ((a) * 8 + (b))
 2643+#define STBI__CASE(a, b)                                                       \
 2644+	case STBI__COMBO(a, b):                                                    \
 2645+		for (i = x - 1; i >= 0; --i, src += a, dest += b)
 2646+		// convert source image with img_n components to one with req_comp
 2647+		// components; avoid switch per pixel, so use switch per scanline and
 2648+		// massive macros
 2649+		switch (STBI__COMBO(img_n, req_comp)) {
 2650+			STBI__CASE(1, 2)
 2651+			{
 2652+				dest[0] = src[0];
 2653+				dest[1] = 255;
 2654+			}
 2655+			break;
 2656+			STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
 2657+			break;
 2658+			STBI__CASE(1, 4)
 2659+			{
 2660+				dest[0] = dest[1] = dest[2] = src[0];
 2661+				dest[3] = 255;
 2662+			}
 2663+			break;
 2664+			STBI__CASE(2, 1) { dest[0] = src[0]; }
 2665+			break;
 2666+			STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
 2667+			break;
 2668+			STBI__CASE(2, 4)
 2669+			{
 2670+				dest[0] = dest[1] = dest[2] = src[0];
 2671+				dest[3] = src[1];
 2672+			}
 2673+			break;
 2674+			STBI__CASE(3, 4)
 2675+			{
 2676+				dest[0] = src[0];
 2677+				dest[1] = src[1];
 2678+				dest[2] = src[2];
 2679+				dest[3] = 255;
 2680+			}
 2681+			break;
 2682+			STBI__CASE(3, 1)
 2683+			{
 2684+				dest[0] = stbi__compute_y(src[0], src[1], src[2]);
 2685+			}
 2686+			break;
 2687+			STBI__CASE(3, 2)
 2688+			{
 2689+				dest[0] = stbi__compute_y(src[0], src[1], src[2]);
 2690+				dest[1] = 255;
 2691+			}
 2692+			break;
 2693+			STBI__CASE(4, 1)
 2694+			{
 2695+				dest[0] = stbi__compute_y(src[0], src[1], src[2]);
 2696+			}
 2697+			break;
 2698+			STBI__CASE(4, 2)
 2699+			{
 2700+				dest[0] = stbi__compute_y(src[0], src[1], src[2]);
 2701+				dest[1] = src[3];
 2702+			}
 2703+			break;
 2704+			STBI__CASE(4, 3)
 2705+			{
 2706+				dest[0] = src[0];
 2707+				dest[1] = src[1];
 2708+				dest[2] = src[2];
 2709+			}
 2710+			break;
 2711+		default:
 2712+			STBI_ASSERT(0);
 2713+			STBI_FREE(data);
 2714+			STBI_FREE(good);
 2715+			return stbi__errpuc("unsupported", "Unsupported format conversion");
 2716+		}
 2717+#undef STBI__CASE
 2718+	}
 2719+
 2720+	STBI_FREE(data);
 2721+	return good;
 2722 }
 2723 #endif
 2724 
 2725 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
 2726 // nothing
 2727 #else
 2728-static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
 2729+static stbi__uint16
 2730+stbi__compute_y_16(int r, int g, int b)
 2731 {
 2732-   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
 2733+	return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8);
 2734 }
 2735 #endif
 2736 
 2737 #if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
 2738 // nothing
 2739 #else
 2740-static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
 2741-{
 2742-   int i,j;
 2743-   stbi__uint16 *good;
 2744-
 2745-   if (req_comp == img_n) return data;
 2746-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 2747-
 2748-   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
 2749-   if (good == NULL) {
 2750-      STBI_FREE(data);
 2751-      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
 2752-   }
 2753-
 2754-   for (j=0; j < (int) y; ++j) {
 2755-      stbi__uint16 *src  = data + j * x * img_n   ;
 2756-      stbi__uint16 *dest = good + j * x * req_comp;
 2757-
 2758-      #define STBI__COMBO(a,b)  ((a)*8+(b))
 2759-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
 2760-      // convert source image with img_n components to one with req_comp components;
 2761-      // avoid switch per pixel, so use switch per scanline and massive macros
 2762-      switch (STBI__COMBO(img_n, req_comp)) {
 2763-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
 2764-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
 2765-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
 2766-         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
 2767-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
 2768-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
 2769-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
 2770-         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
 2771-         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
 2772-         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
 2773-         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
 2774-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
 2775-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
 2776-      }
 2777-      #undef STBI__CASE
 2778-   }
 2779-
 2780-   STBI_FREE(data);
 2781-   return good;
 2782+static stbi__uint16 *
 2783+stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp,
 2784+                       unsigned int x, unsigned int y)
 2785+{
 2786+	int i, j;
 2787+	stbi__uint16 *good;
 2788+
 2789+	if (req_comp == img_n) {
 2790+		return data;
 2791+	}
 2792+	STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 2793+
 2794+	good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2);
 2795+	if (good == NULL) {
 2796+		STBI_FREE(data);
 2797+		return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
 2798+	}
 2799+
 2800+	for (j = 0; j < (int)y; ++j) {
 2801+		stbi__uint16 *src = data + j * x * img_n;
 2802+		stbi__uint16 *dest = good + j * x * req_comp;
 2803+
 2804+#define STBI__COMBO(a, b) ((a) * 8 + (b))
 2805+#define STBI__CASE(a, b)                                                       \
 2806+	case STBI__COMBO(a, b):                                                    \
 2807+		for (i = x - 1; i >= 0; --i, src += a, dest += b)
 2808+		// convert source image with img_n components to one with req_comp
 2809+		// components; avoid switch per pixel, so use switch per scanline and
 2810+		// massive macros
 2811+		switch (STBI__COMBO(img_n, req_comp)) {
 2812+			STBI__CASE(1, 2)
 2813+			{
 2814+				dest[0] = src[0];
 2815+				dest[1] = 0xffff;
 2816+			}
 2817+			break;
 2818+			STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
 2819+			break;
 2820+			STBI__CASE(1, 4)
 2821+			{
 2822+				dest[0] = dest[1] = dest[2] = src[0];
 2823+				dest[3] = 0xffff;
 2824+			}
 2825+			break;
 2826+			STBI__CASE(2, 1) { dest[0] = src[0]; }
 2827+			break;
 2828+			STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
 2829+			break;
 2830+			STBI__CASE(2, 4)
 2831+			{
 2832+				dest[0] = dest[1] = dest[2] = src[0];
 2833+				dest[3] = src[1];
 2834+			}
 2835+			break;
 2836+			STBI__CASE(3, 4)
 2837+			{
 2838+				dest[0] = src[0];
 2839+				dest[1] = src[1];
 2840+				dest[2] = src[2];
 2841+				dest[3] = 0xffff;
 2842+			}
 2843+			break;
 2844+			STBI__CASE(3, 1)
 2845+			{
 2846+				dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
 2847+			}
 2848+			break;
 2849+			STBI__CASE(3, 2)
 2850+			{
 2851+				dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
 2852+				dest[1] = 0xffff;
 2853+			}
 2854+			break;
 2855+			STBI__CASE(4, 1)
 2856+			{
 2857+				dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
 2858+			}
 2859+			break;
 2860+			STBI__CASE(4, 2)
 2861+			{
 2862+				dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
 2863+				dest[1] = src[3];
 2864+			}
 2865+			break;
 2866+			STBI__CASE(4, 3)
 2867+			{
 2868+				dest[0] = src[0];
 2869+				dest[1] = src[1];
 2870+				dest[2] = src[2];
 2871+			}
 2872+			break;
 2873+		default:
 2874+			STBI_ASSERT(0);
 2875+			STBI_FREE(data);
 2876+			STBI_FREE(good);
 2877+			return (stbi__uint16 *)stbi__errpuc(
 2878+			    "unsupported", "Unsupported format conversion");
 2879+		}
 2880+#undef STBI__CASE
 2881+	}
 2882+
 2883+	STBI_FREE(data);
 2884+	return good;
 2885 }
 2886 #endif
 2887 
 2888 #ifndef STBI_NO_LINEAR
 2889-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 2890-{
 2891-   int i,k,n;
 2892-   float *output;
 2893-   if (!data) return NULL;
 2894-   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
 2895-   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
 2896-   // compute number of non-alpha components
 2897-   if (comp & 1) n = comp; else n = comp-1;
 2898-   for (i=0; i < x*y; ++i) {
 2899-      for (k=0; k < n; ++k) {
 2900-         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
 2901-      }
 2902-   }
 2903-   if (n < comp) {
 2904-      for (i=0; i < x*y; ++i) {
 2905-         output[i*comp + n] = data[i*comp + n]/255.0f;
 2906-      }
 2907-   }
 2908-   STBI_FREE(data);
 2909-   return output;
 2910+static float *
 2911+stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 2912+{
 2913+	int i, k, n;
 2914+	float *output;
 2915+	if (!data) {
 2916+		return NULL;
 2917+	}
 2918+	output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
 2919+	if (output == NULL) {
 2920+		STBI_FREE(data);
 2921+		return stbi__errpf("outofmem", "Out of memory");
 2922+	}
 2923+	// compute number of non-alpha components
 2924+	if (comp & 1) {
 2925+		n = comp;
 2926+	} else {
 2927+		n = comp - 1;
 2928+	}
 2929+	for (i = 0; i < x * y; ++i) {
 2930+		for (k = 0; k < n; ++k) {
 2931+			output[i * comp + k] =
 2932+			    (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) *
 2933+			            stbi__l2h_scale);
 2934+		}
 2935+	}
 2936+	if (n < comp) {
 2937+		for (i = 0; i < x * y; ++i) {
 2938+			output[i * comp + n] = data[i * comp + n] / 255.0f;
 2939+		}
 2940+	}
 2941+	STBI_FREE(data);
 2942+	return output;
 2943 }
 2944 #endif
 2945 
 2946 #ifndef STBI_NO_HDR
 2947-#define stbi__float2int(x)   ((int) (x))
 2948-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 2949-{
 2950-   int i,k,n;
 2951-   stbi_uc *output;
 2952-   if (!data) return NULL;
 2953-   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
 2954-   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
 2955-   // compute number of non-alpha components
 2956-   if (comp & 1) n = comp; else n = comp-1;
 2957-   for (i=0; i < x*y; ++i) {
 2958-      for (k=0; k < n; ++k) {
 2959-         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
 2960-         if (z < 0) z = 0;
 2961-         if (z > 255) z = 255;
 2962-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
 2963-      }
 2964-      if (k < comp) {
 2965-         float z = data[i*comp+k] * 255 + 0.5f;
 2966-         if (z < 0) z = 0;
 2967-         if (z > 255) z = 255;
 2968-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
 2969-      }
 2970-   }
 2971-   STBI_FREE(data);
 2972-   return output;
 2973+#define stbi__float2int(x) ((int)(x))
 2974+static stbi_uc *
 2975+stbi__hdr_to_ldr(float *data, int x, int y, int comp)
 2976+{
 2977+	int i, k, n;
 2978+	stbi_uc *output;
 2979+	if (!data) {
 2980+		return NULL;
 2981+	}
 2982+	output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0);
 2983+	if (output == NULL) {
 2984+		STBI_FREE(data);
 2985+		return stbi__errpuc("outofmem", "Out of memory");
 2986+	}
 2987+	// compute number of non-alpha components
 2988+	if (comp & 1) {
 2989+		n = comp;
 2990+	} else {
 2991+		n = comp - 1;
 2992+	}
 2993+	for (i = 0; i < x * y; ++i) {
 2994+		for (k = 0; k < n; ++k) {
 2995+			float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i,
 2996+			                     stbi__h2l_gamma_i) *
 2997+			              255 +
 2998+			          0.5f;
 2999+			if (z < 0) {
 3000+				z = 0;
 3001+			}
 3002+			if (z > 255) {
 3003+				z = 255;
 3004+			}
 3005+			output[i * comp + k] = (stbi_uc)stbi__float2int(z);
 3006+		}
 3007+		if (k < comp) {
 3008+			float z = data[i * comp + k] * 255 + 0.5f;
 3009+			if (z < 0) {
 3010+				z = 0;
 3011+			}
 3012+			if (z > 255) {
 3013+				z = 255;
 3014+			}
 3015+			output[i * comp + k] = (stbi_uc)stbi__float2int(z);
 3016+		}
 3017+	}
 3018+	STBI_FREE(data);
 3019+	return output;
 3020 }
 3021 #endif
 3022 
 3023@@ -1933,763 +2395,899 @@ static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 3024 #ifndef STBI_NO_JPEG
 3025 
 3026 // huffman decoding acceleration
 3027-#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
 3028-
 3029-typedef struct
 3030-{
 3031-   stbi_uc  fast[1 << FAST_BITS];
 3032-   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
 3033-   stbi__uint16 code[256];
 3034-   stbi_uc  values[256];
 3035-   stbi_uc  size[257];
 3036-   unsigned int maxcode[18];
 3037-   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
 3038+#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
 3039+
 3040+typedef struct {
 3041+	stbi_uc fast[1 << FAST_BITS];
 3042+	// weirdly, repacking this into AoS is a 10% speed loss, instead of a win
 3043+	stbi__uint16 code[256];
 3044+	stbi_uc values[256];
 3045+	stbi_uc size[257];
 3046+	unsigned int maxcode[18];
 3047+	int delta[17]; // old 'firstsymbol' - old 'firstcode'
 3048 } stbi__huffman;
 3049 
 3050-typedef struct
 3051-{
 3052-   stbi__context *s;
 3053-   stbi__huffman huff_dc[4];
 3054-   stbi__huffman huff_ac[4];
 3055-   stbi__uint16 dequant[4][64];
 3056-   stbi__int16 fast_ac[4][1 << FAST_BITS];
 3057-
 3058-// sizes for components, interleaved MCUs
 3059-   int img_h_max, img_v_max;
 3060-   int img_mcu_x, img_mcu_y;
 3061-   int img_mcu_w, img_mcu_h;
 3062-
 3063-// definition of jpeg image component
 3064-   struct
 3065-   {
 3066-      int id;
 3067-      int h,v;
 3068-      int tq;
 3069-      int hd,ha;
 3070-      int dc_pred;
 3071-
 3072-      int x,y,w2,h2;
 3073-      stbi_uc *data;
 3074-      void *raw_data, *raw_coeff;
 3075-      stbi_uc *linebuf;
 3076-      short   *coeff;   // progressive only
 3077-      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
 3078-   } img_comp[4];
 3079-
 3080-   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
 3081-   int            code_bits;   // number of valid bits
 3082-   unsigned char  marker;      // marker seen while filling entropy buffer
 3083-   int            nomore;      // flag if we saw a marker so must stop
 3084-
 3085-   int            progressive;
 3086-   int            spec_start;
 3087-   int            spec_end;
 3088-   int            succ_high;
 3089-   int            succ_low;
 3090-   int            eob_run;
 3091-   int            jfif;
 3092-   int            app14_color_transform; // Adobe APP14 tag
 3093-   int            rgb;
 3094-
 3095-   int scan_n, order[4];
 3096-   int restart_interval, todo;
 3097-
 3098-// kernels
 3099-   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
 3100-   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
 3101-   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
 3102+typedef struct {
 3103+	stbi__context *s;
 3104+	stbi__huffman huff_dc[4];
 3105+	stbi__huffman huff_ac[4];
 3106+	stbi__uint16 dequant[4][64];
 3107+	stbi__int16 fast_ac[4][1 << FAST_BITS];
 3108+
 3109+	// sizes for components, interleaved MCUs
 3110+	int img_h_max, img_v_max;
 3111+	int img_mcu_x, img_mcu_y;
 3112+	int img_mcu_w, img_mcu_h;
 3113+
 3114+	// definition of jpeg image component
 3115+	struct {
 3116+		int id;
 3117+		int h, v;
 3118+		int tq;
 3119+		int hd, ha;
 3120+		int dc_pred;
 3121+
 3122+		int x, y, w2, h2;
 3123+		stbi_uc *data;
 3124+		void *raw_data, *raw_coeff;
 3125+		stbi_uc *linebuf;
 3126+		short *coeff;         // progressive only
 3127+		int coeff_w, coeff_h; // number of 8x8 coefficient blocks
 3128+	} img_comp[4];
 3129+
 3130+	stbi__uint32 code_buffer; // jpeg entropy-coded buffer
 3131+	int code_bits;            // number of valid bits
 3132+	unsigned char marker;     // marker seen while filling entropy buffer
 3133+	int nomore;               // flag if we saw a marker so must stop
 3134+
 3135+	int progressive;
 3136+	int spec_start;
 3137+	int spec_end;
 3138+	int succ_high;
 3139+	int succ_low;
 3140+	int eob_run;
 3141+	int jfif;
 3142+	int app14_color_transform; // Adobe APP14 tag
 3143+	int rgb;
 3144+
 3145+	int scan_n, order[4];
 3146+	int restart_interval, todo;
 3147+
 3148+	// kernels
 3149+	void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
 3150+	void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y,
 3151+	                            const stbi_uc *pcb, const stbi_uc *pcr,
 3152+	                            int count, int step);
 3153+	stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near,
 3154+	                                     stbi_uc *in_far, int w, int hs);
 3155 } stbi__jpeg;
 3156 
 3157-static int stbi__build_huffman(stbi__huffman *h, int *count)
 3158-{
 3159-   int i,j,k=0;
 3160-   unsigned int code;
 3161-   // build size list for each symbol (from JPEG spec)
 3162-   for (i=0; i < 16; ++i) {
 3163-      for (j=0; j < count[i]; ++j) {
 3164-         h->size[k++] = (stbi_uc) (i+1);
 3165-         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
 3166-      }
 3167-   }
 3168-   h->size[k] = 0;
 3169-
 3170-   // compute actual symbols (from jpeg spec)
 3171-   code = 0;
 3172-   k = 0;
 3173-   for(j=1; j <= 16; ++j) {
 3174-      // compute delta to add to code to compute symbol id
 3175-      h->delta[j] = k - code;
 3176-      if (h->size[k] == j) {
 3177-         while (h->size[k] == j)
 3178-            h->code[k++] = (stbi__uint16) (code++);
 3179-         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
 3180-      }
 3181-      // compute largest code + 1 for this size, preshifted as needed later
 3182-      h->maxcode[j] = code << (16-j);
 3183-      code <<= 1;
 3184-   }
 3185-   h->maxcode[j] = 0xffffffff;
 3186-
 3187-   // build non-spec acceleration table; 255 is flag for not-accelerated
 3188-   memset(h->fast, 255, 1 << FAST_BITS);
 3189-   for (i=0; i < k; ++i) {
 3190-      int s = h->size[i];
 3191-      if (s <= FAST_BITS) {
 3192-         int c = h->code[i] << (FAST_BITS-s);
 3193-         int m = 1 << (FAST_BITS-s);
 3194-         for (j=0; j < m; ++j) {
 3195-            h->fast[c+j] = (stbi_uc) i;
 3196-         }
 3197-      }
 3198-   }
 3199-   return 1;
 3200+static int
 3201+stbi__build_huffman(stbi__huffman *h, int *count)
 3202+{
 3203+	int i, j, k = 0;
 3204+	unsigned int code;
 3205+	// build size list for each symbol (from JPEG spec)
 3206+	for (i = 0; i < 16; ++i) {
 3207+		for (j = 0; j < count[i]; ++j) {
 3208+			h->size[k++] = (stbi_uc)(i + 1);
 3209+			if (k >= 257) {
 3210+				return stbi__err("bad size list", "Corrupt JPEG");
 3211+			}
 3212+		}
 3213+	}
 3214+	h->size[k] = 0;
 3215+
 3216+	// compute actual symbols (from jpeg spec)
 3217+	code = 0;
 3218+	k = 0;
 3219+	for (j = 1; j <= 16; ++j) {
 3220+		// compute delta to add to code to compute symbol id
 3221+		h->delta[j] = k - code;
 3222+		if (h->size[k] == j) {
 3223+			while (h->size[k] == j) {
 3224+				h->code[k++] = (stbi__uint16)(code++);
 3225+			}
 3226+			if (code - 1 >= (1u << j)) {
 3227+				return stbi__err("bad code lengths", "Corrupt JPEG");
 3228+			}
 3229+		}
 3230+		// compute largest code + 1 for this size, preshifted as needed later
 3231+		h->maxcode[j] = code << (16 - j);
 3232+		code <<= 1;
 3233+	}
 3234+	h->maxcode[j] = 0xffffffff;
 3235+
 3236+	// build non-spec acceleration table; 255 is flag for not-accelerated
 3237+	memset(h->fast, 255, 1 << FAST_BITS);
 3238+	for (i = 0; i < k; ++i) {
 3239+		int s = h->size[i];
 3240+		if (s <= FAST_BITS) {
 3241+			int c = h->code[i] << (FAST_BITS - s);
 3242+			int m = 1 << (FAST_BITS - s);
 3243+			for (j = 0; j < m; ++j) {
 3244+				h->fast[c + j] = (stbi_uc)i;
 3245+			}
 3246+		}
 3247+	}
 3248+	return 1;
 3249 }
 3250 
 3251 // build a table that decodes both magnitude and value of small ACs in
 3252 // one go.
 3253-static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 3254-{
 3255-   int i;
 3256-   for (i=0; i < (1 << FAST_BITS); ++i) {
 3257-      stbi_uc fast = h->fast[i];
 3258-      fast_ac[i] = 0;
 3259-      if (fast < 255) {
 3260-         int rs = h->values[fast];
 3261-         int run = (rs >> 4) & 15;
 3262-         int magbits = rs & 15;
 3263-         int len = h->size[fast];
 3264-
 3265-         if (magbits && len + magbits <= FAST_BITS) {
 3266-            // magnitude code followed by receive_extend code
 3267-            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
 3268-            int m = 1 << (magbits - 1);
 3269-            if (k < m) k += (~0U << magbits) + 1;
 3270-            // if the result is small enough, we can fit it in fast_ac table
 3271-            if (k >= -128 && k <= 127)
 3272-               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
 3273-         }
 3274-      }
 3275-   }
 3276-}
 3277-
 3278-static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 3279-{
 3280-   do {
 3281-      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
 3282-      if (b == 0xff) {
 3283-         int c = stbi__get8(j->s);
 3284-         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
 3285-         if (c != 0) {
 3286-            j->marker = (unsigned char) c;
 3287-            j->nomore = 1;
 3288-            return;
 3289-         }
 3290-      }
 3291-      j->code_buffer |= b << (24 - j->code_bits);
 3292-      j->code_bits += 8;
 3293-   } while (j->code_bits <= 24);
 3294+static void
 3295+stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 3296+{
 3297+	int i;
 3298+	for (i = 0; i < (1 << FAST_BITS); ++i) {
 3299+		stbi_uc fast = h->fast[i];
 3300+		fast_ac[i] = 0;
 3301+		if (fast < 255) {
 3302+			int rs = h->values[fast];
 3303+			int run = (rs >> 4) & 15;
 3304+			int magbits = rs & 15;
 3305+			int len = h->size[fast];
 3306+
 3307+			if (magbits && len + magbits <= FAST_BITS) {
 3308+				// magnitude code followed by receive_extend code
 3309+				int k = ((i << len) & ((1 << FAST_BITS) - 1)) >>
 3310+				        (FAST_BITS - magbits);
 3311+				int m = 1 << (magbits - 1);
 3312+				if (k < m) {
 3313+					k += (~0U << magbits) + 1;
 3314+				}
 3315+				// if the result is small enough, we can fit it in fast_ac table
 3316+				if (k >= -128 && k <= 127) {
 3317+					fast_ac[i] =
 3318+					    (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
 3319+				}
 3320+			}
 3321+		}
 3322+	}
 3323+}
 3324+
 3325+static void
 3326+stbi__grow_buffer_unsafe(stbi__jpeg *j)
 3327+{
 3328+	do {
 3329+		unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
 3330+		if (b == 0xff) {
 3331+			int c = stbi__get8(j->s);
 3332+			while (c == 0xff) {
 3333+				c = stbi__get8(j->s); // consume fill bytes
 3334+			}
 3335+			if (c != 0) {
 3336+				j->marker = (unsigned char)c;
 3337+				j->nomore = 1;
 3338+				return;
 3339+			}
 3340+		}
 3341+		j->code_buffer |= b << (24 - j->code_bits);
 3342+		j->code_bits += 8;
 3343+	} while (j->code_bits <= 24);
 3344 }
 3345 
 3346 // (1 << n) - 1
 3347-static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 3348+static const stbi__uint32 stbi__bmask[17] = {
 3349+    0,   1,    3,    7,    15,   31,    63,    127,  255,
 3350+    511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
 3351 
 3352 // decode a jpeg huffman value from the bitstream
 3353-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 3354-{
 3355-   unsigned int temp;
 3356-   int c,k;
 3357-
 3358-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 3359-
 3360-   // look at the top FAST_BITS and determine what symbol ID it is,
 3361-   // if the code is <= FAST_BITS
 3362-   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 3363-   k = h->fast[c];
 3364-   if (k < 255) {
 3365-      int s = h->size[k];
 3366-      if (s > j->code_bits)
 3367-         return -1;
 3368-      j->code_buffer <<= s;
 3369-      j->code_bits -= s;
 3370-      return h->values[k];
 3371-   }
 3372-
 3373-   // naive test is to shift the code_buffer down so k bits are
 3374-   // valid, then test against maxcode. To speed this up, we've
 3375-   // preshifted maxcode left so that it has (16-k) 0s at the
 3376-   // end; in other words, regardless of the number of bits, it
 3377-   // wants to be compared against something shifted to have 16;
 3378-   // that way we don't need to shift inside the loop.
 3379-   temp = j->code_buffer >> 16;
 3380-   for (k=FAST_BITS+1 ; ; ++k)
 3381-      if (temp < h->maxcode[k])
 3382-         break;
 3383-   if (k == 17) {
 3384-      // error! code not found
 3385-      j->code_bits -= 16;
 3386-      return -1;
 3387-   }
 3388-
 3389-   if (k > j->code_bits)
 3390-      return -1;
 3391-
 3392-   // convert the huffman code to the symbol id
 3393-   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
 3394-   if(c < 0 || c >= 256) // symbol id out of bounds!
 3395-       return -1;
 3396-   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
 3397-
 3398-   // convert the id to a symbol
 3399-   j->code_bits -= k;
 3400-   j->code_buffer <<= k;
 3401-   return h->values[c];
 3402+stbi_inline static int
 3403+stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 3404+{
 3405+	unsigned int temp;
 3406+	int c, k;
 3407+
 3408+	if (j->code_bits < 16) {
 3409+		stbi__grow_buffer_unsafe(j);
 3410+	}
 3411+
 3412+	// look at the top FAST_BITS and determine what symbol ID it is,
 3413+	// if the code is <= FAST_BITS
 3414+	c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
 3415+	k = h->fast[c];
 3416+	if (k < 255) {
 3417+		int s = h->size[k];
 3418+		if (s > j->code_bits) {
 3419+			return -1;
 3420+		}
 3421+		j->code_buffer <<= s;
 3422+		j->code_bits -= s;
 3423+		return h->values[k];
 3424+	}
 3425+
 3426+	// naive test is to shift the code_buffer down so k bits are
 3427+	// valid, then test against maxcode. To speed this up, we've
 3428+	// preshifted maxcode left so that it has (16-k) 0s at the
 3429+	// end; in other words, regardless of the number of bits, it
 3430+	// wants to be compared against something shifted to have 16;
 3431+	// that way we don't need to shift inside the loop.
 3432+	temp = j->code_buffer >> 16;
 3433+	for (k = FAST_BITS + 1;; ++k) {
 3434+		if (temp < h->maxcode[k]) {
 3435+			break;
 3436+		}
 3437+	}
 3438+	if (k == 17) {
 3439+		// error! code not found
 3440+		j->code_bits -= 16;
 3441+		return -1;
 3442+	}
 3443+
 3444+	if (k > j->code_bits) {
 3445+		return -1;
 3446+	}
 3447+
 3448+	// convert the huffman code to the symbol id
 3449+	c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
 3450+	if (c < 0 || c >= 256) { // symbol id out of bounds!
 3451+		return -1;
 3452+	}
 3453+	STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) &
 3454+	             stbi__bmask[h->size[c]]) == h->code[c]);
 3455+
 3456+	// convert the id to a symbol
 3457+	j->code_bits -= k;
 3458+	j->code_buffer <<= k;
 3459+	return h->values[c];
 3460 }
 3461 
 3462 // bias[n] = (-1<<n) + 1
 3463-static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 3464+static const int stbi__jbias[16] = {0,     -1,    -3,     -7,    -15,   -31,
 3465+                                    -63,   -127,  -255,   -511,  -1023, -2047,
 3466+                                    -4095, -8191, -16383, -32767};
 3467 
 3468 // combined JPEG 'receive' and JPEG 'extend', since baseline
 3469 // always extends everything it receives.
 3470-stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
 3471-{
 3472-   unsigned int k;
 3473-   int sgn;
 3474-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
 3475-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
 3476-
 3477-   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
 3478-   k = stbi_lrot(j->code_buffer, n);
 3479-   j->code_buffer = k & ~stbi__bmask[n];
 3480-   k &= stbi__bmask[n];
 3481-   j->code_bits -= n;
 3482-   return k + (stbi__jbias[n] & (sgn - 1));
 3483+stbi_inline static int
 3484+stbi__extend_receive(stbi__jpeg *j, int n)
 3485+{
 3486+	unsigned int k;
 3487+	int sgn;
 3488+	if (j->code_bits < n) {
 3489+		stbi__grow_buffer_unsafe(j);
 3490+	}
 3491+	if (j->code_bits < n) {
 3492+		return 0; // ran out of bits from stream, return 0s intead of continuing
 3493+	}
 3494+
 3495+	sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear
 3496+	                            // (positive), 1 if MSB set (negative)
 3497+	k = stbi_lrot(j->code_buffer, n);
 3498+	j->code_buffer = k & ~stbi__bmask[n];
 3499+	k &= stbi__bmask[n];
 3500+	j->code_bits -= n;
 3501+	return k + (stbi__jbias[n] & (sgn - 1));
 3502 }
 3503 
 3504 // get some unsigned bits
 3505-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
 3506-{
 3507-   unsigned int k;
 3508-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
 3509-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
 3510-   k = stbi_lrot(j->code_buffer, n);
 3511-   j->code_buffer = k & ~stbi__bmask[n];
 3512-   k &= stbi__bmask[n];
 3513-   j->code_bits -= n;
 3514-   return k;
 3515-}
 3516-
 3517-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 3518-{
 3519-   unsigned int k;
 3520-   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
 3521-   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
 3522-   k = j->code_buffer;
 3523-   j->code_buffer <<= 1;
 3524-   --j->code_bits;
 3525-   return k & 0x80000000;
 3526+stbi_inline static int
 3527+stbi__jpeg_get_bits(stbi__jpeg *j, int n)
 3528+{
 3529+	unsigned int k;
 3530+	if (j->code_bits < n) {
 3531+		stbi__grow_buffer_unsafe(j);
 3532+	}
 3533+	if (j->code_bits < n) {
 3534+		return 0; // ran out of bits from stream, return 0s intead of continuing
 3535+	}
 3536+	k = stbi_lrot(j->code_buffer, n);
 3537+	j->code_buffer = k & ~stbi__bmask[n];
 3538+	k &= stbi__bmask[n];
 3539+	j->code_bits -= n;
 3540+	return k;
 3541+}
 3542+
 3543+stbi_inline static int
 3544+stbi__jpeg_get_bit(stbi__jpeg *j)
 3545+{
 3546+	unsigned int k;
 3547+	if (j->code_bits < 1) {
 3548+		stbi__grow_buffer_unsafe(j);
 3549+	}
 3550+	if (j->code_bits < 1) {
 3551+		return 0; // ran out of bits from stream, return 0s intead of continuing
 3552+	}
 3553+	k = j->code_buffer;
 3554+	j->code_buffer <<= 1;
 3555+	--j->code_bits;
 3556+	return k & 0x80000000;
 3557 }
 3558 
 3559 // given a value that's at position X in the zigzag stream,
 3560 // where does it appear in the 8x8 matrix coded as row-major?
 3561-static const stbi_uc stbi__jpeg_dezigzag[64+15] =
 3562-{
 3563-    0,  1,  8, 16,  9,  2,  3, 10,
 3564-   17, 24, 32, 25, 18, 11,  4,  5,
 3565-   12, 19, 26, 33, 40, 48, 41, 34,
 3566-   27, 20, 13,  6,  7, 14, 21, 28,
 3567-   35, 42, 49, 56, 57, 50, 43, 36,
 3568-   29, 22, 15, 23, 30, 37, 44, 51,
 3569-   58, 59, 52, 45, 38, 31, 39, 46,
 3570-   53, 60, 61, 54, 47, 55, 62, 63,
 3571-   // let corrupt input sample past end
 3572-   63, 63, 63, 63, 63, 63, 63, 63,
 3573-   63, 63, 63, 63, 63, 63, 63
 3574-};
 3575+static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = {
 3576+    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40,
 3577+    48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36,
 3578+    29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61,
 3579+    54, 47, 55, 62, 63,
 3580+    // let corrupt input sample past end
 3581+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63};
 3582 
 3583 // decode one 64-entry block--
 3584-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
 3585-{
 3586-   int diff,dc,k;
 3587-   int t;
 3588-
 3589-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 3590-   t = stbi__jpeg_huff_decode(j, hdc);
 3591-   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
 3592-
 3593-   // 0 all the ac values now so we can do it 32-bits at a time
 3594-   memset(data,0,64*sizeof(data[0]));
 3595-
 3596-   diff = t ? stbi__extend_receive(j, t) : 0;
 3597-   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
 3598-   dc = j->img_comp[b].dc_pred + diff;
 3599-   j->img_comp[b].dc_pred = dc;
 3600-   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3601-   data[0] = (short) (dc * dequant[0]);
 3602-
 3603-   // decode AC components, see JPEG spec
 3604-   k = 1;
 3605-   do {
 3606-      unsigned int zig;
 3607-      int c,r,s;
 3608-      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 3609-      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 3610-      r = fac[c];
 3611-      if (r) { // fast-AC path
 3612-         k += (r >> 4) & 15; // run
 3613-         s = r & 15; // combined length
 3614-         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
 3615-         j->code_buffer <<= s;
 3616-         j->code_bits -= s;
 3617-         // decode into unzigzag'd location
 3618-         zig = stbi__jpeg_dezigzag[k++];
 3619-         data[zig] = (short) ((r >> 8) * dequant[zig]);
 3620-      } else {
 3621-         int rs = stbi__jpeg_huff_decode(j, hac);
 3622-         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
 3623-         s = rs & 15;
 3624-         r = rs >> 4;
 3625-         if (s == 0) {
 3626-            if (rs != 0xf0) break; // end block
 3627-            k += 16;
 3628-         } else {
 3629-            k += r;
 3630-            // decode into unzigzag'd location
 3631-            zig = stbi__jpeg_dezigzag[k++];
 3632-            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
 3633-         }
 3634-      }
 3635-   } while (k < 64);
 3636-   return 1;
 3637-}
 3638-
 3639-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
 3640-{
 3641-   int diff,dc;
 3642-   int t;
 3643-   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3644-
 3645-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 3646-
 3647-   if (j->succ_high == 0) {
 3648-      // first scan for DC coefficient, must be first
 3649-      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
 3650-      t = stbi__jpeg_huff_decode(j, hdc);
 3651-      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3652-      diff = t ? stbi__extend_receive(j, t) : 0;
 3653-
 3654-      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
 3655-      dc = j->img_comp[b].dc_pred + diff;
 3656-      j->img_comp[b].dc_pred = dc;
 3657-      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3658-      data[0] = (short) (dc * (1 << j->succ_low));
 3659-   } else {
 3660-      // refinement scan for DC coefficient
 3661-      if (stbi__jpeg_get_bit(j))
 3662-         data[0] += (short) (1 << j->succ_low);
 3663-   }
 3664-   return 1;
 3665+static int
 3666+stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc,
 3667+                        stbi__huffman *hac, stbi__int16 *fac, int b,
 3668+                        stbi__uint16 *dequant)
 3669+{
 3670+	int diff, dc, k;
 3671+	int t;
 3672+
 3673+	if (j->code_bits < 16) {
 3674+		stbi__grow_buffer_unsafe(j);
 3675+	}
 3676+	t = stbi__jpeg_huff_decode(j, hdc);
 3677+	if (t < 0 || t > 15) {
 3678+		return stbi__err("bad huffman code", "Corrupt JPEG");
 3679+	}
 3680+
 3681+	// 0 all the ac values now so we can do it 32-bits at a time
 3682+	memset(data, 0, 64 * sizeof(data[0]));
 3683+
 3684+	diff = t ? stbi__extend_receive(j, t) : 0;
 3685+	if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) {
 3686+		return stbi__err("bad delta", "Corrupt JPEG");
 3687+	}
 3688+	dc = j->img_comp[b].dc_pred + diff;
 3689+	j->img_comp[b].dc_pred = dc;
 3690+	if (!stbi__mul2shorts_valid(dc, dequant[0])) {
 3691+		return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3692+	}
 3693+	data[0] = (short)(dc * dequant[0]);
 3694+
 3695+	// decode AC components, see JPEG spec
 3696+	k = 1;
 3697+	do {
 3698+		unsigned int zig;
 3699+		int c, r, s;
 3700+		if (j->code_bits < 16) {
 3701+			stbi__grow_buffer_unsafe(j);
 3702+		}
 3703+		c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
 3704+		r = fac[c];
 3705+		if (r) {                // fast-AC path
 3706+			k += (r >> 4) & 15; // run
 3707+			s = r & 15;         // combined length
 3708+			if (s > j->code_bits) {
 3709+				return stbi__err(
 3710+				    "bad huffman code",
 3711+				    "Combined length longer than code bits available");
 3712+			}
 3713+			j->code_buffer <<= s;
 3714+			j->code_bits -= s;
 3715+			// decode into unzigzag'd location
 3716+			zig = stbi__jpeg_dezigzag[k++];
 3717+			data[zig] = (short)((r >> 8) * dequant[zig]);
 3718+		} else {
 3719+			int rs = stbi__jpeg_huff_decode(j, hac);
 3720+			if (rs < 0) {
 3721+				return stbi__err("bad huffman code", "Corrupt JPEG");
 3722+			}
 3723+			s = rs & 15;
 3724+			r = rs >> 4;
 3725+			if (s == 0) {
 3726+				if (rs != 0xf0) {
 3727+					break; // end block
 3728+				}
 3729+				k += 16;
 3730+			} else {
 3731+				k += r;
 3732+				// decode into unzigzag'd location
 3733+				zig = stbi__jpeg_dezigzag[k++];
 3734+				data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
 3735+			}
 3736+		}
 3737+	} while (k < 64);
 3738+	return 1;
 3739+}
 3740+
 3741+static int
 3742+stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64],
 3743+                                stbi__huffman *hdc, int b)
 3744+{
 3745+	int diff, dc;
 3746+	int t;
 3747+	if (j->spec_end != 0) {
 3748+		return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3749+	}
 3750+
 3751+	if (j->code_bits < 16) {
 3752+		stbi__grow_buffer_unsafe(j);
 3753+	}
 3754+
 3755+	if (j->succ_high == 0) {
 3756+		// first scan for DC coefficient, must be first
 3757+		memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
 3758+		t = stbi__jpeg_huff_decode(j, hdc);
 3759+		if (t < 0 || t > 15) {
 3760+			return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3761+		}
 3762+		diff = t ? stbi__extend_receive(j, t) : 0;
 3763+
 3764+		if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) {
 3765+			return stbi__err("bad delta", "Corrupt JPEG");
 3766+		}
 3767+		dc = j->img_comp[b].dc_pred + diff;
 3768+		j->img_comp[b].dc_pred = dc;
 3769+		if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) {
 3770+			return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3771+		}
 3772+		data[0] = (short)(dc * (1 << j->succ_low));
 3773+	} else {
 3774+		// refinement scan for DC coefficient
 3775+		if (stbi__jpeg_get_bit(j)) {
 3776+			data[0] += (short)(1 << j->succ_low);
 3777+		}
 3778+	}
 3779+	return 1;
 3780 }
 3781 
 3782 // @OPTIMIZE: store non-zigzagged during the decode passes,
 3783 // and only de-zigzag when dequantizing
 3784-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
 3785-{
 3786-   int k;
 3787-   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3788-
 3789-   if (j->succ_high == 0) {
 3790-      int shift = j->succ_low;
 3791-
 3792-      if (j->eob_run) {
 3793-         --j->eob_run;
 3794-         return 1;
 3795-      }
 3796-
 3797-      k = j->spec_start;
 3798-      do {
 3799-         unsigned int zig;
 3800-         int c,r,s;
 3801-         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
 3802-         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
 3803-         r = fac[c];
 3804-         if (r) { // fast-AC path
 3805-            k += (r >> 4) & 15; // run
 3806-            s = r & 15; // combined length
 3807-            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
 3808-            j->code_buffer <<= s;
 3809-            j->code_bits -= s;
 3810-            zig = stbi__jpeg_dezigzag[k++];
 3811-            data[zig] = (short) ((r >> 8) * (1 << shift));
 3812-         } else {
 3813-            int rs = stbi__jpeg_huff_decode(j, hac);
 3814-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
 3815-            s = rs & 15;
 3816-            r = rs >> 4;
 3817-            if (s == 0) {
 3818-               if (r < 15) {
 3819-                  j->eob_run = (1 << r);
 3820-                  if (r)
 3821-                     j->eob_run += stbi__jpeg_get_bits(j, r);
 3822-                  --j->eob_run;
 3823-                  break;
 3824-               }
 3825-               k += 16;
 3826-            } else {
 3827-               k += r;
 3828-               zig = stbi__jpeg_dezigzag[k++];
 3829-               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
 3830-            }
 3831-         }
 3832-      } while (k <= j->spec_end);
 3833-   } else {
 3834-      // refinement scan for these AC coefficients
 3835-
 3836-      short bit = (short) (1 << j->succ_low);
 3837-
 3838-      if (j->eob_run) {
 3839-         --j->eob_run;
 3840-         for (k = j->spec_start; k <= j->spec_end; ++k) {
 3841-            short *p = &data[stbi__jpeg_dezigzag[k]];
 3842-            if (*p != 0)
 3843-               if (stbi__jpeg_get_bit(j))
 3844-                  if ((*p & bit)==0) {
 3845-                     if (*p > 0)
 3846-                        *p += bit;
 3847-                     else
 3848-                        *p -= bit;
 3849-                  }
 3850-         }
 3851-      } else {
 3852-         k = j->spec_start;
 3853-         do {
 3854-            int r,s;
 3855-            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
 3856-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
 3857-            s = rs & 15;
 3858-            r = rs >> 4;
 3859-            if (s == 0) {
 3860-               if (r < 15) {
 3861-                  j->eob_run = (1 << r) - 1;
 3862-                  if (r)
 3863-                     j->eob_run += stbi__jpeg_get_bits(j, r);
 3864-                  r = 64; // force end of block
 3865-               } else {
 3866-                  // r=15 s=0 should write 16 0s, so we just do
 3867-                  // a run of 15 0s and then write s (which is 0),
 3868-                  // so we don't have to do anything special here
 3869-               }
 3870-            } else {
 3871-               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
 3872-               // sign bit
 3873-               if (stbi__jpeg_get_bit(j))
 3874-                  s = bit;
 3875-               else
 3876-                  s = -bit;
 3877-            }
 3878-
 3879-            // advance by r
 3880-            while (k <= j->spec_end) {
 3881-               short *p = &data[stbi__jpeg_dezigzag[k++]];
 3882-               if (*p != 0) {
 3883-                  if (stbi__jpeg_get_bit(j))
 3884-                     if ((*p & bit)==0) {
 3885-                        if (*p > 0)
 3886-                           *p += bit;
 3887-                        else
 3888-                           *p -= bit;
 3889-                     }
 3890-               } else {
 3891-                  if (r == 0) {
 3892-                     *p = (short) s;
 3893-                     break;
 3894-                  }
 3895-                  --r;
 3896-               }
 3897-            }
 3898-         } while (k <= j->spec_end);
 3899-      }
 3900-   }
 3901-   return 1;
 3902+static int
 3903+stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64],
 3904+                                stbi__huffman *hac, stbi__int16 *fac)
 3905+{
 3906+	int k;
 3907+	if (j->spec_start == 0) {
 3908+		return stbi__err("can't merge dc and ac", "Corrupt JPEG");
 3909+	}
 3910+
 3911+	if (j->succ_high == 0) {
 3912+		int shift = j->succ_low;
 3913+
 3914+		if (j->eob_run) {
 3915+			--j->eob_run;
 3916+			return 1;
 3917+		}
 3918+
 3919+		k = j->spec_start;
 3920+		do {
 3921+			unsigned int zig;
 3922+			int c, r, s;
 3923+			if (j->code_bits < 16) {
 3924+				stbi__grow_buffer_unsafe(j);
 3925+			}
 3926+			c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
 3927+			r = fac[c];
 3928+			if (r) {                // fast-AC path
 3929+				k += (r >> 4) & 15; // run
 3930+				s = r & 15;         // combined length
 3931+				if (s > j->code_bits) {
 3932+					return stbi__err(
 3933+					    "bad huffman code",
 3934+					    "Combined length longer than code bits available");
 3935+				}
 3936+				j->code_buffer <<= s;
 3937+				j->code_bits -= s;
 3938+				zig = stbi__jpeg_dezigzag[k++];
 3939+				data[zig] = (short)((r >> 8) * (1 << shift));
 3940+			} else {
 3941+				int rs = stbi__jpeg_huff_decode(j, hac);
 3942+				if (rs < 0) {
 3943+					return stbi__err("bad huffman code", "Corrupt JPEG");
 3944+				}
 3945+				s = rs & 15;
 3946+				r = rs >> 4;
 3947+				if (s == 0) {
 3948+					if (r < 15) {
 3949+						j->eob_run = (1 << r);
 3950+						if (r) {
 3951+							j->eob_run += stbi__jpeg_get_bits(j, r);
 3952+						}
 3953+						--j->eob_run;
 3954+						break;
 3955+					}
 3956+					k += 16;
 3957+				} else {
 3958+					k += r;
 3959+					zig = stbi__jpeg_dezigzag[k++];
 3960+					data[zig] =
 3961+					    (short)(stbi__extend_receive(j, s) * (1 << shift));
 3962+				}
 3963+			}
 3964+		} while (k <= j->spec_end);
 3965+	} else {
 3966+		// refinement scan for these AC coefficients
 3967+
 3968+		short bit = (short)(1 << j->succ_low);
 3969+
 3970+		if (j->eob_run) {
 3971+			--j->eob_run;
 3972+			for (k = j->spec_start; k <= j->spec_end; ++k) {
 3973+				short *p = &data[stbi__jpeg_dezigzag[k]];
 3974+				if (*p != 0) {
 3975+					if (stbi__jpeg_get_bit(j)) {
 3976+						if ((*p & bit) == 0) {
 3977+							if (*p > 0) {
 3978+								*p += bit;
 3979+							} else {
 3980+								*p -= bit;
 3981+							}
 3982+						}
 3983+					}
 3984+				}
 3985+			}
 3986+		} else {
 3987+			k = j->spec_start;
 3988+			do {
 3989+				int r, s;
 3990+				int rs = stbi__jpeg_huff_decode(
 3991+				    j, hac); // @OPTIMIZE see if we can use the fast path here,
 3992+				             // advance-by-r is so slow, eh
 3993+				if (rs < 0) {
 3994+					return stbi__err("bad huffman code", "Corrupt JPEG");
 3995+				}
 3996+				s = rs & 15;
 3997+				r = rs >> 4;
 3998+				if (s == 0) {
 3999+					if (r < 15) {
 4000+						j->eob_run = (1 << r) - 1;
 4001+						if (r) {
 4002+							j->eob_run += stbi__jpeg_get_bits(j, r);
 4003+						}
 4004+						r = 64; // force end of block
 4005+					} else {
 4006+						// r=15 s=0 should write 16 0s, so we just do
 4007+						// a run of 15 0s and then write s (which is 0),
 4008+						// so we don't have to do anything special here
 4009+					}
 4010+				} else {
 4011+					if (s != 1) {
 4012+						return stbi__err("bad huffman code", "Corrupt JPEG");
 4013+					}
 4014+					// sign bit
 4015+					if (stbi__jpeg_get_bit(j)) {
 4016+						s = bit;
 4017+					} else {
 4018+						s = -bit;
 4019+					}
 4020+				}
 4021+
 4022+				// advance by r
 4023+				while (k <= j->spec_end) {
 4024+					short *p = &data[stbi__jpeg_dezigzag[k++]];
 4025+					if (*p != 0) {
 4026+						if (stbi__jpeg_get_bit(j)) {
 4027+							if ((*p & bit) == 0) {
 4028+								if (*p > 0) {
 4029+									*p += bit;
 4030+								} else {
 4031+									*p -= bit;
 4032+								}
 4033+							}
 4034+						}
 4035+					} else {
 4036+						if (r == 0) {
 4037+							*p = (short)s;
 4038+							break;
 4039+						}
 4040+						--r;
 4041+					}
 4042+				}
 4043+			} while (k <= j->spec_end);
 4044+		}
 4045+	}
 4046+	return 1;
 4047 }
 4048 
 4049 // take a -128..127 value and stbi__clamp it and convert to 0..255
 4050-stbi_inline static stbi_uc stbi__clamp(int x)
 4051+stbi_inline static stbi_uc
 4052+stbi__clamp(int x)
 4053 {
 4054-   // trick to use a single test to catch both cases
 4055-   if ((unsigned int) x > 255) {
 4056-      if (x < 0) return 0;
 4057-      if (x > 255) return 255;
 4058-   }
 4059-   return (stbi_uc) x;
 4060+	// trick to use a single test to catch both cases
 4061+	if ((unsigned int)x > 255) {
 4062+		if (x < 0) {
 4063+			return 0;
 4064+		}
 4065+		if (x > 255) {
 4066+			return 255;
 4067+		}
 4068+	}
 4069+	return (stbi_uc)x;
 4070 }
 4071 
 4072-#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
 4073-#define stbi__fsh(x)  ((x) * 4096)
 4074+#define stbi__f2f(x) ((int)(((x) * 4096 + 0.5)))
 4075+#define stbi__fsh(x) ((x) * 4096)
 4076 
 4077 // derived from jidctint -- DCT_ISLOW
 4078-#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
 4079-   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
 4080-   p2 = s2;                                    \
 4081-   p3 = s6;                                    \
 4082-   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
 4083-   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
 4084-   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
 4085-   p2 = s0;                                    \
 4086-   p3 = s4;                                    \
 4087-   t0 = stbi__fsh(p2+p3);                      \
 4088-   t1 = stbi__fsh(p2-p3);                      \
 4089-   x0 = t0+t3;                                 \
 4090-   x3 = t0-t3;                                 \
 4091-   x1 = t1+t2;                                 \
 4092-   x2 = t1-t2;                                 \
 4093-   t0 = s7;                                    \
 4094-   t1 = s5;                                    \
 4095-   t2 = s3;                                    \
 4096-   t3 = s1;                                    \
 4097-   p3 = t0+t2;                                 \
 4098-   p4 = t1+t3;                                 \
 4099-   p1 = t0+t3;                                 \
 4100-   p2 = t1+t2;                                 \
 4101-   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
 4102-   t0 = t0*stbi__f2f( 0.298631336f);           \
 4103-   t1 = t1*stbi__f2f( 2.053119869f);           \
 4104-   t2 = t2*stbi__f2f( 3.072711026f);           \
 4105-   t3 = t3*stbi__f2f( 1.501321110f);           \
 4106-   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
 4107-   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
 4108-   p3 = p3*stbi__f2f(-1.961570560f);           \
 4109-   p4 = p4*stbi__f2f(-0.390180644f);           \
 4110-   t3 += p1+p4;                                \
 4111-   t2 += p2+p3;                                \
 4112-   t1 += p2+p4;                                \
 4113-   t0 += p1+p3;
 4114-
 4115-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
 4116-{
 4117-   int i,val[64],*v=val;
 4118-   stbi_uc *o;
 4119-   short *d = data;
 4120-
 4121-   // columns
 4122-   for (i=0; i < 8; ++i,++d, ++v) {
 4123-      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
 4124-      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
 4125-           && d[40]==0 && d[48]==0 && d[56]==0) {
 4126-         //    no shortcut                 0     seconds
 4127-         //    (1|2|3|4|5|6|7)==0          0     seconds
 4128-         //    all separate               -0.047 seconds
 4129-         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
 4130-         int dcterm = d[0]*4;
 4131-         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
 4132-      } else {
 4133-         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
 4134-         // constants scaled things up by 1<<12; let's bring them back
 4135-         // down, but keep 2 extra bits of precision
 4136-         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
 4137-         v[ 0] = (x0+t3) >> 10;
 4138-         v[56] = (x0-t3) >> 10;
 4139-         v[ 8] = (x1+t2) >> 10;
 4140-         v[48] = (x1-t2) >> 10;
 4141-         v[16] = (x2+t1) >> 10;
 4142-         v[40] = (x2-t1) >> 10;
 4143-         v[24] = (x3+t0) >> 10;
 4144-         v[32] = (x3-t0) >> 10;
 4145-      }
 4146-   }
 4147-
 4148-   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
 4149-      // no fast case since the first 1D IDCT spread components out
 4150-      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
 4151-      // constants scaled things up by 1<<12, plus we had 1<<2 from first
 4152-      // loop, plus horizontal and vertical each scale by sqrt(8) so together
 4153-      // we've got an extra 1<<3, so 1<<17 total we need to remove.
 4154-      // so we want to round that, which means adding 0.5 * 1<<17,
 4155-      // aka 65536. Also, we'll end up with -128 to 127 that we want
 4156-      // to encode as 0..255 by adding 128, so we'll add that before the shift
 4157-      x0 += 65536 + (128<<17);
 4158-      x1 += 65536 + (128<<17);
 4159-      x2 += 65536 + (128<<17);
 4160-      x3 += 65536 + (128<<17);
 4161-      // tried computing the shifts into temps, or'ing the temps to see
 4162-      // if any were out of range, but that was slower
 4163-      o[0] = stbi__clamp((x0+t3) >> 17);
 4164-      o[7] = stbi__clamp((x0-t3) >> 17);
 4165-      o[1] = stbi__clamp((x1+t2) >> 17);
 4166-      o[6] = stbi__clamp((x1-t2) >> 17);
 4167-      o[2] = stbi__clamp((x2+t1) >> 17);
 4168-      o[5] = stbi__clamp((x2-t1) >> 17);
 4169-      o[3] = stbi__clamp((x3+t0) >> 17);
 4170-      o[4] = stbi__clamp((x3-t0) >> 17);
 4171-   }
 4172+#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)                          \
 4173+	int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;                    \
 4174+	p2 = s2;                                                                   \
 4175+	p3 = s6;                                                                   \
 4176+	p1 = (p2 + p3) * stbi__f2f(0.5411961f);                                    \
 4177+	t2 = p1 + p3 * stbi__f2f(-1.847759065f);                                   \
 4178+	t3 = p1 + p2 * stbi__f2f(0.765366865f);                                    \
 4179+	p2 = s0;                                                                   \
 4180+	p3 = s4;                                                                   \
 4181+	t0 = stbi__fsh(p2 + p3);                                                   \
 4182+	t1 = stbi__fsh(p2 - p3);                                                   \
 4183+	x0 = t0 + t3;                                                              \
 4184+	x3 = t0 - t3;                                                              \
 4185+	x1 = t1 + t2;                                                              \
 4186+	x2 = t1 - t2;                                                              \
 4187+	t0 = s7;                                                                   \
 4188+	t1 = s5;                                                                   \
 4189+	t2 = s3;                                                                   \
 4190+	t3 = s1;                                                                   \
 4191+	p3 = t0 + t2;                                                              \
 4192+	p4 = t1 + t3;                                                              \
 4193+	p1 = t0 + t3;                                                              \
 4194+	p2 = t1 + t2;                                                              \
 4195+	p5 = (p3 + p4) * stbi__f2f(1.175875602f);                                  \
 4196+	t0 = t0 * stbi__f2f(0.298631336f);                                         \
 4197+	t1 = t1 * stbi__f2f(2.053119869f);                                         \
 4198+	t2 = t2 * stbi__f2f(3.072711026f);                                         \
 4199+	t3 = t3 * stbi__f2f(1.501321110f);                                         \
 4200+	p1 = p5 + p1 * stbi__f2f(-0.899976223f);                                   \
 4201+	p2 = p5 + p2 * stbi__f2f(-2.562915447f);                                   \
 4202+	p3 = p3 * stbi__f2f(-1.961570560f);                                        \
 4203+	p4 = p4 * stbi__f2f(-0.390180644f);                                        \
 4204+	t3 += p1 + p4;                                                             \
 4205+	t2 += p2 + p3;                                                             \
 4206+	t1 += p2 + p4;                                                             \
 4207+	t0 += p1 + p3;
 4208+
 4209+static void
 4210+stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
 4211+{
 4212+	int i, val[64], *v = val;
 4213+	stbi_uc *o;
 4214+	short *d = data;
 4215+
 4216+	// columns
 4217+	for (i = 0; i < 8; ++i, ++d, ++v) {
 4218+		// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
 4219+		if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
 4220+		    d[48] == 0 && d[56] == 0) {
 4221+			//    no shortcut                 0     seconds
 4222+			//    (1|2|3|4|5|6|7)==0          0     seconds
 4223+			//    all separate               -0.047 seconds
 4224+			//    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
 4225+			int dcterm = d[0] * 4;
 4226+			v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] =
 4227+			    dcterm;
 4228+		} else {
 4229+			STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
 4230+			// constants scaled things up by 1<<12; let's bring them back
 4231+			// down, but keep 2 extra bits of precision
 4232+			x0 += 512;
 4233+			x1 += 512;
 4234+			x2 += 512;
 4235+			x3 += 512;
 4236+			v[0] = (x0 + t3) >> 10;
 4237+			v[56] = (x0 - t3) >> 10;
 4238+			v[8] = (x1 + t2) >> 10;
 4239+			v[48] = (x1 - t2) >> 10;
 4240+			v[16] = (x2 + t1) >> 10;
 4241+			v[40] = (x2 - t1) >> 10;
 4242+			v[24] = (x3 + t0) >> 10;
 4243+			v[32] = (x3 - t0) >> 10;
 4244+		}
 4245+	}
 4246+
 4247+	for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
 4248+		// no fast case since the first 1D IDCT spread components out
 4249+		STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
 4250+		// constants scaled things up by 1<<12, plus we had 1<<2 from first
 4251+		// loop, plus horizontal and vertical each scale by sqrt(8) so together
 4252+		// we've got an extra 1<<3, so 1<<17 total we need to remove.
 4253+		// so we want to round that, which means adding 0.5 * 1<<17,
 4254+		// aka 65536. Also, we'll end up with -128 to 127 that we want
 4255+		// to encode as 0..255 by adding 128, so we'll add that before the shift
 4256+		x0 += 65536 + (128 << 17);
 4257+		x1 += 65536 + (128 << 17);
 4258+		x2 += 65536 + (128 << 17);
 4259+		x3 += 65536 + (128 << 17);
 4260+		// tried computing the shifts into temps, or'ing the temps to see
 4261+		// if any were out of range, but that was slower
 4262+		o[0] = stbi__clamp((x0 + t3) >> 17);
 4263+		o[7] = stbi__clamp((x0 - t3) >> 17);
 4264+		o[1] = stbi__clamp((x1 + t2) >> 17);
 4265+		o[6] = stbi__clamp((x1 - t2) >> 17);
 4266+		o[2] = stbi__clamp((x2 + t1) >> 17);
 4267+		o[5] = stbi__clamp((x2 - t1) >> 17);
 4268+		o[3] = stbi__clamp((x3 + t0) >> 17);
 4269+		o[4] = stbi__clamp((x3 - t0) >> 17);
 4270+	}
 4271 }
 4272 
 4273 #ifdef STBI_SSE2
 4274 // sse2 integer IDCT. not the fastest possible implementation but it
 4275 // produces bit-identical results to the generic C version so it's
 4276 // fully "transparent".
 4277-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 4278-{
 4279-   // This is constructed to match our regular (generic) integer IDCT exactly.
 4280-   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
 4281-   __m128i tmp;
 4282-
 4283-   // dot product constant: even elems=x, odd elems=y
 4284-   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
 4285-
 4286-   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
 4287-   // out(1) = c1[even]*x + c1[odd]*y
 4288-   #define dct_rot(out0,out1, x,y,c0,c1) \
 4289-      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
 4290-      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
 4291-      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
 4292-      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
 4293-      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
 4294-      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
 4295-
 4296-   // out = in << 12  (in 16-bit, out 32-bit)
 4297-   #define dct_widen(out, in) \
 4298-      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
 4299-      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
 4300-
 4301-   // wide add
 4302-   #define dct_wadd(out, a, b) \
 4303-      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
 4304-      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
 4305-
 4306-   // wide sub
 4307-   #define dct_wsub(out, a, b) \
 4308-      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
 4309-      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
 4310-
 4311-   // butterfly a/b, add bias, then shift by "s" and pack
 4312-   #define dct_bfly32o(out0, out1, a,b,bias,s) \
 4313-      { \
 4314-         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
 4315-         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
 4316-         dct_wadd(sum, abiased, b); \
 4317-         dct_wsub(dif, abiased, b); \
 4318-         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
 4319-         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
 4320-      }
 4321-
 4322-   // 8-bit interleave step (for transposes)
 4323-   #define dct_interleave8(a, b) \
 4324-      tmp = a; \
 4325-      a = _mm_unpacklo_epi8(a, b); \
 4326-      b = _mm_unpackhi_epi8(tmp, b)
 4327-
 4328-   // 16-bit interleave step (for transposes)
 4329-   #define dct_interleave16(a, b) \
 4330-      tmp = a; \
 4331-      a = _mm_unpacklo_epi16(a, b); \
 4332-      b = _mm_unpackhi_epi16(tmp, b)
 4333-
 4334-   #define dct_pass(bias,shift) \
 4335-      { \
 4336-         /* even part */ \
 4337-         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
 4338-         __m128i sum04 = _mm_add_epi16(row0, row4); \
 4339-         __m128i dif04 = _mm_sub_epi16(row0, row4); \
 4340-         dct_widen(t0e, sum04); \
 4341-         dct_widen(t1e, dif04); \
 4342-         dct_wadd(x0, t0e, t3e); \
 4343-         dct_wsub(x3, t0e, t3e); \
 4344-         dct_wadd(x1, t1e, t2e); \
 4345-         dct_wsub(x2, t1e, t2e); \
 4346-         /* odd part */ \
 4347-         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
 4348-         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
 4349-         __m128i sum17 = _mm_add_epi16(row1, row7); \
 4350-         __m128i sum35 = _mm_add_epi16(row3, row5); \
 4351-         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
 4352-         dct_wadd(x4, y0o, y4o); \
 4353-         dct_wadd(x5, y1o, y5o); \
 4354-         dct_wadd(x6, y2o, y5o); \
 4355-         dct_wadd(x7, y3o, y4o); \
 4356-         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
 4357-         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
 4358-         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
 4359-         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
 4360-      }
 4361-
 4362-   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
 4363-   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
 4364-   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
 4365-   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
 4366-   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
 4367-   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
 4368-   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
 4369-   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
 4370-
 4371-   // rounding biases in column/row passes, see stbi__idct_block for explanation.
 4372-   __m128i bias_0 = _mm_set1_epi32(512);
 4373-   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
 4374-
 4375-   // load
 4376-   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
 4377-   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
 4378-   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
 4379-   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
 4380-   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
 4381-   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
 4382-   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
 4383-   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
 4384-
 4385-   // column pass
 4386-   dct_pass(bias_0, 10);
 4387-
 4388-   {
 4389-      // 16bit 8x8 transpose pass 1
 4390-      dct_interleave16(row0, row4);
 4391-      dct_interleave16(row1, row5);
 4392-      dct_interleave16(row2, row6);
 4393-      dct_interleave16(row3, row7);
 4394-
 4395-      // transpose pass 2
 4396-      dct_interleave16(row0, row2);
 4397-      dct_interleave16(row1, row3);
 4398-      dct_interleave16(row4, row6);
 4399-      dct_interleave16(row5, row7);
 4400-
 4401-      // transpose pass 3
 4402-      dct_interleave16(row0, row1);
 4403-      dct_interleave16(row2, row3);
 4404-      dct_interleave16(row4, row5);
 4405-      dct_interleave16(row6, row7);
 4406-   }
 4407-
 4408-   // row pass
 4409-   dct_pass(bias_1, 17);
 4410-
 4411-   {
 4412-      // pack
 4413-      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
 4414-      __m128i p1 = _mm_packus_epi16(row2, row3);
 4415-      __m128i p2 = _mm_packus_epi16(row4, row5);
 4416-      __m128i p3 = _mm_packus_epi16(row6, row7);
 4417-
 4418-      // 8bit 8x8 transpose pass 1
 4419-      dct_interleave8(p0, p2); // a0e0a1e1...
 4420-      dct_interleave8(p1, p3); // c0g0c1g1...
 4421-
 4422-      // transpose pass 2
 4423-      dct_interleave8(p0, p1); // a0c0e0g0...
 4424-      dct_interleave8(p2, p3); // b0d0f0h0...
 4425-
 4426-      // transpose pass 3
 4427-      dct_interleave8(p0, p2); // a0b0c0d0...
 4428-      dct_interleave8(p1, p3); // a4b4c4d4...
 4429-
 4430-      // store
 4431-      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
 4432-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
 4433-      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
 4434-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
 4435-      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
 4436-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
 4437-      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
 4438-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
 4439-   }
 4440+static void
 4441+stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 4442+{
 4443+	// This is constructed to match our regular (generic) integer IDCT exactly.
 4444+	__m128i row0, row1, row2, row3, row4, row5, row6, row7;
 4445+	__m128i tmp;
 4446+
 4447+// dot product constant: even elems=x, odd elems=y
 4448+#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
 4449+
 4450+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
 4451+// out(1) = c1[even]*x + c1[odd]*y
 4452+#define dct_rot(out0, out1, x, y, c0, c1)                                      \
 4453+	__m128i c0##lo = _mm_unpacklo_epi16((x), (y));                             \
 4454+	__m128i c0##hi = _mm_unpackhi_epi16((x), (y));                             \
 4455+	__m128i out0##_l = _mm_madd_epi16(c0##lo, c0);                             \
 4456+	__m128i out0##_h = _mm_madd_epi16(c0##hi, c0);                             \
 4457+	__m128i out1##_l = _mm_madd_epi16(c0##lo, c1);                             \
 4458+	__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
 4459+
 4460+// out = in << 12  (in 16-bit, out 32-bit)
 4461+#define dct_widen(out, in)                                                     \
 4462+	__m128i out##_l =                                                          \
 4463+	    _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4);      \
 4464+	__m128i out##_h =                                                          \
 4465+	    _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
 4466+
 4467+// wide add
 4468+#define dct_wadd(out, a, b)                                                    \
 4469+	__m128i out##_l = _mm_add_epi32(a##_l, b##_l);                             \
 4470+	__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
 4471+
 4472+// wide sub
 4473+#define dct_wsub(out, a, b)                                                    \
 4474+	__m128i out##_l = _mm_sub_epi32(a##_l, b##_l);                             \
 4475+	__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
 4476+
 4477+// butterfly a/b, add bias, then shift by "s" and pack
 4478+#define dct_bfly32o(out0, out1, a, b, bias, s)                                 \
 4479+	{                                                                          \
 4480+		__m128i abiased_l = _mm_add_epi32(a##_l, bias);                        \
 4481+		__m128i abiased_h = _mm_add_epi32(a##_h, bias);                        \
 4482+		dct_wadd(sum, abiased, b);                                             \
 4483+		dct_wsub(dif, abiased, b);                                             \
 4484+		out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s),                       \
 4485+		                       _mm_srai_epi32(sum_h, s));                      \
 4486+		out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s),                       \
 4487+		                       _mm_srai_epi32(dif_h, s));                      \
 4488+	}
 4489+
 4490+// 8-bit interleave step (for transposes)
 4491+#define dct_interleave8(a, b)                                                  \
 4492+	tmp = a;                                                                   \
 4493+	a = _mm_unpacklo_epi8(a, b);                                               \
 4494+	b = _mm_unpackhi_epi8(tmp, b)
 4495+
 4496+// 16-bit interleave step (for transposes)
 4497+#define dct_interleave16(a, b)                                                 \
 4498+	tmp = a;                                                                   \
 4499+	a = _mm_unpacklo_epi16(a, b);                                              \
 4500+	b = _mm_unpackhi_epi16(tmp, b)
 4501+
 4502+#define dct_pass(bias, shift)                                                  \
 4503+	{                                                                          \
 4504+		/* even part */                                                        \
 4505+		dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);                         \
 4506+		__m128i sum04 = _mm_add_epi16(row0, row4);                             \
 4507+		__m128i dif04 = _mm_sub_epi16(row0, row4);                             \
 4508+		dct_widen(t0e, sum04);                                                 \
 4509+		dct_widen(t1e, dif04);                                                 \
 4510+		dct_wadd(x0, t0e, t3e);                                                \
 4511+		dct_wsub(x3, t0e, t3e);                                                \
 4512+		dct_wadd(x1, t1e, t2e);                                                \
 4513+		dct_wsub(x2, t1e, t2e);                                                \
 4514+		/* odd part */                                                         \
 4515+		dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);                         \
 4516+		dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);                         \
 4517+		__m128i sum17 = _mm_add_epi16(row1, row7);                             \
 4518+		__m128i sum35 = _mm_add_epi16(row3, row5);                             \
 4519+		dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1);                       \
 4520+		dct_wadd(x4, y0o, y4o);                                                \
 4521+		dct_wadd(x5, y1o, y5o);                                                \
 4522+		dct_wadd(x6, y2o, y5o);                                                \
 4523+		dct_wadd(x7, y3o, y4o);                                                \
 4524+		dct_bfly32o(row0, row7, x0, x7, bias, shift);                          \
 4525+		dct_bfly32o(row1, row6, x1, x6, bias, shift);                          \
 4526+		dct_bfly32o(row2, row5, x2, x5, bias, shift);                          \
 4527+		dct_bfly32o(row3, row4, x3, x4, bias, shift);                          \
 4528+	}
 4529+
 4530+	__m128i rot0_0 =
 4531+	    dct_const(stbi__f2f(0.5411961f),
 4532+	              stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
 4533+	__m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
 4534+	                           stbi__f2f(0.5411961f));
 4535+	__m128i rot1_0 =
 4536+	    dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
 4537+	              stbi__f2f(1.175875602f));
 4538+	__m128i rot1_1 =
 4539+	    dct_const(stbi__f2f(1.175875602f),
 4540+	              stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
 4541+	__m128i rot2_0 =
 4542+	    dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
 4543+	              stbi__f2f(-1.961570560f));
 4544+	__m128i rot2_1 =
 4545+	    dct_const(stbi__f2f(-1.961570560f),
 4546+	              stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
 4547+	__m128i rot3_0 =
 4548+	    dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
 4549+	              stbi__f2f(-0.390180644f));
 4550+	__m128i rot3_1 =
 4551+	    dct_const(stbi__f2f(-0.390180644f),
 4552+	              stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
 4553+
 4554+	// rounding biases in column/row passes, see stbi__idct_block for
 4555+	// explanation.
 4556+	__m128i bias_0 = _mm_set1_epi32(512);
 4557+	__m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
 4558+
 4559+	// load
 4560+	row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
 4561+	row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
 4562+	row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
 4563+	row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
 4564+	row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
 4565+	row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
 4566+	row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
 4567+	row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
 4568+
 4569+	// column pass
 4570+	dct_pass(bias_0, 10);
 4571+
 4572+	{
 4573+		// 16bit 8x8 transpose pass 1
 4574+		dct_interleave16(row0, row4);
 4575+		dct_interleave16(row1, row5);
 4576+		dct_interleave16(row2, row6);
 4577+		dct_interleave16(row3, row7);
 4578+
 4579+		// transpose pass 2
 4580+		dct_interleave16(row0, row2);
 4581+		dct_interleave16(row1, row3);
 4582+		dct_interleave16(row4, row6);
 4583+		dct_interleave16(row5, row7);
 4584+
 4585+		// transpose pass 3
 4586+		dct_interleave16(row0, row1);
 4587+		dct_interleave16(row2, row3);
 4588+		dct_interleave16(row4, row5);
 4589+		dct_interleave16(row6, row7);
 4590+	}
 4591+
 4592+	// row pass
 4593+	dct_pass(bias_1, 17);
 4594+
 4595+	{
 4596+		// pack
 4597+		__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
 4598+		__m128i p1 = _mm_packus_epi16(row2, row3);
 4599+		__m128i p2 = _mm_packus_epi16(row4, row5);
 4600+		__m128i p3 = _mm_packus_epi16(row6, row7);
 4601+
 4602+		// 8bit 8x8 transpose pass 1
 4603+		dct_interleave8(p0, p2); // a0e0a1e1...
 4604+		dct_interleave8(p1, p3); // c0g0c1g1...
 4605+
 4606+		// transpose pass 2
 4607+		dct_interleave8(p0, p1); // a0c0e0g0...
 4608+		dct_interleave8(p2, p3); // b0d0f0h0...
 4609+
 4610+		// transpose pass 3
 4611+		dct_interleave8(p0, p2); // a0b0c0d0...
 4612+		dct_interleave8(p1, p3); // a4b4c4d4...
 4613+
 4614+		// store
 4615+		_mm_storel_epi64((__m128i *)out, p0);
 4616+		out += out_stride;
 4617+		_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
 4618+		out += out_stride;
 4619+		_mm_storel_epi64((__m128i *)out, p2);
 4620+		out += out_stride;
 4621+		_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
 4622+		out += out_stride;
 4623+		_mm_storel_epi64((__m128i *)out, p1);
 4624+		out += out_stride;
 4625+		_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
 4626+		out += out_stride;
 4627+		_mm_storel_epi64((__m128i *)out, p3);
 4628+		out += out_stride;
 4629+		_mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
 4630+	}
 4631 
 4632 #undef dct_const
 4633 #undef dct_rot
 4634@@ -2708,198 +3306,240 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 4635 
 4636 // NEON integer IDCT. should produce bit-identical
 4637 // results to the generic C version.
 4638-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 4639-{
 4640-   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
 4641-
 4642-   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
 4643-   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
 4644-   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
 4645-   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
 4646-   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
 4647-   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
 4648-   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
 4649-   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
 4650-   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
 4651-   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
 4652-   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
 4653-   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
 4654-
 4655-#define dct_long_mul(out, inq, coeff) \
 4656-   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
 4657-   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
 4658-
 4659-#define dct_long_mac(out, acc, inq, coeff) \
 4660-   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
 4661-   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
 4662-
 4663-#define dct_widen(out, inq) \
 4664-   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
 4665-   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
 4666+static void
 4667+stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 4668+{
 4669+	int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
 4670+
 4671+	int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
 4672+	int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
 4673+	int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
 4674+	int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
 4675+	int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
 4676+	int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
 4677+	int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
 4678+	int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
 4679+	int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
 4680+	int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
 4681+	int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
 4682+	int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
 4683+
 4684+#define dct_long_mul(out, inq, coeff)                                          \
 4685+	int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff);                   \
 4686+	int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
 4687+
 4688+#define dct_long_mac(out, acc, inq, coeff)                                     \
 4689+	int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff);          \
 4690+	int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
 4691+
 4692+#define dct_widen(out, inq)                                                    \
 4693+	int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12);                    \
 4694+	int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
 4695 
 4696 // wide add
 4697-#define dct_wadd(out, a, b) \
 4698-   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
 4699-   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
 4700+#define dct_wadd(out, a, b)                                                    \
 4701+	int32x4_t out##_l = vaddq_s32(a##_l, b##_l);                               \
 4702+	int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
 4703 
 4704 // wide sub
 4705-#define dct_wsub(out, a, b) \
 4706-   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
 4707-   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
 4708+#define dct_wsub(out, a, b)                                                    \
 4709+	int32x4_t out##_l = vsubq_s32(a##_l, b##_l);                               \
 4710+	int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
 4711 
 4712 // butterfly a/b, then shift using "shiftop" by "s" and pack
 4713-#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
 4714-   { \
 4715-      dct_wadd(sum, a, b); \
 4716-      dct_wsub(dif, a, b); \
 4717-      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
 4718-      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
 4719-   }
 4720-
 4721-#define dct_pass(shiftop, shift) \
 4722-   { \
 4723-      /* even part */ \
 4724-      int16x8_t sum26 = vaddq_s16(row2, row6); \
 4725-      dct_long_mul(p1e, sum26, rot0_0); \
 4726-      dct_long_mac(t2e, p1e, row6, rot0_1); \
 4727-      dct_long_mac(t3e, p1e, row2, rot0_2); \
 4728-      int16x8_t sum04 = vaddq_s16(row0, row4); \
 4729-      int16x8_t dif04 = vsubq_s16(row0, row4); \
 4730-      dct_widen(t0e, sum04); \
 4731-      dct_widen(t1e, dif04); \
 4732-      dct_wadd(x0, t0e, t3e); \
 4733-      dct_wsub(x3, t0e, t3e); \
 4734-      dct_wadd(x1, t1e, t2e); \
 4735-      dct_wsub(x2, t1e, t2e); \
 4736-      /* odd part */ \
 4737-      int16x8_t sum15 = vaddq_s16(row1, row5); \
 4738-      int16x8_t sum17 = vaddq_s16(row1, row7); \
 4739-      int16x8_t sum35 = vaddq_s16(row3, row5); \
 4740-      int16x8_t sum37 = vaddq_s16(row3, row7); \
 4741-      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
 4742-      dct_long_mul(p5o, sumodd, rot1_0); \
 4743-      dct_long_mac(p1o, p5o, sum17, rot1_1); \
 4744-      dct_long_mac(p2o, p5o, sum35, rot1_2); \
 4745-      dct_long_mul(p3o, sum37, rot2_0); \
 4746-      dct_long_mul(p4o, sum15, rot2_1); \
 4747-      dct_wadd(sump13o, p1o, p3o); \
 4748-      dct_wadd(sump24o, p2o, p4o); \
 4749-      dct_wadd(sump23o, p2o, p3o); \
 4750-      dct_wadd(sump14o, p1o, p4o); \
 4751-      dct_long_mac(x4, sump13o, row7, rot3_0); \
 4752-      dct_long_mac(x5, sump24o, row5, rot3_1); \
 4753-      dct_long_mac(x6, sump23o, row3, rot3_2); \
 4754-      dct_long_mac(x7, sump14o, row1, rot3_3); \
 4755-      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
 4756-      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
 4757-      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
 4758-      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
 4759-   }
 4760-
 4761-   // load
 4762-   row0 = vld1q_s16(data + 0*8);
 4763-   row1 = vld1q_s16(data + 1*8);
 4764-   row2 = vld1q_s16(data + 2*8);
 4765-   row3 = vld1q_s16(data + 3*8);
 4766-   row4 = vld1q_s16(data + 4*8);
 4767-   row5 = vld1q_s16(data + 5*8);
 4768-   row6 = vld1q_s16(data + 6*8);
 4769-   row7 = vld1q_s16(data + 7*8);
 4770-
 4771-   // add DC bias
 4772-   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
 4773-
 4774-   // column pass
 4775-   dct_pass(vrshrn_n_s32, 10);
 4776-
 4777-   // 16bit 8x8 transpose
 4778-   {
 4779+#define dct_bfly32o(out0, out1, a, b, shiftop, s)                              \
 4780+	{                                                                          \
 4781+		dct_wadd(sum, a, b);                                                   \
 4782+		dct_wsub(dif, a, b);                                                   \
 4783+		out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s));             \
 4784+		out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s));             \
 4785+	}
 4786+
 4787+#define dct_pass(shiftop, shift)                                               \
 4788+	{                                                                          \
 4789+		/* even part */                                                        \
 4790+		int16x8_t sum26 = vaddq_s16(row2, row6);                               \
 4791+		dct_long_mul(p1e, sum26, rot0_0);                                      \
 4792+		dct_long_mac(t2e, p1e, row6, rot0_1);                                  \
 4793+		dct_long_mac(t3e, p1e, row2, rot0_2);                                  \
 4794+		int16x8_t sum04 = vaddq_s16(row0, row4);                               \
 4795+		int16x8_t dif04 = vsubq_s16(row0, row4);                               \
 4796+		dct_widen(t0e, sum04);                                                 \
 4797+		dct_widen(t1e, dif04);                                                 \
 4798+		dct_wadd(x0, t0e, t3e);                                                \
 4799+		dct_wsub(x3, t0e, t3e);                                                \
 4800+		dct_wadd(x1, t1e, t2e);                                                \
 4801+		dct_wsub(x2, t1e, t2e);                                                \
 4802+		/* odd part */                                                         \
 4803+		int16x8_t sum15 = vaddq_s16(row1, row5);                               \
 4804+		int16x8_t sum17 = vaddq_s16(row1, row7);                               \
 4805+		int16x8_t sum35 = vaddq_s16(row3, row5);                               \
 4806+		int16x8_t sum37 = vaddq_s16(row3, row7);                               \
 4807+		int16x8_t sumodd = vaddq_s16(sum17, sum35);                            \
 4808+		dct_long_mul(p5o, sumodd, rot1_0);                                     \
 4809+		dct_long_mac(p1o, p5o, sum17, rot1_1);                                 \
 4810+		dct_long_mac(p2o, p5o, sum35, rot1_2);                                 \
 4811+		dct_long_mul(p3o, sum37, rot2_0);                                      \
 4812+		dct_long_mul(p4o, sum15, rot2_1);                                      \
 4813+		dct_wadd(sump13o, p1o, p3o);                                           \
 4814+		dct_wadd(sump24o, p2o, p4o);                                           \
 4815+		dct_wadd(sump23o, p2o, p3o);                                           \
 4816+		dct_wadd(sump14o, p1o, p4o);                                           \
 4817+		dct_long_mac(x4, sump13o, row7, rot3_0);                               \
 4818+		dct_long_mac(x5, sump24o, row5, rot3_1);                               \
 4819+		dct_long_mac(x6, sump23o, row3, rot3_2);                               \
 4820+		dct_long_mac(x7, sump14o, row1, rot3_3);                               \
 4821+		dct_bfly32o(row0, row7, x0, x7, shiftop, shift);                       \
 4822+		dct_bfly32o(row1, row6, x1, x6, shiftop, shift);                       \
 4823+		dct_bfly32o(row2, row5, x2, x5, shiftop, shift);                       \
 4824+		dct_bfly32o(row3, row4, x3, x4, shiftop, shift);                       \
 4825+	}
 4826+
 4827+	// load
 4828+	row0 = vld1q_s16(data + 0 * 8);
 4829+	row1 = vld1q_s16(data + 1 * 8);
 4830+	row2 = vld1q_s16(data + 2 * 8);
 4831+	row3 = vld1q_s16(data + 3 * 8);
 4832+	row4 = vld1q_s16(data + 4 * 8);
 4833+	row5 = vld1q_s16(data + 5 * 8);
 4834+	row6 = vld1q_s16(data + 6 * 8);
 4835+	row7 = vld1q_s16(data + 7 * 8);
 4836+
 4837+	// add DC bias
 4838+	row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
 4839+
 4840+	// column pass
 4841+	dct_pass(vrshrn_n_s32, 10);
 4842+
 4843+	// 16bit 8x8 transpose
 4844+	{
 4845 // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
 4846 // whether compilers actually get this is another story, sadly.
 4847-#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
 4848-#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
 4849-#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
 4850-
 4851-      // pass 1
 4852-      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
 4853-      dct_trn16(row2, row3);
 4854-      dct_trn16(row4, row5);
 4855-      dct_trn16(row6, row7);
 4856-
 4857-      // pass 2
 4858-      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
 4859-      dct_trn32(row1, row3);
 4860-      dct_trn32(row4, row6);
 4861-      dct_trn32(row5, row7);
 4862-
 4863-      // pass 3
 4864-      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
 4865-      dct_trn64(row1, row5);
 4866-      dct_trn64(row2, row6);
 4867-      dct_trn64(row3, row7);
 4868+#define dct_trn16(x, y)                                                        \
 4869+	{                                                                          \
 4870+		int16x8x2_t t = vtrnq_s16(x, y);                                       \
 4871+		x = t.val[0];                                                          \
 4872+		y = t.val[1];                                                          \
 4873+	}
 4874+#define dct_trn32(x, y)                                                        \
 4875+	{                                                                          \
 4876+		int32x4x2_t t =                                                        \
 4877+		    vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y));     \
 4878+		x = vreinterpretq_s16_s32(t.val[0]);                                   \
 4879+		y = vreinterpretq_s16_s32(t.val[1]);                                   \
 4880+	}
 4881+#define dct_trn64(x, y)                                                        \
 4882+	{                                                                          \
 4883+		int16x8_t x0 = x;                                                      \
 4884+		int16x8_t y0 = y;                                                      \
 4885+		x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));                  \
 4886+		y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0));                \
 4887+	}
 4888+
 4889+		// pass 1
 4890+		dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
 4891+		dct_trn16(row2, row3);
 4892+		dct_trn16(row4, row5);
 4893+		dct_trn16(row6, row7);
 4894+
 4895+		// pass 2
 4896+		dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
 4897+		dct_trn32(row1, row3);
 4898+		dct_trn32(row4, row6);
 4899+		dct_trn32(row5, row7);
 4900+
 4901+		// pass 3
 4902+		dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
 4903+		dct_trn64(row1, row5);
 4904+		dct_trn64(row2, row6);
 4905+		dct_trn64(row3, row7);
 4906 
 4907 #undef dct_trn16
 4908 #undef dct_trn32
 4909 #undef dct_trn64
 4910-   }
 4911-
 4912-   // row pass
 4913-   // vrshrn_n_s32 only supports shifts up to 16, we need
 4914-   // 17. so do a non-rounding shift of 16 first then follow
 4915-   // up with a rounding shift by 1.
 4916-   dct_pass(vshrn_n_s32, 16);
 4917-
 4918-   {
 4919-      // pack and round
 4920-      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
 4921-      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
 4922-      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
 4923-      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
 4924-      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
 4925-      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
 4926-      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
 4927-      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
 4928-
 4929-      // again, these can translate into one instruction, but often don't.
 4930-#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
 4931-#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
 4932-#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
 4933-
 4934-      // sadly can't use interleaved stores here since we only write
 4935-      // 8 bytes to each scan line!
 4936-
 4937-      // 8x8 8-bit transpose pass 1
 4938-      dct_trn8_8(p0, p1);
 4939-      dct_trn8_8(p2, p3);
 4940-      dct_trn8_8(p4, p5);
 4941-      dct_trn8_8(p6, p7);
 4942-
 4943-      // pass 2
 4944-      dct_trn8_16(p0, p2);
 4945-      dct_trn8_16(p1, p3);
 4946-      dct_trn8_16(p4, p6);
 4947-      dct_trn8_16(p5, p7);
 4948-
 4949-      // pass 3
 4950-      dct_trn8_32(p0, p4);
 4951-      dct_trn8_32(p1, p5);
 4952-      dct_trn8_32(p2, p6);
 4953-      dct_trn8_32(p3, p7);
 4954-
 4955-      // store
 4956-      vst1_u8(out, p0); out += out_stride;
 4957-      vst1_u8(out, p1); out += out_stride;
 4958-      vst1_u8(out, p2); out += out_stride;
 4959-      vst1_u8(out, p3); out += out_stride;
 4960-      vst1_u8(out, p4); out += out_stride;
 4961-      vst1_u8(out, p5); out += out_stride;
 4962-      vst1_u8(out, p6); out += out_stride;
 4963-      vst1_u8(out, p7);
 4964+	}
 4965+
 4966+	// row pass
 4967+	// vrshrn_n_s32 only supports shifts up to 16, we need
 4968+	// 17. so do a non-rounding shift of 16 first then follow
 4969+	// up with a rounding shift by 1.
 4970+	dct_pass(vshrn_n_s32, 16);
 4971+
 4972+	{
 4973+		// pack and round
 4974+		uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
 4975+		uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
 4976+		uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
 4977+		uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
 4978+		uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
 4979+		uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
 4980+		uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
 4981+		uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
 4982+
 4983+		// again, these can translate into one instruction, but often don't.
 4984+#define dct_trn8_8(x, y)                                                       \
 4985+	{                                                                          \
 4986+		uint8x8x2_t t = vtrn_u8(x, y);                                         \
 4987+		x = t.val[0];                                                          \
 4988+		y = t.val[1];                                                          \
 4989+	}
 4990+#define dct_trn8_16(x, y)                                                      \
 4991+	{                                                                          \
 4992+		uint16x4x2_t t =                                                       \
 4993+		    vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y));          \
 4994+		x = vreinterpret_u8_u16(t.val[0]);                                     \
 4995+		y = vreinterpret_u8_u16(t.val[1]);                                     \
 4996+	}
 4997+#define dct_trn8_32(x, y)                                                      \
 4998+	{                                                                          \
 4999+		uint32x2x2_t t =                                                       \
 5000+		    vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y));          \
 5001+		x = vreinterpret_u8_u32(t.val[0]);                                     \
 5002+		y = vreinterpret_u8_u32(t.val[1]);                                     \
 5003+	}
 5004+
 5005+		// sadly can't use interleaved stores here since we only write
 5006+		// 8 bytes to each scan line!
 5007+
 5008+		// 8x8 8-bit transpose pass 1
 5009+		dct_trn8_8(p0, p1);
 5010+		dct_trn8_8(p2, p3);
 5011+		dct_trn8_8(p4, p5);
 5012+		dct_trn8_8(p6, p7);
 5013+
 5014+		// pass 2
 5015+		dct_trn8_16(p0, p2);
 5016+		dct_trn8_16(p1, p3);
 5017+		dct_trn8_16(p4, p6);
 5018+		dct_trn8_16(p5, p7);
 5019+
 5020+		// pass 3
 5021+		dct_trn8_32(p0, p4);
 5022+		dct_trn8_32(p1, p5);
 5023+		dct_trn8_32(p2, p6);
 5024+		dct_trn8_32(p3, p7);
 5025+
 5026+		// store
 5027+		vst1_u8(out, p0);
 5028+		out += out_stride;
 5029+		vst1_u8(out, p1);
 5030+		out += out_stride;
 5031+		vst1_u8(out, p2);
 5032+		out += out_stride;
 5033+		vst1_u8(out, p3);
 5034+		out += out_stride;
 5035+		vst1_u8(out, p4);
 5036+		out += out_stride;
 5037+		vst1_u8(out, p5);
 5038+		out += out_stride;
 5039+		vst1_u8(out, p6);
 5040+		out += out_stride;
 5041+		vst1_u8(out, p7);
 5042 
 5043 #undef dct_trn8_8
 5044 #undef dct_trn8_16
 5045 #undef dct_trn8_32
 5046-   }
 5047+	}
 5048 
 5049 #undef dct_long_mul
 5050 #undef dct_long_mac
 5051@@ -2912,1169 +3552,1498 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 5052 
 5053 #endif // STBI_NEON
 5054 
 5055-#define STBI__MARKER_none  0xff
 5056+#define STBI__MARKER_none 0xff
 5057 // if there's a pending marker from the entropy stream, return that
 5058 // otherwise, fetch from the stream and get a marker. if there's no
 5059 // marker, return 0xff, which is never a valid marker value
 5060-static stbi_uc stbi__get_marker(stbi__jpeg *j)
 5061-{
 5062-   stbi_uc x;
 5063-   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
 5064-   x = stbi__get8(j->s);
 5065-   if (x != 0xff) return STBI__MARKER_none;
 5066-   while (x == 0xff)
 5067-      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
 5068-   return x;
 5069+static stbi_uc
 5070+stbi__get_marker(stbi__jpeg *j)
 5071+{
 5072+	stbi_uc x;
 5073+	if (j->marker != STBI__MARKER_none) {
 5074+		x = j->marker;
 5075+		j->marker = STBI__MARKER_none;
 5076+		return x;
 5077+	}
 5078+	x = stbi__get8(j->s);
 5079+	if (x != 0xff) {
 5080+		return STBI__MARKER_none;
 5081+	}
 5082+	while (x == 0xff) {
 5083+		x = stbi__get8(j->s); // consume repeated 0xff fill bytes
 5084+	}
 5085+	return x;
 5086 }
 5087 
 5088 // in each scan, we'll have scan_n components, and the order
 5089 // of the components is specified by order[]
 5090-#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
 5091+#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
 5092 
 5093 // after a restart interval, stbi__jpeg_reset the entropy decoder and
 5094 // the dc prediction
 5095-static void stbi__jpeg_reset(stbi__jpeg *j)
 5096-{
 5097-   j->code_bits = 0;
 5098-   j->code_buffer = 0;
 5099-   j->nomore = 0;
 5100-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
 5101-   j->marker = STBI__MARKER_none;
 5102-   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
 5103-   j->eob_run = 0;
 5104-   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
 5105-   // since we don't even allow 1<<30 pixels
 5106-}
 5107-
 5108-static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
 5109-{
 5110-   stbi__jpeg_reset(z);
 5111-   if (!z->progressive) {
 5112-      if (z->scan_n == 1) {
 5113-         int i,j;
 5114-         STBI_SIMD_ALIGN(short, data[64]);
 5115-         int n = z->order[0];
 5116-         // non-interleaved data, we just need to process one block at a time,
 5117-         // in trivial scanline order
 5118-         // number of blocks to do just depends on how many actual "pixels" this
 5119-         // component has, independent of interleaved MCU blocking and such
 5120-         int w = (z->img_comp[n].x+7) >> 3;
 5121-         int h = (z->img_comp[n].y+7) >> 3;
 5122-         for (j=0; j < h; ++j) {
 5123-            for (i=0; i < w; ++i) {
 5124-               int ha = z->img_comp[n].ha;
 5125-               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
 5126-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
 5127-               // every data block is an MCU, so countdown the restart interval
 5128-               if (--z->todo <= 0) {
 5129-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
 5130-                  // if it's NOT a restart, then just bail, so we get corrupt data
 5131-                  // rather than no data
 5132-                  if (!STBI__RESTART(z->marker)) return 1;
 5133-                  stbi__jpeg_reset(z);
 5134-               }
 5135-            }
 5136-         }
 5137-         return 1;
 5138-      } else { // interleaved
 5139-         int i,j,k,x,y;
 5140-         STBI_SIMD_ALIGN(short, data[64]);
 5141-         for (j=0; j < z->img_mcu_y; ++j) {
 5142-            for (i=0; i < z->img_mcu_x; ++i) {
 5143-               // scan an interleaved mcu... process scan_n components in order
 5144-               for (k=0; k < z->scan_n; ++k) {
 5145-                  int n = z->order[k];
 5146-                  // scan out an mcu's worth of this component; that's just determined
 5147-                  // by the basic H and V specified for the component
 5148-                  for (y=0; y < z->img_comp[n].v; ++y) {
 5149-                     for (x=0; x < z->img_comp[n].h; ++x) {
 5150-                        int x2 = (i*z->img_comp[n].h + x)*8;
 5151-                        int y2 = (j*z->img_comp[n].v + y)*8;
 5152-                        int ha = z->img_comp[n].ha;
 5153-                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
 5154-                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
 5155-                     }
 5156-                  }
 5157-               }
 5158-               // after all interleaved components, that's an interleaved MCU,
 5159-               // so now count down the restart interval
 5160-               if (--z->todo <= 0) {
 5161-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
 5162-                  if (!STBI__RESTART(z->marker)) return 1;
 5163-                  stbi__jpeg_reset(z);
 5164-               }
 5165-            }
 5166-         }
 5167-         return 1;
 5168-      }
 5169-   } else {
 5170-      if (z->scan_n == 1) {
 5171-         int i,j;
 5172-         int n = z->order[0];
 5173-         // non-interleaved data, we just need to process one block at a time,
 5174-         // in trivial scanline order
 5175-         // number of blocks to do just depends on how many actual "pixels" this
 5176-         // component has, independent of interleaved MCU blocking and such
 5177-         int w = (z->img_comp[n].x+7) >> 3;
 5178-         int h = (z->img_comp[n].y+7) >> 3;
 5179-         for (j=0; j < h; ++j) {
 5180-            for (i=0; i < w; ++i) {
 5181-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
 5182-               if (z->spec_start == 0) {
 5183-                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
 5184-                     return 0;
 5185-               } else {
 5186-                  int ha = z->img_comp[n].ha;
 5187-                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
 5188-                     return 0;
 5189-               }
 5190-               // every data block is an MCU, so countdown the restart interval
 5191-               if (--z->todo <= 0) {
 5192-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
 5193-                  if (!STBI__RESTART(z->marker)) return 1;
 5194-                  stbi__jpeg_reset(z);
 5195-               }
 5196-            }
 5197-         }
 5198-         return 1;
 5199-      } else { // interleaved
 5200-         int i,j,k,x,y;
 5201-         for (j=0; j < z->img_mcu_y; ++j) {
 5202-            for (i=0; i < z->img_mcu_x; ++i) {
 5203-               // scan an interleaved mcu... process scan_n components in order
 5204-               for (k=0; k < z->scan_n; ++k) {
 5205-                  int n = z->order[k];
 5206-                  // scan out an mcu's worth of this component; that's just determined
 5207-                  // by the basic H and V specified for the component
 5208-                  for (y=0; y < z->img_comp[n].v; ++y) {
 5209-                     for (x=0; x < z->img_comp[n].h; ++x) {
 5210-                        int x2 = (i*z->img_comp[n].h + x);
 5211-                        int y2 = (j*z->img_comp[n].v + y);
 5212-                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
 5213-                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
 5214-                           return 0;
 5215-                     }
 5216-                  }
 5217-               }
 5218-               // after all interleaved components, that's an interleaved MCU,
 5219-               // so now count down the restart interval
 5220-               if (--z->todo <= 0) {
 5221-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
 5222-                  if (!STBI__RESTART(z->marker)) return 1;
 5223-                  stbi__jpeg_reset(z);
 5224-               }
 5225-            }
 5226-         }
 5227-         return 1;
 5228-      }
 5229-   }
 5230-}
 5231-
 5232-static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 5233-{
 5234-   int i;
 5235-   for (i=0; i < 64; ++i)
 5236-      data[i] *= dequant[i];
 5237-}
 5238-
 5239-static void stbi__jpeg_finish(stbi__jpeg *z)
 5240-{
 5241-   if (z->progressive) {
 5242-      // dequantize and idct the data
 5243-      int i,j,n;
 5244-      for (n=0; n < z->s->img_n; ++n) {
 5245-         int w = (z->img_comp[n].x+7) >> 3;
 5246-         int h = (z->img_comp[n].y+7) >> 3;
 5247-         for (j=0; j < h; ++j) {
 5248-            for (i=0; i < w; ++i) {
 5249-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
 5250-               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
 5251-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
 5252-            }
 5253-         }
 5254-      }
 5255-   }
 5256-}
 5257-
 5258-static int stbi__process_marker(stbi__jpeg *z, int m)
 5259-{
 5260-   int L;
 5261-   switch (m) {
 5262-      case STBI__MARKER_none: // no marker found
 5263-         return stbi__err("expected marker","Corrupt JPEG");
 5264-
 5265-      case 0xDD: // DRI - specify restart interval
 5266-         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
 5267-         z->restart_interval = stbi__get16be(z->s);
 5268-         return 1;
 5269-
 5270-      case 0xDB: // DQT - define quantization table
 5271-         L = stbi__get16be(z->s)-2;
 5272-         while (L > 0) {
 5273-            int q = stbi__get8(z->s);
 5274-            int p = q >> 4, sixteen = (p != 0);
 5275-            int t = q & 15,i;
 5276-            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
 5277-            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
 5278-
 5279-            for (i=0; i < 64; ++i)
 5280-               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
 5281-            L -= (sixteen ? 129 : 65);
 5282-         }
 5283-         return L==0;
 5284-
 5285-      case 0xC4: // DHT - define huffman table
 5286-         L = stbi__get16be(z->s)-2;
 5287-         while (L > 0) {
 5288-            stbi_uc *v;
 5289-            int sizes[16],i,n=0;
 5290-            int q = stbi__get8(z->s);
 5291-            int tc = q >> 4;
 5292-            int th = q & 15;
 5293-            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
 5294-            for (i=0; i < 16; ++i) {
 5295-               sizes[i] = stbi__get8(z->s);
 5296-               n += sizes[i];
 5297-            }
 5298-            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
 5299-            L -= 17;
 5300-            if (tc == 0) {
 5301-               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
 5302-               v = z->huff_dc[th].values;
 5303-            } else {
 5304-               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
 5305-               v = z->huff_ac[th].values;
 5306-            }
 5307-            for (i=0; i < n; ++i)
 5308-               v[i] = stbi__get8(z->s);
 5309-            if (tc != 0)
 5310-               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
 5311-            L -= n;
 5312-         }
 5313-         return L==0;
 5314-   }
 5315-
 5316-   // check for comment block or APP blocks
 5317-   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
 5318-      L = stbi__get16be(z->s);
 5319-      if (L < 2) {
 5320-         if (m == 0xFE)
 5321-            return stbi__err("bad COM len","Corrupt JPEG");
 5322-         else
 5323-            return stbi__err("bad APP len","Corrupt JPEG");
 5324-      }
 5325-      L -= 2;
 5326-
 5327-      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
 5328-         static const unsigned char tag[5] = {'J','F','I','F','\0'};
 5329-         int ok = 1;
 5330-         int i;
 5331-         for (i=0; i < 5; ++i)
 5332-            if (stbi__get8(z->s) != tag[i])
 5333-               ok = 0;
 5334-         L -= 5;
 5335-         if (ok)
 5336-            z->jfif = 1;
 5337-      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
 5338-         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
 5339-         int ok = 1;
 5340-         int i;
 5341-         for (i=0; i < 6; ++i)
 5342-            if (stbi__get8(z->s) != tag[i])
 5343-               ok = 0;
 5344-         L -= 6;
 5345-         if (ok) {
 5346-            stbi__get8(z->s); // version
 5347-            stbi__get16be(z->s); // flags0
 5348-            stbi__get16be(z->s); // flags1
 5349-            z->app14_color_transform = stbi__get8(z->s); // color transform
 5350-            L -= 6;
 5351-         }
 5352-      }
 5353-
 5354-      stbi__skip(z->s, L);
 5355-      return 1;
 5356-   }
 5357-
 5358-   return stbi__err("unknown marker","Corrupt JPEG");
 5359+static void
 5360+stbi__jpeg_reset(stbi__jpeg *j)
 5361+{
 5362+	j->code_bits = 0;
 5363+	j->code_buffer = 0;
 5364+	j->nomore = 0;
 5365+	j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred =
 5366+	    j->img_comp[3].dc_pred = 0;
 5367+	j->marker = STBI__MARKER_none;
 5368+	j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
 5369+	j->eob_run = 0;
 5370+	// no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
 5371+	// since we don't even allow 1<<30 pixels
 5372+}
 5373+
 5374+static int
 5375+stbi__parse_entropy_coded_data(stbi__jpeg *z)
 5376+{
 5377+	stbi__jpeg_reset(z);
 5378+	if (!z->progressive) {
 5379+		if (z->scan_n == 1) {
 5380+			int i, j;
 5381+			STBI_SIMD_ALIGN(short, data[64]);
 5382+			int n = z->order[0];
 5383+			// non-interleaved data, we just need to process one block at a
 5384+			// time, in trivial scanline order number of blocks to do just
 5385+			// depends on how many actual "pixels" this component has,
 5386+			// independent of interleaved MCU blocking and such
 5387+			int w = (z->img_comp[n].x + 7) >> 3;
 5388+			int h = (z->img_comp[n].y + 7) >> 3;
 5389+			for (j = 0; j < h; ++j) {
 5390+				for (i = 0; i < w; ++i) {
 5391+					int ha = z->img_comp[n].ha;
 5392+					if (!stbi__jpeg_decode_block(
 5393+					        z, data, z->huff_dc + z->img_comp[n].hd,
 5394+					        z->huff_ac + ha, z->fast_ac[ha], n,
 5395+					        z->dequant[z->img_comp[n].tq])) {
 5396+						return 0;
 5397+					}
 5398+					z->idct_block_kernel(z->img_comp[n].data +
 5399+					                         z->img_comp[n].w2 * j * 8 + i * 8,
 5400+					                     z->img_comp[n].w2, data);
 5401+					// every data block is an MCU, so countdown the restart
 5402+					// interval
 5403+					if (--z->todo <= 0) {
 5404+						if (z->code_bits < 24) {
 5405+							stbi__grow_buffer_unsafe(z);
 5406+						}
 5407+						// if it's NOT a restart, then just bail, so we get
 5408+						// corrupt data rather than no data
 5409+						if (!STBI__RESTART(z->marker)) {
 5410+							return 1;
 5411+						}
 5412+						stbi__jpeg_reset(z);
 5413+					}
 5414+				}
 5415+			}
 5416+			return 1;
 5417+		} else { // interleaved
 5418+			int i, j, k, x, y;
 5419+			STBI_SIMD_ALIGN(short, data[64]);
 5420+			for (j = 0; j < z->img_mcu_y; ++j) {
 5421+				for (i = 0; i < z->img_mcu_x; ++i) {
 5422+					// scan an interleaved mcu... process scan_n components in
 5423+					// order
 5424+					for (k = 0; k < z->scan_n; ++k) {
 5425+						int n = z->order[k];
 5426+						// scan out an mcu's worth of this component; that's
 5427+						// just determined by the basic H and V specified for
 5428+						// the component
 5429+						for (y = 0; y < z->img_comp[n].v; ++y) {
 5430+							for (x = 0; x < z->img_comp[n].h; ++x) {
 5431+								int x2 = (i * z->img_comp[n].h + x) * 8;
 5432+								int y2 = (j * z->img_comp[n].v + y) * 8;
 5433+								int ha = z->img_comp[n].ha;
 5434+								if (!stbi__jpeg_decode_block(
 5435+								        z, data, z->huff_dc + z->img_comp[n].hd,
 5436+								        z->huff_ac + ha, z->fast_ac[ha], n,
 5437+								        z->dequant[z->img_comp[n].tq])) {
 5438+									return 0;
 5439+								}
 5440+								z->idct_block_kernel(
 5441+								    z->img_comp[n].data +
 5442+								        z->img_comp[n].w2 * y2 + x2,
 5443+								    z->img_comp[n].w2, data);
 5444+							}
 5445+						}
 5446+					}
 5447+					// after all interleaved components, that's an interleaved
 5448+					// MCU, so now count down the restart interval
 5449+					if (--z->todo <= 0) {
 5450+						if (z->code_bits < 24) {
 5451+							stbi__grow_buffer_unsafe(z);
 5452+						}
 5453+						if (!STBI__RESTART(z->marker)) {
 5454+							return 1;
 5455+						}
 5456+						stbi__jpeg_reset(z);
 5457+					}
 5458+				}
 5459+			}
 5460+			return 1;
 5461+		}
 5462+	} else {
 5463+		if (z->scan_n == 1) {
 5464+			int i, j;
 5465+			int n = z->order[0];
 5466+			// non-interleaved data, we just need to process one block at a
 5467+			// time, in trivial scanline order number of blocks to do just
 5468+			// depends on how many actual "pixels" this component has,
 5469+			// independent of interleaved MCU blocking and such
 5470+			int w = (z->img_comp[n].x + 7) >> 3;
 5471+			int h = (z->img_comp[n].y + 7) >> 3;
 5472+			for (j = 0; j < h; ++j) {
 5473+				for (i = 0; i < w; ++i) {
 5474+					short *data = z->img_comp[n].coeff +
 5475+					              64 * (i + j * z->img_comp[n].coeff_w);
 5476+					if (z->spec_start == 0) {
 5477+						if (!stbi__jpeg_decode_block_prog_dc(
 5478+						        z, data, &z->huff_dc[z->img_comp[n].hd], n)) {
 5479+							return 0;
 5480+						}
 5481+					} else {
 5482+						int ha = z->img_comp[n].ha;
 5483+						if (!stbi__jpeg_decode_block_prog_ac(
 5484+						        z, data, &z->huff_ac[ha], z->fast_ac[ha])) {
 5485+							return 0;
 5486+						}
 5487+					}
 5488+					// every data block is an MCU, so countdown the restart
 5489+					// interval
 5490+					if (--z->todo <= 0) {
 5491+						if (z->code_bits < 24) {
 5492+							stbi__grow_buffer_unsafe(z);
 5493+						}
 5494+						if (!STBI__RESTART(z->marker)) {
 5495+							return 1;
 5496+						}
 5497+						stbi__jpeg_reset(z);
 5498+					}
 5499+				}
 5500+			}
 5501+			return 1;
 5502+		} else { // interleaved
 5503+			int i, j, k, x, y;
 5504+			for (j = 0; j < z->img_mcu_y; ++j) {
 5505+				for (i = 0; i < z->img_mcu_x; ++i) {
 5506+					// scan an interleaved mcu... process scan_n components in
 5507+					// order
 5508+					for (k = 0; k < z->scan_n; ++k) {
 5509+						int n = z->order[k];
 5510+						// scan out an mcu's worth of this component; that's
 5511+						// just determined by the basic H and V specified for
 5512+						// the component
 5513+						for (y = 0; y < z->img_comp[n].v; ++y) {
 5514+							for (x = 0; x < z->img_comp[n].h; ++x) {
 5515+								int x2 = (i * z->img_comp[n].h + x);
 5516+								int y2 = (j * z->img_comp[n].v + y);
 5517+								short *data =
 5518+								    z->img_comp[n].coeff +
 5519+								    64 * (x2 + y2 * z->img_comp[n].coeff_w);
 5520+								if (!stbi__jpeg_decode_block_prog_dc(
 5521+								        z, data, &z->huff_dc[z->img_comp[n].hd],
 5522+								        n)) {
 5523+									return 0;
 5524+								}
 5525+							}
 5526+						}
 5527+					}
 5528+					// after all interleaved components, that's an interleaved
 5529+					// MCU, so now count down the restart interval
 5530+					if (--z->todo <= 0) {
 5531+						if (z->code_bits < 24) {
 5532+							stbi__grow_buffer_unsafe(z);
 5533+						}
 5534+						if (!STBI__RESTART(z->marker)) {
 5535+							return 1;
 5536+						}
 5537+						stbi__jpeg_reset(z);
 5538+					}
 5539+				}
 5540+			}
 5541+			return 1;
 5542+		}
 5543+	}
 5544+}
 5545+
 5546+static void
 5547+stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 5548+{
 5549+	int i;
 5550+	for (i = 0; i < 64; ++i) {
 5551+		data[i] *= dequant[i];
 5552+	}
 5553+}
 5554+
 5555+static void
 5556+stbi__jpeg_finish(stbi__jpeg *z)
 5557+{
 5558+	if (z->progressive) {
 5559+		// dequantize and idct the data
 5560+		int i, j, n;
 5561+		for (n = 0; n < z->s->img_n; ++n) {
 5562+			int w = (z->img_comp[n].x + 7) >> 3;
 5563+			int h = (z->img_comp[n].y + 7) >> 3;
 5564+			for (j = 0; j < h; ++j) {
 5565+				for (i = 0; i < w; ++i) {
 5566+					short *data = z->img_comp[n].coeff +
 5567+					              64 * (i + j * z->img_comp[n].coeff_w);
 5568+					stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
 5569+					z->idct_block_kernel(z->img_comp[n].data +
 5570+					                         z->img_comp[n].w2 * j * 8 + i * 8,
 5571+					                     z->img_comp[n].w2, data);
 5572+				}
 5573+			}
 5574+		}
 5575+	}
 5576+}
 5577+
 5578+static int
 5579+stbi__process_marker(stbi__jpeg *z, int m)
 5580+{
 5581+	int L;
 5582+	switch (m) {
 5583+	case STBI__MARKER_none: // no marker found
 5584+		return stbi__err("expected marker", "Corrupt JPEG");
 5585+
 5586+	case 0xDD: // DRI - specify restart interval
 5587+		if (stbi__get16be(z->s) != 4) {
 5588+			return stbi__err("bad DRI len", "Corrupt JPEG");
 5589+		}
 5590+		z->restart_interval = stbi__get16be(z->s);
 5591+		return 1;
 5592+
 5593+	case 0xDB: // DQT - define quantization table
 5594+		L = stbi__get16be(z->s) - 2;
 5595+		while (L > 0) {
 5596+			int q = stbi__get8(z->s);
 5597+			int p = q >> 4, sixteen = (p != 0);
 5598+			int t = q & 15, i;
 5599+			if (p != 0 && p != 1) {
 5600+				return stbi__err("bad DQT type", "Corrupt JPEG");
 5601+			}
 5602+			if (t > 3) {
 5603+				return stbi__err("bad DQT table", "Corrupt JPEG");
 5604+			}
 5605+
 5606+			for (i = 0; i < 64; ++i) {
 5607+				z->dequant[t][stbi__jpeg_dezigzag[i]] =
 5608+				    (stbi__uint16)(sixteen ? stbi__get16be(z->s)
 5609+				                           : stbi__get8(z->s));
 5610+			}
 5611+			L -= (sixteen ? 129 : 65);
 5612+		}
 5613+		return L == 0;
 5614+
 5615+	case 0xC4: // DHT - define huffman table
 5616+		L = stbi__get16be(z->s) - 2;
 5617+		while (L > 0) {
 5618+			stbi_uc *v;
 5619+			int sizes[16], i, n = 0;
 5620+			int q = stbi__get8(z->s);
 5621+			int tc = q >> 4;
 5622+			int th = q & 15;
 5623+			if (tc > 1 || th > 3) {
 5624+				return stbi__err("bad DHT header", "Corrupt JPEG");
 5625+			}
 5626+			for (i = 0; i < 16; ++i) {
 5627+				sizes[i] = stbi__get8(z->s);
 5628+				n += sizes[i];
 5629+			}
 5630+			if (n > 256) {
 5631+				return stbi__err("bad DHT header",
 5632+				                 "Corrupt JPEG"); // Loop over i < n would write
 5633+				                                  // past end of values!
 5634+			}
 5635+			L -= 17;
 5636+			if (tc == 0) {
 5637+				if (!stbi__build_huffman(z->huff_dc + th, sizes)) {
 5638+					return 0;
 5639+				}
 5640+				v = z->huff_dc[th].values;
 5641+			} else {
 5642+				if (!stbi__build_huffman(z->huff_ac + th, sizes)) {
 5643+					return 0;
 5644+				}
 5645+				v = z->huff_ac[th].values;
 5646+			}
 5647+			for (i = 0; i < n; ++i) {
 5648+				v[i] = stbi__get8(z->s);
 5649+			}
 5650+			if (tc != 0) {
 5651+				stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
 5652+			}
 5653+			L -= n;
 5654+		}
 5655+		return L == 0;
 5656+	}
 5657+
 5658+	// check for comment block or APP blocks
 5659+	if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
 5660+		L = stbi__get16be(z->s);
 5661+		if (L < 2) {
 5662+			if (m == 0xFE) {
 5663+				return stbi__err("bad COM len", "Corrupt JPEG");
 5664+			} else {
 5665+				return stbi__err("bad APP len", "Corrupt JPEG");
 5666+			}
 5667+		}
 5668+		L -= 2;
 5669+
 5670+		if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
 5671+			static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
 5672+			int ok = 1;
 5673+			int i;
 5674+			for (i = 0; i < 5; ++i) {
 5675+				if (stbi__get8(z->s) != tag[i]) {
 5676+					ok = 0;
 5677+				}
 5678+			}
 5679+			L -= 5;
 5680+			if (ok) {
 5681+				z->jfif = 1;
 5682+			}
 5683+		} else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
 5684+			static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
 5685+			int ok = 1;
 5686+			int i;
 5687+			for (i = 0; i < 6; ++i) {
 5688+				if (stbi__get8(z->s) != tag[i]) {
 5689+					ok = 0;
 5690+				}
 5691+			}
 5692+			L -= 6;
 5693+			if (ok) {
 5694+				stbi__get8(z->s);                            // version
 5695+				stbi__get16be(z->s);                         // flags0
 5696+				stbi__get16be(z->s);                         // flags1
 5697+				z->app14_color_transform = stbi__get8(z->s); // color transform
 5698+				L -= 6;
 5699+			}
 5700+		}
 5701+
 5702+		stbi__skip(z->s, L);
 5703+		return 1;
 5704+	}
 5705+
 5706+	return stbi__err("unknown marker", "Corrupt JPEG");
 5707 }
 5708 
 5709 // after we see SOS
 5710-static int stbi__process_scan_header(stbi__jpeg *z)
 5711-{
 5712-   int i;
 5713-   int Ls = stbi__get16be(z->s);
 5714-   z->scan_n = stbi__get8(z->s);
 5715-   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
 5716-   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
 5717-   for (i=0; i < z->scan_n; ++i) {
 5718-      int id = stbi__get8(z->s), which;
 5719-      int q = stbi__get8(z->s);
 5720-      for (which = 0; which < z->s->img_n; ++which)
 5721-         if (z->img_comp[which].id == id)
 5722-            break;
 5723-      if (which == z->s->img_n) return 0; // no match
 5724-      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
 5725-      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
 5726-      z->order[i] = which;
 5727-   }
 5728-
 5729-   {
 5730-      int aa;
 5731-      z->spec_start = stbi__get8(z->s);
 5732-      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
 5733-      aa = stbi__get8(z->s);
 5734-      z->succ_high = (aa >> 4);
 5735-      z->succ_low  = (aa & 15);
 5736-      if (z->progressive) {
 5737-         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
 5738-            return stbi__err("bad SOS", "Corrupt JPEG");
 5739-      } else {
 5740-         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
 5741-         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
 5742-         z->spec_end = 63;
 5743-      }
 5744-   }
 5745-
 5746-   return 1;
 5747-}
 5748-
 5749-static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
 5750-{
 5751-   int i;
 5752-   for (i=0; i < ncomp; ++i) {
 5753-      if (z->img_comp[i].raw_data) {
 5754-         STBI_FREE(z->img_comp[i].raw_data);
 5755-         z->img_comp[i].raw_data = NULL;
 5756-         z->img_comp[i].data = NULL;
 5757-      }
 5758-      if (z->img_comp[i].raw_coeff) {
 5759-         STBI_FREE(z->img_comp[i].raw_coeff);
 5760-         z->img_comp[i].raw_coeff = 0;
 5761-         z->img_comp[i].coeff = 0;
 5762-      }
 5763-      if (z->img_comp[i].linebuf) {
 5764-         STBI_FREE(z->img_comp[i].linebuf);
 5765-         z->img_comp[i].linebuf = NULL;
 5766-      }
 5767-   }
 5768-   return why;
 5769-}
 5770-
 5771-static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 5772-{
 5773-   stbi__context *s = z->s;
 5774-   int Lf,p,i,q, h_max=1,v_max=1,c;
 5775-   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
 5776-   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
 5777-   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
 5778-   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
 5779-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
 5780-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
 5781-   c = stbi__get8(s);
 5782-   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
 5783-   s->img_n = c;
 5784-   for (i=0; i < c; ++i) {
 5785-      z->img_comp[i].data = NULL;
 5786-      z->img_comp[i].linebuf = NULL;
 5787-   }
 5788-
 5789-   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 5790-
 5791-   z->rgb = 0;
 5792-   for (i=0; i < s->img_n; ++i) {
 5793-      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
 5794-      z->img_comp[i].id = stbi__get8(s);
 5795-      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
 5796-         ++z->rgb;
 5797-      q = stbi__get8(s);
 5798-      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
 5799-      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
 5800-      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
 5801-   }
 5802-
 5803-   if (scan != STBI__SCAN_load) return 1;
 5804-
 5805-   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 5806-
 5807-   for (i=0; i < s->img_n; ++i) {
 5808-      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
 5809-      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
 5810-   }
 5811-
 5812-   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
 5813-   // and I've never seen a non-corrupted JPEG file actually use them
 5814-   for (i=0; i < s->img_n; ++i) {
 5815-      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
 5816-      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
 5817-   }
 5818-
 5819-   // compute interleaved mcu info
 5820-   z->img_h_max = h_max;
 5821-   z->img_v_max = v_max;
 5822-   z->img_mcu_w = h_max * 8;
 5823-   z->img_mcu_h = v_max * 8;
 5824-   // these sizes can't be more than 17 bits
 5825-   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
 5826-   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 5827-
 5828-   for (i=0; i < s->img_n; ++i) {
 5829-      // number of effective pixels (e.g. for non-interleaved MCU)
 5830-      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
 5831-      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
 5832-      // to simplify generation, we'll allocate enough memory to decode
 5833-      // the bogus oversized data from using interleaved MCUs and their
 5834-      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
 5835-      // discard the extra data until colorspace conversion
 5836-      //
 5837-      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
 5838-      // so these muls can't overflow with 32-bit ints (which we require)
 5839-      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
 5840-      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
 5841-      z->img_comp[i].coeff = 0;
 5842-      z->img_comp[i].raw_coeff = 0;
 5843-      z->img_comp[i].linebuf = NULL;
 5844-      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
 5845-      if (z->img_comp[i].raw_data == NULL)
 5846-         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
 5847-      // align blocks for idct using mmx/sse
 5848-      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
 5849-      if (z->progressive) {
 5850-         // w2, h2 are multiples of 8 (see above)
 5851-         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
 5852-         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
 5853-         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
 5854-         if (z->img_comp[i].raw_coeff == NULL)
 5855-            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
 5856-         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
 5857-      }
 5858-   }
 5859-
 5860-   return 1;
 5861+static int
 5862+stbi__process_scan_header(stbi__jpeg *z)
 5863+{
 5864+	int i;
 5865+	int Ls = stbi__get16be(z->s);
 5866+	z->scan_n = stbi__get8(z->s);
 5867+	if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) {
 5868+		return stbi__err("bad SOS component count", "Corrupt JPEG");
 5869+	}
 5870+	if (Ls != 6 + 2 * z->scan_n) {
 5871+		return stbi__err("bad SOS len", "Corrupt JPEG");
 5872+	}
 5873+	for (i = 0; i < z->scan_n; ++i) {
 5874+		int id = stbi__get8(z->s), which;
 5875+		int q = stbi__get8(z->s);
 5876+		for (which = 0; which < z->s->img_n; ++which) {
 5877+			if (z->img_comp[which].id == id) {
 5878+				break;
 5879+			}
 5880+		}
 5881+		if (which == z->s->img_n) {
 5882+			return 0; // no match
 5883+		}
 5884+		z->img_comp[which].hd = q >> 4;
 5885+		if (z->img_comp[which].hd > 3) {
 5886+			return stbi__err("bad DC huff", "Corrupt JPEG");
 5887+		}
 5888+		z->img_comp[which].ha = q & 15;
 5889+		if (z->img_comp[which].ha > 3) {
 5890+			return stbi__err("bad AC huff", "Corrupt JPEG");
 5891+		}
 5892+		z->order[i] = which;
 5893+	}
 5894+
 5895+	{
 5896+		int aa;
 5897+		z->spec_start = stbi__get8(z->s);
 5898+		z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
 5899+		aa = stbi__get8(z->s);
 5900+		z->succ_high = (aa >> 4);
 5901+		z->succ_low = (aa & 15);
 5902+		if (z->progressive) {
 5903+			if (z->spec_start > 63 || z->spec_end > 63 ||
 5904+			    z->spec_start > z->spec_end || z->succ_high > 13 ||
 5905+			    z->succ_low > 13) {
 5906+				return stbi__err("bad SOS", "Corrupt JPEG");
 5907+			}
 5908+		} else {
 5909+			if (z->spec_start != 0) {
 5910+				return stbi__err("bad SOS", "Corrupt JPEG");
 5911+			}
 5912+			if (z->succ_high != 0 || z->succ_low != 0) {
 5913+				return stbi__err("bad SOS", "Corrupt JPEG");
 5914+			}
 5915+			z->spec_end = 63;
 5916+		}
 5917+	}
 5918+
 5919+	return 1;
 5920+}
 5921+
 5922+static int
 5923+stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
 5924+{
 5925+	int i;
 5926+	for (i = 0; i < ncomp; ++i) {
 5927+		if (z->img_comp[i].raw_data) {
 5928+			STBI_FREE(z->img_comp[i].raw_data);
 5929+			z->img_comp[i].raw_data = NULL;
 5930+			z->img_comp[i].data = NULL;
 5931+		}
 5932+		if (z->img_comp[i].raw_coeff) {
 5933+			STBI_FREE(z->img_comp[i].raw_coeff);
 5934+			z->img_comp[i].raw_coeff = 0;
 5935+			z->img_comp[i].coeff = 0;
 5936+		}
 5937+		if (z->img_comp[i].linebuf) {
 5938+			STBI_FREE(z->img_comp[i].linebuf);
 5939+			z->img_comp[i].linebuf = NULL;
 5940+		}
 5941+	}
 5942+	return why;
 5943+}
 5944+
 5945+static int
 5946+stbi__process_frame_header(stbi__jpeg *z, int scan)
 5947+{
 5948+	stbi__context *s = z->s;
 5949+	int Lf, p, i, q, h_max = 1, v_max = 1, c;
 5950+	Lf = stbi__get16be(s);
 5951+	if (Lf < 11) {
 5952+		return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
 5953+	}
 5954+	p = stbi__get8(s);
 5955+	if (p != 8) {
 5956+		return stbi__err(
 5957+		    "only 8-bit",
 5958+		    "JPEG format not supported: 8-bit only"); // JPEG baseline
 5959+	}
 5960+	s->img_y = stbi__get16be(s);
 5961+	if (s->img_y == 0) {
 5962+		return stbi__err(
 5963+		    "no header height",
 5964+		    "JPEG format not supported: delayed height"); // Legal, but we don't
 5965+		                                                  // handle it--but
 5966+		                                                  // neither does IJG
 5967+	}
 5968+	s->img_x = stbi__get16be(s);
 5969+	if (s->img_x == 0) {
 5970+		return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
 5971+	}
 5972+	if (s->img_y > STBI_MAX_DIMENSIONS) {
 5973+		return stbi__err("too large", "Very large image (corrupt?)");
 5974+	}
 5975+	if (s->img_x > STBI_MAX_DIMENSIONS) {
 5976+		return stbi__err("too large", "Very large image (corrupt?)");
 5977+	}
 5978+	c = stbi__get8(s);
 5979+	if (c != 3 && c != 1 && c != 4) {
 5980+		return stbi__err("bad component count", "Corrupt JPEG");
 5981+	}
 5982+	s->img_n = c;
 5983+	for (i = 0; i < c; ++i) {
 5984+		z->img_comp[i].data = NULL;
 5985+		z->img_comp[i].linebuf = NULL;
 5986+	}
 5987+
 5988+	if (Lf != 8 + 3 * s->img_n) {
 5989+		return stbi__err("bad SOF len", "Corrupt JPEG");
 5990+	}
 5991+
 5992+	z->rgb = 0;
 5993+	for (i = 0; i < s->img_n; ++i) {
 5994+		static const unsigned char rgb[3] = {'R', 'G', 'B'};
 5995+		z->img_comp[i].id = stbi__get8(s);
 5996+		if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) {
 5997+			++z->rgb;
 5998+		}
 5999+		q = stbi__get8(s);
 6000+		z->img_comp[i].h = (q >> 4);
 6001+		if (!z->img_comp[i].h || z->img_comp[i].h > 4) {
 6002+			return stbi__err("bad H", "Corrupt JPEG");
 6003+		}
 6004+		z->img_comp[i].v = q & 15;
 6005+		if (!z->img_comp[i].v || z->img_comp[i].v > 4) {
 6006+			return stbi__err("bad V", "Corrupt JPEG");
 6007+		}
 6008+		z->img_comp[i].tq = stbi__get8(s);
 6009+		if (z->img_comp[i].tq > 3) {
 6010+			return stbi__err("bad TQ", "Corrupt JPEG");
 6011+		}
 6012+	}
 6013+
 6014+	if (scan != STBI__SCAN_load) {
 6015+		return 1;
 6016+	}
 6017+
 6018+	if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) {
 6019+		return stbi__err("too large", "Image too large to decode");
 6020+	}
 6021+
 6022+	for (i = 0; i < s->img_n; ++i) {
 6023+		if (z->img_comp[i].h > h_max) {
 6024+			h_max = z->img_comp[i].h;
 6025+		}
 6026+		if (z->img_comp[i].v > v_max) {
 6027+			v_max = z->img_comp[i].v;
 6028+		}
 6029+	}
 6030+
 6031+	// check that plane subsampling factors are integer ratios; our resamplers
 6032+	// can't deal with fractional ratios and I've never seen a non-corrupted
 6033+	// JPEG file actually use them
 6034+	for (i = 0; i < s->img_n; ++i) {
 6035+		if (h_max % z->img_comp[i].h != 0) {
 6036+			return stbi__err("bad H", "Corrupt JPEG");
 6037+		}
 6038+		if (v_max % z->img_comp[i].v != 0) {
 6039+			return stbi__err("bad V", "Corrupt JPEG");
 6040+		}
 6041+	}
 6042+
 6043+	// compute interleaved mcu info
 6044+	z->img_h_max = h_max;
 6045+	z->img_v_max = v_max;
 6046+	z->img_mcu_w = h_max * 8;
 6047+	z->img_mcu_h = v_max * 8;
 6048+	// these sizes can't be more than 17 bits
 6049+	z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
 6050+	z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
 6051+
 6052+	for (i = 0; i < s->img_n; ++i) {
 6053+		// number of effective pixels (e.g. for non-interleaved MCU)
 6054+		z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
 6055+		z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
 6056+		// to simplify generation, we'll allocate enough memory to decode
 6057+		// the bogus oversized data from using interleaved MCUs and their
 6058+		// big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
 6059+		// discard the extra data until colorspace conversion
 6060+		//
 6061+		// img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked
 6062+		// earlier) so these muls can't overflow with 32-bit ints (which we
 6063+		// require)
 6064+		z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
 6065+		z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
 6066+		z->img_comp[i].coeff = 0;
 6067+		z->img_comp[i].raw_coeff = 0;
 6068+		z->img_comp[i].linebuf = NULL;
 6069+		z->img_comp[i].raw_data =
 6070+		    stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
 6071+		if (z->img_comp[i].raw_data == NULL) {
 6072+			return stbi__free_jpeg_components(
 6073+			    z, i + 1, stbi__err("outofmem", "Out of memory"));
 6074+		}
 6075+		// align blocks for idct using mmx/sse
 6076+		z->img_comp[i].data =
 6077+		    (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
 6078+		if (z->progressive) {
 6079+			// w2, h2 are multiples of 8 (see above)
 6080+			z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
 6081+			z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
 6082+			z->img_comp[i].raw_coeff = stbi__malloc_mad3(
 6083+			    z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
 6084+			if (z->img_comp[i].raw_coeff == NULL) {
 6085+				return stbi__free_jpeg_components(
 6086+				    z, i + 1, stbi__err("outofmem", "Out of memory"));
 6087+			}
 6088+			z->img_comp[i].coeff =
 6089+			    (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
 6090+		}
 6091+	}
 6092+
 6093+	return 1;
 6094 }
 6095 
 6096 // use comparisons since in some cases we handle more than one case (e.g. SOF)
 6097-#define stbi__DNL(x)         ((x) == 0xdc)
 6098-#define stbi__SOI(x)         ((x) == 0xd8)
 6099-#define stbi__EOI(x)         ((x) == 0xd9)
 6100-#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
 6101-#define stbi__SOS(x)         ((x) == 0xda)
 6102-
 6103-#define stbi__SOF_progressive(x)   ((x) == 0xc2)
 6104-
 6105-static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 6106-{
 6107-   int m;
 6108-   z->jfif = 0;
 6109-   z->app14_color_transform = -1; // valid values are 0,1,2
 6110-   z->marker = STBI__MARKER_none; // initialize cached marker to empty
 6111-   m = stbi__get_marker(z);
 6112-   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
 6113-   if (scan == STBI__SCAN_type) return 1;
 6114-   m = stbi__get_marker(z);
 6115-   while (!stbi__SOF(m)) {
 6116-      if (!stbi__process_marker(z,m)) return 0;
 6117-      m = stbi__get_marker(z);
 6118-      while (m == STBI__MARKER_none) {
 6119-         // some files have extra padding after their blocks, so ok, we'll scan
 6120-         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
 6121-         m = stbi__get_marker(z);
 6122-      }
 6123-   }
 6124-   z->progressive = stbi__SOF_progressive(m);
 6125-   if (!stbi__process_frame_header(z, scan)) return 0;
 6126-   return 1;
 6127-}
 6128-
 6129-static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
 6130-{
 6131-   // some JPEGs have junk at end, skip over it but if we find what looks
 6132-   // like a valid marker, resume there
 6133-   while (!stbi__at_eof(j->s)) {
 6134-      stbi_uc x = stbi__get8(j->s);
 6135-      while (x == 0xff) { // might be a marker
 6136-         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
 6137-         x = stbi__get8(j->s);
 6138-         if (x != 0x00 && x != 0xff) {
 6139-            // not a stuffed zero or lead-in to another marker, looks
 6140-            // like an actual marker, return it
 6141-            return x;
 6142-         }
 6143-         // stuffed zero has x=0 now which ends the loop, meaning we go
 6144-         // back to regular scan loop.
 6145-         // repeated 0xff keeps trying to read the next byte of the marker.
 6146-      }
 6147-   }
 6148-   return STBI__MARKER_none;
 6149+#define stbi__DNL(x) ((x) == 0xdc)
 6150+#define stbi__SOI(x) ((x) == 0xd8)
 6151+#define stbi__EOI(x) ((x) == 0xd9)
 6152+#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
 6153+#define stbi__SOS(x) ((x) == 0xda)
 6154+
 6155+#define stbi__SOF_progressive(x) ((x) == 0xc2)
 6156+
 6157+static int
 6158+stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 6159+{
 6160+	int m;
 6161+	z->jfif = 0;
 6162+	z->app14_color_transform = -1; // valid values are 0,1,2
 6163+	z->marker = STBI__MARKER_none; // initialize cached marker to empty
 6164+	m = stbi__get_marker(z);
 6165+	if (!stbi__SOI(m)) {
 6166+		return stbi__err("no SOI", "Corrupt JPEG");
 6167+	}
 6168+	if (scan == STBI__SCAN_type) {
 6169+		return 1;
 6170+	}
 6171+	m = stbi__get_marker(z);
 6172+	while (!stbi__SOF(m)) {
 6173+		if (!stbi__process_marker(z, m)) {
 6174+			return 0;
 6175+		}
 6176+		m = stbi__get_marker(z);
 6177+		while (m == STBI__MARKER_none) {
 6178+			// some files have extra padding after their blocks, so ok, we'll
 6179+			// scan
 6180+			if (stbi__at_eof(z->s)) {
 6181+				return stbi__err("no SOF", "Corrupt JPEG");
 6182+			}
 6183+			m = stbi__get_marker(z);
 6184+		}
 6185+	}
 6186+	z->progressive = stbi__SOF_progressive(m);
 6187+	if (!stbi__process_frame_header(z, scan)) {
 6188+		return 0;
 6189+	}
 6190+	return 1;
 6191+}
 6192+
 6193+static stbi_uc
 6194+stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
 6195+{
 6196+	// some JPEGs have junk at end, skip over it but if we find what looks
 6197+	// like a valid marker, resume there
 6198+	while (!stbi__at_eof(j->s)) {
 6199+		stbi_uc x = stbi__get8(j->s);
 6200+		while (x == 0xff) { // might be a marker
 6201+			if (stbi__at_eof(j->s)) {
 6202+				return STBI__MARKER_none;
 6203+			}
 6204+			x = stbi__get8(j->s);
 6205+			if (x != 0x00 && x != 0xff) {
 6206+				// not a stuffed zero or lead-in to another marker, looks
 6207+				// like an actual marker, return it
 6208+				return x;
 6209+			}
 6210+			// stuffed zero has x=0 now which ends the loop, meaning we go
 6211+			// back to regular scan loop.
 6212+			// repeated 0xff keeps trying to read the next byte of the marker.
 6213+		}
 6214+	}
 6215+	return STBI__MARKER_none;
 6216 }
 6217 
 6218 // decode image to YCbCr format
 6219-static int stbi__decode_jpeg_image(stbi__jpeg *j)
 6220-{
 6221-   int m;
 6222-   for (m = 0; m < 4; m++) {
 6223-      j->img_comp[m].raw_data = NULL;
 6224-      j->img_comp[m].raw_coeff = NULL;
 6225-   }
 6226-   j->restart_interval = 0;
 6227-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
 6228-   m = stbi__get_marker(j);
 6229-   while (!stbi__EOI(m)) {
 6230-      if (stbi__SOS(m)) {
 6231-         if (!stbi__process_scan_header(j)) return 0;
 6232-         if (!stbi__parse_entropy_coded_data(j)) return 0;
 6233-         if (j->marker == STBI__MARKER_none ) {
 6234-         j->marker = stbi__skip_jpeg_junk_at_end(j);
 6235-            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
 6236-         }
 6237-         m = stbi__get_marker(j);
 6238-         if (STBI__RESTART(m))
 6239-            m = stbi__get_marker(j);
 6240-      } else if (stbi__DNL(m)) {
 6241-         int Ld = stbi__get16be(j->s);
 6242-         stbi__uint32 NL = stbi__get16be(j->s);
 6243-         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
 6244-         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
 6245-         m = stbi__get_marker(j);
 6246-      } else {
 6247-         if (!stbi__process_marker(j, m)) return 1;
 6248-         m = stbi__get_marker(j);
 6249-      }
 6250-   }
 6251-   if (j->progressive)
 6252-      stbi__jpeg_finish(j);
 6253-   return 1;
 6254+static int
 6255+stbi__decode_jpeg_image(stbi__jpeg *j)
 6256+{
 6257+	int m;
 6258+	for (m = 0; m < 4; m++) {
 6259+		j->img_comp[m].raw_data = NULL;
 6260+		j->img_comp[m].raw_coeff = NULL;
 6261+	}
 6262+	j->restart_interval = 0;
 6263+	if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) {
 6264+		return 0;
 6265+	}
 6266+	m = stbi__get_marker(j);
 6267+	while (!stbi__EOI(m)) {
 6268+		if (stbi__SOS(m)) {
 6269+			if (!stbi__process_scan_header(j)) {
 6270+				return 0;
 6271+			}
 6272+			if (!stbi__parse_entropy_coded_data(j)) {
 6273+				return 0;
 6274+			}
 6275+			if (j->marker == STBI__MARKER_none) {
 6276+				j->marker = stbi__skip_jpeg_junk_at_end(j);
 6277+				// if we reach eof without hitting a marker, stbi__get_marker()
 6278+				// below will fail and we'll eventually return 0
 6279+			}
 6280+			m = stbi__get_marker(j);
 6281+			if (STBI__RESTART(m)) {
 6282+				m = stbi__get_marker(j);
 6283+			}
 6284+		} else if (stbi__DNL(m)) {
 6285+			int Ld = stbi__get16be(j->s);
 6286+			stbi__uint32 NL = stbi__get16be(j->s);
 6287+			if (Ld != 4) {
 6288+				return stbi__err("bad DNL len", "Corrupt JPEG");
 6289+			}
 6290+			if (NL != j->s->img_y) {
 6291+				return stbi__err("bad DNL height", "Corrupt JPEG");
 6292+			}
 6293+			m = stbi__get_marker(j);
 6294+		} else {
 6295+			if (!stbi__process_marker(j, m)) {
 6296+				return 1;
 6297+			}
 6298+			m = stbi__get_marker(j);
 6299+		}
 6300+	}
 6301+	if (j->progressive) {
 6302+		stbi__jpeg_finish(j);
 6303+	}
 6304+	return 1;
 6305 }
 6306 
 6307 // static jfif-centered resampling (across block boundaries)
 6308 
 6309 typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
 6310-                                    int w, int hs);
 6311+                                      int w, int hs);
 6312 
 6313-#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
 6314+#define stbi__div4(x) ((stbi_uc)((x) >> 2))
 6315 
 6316-static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6317+static stbi_uc *
 6318+resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6319 {
 6320-   STBI_NOTUSED(out);
 6321-   STBI_NOTUSED(in_far);
 6322-   STBI_NOTUSED(w);
 6323-   STBI_NOTUSED(hs);
 6324-   return in_near;
 6325+	STBI_NOTUSED(out);
 6326+	STBI_NOTUSED(in_far);
 6327+	STBI_NOTUSED(w);
 6328+	STBI_NOTUSED(hs);
 6329+	return in_near;
 6330 }
 6331 
 6332-static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6333+static stbi_uc *
 6334+stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w,
 6335+                       int hs)
 6336 {
 6337-   // need to generate two samples vertically for every one in input
 6338-   int i;
 6339-   STBI_NOTUSED(hs);
 6340-   for (i=0; i < w; ++i)
 6341-      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
 6342-   return out;
 6343+	// need to generate two samples vertically for every one in input
 6344+	int i;
 6345+	STBI_NOTUSED(hs);
 6346+	for (i = 0; i < w; ++i) {
 6347+		out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
 6348+	}
 6349+	return out;
 6350 }
 6351 
 6352-static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6353+static stbi_uc *
 6354+stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w,
 6355+                       int hs)
 6356 {
 6357-   // need to generate two samples horizontally for every one in input
 6358-   int i;
 6359-   stbi_uc *input = in_near;
 6360+	// need to generate two samples horizontally for every one in input
 6361+	int i;
 6362+	stbi_uc *input = in_near;
 6363 
 6364-   if (w == 1) {
 6365-      // if only one sample, can't do any interpolation
 6366-      out[0] = out[1] = input[0];
 6367-      return out;
 6368-   }
 6369+	if (w == 1) {
 6370+		// if only one sample, can't do any interpolation
 6371+		out[0] = out[1] = input[0];
 6372+		return out;
 6373+	}
 6374 
 6375-   out[0] = input[0];
 6376-   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
 6377-   for (i=1; i < w-1; ++i) {
 6378-      int n = 3*input[i]+2;
 6379-      out[i*2+0] = stbi__div4(n+input[i-1]);
 6380-      out[i*2+1] = stbi__div4(n+input[i+1]);
 6381-   }
 6382-   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
 6383-   out[i*2+1] = input[w-1];
 6384+	out[0] = input[0];
 6385+	out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
 6386+	for (i = 1; i < w - 1; ++i) {
 6387+		int n = 3 * input[i] + 2;
 6388+		out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
 6389+		out[i * 2 + 1] = stbi__div4(n + input[i + 1]);
 6390+	}
 6391+	out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2);
 6392+	out[i * 2 + 1] = input[w - 1];
 6393 
 6394-   STBI_NOTUSED(in_far);
 6395-   STBI_NOTUSED(hs);
 6396+	STBI_NOTUSED(in_far);
 6397+	STBI_NOTUSED(hs);
 6398 
 6399-   return out;
 6400+	return out;
 6401 }
 6402 
 6403-#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
 6404+#define stbi__div16(x) ((stbi_uc)((x) >> 4))
 6405 
 6406-static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6407+static stbi_uc *
 6408+stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w,
 6409+                        int hs)
 6410 {
 6411-   // need to generate 2x2 samples for every one in input
 6412-   int i,t0,t1;
 6413-   if (w == 1) {
 6414-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
 6415-      return out;
 6416-   }
 6417+	// need to generate 2x2 samples for every one in input
 6418+	int i, t0, t1;
 6419+	if (w == 1) {
 6420+		out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
 6421+		return out;
 6422+	}
 6423 
 6424-   t1 = 3*in_near[0] + in_far[0];
 6425-   out[0] = stbi__div4(t1+2);
 6426-   for (i=1; i < w; ++i) {
 6427-      t0 = t1;
 6428-      t1 = 3*in_near[i]+in_far[i];
 6429-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
 6430-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
 6431-   }
 6432-   out[w*2-1] = stbi__div4(t1+2);
 6433+	t1 = 3 * in_near[0] + in_far[0];
 6434+	out[0] = stbi__div4(t1 + 2);
 6435+	for (i = 1; i < w; ++i) {
 6436+		t0 = t1;
 6437+		t1 = 3 * in_near[i] + in_far[i];
 6438+		out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
 6439+		out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 6440+	}
 6441+	out[w * 2 - 1] = stbi__div4(t1 + 2);
 6442 
 6443-   STBI_NOTUSED(hs);
 6444+	STBI_NOTUSED(hs);
 6445 
 6446-   return out;
 6447+	return out;
 6448 }
 6449 
 6450 #if defined(STBI_SSE2) || defined(STBI_NEON)
 6451-static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6452-{
 6453-   // need to generate 2x2 samples for every one in input
 6454-   int i=0,t0,t1;
 6455-
 6456-   if (w == 1) {
 6457-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
 6458-      return out;
 6459-   }
 6460-
 6461-   t1 = 3*in_near[0] + in_far[0];
 6462-   // process groups of 8 pixels for as long as we can.
 6463-   // note we can't handle the last pixel in a row in this loop
 6464-   // because we need to handle the filter boundary conditions.
 6465-   for (; i < ((w-1) & ~7); i += 8) {
 6466+static stbi_uc *
 6467+stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far,
 6468+                             int w, int hs)
 6469+{
 6470+	// need to generate 2x2 samples for every one in input
 6471+	int i = 0, t0, t1;
 6472+
 6473+	if (w == 1) {
 6474+		out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
 6475+		return out;
 6476+	}
 6477+
 6478+	t1 = 3 * in_near[0] + in_far[0];
 6479+	// process groups of 8 pixels for as long as we can.
 6480+	// note we can't handle the last pixel in a row in this loop
 6481+	// because we need to handle the filter boundary conditions.
 6482+	for (; i < ((w - 1) & ~7); i += 8) {
 6483 #if defined(STBI_SSE2)
 6484-      // load and perform the vertical filtering pass
 6485-      // this uses 3*x + y = 4*x + (y - x)
 6486-      __m128i zero  = _mm_setzero_si128();
 6487-      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
 6488-      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
 6489-      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
 6490-      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
 6491-      __m128i diff  = _mm_sub_epi16(farw, nearw);
 6492-      __m128i nears = _mm_slli_epi16(nearw, 2);
 6493-      __m128i curr  = _mm_add_epi16(nears, diff); // current row
 6494-
 6495-      // horizontal filter works the same based on shifted vers of current
 6496-      // row. "prev" is current row shifted right by 1 pixel; we need to
 6497-      // insert the previous pixel value (from t1).
 6498-      // "next" is current row shifted left by 1 pixel, with first pixel
 6499-      // of next block of 8 pixels added in.
 6500-      __m128i prv0 = _mm_slli_si128(curr, 2);
 6501-      __m128i nxt0 = _mm_srli_si128(curr, 2);
 6502-      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
 6503-      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
 6504-
 6505-      // horizontal filter, polyphase implementation since it's convenient:
 6506-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
 6507-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
 6508-      // note the shared term.
 6509-      __m128i bias  = _mm_set1_epi16(8);
 6510-      __m128i curs = _mm_slli_epi16(curr, 2);
 6511-      __m128i prvd = _mm_sub_epi16(prev, curr);
 6512-      __m128i nxtd = _mm_sub_epi16(next, curr);
 6513-      __m128i curb = _mm_add_epi16(curs, bias);
 6514-      __m128i even = _mm_add_epi16(prvd, curb);
 6515-      __m128i odd  = _mm_add_epi16(nxtd, curb);
 6516-
 6517-      // interleave even and odd pixels, then undo scaling.
 6518-      __m128i int0 = _mm_unpacklo_epi16(even, odd);
 6519-      __m128i int1 = _mm_unpackhi_epi16(even, odd);
 6520-      __m128i de0  = _mm_srli_epi16(int0, 4);
 6521-      __m128i de1  = _mm_srli_epi16(int1, 4);
 6522-
 6523-      // pack and write output
 6524-      __m128i outv = _mm_packus_epi16(de0, de1);
 6525-      _mm_storeu_si128((__m128i *) (out + i*2), outv);
 6526+		// load and perform the vertical filtering pass
 6527+		// this uses 3*x + y = 4*x + (y - x)
 6528+		__m128i zero = _mm_setzero_si128();
 6529+		__m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i));
 6530+		__m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i));
 6531+		__m128i farw = _mm_unpacklo_epi8(farb, zero);
 6532+		__m128i nearw = _mm_unpacklo_epi8(nearb, zero);
 6533+		__m128i diff = _mm_sub_epi16(farw, nearw);
 6534+		__m128i nears = _mm_slli_epi16(nearw, 2);
 6535+		__m128i curr = _mm_add_epi16(nears, diff); // current row
 6536+
 6537+		// horizontal filter works the same based on shifted vers of current
 6538+		// row. "prev" is current row shifted right by 1 pixel; we need to
 6539+		// insert the previous pixel value (from t1).
 6540+		// "next" is current row shifted left by 1 pixel, with first pixel
 6541+		// of next block of 8 pixels added in.
 6542+		__m128i prv0 = _mm_slli_si128(curr, 2);
 6543+		__m128i nxt0 = _mm_srli_si128(curr, 2);
 6544+		__m128i prev = _mm_insert_epi16(prv0, t1, 0);
 6545+		__m128i next =
 6546+		    _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
 6547+
 6548+		// horizontal filter, polyphase implementation since it's convenient:
 6549+		// even pixels = 3*cur + prev = cur*4 + (prev - cur)
 6550+		// odd  pixels = 3*cur + next = cur*4 + (next - cur)
 6551+		// note the shared term.
 6552+		__m128i bias = _mm_set1_epi16(8);
 6553+		__m128i curs = _mm_slli_epi16(curr, 2);
 6554+		__m128i prvd = _mm_sub_epi16(prev, curr);
 6555+		__m128i nxtd = _mm_sub_epi16(next, curr);
 6556+		__m128i curb = _mm_add_epi16(curs, bias);
 6557+		__m128i even = _mm_add_epi16(prvd, curb);
 6558+		__m128i odd = _mm_add_epi16(nxtd, curb);
 6559+
 6560+		// interleave even and odd pixels, then undo scaling.
 6561+		__m128i int0 = _mm_unpacklo_epi16(even, odd);
 6562+		__m128i int1 = _mm_unpackhi_epi16(even, odd);
 6563+		__m128i de0 = _mm_srli_epi16(int0, 4);
 6564+		__m128i de1 = _mm_srli_epi16(int1, 4);
 6565+
 6566+		// pack and write output
 6567+		__m128i outv = _mm_packus_epi16(de0, de1);
 6568+		_mm_storeu_si128((__m128i *)(out + i * 2), outv);
 6569 #elif defined(STBI_NEON)
 6570-      // load and perform the vertical filtering pass
 6571-      // this uses 3*x + y = 4*x + (y - x)
 6572-      uint8x8_t farb  = vld1_u8(in_far + i);
 6573-      uint8x8_t nearb = vld1_u8(in_near + i);
 6574-      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
 6575-      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
 6576-      int16x8_t curr  = vaddq_s16(nears, diff); // current row
 6577-
 6578-      // horizontal filter works the same based on shifted vers of current
 6579-      // row. "prev" is current row shifted right by 1 pixel; we need to
 6580-      // insert the previous pixel value (from t1).
 6581-      // "next" is current row shifted left by 1 pixel, with first pixel
 6582-      // of next block of 8 pixels added in.
 6583-      int16x8_t prv0 = vextq_s16(curr, curr, 7);
 6584-      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
 6585-      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
 6586-      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
 6587-
 6588-      // horizontal filter, polyphase implementation since it's convenient:
 6589-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
 6590-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
 6591-      // note the shared term.
 6592-      int16x8_t curs = vshlq_n_s16(curr, 2);
 6593-      int16x8_t prvd = vsubq_s16(prev, curr);
 6594-      int16x8_t nxtd = vsubq_s16(next, curr);
 6595-      int16x8_t even = vaddq_s16(curs, prvd);
 6596-      int16x8_t odd  = vaddq_s16(curs, nxtd);
 6597-
 6598-      // undo scaling and round, then store with even/odd phases interleaved
 6599-      uint8x8x2_t o;
 6600-      o.val[0] = vqrshrun_n_s16(even, 4);
 6601-      o.val[1] = vqrshrun_n_s16(odd,  4);
 6602-      vst2_u8(out + i*2, o);
 6603-#endif
 6604-
 6605-      // "previous" value for next iter
 6606-      t1 = 3*in_near[i+7] + in_far[i+7];
 6607-   }
 6608-
 6609-   t0 = t1;
 6610-   t1 = 3*in_near[i] + in_far[i];
 6611-   out[i*2] = stbi__div16(3*t1 + t0 + 8);
 6612-
 6613-   for (++i; i < w; ++i) {
 6614-      t0 = t1;
 6615-      t1 = 3*in_near[i]+in_far[i];
 6616-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
 6617-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
 6618-   }
 6619-   out[w*2-1] = stbi__div4(t1+2);
 6620-
 6621-   STBI_NOTUSED(hs);
 6622-
 6623-   return out;
 6624-}
 6625-#endif
 6626-
 6627-static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 6628-{
 6629-   // resample with nearest-neighbor
 6630-   int i,j;
 6631-   STBI_NOTUSED(in_far);
 6632-   for (i=0; i < w; ++i)
 6633-      for (j=0; j < hs; ++j)
 6634-         out[i*hs+j] = in_near[i];
 6635-   return out;
 6636+		// load and perform the vertical filtering pass
 6637+		// this uses 3*x + y = 4*x + (y - x)
 6638+		uint8x8_t farb = vld1_u8(in_far + i);
 6639+		uint8x8_t nearb = vld1_u8(in_near + i);
 6640+		int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
 6641+		int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
 6642+		int16x8_t curr = vaddq_s16(nears, diff); // current row
 6643+
 6644+		// horizontal filter works the same based on shifted vers of current
 6645+		// row. "prev" is current row shifted right by 1 pixel; we need to
 6646+		// insert the previous pixel value (from t1).
 6647+		// "next" is current row shifted left by 1 pixel, with first pixel
 6648+		// of next block of 8 pixels added in.
 6649+		int16x8_t prv0 = vextq_s16(curr, curr, 7);
 6650+		int16x8_t nxt0 = vextq_s16(curr, curr, 1);
 6651+		int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
 6652+		int16x8_t next =
 6653+		    vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
 6654+
 6655+		// horizontal filter, polyphase implementation since it's convenient:
 6656+		// even pixels = 3*cur + prev = cur*4 + (prev - cur)
 6657+		// odd  pixels = 3*cur + next = cur*4 + (next - cur)
 6658+		// note the shared term.
 6659+		int16x8_t curs = vshlq_n_s16(curr, 2);
 6660+		int16x8_t prvd = vsubq_s16(prev, curr);
 6661+		int16x8_t nxtd = vsubq_s16(next, curr);
 6662+		int16x8_t even = vaddq_s16(curs, prvd);
 6663+		int16x8_t odd = vaddq_s16(curs, nxtd);
 6664+
 6665+		// undo scaling and round, then store with even/odd phases interleaved
 6666+		uint8x8x2_t o;
 6667+		o.val[0] = vqrshrun_n_s16(even, 4);
 6668+		o.val[1] = vqrshrun_n_s16(odd, 4);
 6669+		vst2_u8(out + i * 2, o);
 6670+#endif
 6671+
 6672+		// "previous" value for next iter
 6673+		t1 = 3 * in_near[i + 7] + in_far[i + 7];
 6674+	}
 6675+
 6676+	t0 = t1;
 6677+	t1 = 3 * in_near[i] + in_far[i];
 6678+	out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 6679+
 6680+	for (++i; i < w; ++i) {
 6681+		t0 = t1;
 6682+		t1 = 3 * in_near[i] + in_far[i];
 6683+		out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
 6684+		out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
 6685+	}
 6686+	out[w * 2 - 1] = stbi__div4(t1 + 2);
 6687+
 6688+	STBI_NOTUSED(hs);
 6689+
 6690+	return out;
 6691+}
 6692+#endif
 6693+
 6694+static stbi_uc *
 6695+stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far,
 6696+                           int w, int hs)
 6697+{
 6698+	// resample with nearest-neighbor
 6699+	int i, j;
 6700+	STBI_NOTUSED(in_far);
 6701+	for (i = 0; i < w; ++i) {
 6702+		for (j = 0; j < hs; ++j) {
 6703+			out[i * hs + j] = in_near[i];
 6704+		}
 6705+	}
 6706+	return out;
 6707 }
 6708 
 6709 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 6710 // to make sure the code produces the same results in both SIMD and scalar
 6711-#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 6712-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 6713-{
 6714-   int i;
 6715-   for (i=0; i < count; ++i) {
 6716-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
 6717-      int r,g,b;
 6718-      int cr = pcr[i] - 128;
 6719-      int cb = pcb[i] - 128;
 6720-      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
 6721-      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
 6722-      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
 6723-      r >>= 20;
 6724-      g >>= 20;
 6725-      b >>= 20;
 6726-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
 6727-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
 6728-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
 6729-      out[0] = (stbi_uc)r;
 6730-      out[1] = (stbi_uc)g;
 6731-      out[2] = (stbi_uc)b;
 6732-      out[3] = 255;
 6733-      out += step;
 6734-   }
 6735+#define stbi__float2fixed(x) (((int)((x) * 4096.0f + 0.5f)) << 8)
 6736+static void
 6737+stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb,
 6738+                       const stbi_uc *pcr, int count, int step)
 6739+{
 6740+	int i;
 6741+	for (i = 0; i < count; ++i) {
 6742+		int y_fixed = (y[i] << 20) + (1 << 19); // rounding
 6743+		int r, g, b;
 6744+		int cr = pcr[i] - 128;
 6745+		int cb = pcb[i] - 128;
 6746+		r = y_fixed + cr * stbi__float2fixed(1.40200f);
 6747+		g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
 6748+		    ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
 6749+		b = y_fixed + cb * stbi__float2fixed(1.77200f);
 6750+		r >>= 20;
 6751+		g >>= 20;
 6752+		b >>= 20;
 6753+		if ((unsigned)r > 255) {
 6754+			if (r < 0) {
 6755+				r = 0;
 6756+			} else {
 6757+				r = 255;
 6758+			}
 6759+		}
 6760+		if ((unsigned)g > 255) {
 6761+			if (g < 0) {
 6762+				g = 0;
 6763+			} else {
 6764+				g = 255;
 6765+			}
 6766+		}
 6767+		if ((unsigned)b > 255) {
 6768+			if (b < 0) {
 6769+				b = 0;
 6770+			} else {
 6771+				b = 255;
 6772+			}
 6773+		}
 6774+		out[0] = (stbi_uc)r;
 6775+		out[1] = (stbi_uc)g;
 6776+		out[2] = (stbi_uc)b;
 6777+		out[3] = 255;
 6778+		out += step;
 6779+	}
 6780 }
 6781 
 6782 #if defined(STBI_SSE2) || defined(STBI_NEON)
 6783-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
 6784+static void
 6785+stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb,
 6786+                        stbi_uc const *pcr, int count, int step)
 6787 {
 6788-   int i = 0;
 6789+	int i = 0;
 6790 
 6791 #ifdef STBI_SSE2
 6792-   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
 6793-   // it's useful in practice (you wouldn't use it for textures, for example).
 6794-   // so just accelerate step == 4 case.
 6795-   if (step == 4) {
 6796-      // this is a fairly straightforward implementation and not super-optimized.
 6797-      __m128i signflip  = _mm_set1_epi8(-0x80);
 6798-      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
 6799-      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
 6800-      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
 6801-      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
 6802-      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
 6803-      __m128i xw = _mm_set1_epi16(255); // alpha channel
 6804-
 6805-      for (; i+7 < count; i += 8) {
 6806-         // load
 6807-         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
 6808-         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
 6809-         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
 6810-         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
 6811-         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 6812-
 6813-         // unpack to short (and left-shift cr, cb by 8)
 6814-         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
 6815-         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
 6816-         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
 6817-
 6818-         // color transform
 6819-         __m128i yws = _mm_srli_epi16(yw, 4);
 6820-         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
 6821-         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
 6822-         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
 6823-         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
 6824-         __m128i rws = _mm_add_epi16(cr0, yws);
 6825-         __m128i gwt = _mm_add_epi16(cb0, yws);
 6826-         __m128i bws = _mm_add_epi16(yws, cb1);
 6827-         __m128i gws = _mm_add_epi16(gwt, cr1);
 6828-
 6829-         // descale
 6830-         __m128i rw = _mm_srai_epi16(rws, 4);
 6831-         __m128i bw = _mm_srai_epi16(bws, 4);
 6832-         __m128i gw = _mm_srai_epi16(gws, 4);
 6833-
 6834-         // back to byte, set up for transpose
 6835-         __m128i brb = _mm_packus_epi16(rw, bw);
 6836-         __m128i gxb = _mm_packus_epi16(gw, xw);
 6837-
 6838-         // transpose to interleave channels
 6839-         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
 6840-         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
 6841-         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
 6842-         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
 6843-
 6844-         // store
 6845-         _mm_storeu_si128((__m128i *) (out + 0), o0);
 6846-         _mm_storeu_si128((__m128i *) (out + 16), o1);
 6847-         out += 32;
 6848-      }
 6849-   }
 6850+	// step == 3 is pretty ugly on the final interleave, and i'm not convinced
 6851+	// it's useful in practice (you wouldn't use it for textures, for example).
 6852+	// so just accelerate step == 4 case.
 6853+	if (step == 4) {
 6854+		// this is a fairly straightforward implementation and not
 6855+		// super-optimized.
 6856+		__m128i signflip = _mm_set1_epi8(-0x80);
 6857+		__m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
 6858+		__m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
 6859+		__m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
 6860+		__m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
 6861+		__m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
 6862+		__m128i xw = _mm_set1_epi16(255); // alpha channel
 6863+
 6864+		for (; i + 7 < count; i += 8) {
 6865+			// load
 6866+			__m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
 6867+			__m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
 6868+			__m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
 6869+			__m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
 6870+			__m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 6871+
 6872+			// unpack to short (and left-shift cr, cb by 8)
 6873+			__m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
 6874+			__m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
 6875+			__m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
 6876+
 6877+			// color transform
 6878+			__m128i yws = _mm_srli_epi16(yw, 4);
 6879+			__m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
 6880+			__m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
 6881+			__m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
 6882+			__m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
 6883+			__m128i rws = _mm_add_epi16(cr0, yws);
 6884+			__m128i gwt = _mm_add_epi16(cb0, yws);
 6885+			__m128i bws = _mm_add_epi16(yws, cb1);
 6886+			__m128i gws = _mm_add_epi16(gwt, cr1);
 6887+
 6888+			// descale
 6889+			__m128i rw = _mm_srai_epi16(rws, 4);
 6890+			__m128i bw = _mm_srai_epi16(bws, 4);
 6891+			__m128i gw = _mm_srai_epi16(gws, 4);
 6892+
 6893+			// back to byte, set up for transpose
 6894+			__m128i brb = _mm_packus_epi16(rw, bw);
 6895+			__m128i gxb = _mm_packus_epi16(gw, xw);
 6896+
 6897+			// transpose to interleave channels
 6898+			__m128i t0 = _mm_unpacklo_epi8(brb, gxb);
 6899+			__m128i t1 = _mm_unpackhi_epi8(brb, gxb);
 6900+			__m128i o0 = _mm_unpacklo_epi16(t0, t1);
 6901+			__m128i o1 = _mm_unpackhi_epi16(t0, t1);
 6902+
 6903+			// store
 6904+			_mm_storeu_si128((__m128i *)(out + 0), o0);
 6905+			_mm_storeu_si128((__m128i *)(out + 16), o1);
 6906+			out += 32;
 6907+		}
 6908+	}
 6909 #endif
 6910 
 6911 #ifdef STBI_NEON
 6912-   // in this version, step=3 support would be easy to add. but is there demand?
 6913-   if (step == 4) {
 6914-      // this is a fairly straightforward implementation and not super-optimized.
 6915-      uint8x8_t signflip = vdup_n_u8(0x80);
 6916-      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
 6917-      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
 6918-      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
 6919-      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
 6920-
 6921-      for (; i+7 < count; i += 8) {
 6922-         // load
 6923-         uint8x8_t y_bytes  = vld1_u8(y + i);
 6924-         uint8x8_t cr_bytes = vld1_u8(pcr + i);
 6925-         uint8x8_t cb_bytes = vld1_u8(pcb + i);
 6926-         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
 6927-         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
 6928-
 6929-         // expand to s16
 6930-         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
 6931-         int16x8_t crw = vshll_n_s8(cr_biased, 7);
 6932-         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
 6933-
 6934-         // color transform
 6935-         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
 6936-         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
 6937-         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
 6938-         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
 6939-         int16x8_t rws = vaddq_s16(yws, cr0);
 6940-         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
 6941-         int16x8_t bws = vaddq_s16(yws, cb1);
 6942-
 6943-         // undo scaling, round, convert to byte
 6944-         uint8x8x4_t o;
 6945-         o.val[0] = vqrshrun_n_s16(rws, 4);
 6946-         o.val[1] = vqrshrun_n_s16(gws, 4);
 6947-         o.val[2] = vqrshrun_n_s16(bws, 4);
 6948-         o.val[3] = vdup_n_u8(255);
 6949-
 6950-         // store, interleaving r/g/b/a
 6951-         vst4_u8(out, o);
 6952-         out += 8*4;
 6953-      }
 6954-   }
 6955-#endif
 6956-
 6957-   for (; i < count; ++i) {
 6958-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
 6959-      int r,g,b;
 6960-      int cr = pcr[i] - 128;
 6961-      int cb = pcb[i] - 128;
 6962-      r = y_fixed + cr* stbi__float2fixed(1.40200f);
 6963-      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
 6964-      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
 6965-      r >>= 20;
 6966-      g >>= 20;
 6967-      b >>= 20;
 6968-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
 6969-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
 6970-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
 6971-      out[0] = (stbi_uc)r;
 6972-      out[1] = (stbi_uc)g;
 6973-      out[2] = (stbi_uc)b;
 6974-      out[3] = 255;
 6975-      out += step;
 6976-   }
 6977+	// in this version, step=3 support would be easy to add. but is there
 6978+	// demand?
 6979+	if (step == 4) {
 6980+		// this is a fairly straightforward implementation and not
 6981+		// super-optimized.
 6982+		uint8x8_t signflip = vdup_n_u8(0x80);
 6983+		int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
 6984+		int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
 6985+		int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
 6986+		int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
 6987+
 6988+		for (; i + 7 < count; i += 8) {
 6989+			// load
 6990+			uint8x8_t y_bytes = vld1_u8(y + i);
 6991+			uint8x8_t cr_bytes = vld1_u8(pcr + i);
 6992+			uint8x8_t cb_bytes = vld1_u8(pcb + i);
 6993+			int8x8_t cr_biased =
 6994+			    vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
 6995+			int8x8_t cb_biased =
 6996+			    vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
 6997+
 6998+			// expand to s16
 6999+			int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
 7000+			int16x8_t crw = vshll_n_s8(cr_biased, 7);
 7001+			int16x8_t cbw = vshll_n_s8(cb_biased, 7);
 7002+
 7003+			// color transform
 7004+			int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
 7005+			int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
 7006+			int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
 7007+			int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
 7008+			int16x8_t rws = vaddq_s16(yws, cr0);
 7009+			int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
 7010+			int16x8_t bws = vaddq_s16(yws, cb1);
 7011+
 7012+			// undo scaling, round, convert to byte
 7013+			uint8x8x4_t o;
 7014+			o.val[0] = vqrshrun_n_s16(rws, 4);
 7015+			o.val[1] = vqrshrun_n_s16(gws, 4);
 7016+			o.val[2] = vqrshrun_n_s16(bws, 4);
 7017+			o.val[3] = vdup_n_u8(255);
 7018+
 7019+			// store, interleaving r/g/b/a
 7020+			vst4_u8(out, o);
 7021+			out += 8 * 4;
 7022+		}
 7023+	}
 7024+#endif
 7025+
 7026+	for (; i < count; ++i) {
 7027+		int y_fixed = (y[i] << 20) + (1 << 19); // rounding
 7028+		int r, g, b;
 7029+		int cr = pcr[i] - 128;
 7030+		int cb = pcb[i] - 128;
 7031+		r = y_fixed + cr * stbi__float2fixed(1.40200f);
 7032+		g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
 7033+		    ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
 7034+		b = y_fixed + cb * stbi__float2fixed(1.77200f);
 7035+		r >>= 20;
 7036+		g >>= 20;
 7037+		b >>= 20;
 7038+		if ((unsigned)r > 255) {
 7039+			if (r < 0) {
 7040+				r = 0;
 7041+			} else {
 7042+				r = 255;
 7043+			}
 7044+		}
 7045+		if ((unsigned)g > 255) {
 7046+			if (g < 0) {
 7047+				g = 0;
 7048+			} else {
 7049+				g = 255;
 7050+			}
 7051+		}
 7052+		if ((unsigned)b > 255) {
 7053+			if (b < 0) {
 7054+				b = 0;
 7055+			} else {
 7056+				b = 255;
 7057+			}
 7058+		}
 7059+		out[0] = (stbi_uc)r;
 7060+		out[1] = (stbi_uc)g;
 7061+		out[2] = (stbi_uc)b;
 7062+		out[3] = 255;
 7063+		out += step;
 7064+	}
 7065 }
 7066 #endif
 7067 
 7068 // set up the kernels
 7069-static void stbi__setup_jpeg(stbi__jpeg *j)
 7070+static void
 7071+stbi__setup_jpeg(stbi__jpeg *j)
 7072 {
 7073-   j->idct_block_kernel = stbi__idct_block;
 7074-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
 7075-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 7076+	j->idct_block_kernel = stbi__idct_block;
 7077+	j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
 7078+	j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
 7079 
 7080 #ifdef STBI_SSE2
 7081-   if (stbi__sse2_available()) {
 7082-      j->idct_block_kernel = stbi__idct_simd;
 7083-      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
 7084-      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 7085-   }
 7086+	if (stbi__sse2_available()) {
 7087+		j->idct_block_kernel = stbi__idct_simd;
 7088+		j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
 7089+		j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 7090+	}
 7091 #endif
 7092 
 7093 #ifdef STBI_NEON
 7094-   j->idct_block_kernel = stbi__idct_simd;
 7095-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
 7096-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 7097+	j->idct_block_kernel = stbi__idct_simd;
 7098+	j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
 7099+	j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 7100 #endif
 7101 }
 7102 
 7103 // clean up the temporary component buffers
 7104-static void stbi__cleanup_jpeg(stbi__jpeg *j)
 7105+static void
 7106+stbi__cleanup_jpeg(stbi__jpeg *j)
 7107 {
 7108-   stbi__free_jpeg_components(j, j->s->img_n, 0);
 7109+	stbi__free_jpeg_components(j, j->s->img_n, 0);
 7110 }
 7111 
 7112-typedef struct
 7113-{
 7114-   resample_row_func resample;
 7115-   stbi_uc *line0,*line1;
 7116-   int hs,vs;   // expansion factor in each axis
 7117-   int w_lores; // horizontal pixels pre-expansion
 7118-   int ystep;   // how far through vertical expansion we are
 7119-   int ypos;    // which pre-expansion row we're on
 7120+typedef struct {
 7121+	resample_row_func resample;
 7122+	stbi_uc *line0, *line1;
 7123+	int hs, vs;  // expansion factor in each axis
 7124+	int w_lores; // horizontal pixels pre-expansion
 7125+	int ystep;   // how far through vertical expansion we are
 7126+	int ypos;    // which pre-expansion row we're on
 7127 } stbi__resample;
 7128 
 7129 // fast 0..255 * 0..255 => 0..255 rounded multiplication
 7130-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
 7131-{
 7132-   unsigned int t = x*y + 128;
 7133-   return (stbi_uc) ((t + (t >>8)) >> 8);
 7134-}
 7135-
 7136-static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 7137-{
 7138-   int n, decode_n, is_rgb;
 7139-   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 7140-
 7141-   // validate req_comp
 7142-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
 7143-
 7144-   // load a jpeg image from whichever source, but leave in YCbCr format
 7145-   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 7146-
 7147-   // determine actual number of components to generate
 7148-   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
 7149-
 7150-   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 7151-
 7152-   if (z->s->img_n == 3 && n < 3 && !is_rgb)
 7153-      decode_n = 1;
 7154-   else
 7155-      decode_n = z->s->img_n;
 7156-
 7157-   // nothing to do if no components requested; check this now to avoid
 7158-   // accessing uninitialized coutput[0] later
 7159-   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
 7160-
 7161-   // resample and color-convert
 7162-   {
 7163-      int k;
 7164-      unsigned int i,j;
 7165-      stbi_uc *output;
 7166-      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
 7167-
 7168-      stbi__resample res_comp[4];
 7169-
 7170-      for (k=0; k < decode_n; ++k) {
 7171-         stbi__resample *r = &res_comp[k];
 7172-
 7173-         // allocate line buffer big enough for upsampling off the edges
 7174-         // with upsample factor of 4
 7175-         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
 7176-         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 7177-
 7178-         r->hs      = z->img_h_max / z->img_comp[k].h;
 7179-         r->vs      = z->img_v_max / z->img_comp[k].v;
 7180-         r->ystep   = r->vs >> 1;
 7181-         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
 7182-         r->ypos    = 0;
 7183-         r->line0   = r->line1 = z->img_comp[k].data;
 7184-
 7185-         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
 7186-         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
 7187-         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
 7188-         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
 7189-         else                               r->resample = stbi__resample_row_generic;
 7190-      }
 7191-
 7192-      // can't error after this so, this is safe
 7193-      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
 7194-      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 7195-
 7196-      // now go ahead and resample
 7197-      for (j=0; j < z->s->img_y; ++j) {
 7198-         stbi_uc *out = output + n * z->s->img_x * j;
 7199-         for (k=0; k < decode_n; ++k) {
 7200-            stbi__resample *r = &res_comp[k];
 7201-            int y_bot = r->ystep >= (r->vs >> 1);
 7202-            coutput[k] = r->resample(z->img_comp[k].linebuf,
 7203-                                     y_bot ? r->line1 : r->line0,
 7204-                                     y_bot ? r->line0 : r->line1,
 7205-                                     r->w_lores, r->hs);
 7206-            if (++r->ystep >= r->vs) {
 7207-               r->ystep = 0;
 7208-               r->line0 = r->line1;
 7209-               if (++r->ypos < z->img_comp[k].y)
 7210-                  r->line1 += z->img_comp[k].w2;
 7211-            }
 7212-         }
 7213-         if (n >= 3) {
 7214-            stbi_uc *y = coutput[0];
 7215-            if (z->s->img_n == 3) {
 7216-               if (is_rgb) {
 7217-                  for (i=0; i < z->s->img_x; ++i) {
 7218-                     out[0] = y[i];
 7219-                     out[1] = coutput[1][i];
 7220-                     out[2] = coutput[2][i];
 7221-                     out[3] = 255;
 7222-                     out += n;
 7223-                  }
 7224-               } else {
 7225-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
 7226-               }
 7227-            } else if (z->s->img_n == 4) {
 7228-               if (z->app14_color_transform == 0) { // CMYK
 7229-                  for (i=0; i < z->s->img_x; ++i) {
 7230-                     stbi_uc m = coutput[3][i];
 7231-                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
 7232-                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
 7233-                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
 7234-                     out[3] = 255;
 7235-                     out += n;
 7236-                  }
 7237-               } else if (z->app14_color_transform == 2) { // YCCK
 7238-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
 7239-                  for (i=0; i < z->s->img_x; ++i) {
 7240-                     stbi_uc m = coutput[3][i];
 7241-                     out[0] = stbi__blinn_8x8(255 - out[0], m);
 7242-                     out[1] = stbi__blinn_8x8(255 - out[1], m);
 7243-                     out[2] = stbi__blinn_8x8(255 - out[2], m);
 7244-                     out += n;
 7245-                  }
 7246-               } else { // YCbCr + alpha?  Ignore the fourth channel for now
 7247-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
 7248-               }
 7249-            } else
 7250-               for (i=0; i < z->s->img_x; ++i) {
 7251-                  out[0] = out[1] = out[2] = y[i];
 7252-                  out[3] = 255; // not used if n==3
 7253-                  out += n;
 7254-               }
 7255-         } else {
 7256-            if (is_rgb) {
 7257-               if (n == 1)
 7258-                  for (i=0; i < z->s->img_x; ++i)
 7259-                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
 7260-               else {
 7261-                  for (i=0; i < z->s->img_x; ++i, out += 2) {
 7262-                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
 7263-                     out[1] = 255;
 7264-                  }
 7265-               }
 7266-            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
 7267-               for (i=0; i < z->s->img_x; ++i) {
 7268-                  stbi_uc m = coutput[3][i];
 7269-                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
 7270-                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
 7271-                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
 7272-                  out[0] = stbi__compute_y(r, g, b);
 7273-                  out[1] = 255;
 7274-                  out += n;
 7275-               }
 7276-            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
 7277-               for (i=0; i < z->s->img_x; ++i) {
 7278-                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
 7279-                  out[1] = 255;
 7280-                  out += n;
 7281-               }
 7282-            } else {
 7283-               stbi_uc *y = coutput[0];
 7284-               if (n == 1)
 7285-                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
 7286-               else
 7287-                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
 7288-            }
 7289-         }
 7290-      }
 7291-      stbi__cleanup_jpeg(z);
 7292-      *out_x = z->s->img_x;
 7293-      *out_y = z->s->img_y;
 7294-      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
 7295-      return output;
 7296-   }
 7297-}
 7298-
 7299-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 7300-{
 7301-   unsigned char* result;
 7302-   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
 7303-   if (!j) return stbi__errpuc("outofmem", "Out of memory");
 7304-   memset(j, 0, sizeof(stbi__jpeg));
 7305-   STBI_NOTUSED(ri);
 7306-   j->s = s;
 7307-   stbi__setup_jpeg(j);
 7308-   result = load_jpeg_image(j, x,y,comp,req_comp);
 7309-   STBI_FREE(j);
 7310-   return result;
 7311-}
 7312-
 7313-static int stbi__jpeg_test(stbi__context *s)
 7314-{
 7315-   int r;
 7316-   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
 7317-   if (!j) return stbi__err("outofmem", "Out of memory");
 7318-   memset(j, 0, sizeof(stbi__jpeg));
 7319-   j->s = s;
 7320-   stbi__setup_jpeg(j);
 7321-   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
 7322-   stbi__rewind(s);
 7323-   STBI_FREE(j);
 7324-   return r;
 7325-}
 7326-
 7327-static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
 7328-{
 7329-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
 7330-      stbi__rewind( j->s );
 7331-      return 0;
 7332-   }
 7333-   if (x) *x = j->s->img_x;
 7334-   if (y) *y = j->s->img_y;
 7335-   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
 7336-   return 1;
 7337-}
 7338-
 7339-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 7340-{
 7341-   int result;
 7342-   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
 7343-   if (!j) return stbi__err("outofmem", "Out of memory");
 7344-   memset(j, 0, sizeof(stbi__jpeg));
 7345-   j->s = s;
 7346-   result = stbi__jpeg_info_raw(j, x, y, comp);
 7347-   STBI_FREE(j);
 7348-   return result;
 7349+static stbi_uc
 7350+stbi__blinn_8x8(stbi_uc x, stbi_uc y)
 7351+{
 7352+	unsigned int t = x * y + 128;
 7353+	return (stbi_uc)((t + (t >> 8)) >> 8);
 7354+}
 7355+
 7356+static stbi_uc *
 7357+load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 7358+{
 7359+	int n, decode_n, is_rgb;
 7360+	z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 7361+
 7362+	// validate req_comp
 7363+	if (req_comp < 0 || req_comp > 4) {
 7364+		return stbi__errpuc("bad req_comp", "Internal error");
 7365+	}
 7366+
 7367+	// load a jpeg image from whichever source, but leave in YCbCr format
 7368+	if (!stbi__decode_jpeg_image(z)) {
 7369+		stbi__cleanup_jpeg(z);
 7370+		return NULL;
 7371+	}
 7372+
 7373+	// determine actual number of components to generate
 7374+	n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
 7375+
 7376+	is_rgb = z->s->img_n == 3 &&
 7377+	         (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 7378+
 7379+	if (z->s->img_n == 3 && n < 3 && !is_rgb) {
 7380+		decode_n = 1;
 7381+	} else {
 7382+		decode_n = z->s->img_n;
 7383+	}
 7384+
 7385+	// nothing to do if no components requested; check this now to avoid
 7386+	// accessing uninitialized coutput[0] later
 7387+	if (decode_n <= 0) {
 7388+		stbi__cleanup_jpeg(z);
 7389+		return NULL;
 7390+	}
 7391+
 7392+	// resample and color-convert
 7393+	{
 7394+		int k;
 7395+		unsigned int i, j;
 7396+		stbi_uc *output;
 7397+		stbi_uc *coutput[4] = {NULL, NULL, NULL, NULL};
 7398+
 7399+		stbi__resample res_comp[4];
 7400+
 7401+		for (k = 0; k < decode_n; ++k) {
 7402+			stbi__resample *r = &res_comp[k];
 7403+
 7404+			// allocate line buffer big enough for upsampling off the edges
 7405+			// with upsample factor of 4
 7406+			z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3);
 7407+			if (!z->img_comp[k].linebuf) {
 7408+				stbi__cleanup_jpeg(z);
 7409+				return stbi__errpuc("outofmem", "Out of memory");
 7410+			}
 7411+
 7412+			r->hs = z->img_h_max / z->img_comp[k].h;
 7413+			r->vs = z->img_v_max / z->img_comp[k].v;
 7414+			r->ystep = r->vs >> 1;
 7415+			r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
 7416+			r->ypos = 0;
 7417+			r->line0 = r->line1 = z->img_comp[k].data;
 7418+
 7419+			if (r->hs == 1 && r->vs == 1) {
 7420+				r->resample = resample_row_1;
 7421+			} else if (r->hs == 1 && r->vs == 2) {
 7422+				r->resample = stbi__resample_row_v_2;
 7423+			} else if (r->hs == 2 && r->vs == 1) {
 7424+				r->resample = stbi__resample_row_h_2;
 7425+			} else if (r->hs == 2 && r->vs == 2) {
 7426+				r->resample = z->resample_row_hv_2_kernel;
 7427+			} else {
 7428+				r->resample = stbi__resample_row_generic;
 7429+			}
 7430+		}
 7431+
 7432+		// can't error after this so, this is safe
 7433+		output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
 7434+		if (!output) {
 7435+			stbi__cleanup_jpeg(z);
 7436+			return stbi__errpuc("outofmem", "Out of memory");
 7437+		}
 7438+
 7439+		// now go ahead and resample
 7440+		for (j = 0; j < z->s->img_y; ++j) {
 7441+			stbi_uc *out = output + n * z->s->img_x * j;
 7442+			for (k = 0; k < decode_n; ++k) {
 7443+				stbi__resample *r = &res_comp[k];
 7444+				int y_bot = r->ystep >= (r->vs >> 1);
 7445+				coutput[k] = r->resample(
 7446+				    z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0,
 7447+				    y_bot ? r->line0 : r->line1, r->w_lores, r->hs);
 7448+				if (++r->ystep >= r->vs) {
 7449+					r->ystep = 0;
 7450+					r->line0 = r->line1;
 7451+					if (++r->ypos < z->img_comp[k].y) {
 7452+						r->line1 += z->img_comp[k].w2;
 7453+					}
 7454+				}
 7455+			}
 7456+			if (n >= 3) {
 7457+				stbi_uc *y = coutput[0];
 7458+				if (z->s->img_n == 3) {
 7459+					if (is_rgb) {
 7460+						for (i = 0; i < z->s->img_x; ++i) {
 7461+							out[0] = y[i];
 7462+							out[1] = coutput[1][i];
 7463+							out[2] = coutput[2][i];
 7464+							out[3] = 255;
 7465+							out += n;
 7466+						}
 7467+					} else {
 7468+						z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2],
 7469+						                       z->s->img_x, n);
 7470+					}
 7471+				} else if (z->s->img_n == 4) {
 7472+					if (z->app14_color_transform == 0) { // CMYK
 7473+						for (i = 0; i < z->s->img_x; ++i) {
 7474+							stbi_uc m = coutput[3][i];
 7475+							out[0] = stbi__blinn_8x8(coutput[0][i], m);
 7476+							out[1] = stbi__blinn_8x8(coutput[1][i], m);
 7477+							out[2] = stbi__blinn_8x8(coutput[2][i], m);
 7478+							out[3] = 255;
 7479+							out += n;
 7480+						}
 7481+					} else if (z->app14_color_transform == 2) { // YCCK
 7482+						z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2],
 7483+						                       z->s->img_x, n);
 7484+						for (i = 0; i < z->s->img_x; ++i) {
 7485+							stbi_uc m = coutput[3][i];
 7486+							out[0] = stbi__blinn_8x8(255 - out[0], m);
 7487+							out[1] = stbi__blinn_8x8(255 - out[1], m);
 7488+							out[2] = stbi__blinn_8x8(255 - out[2], m);
 7489+							out += n;
 7490+						}
 7491+					} else { // YCbCr + alpha?  Ignore the fourth channel for
 7492+						     // now
 7493+						z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2],
 7494+						                       z->s->img_x, n);
 7495+					}
 7496+				} else {
 7497+					for (i = 0; i < z->s->img_x; ++i) {
 7498+						out[0] = out[1] = out[2] = y[i];
 7499+						out[3] = 255; // not used if n==3
 7500+						out += n;
 7501+					}
 7502+				}
 7503+			} else {
 7504+				if (is_rgb) {
 7505+					if (n == 1) {
 7506+						for (i = 0; i < z->s->img_x; ++i) {
 7507+							*out++ = stbi__compute_y(
 7508+							    coutput[0][i], coutput[1][i], coutput[2][i]);
 7509+						}
 7510+					} else {
 7511+						for (i = 0; i < z->s->img_x; ++i, out += 2) {
 7512+							out[0] = stbi__compute_y(
 7513+							    coutput[0][i], coutput[1][i], coutput[2][i]);
 7514+							out[1] = 255;
 7515+						}
 7516+					}
 7517+				} else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
 7518+					for (i = 0; i < z->s->img_x; ++i) {
 7519+						stbi_uc m = coutput[3][i];
 7520+						stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
 7521+						stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
 7522+						stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
 7523+						out[0] = stbi__compute_y(r, g, b);
 7524+						out[1] = 255;
 7525+						out += n;
 7526+					}
 7527+				} else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
 7528+					for (i = 0; i < z->s->img_x; ++i) {
 7529+						out[0] =
 7530+						    stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
 7531+						out[1] = 255;
 7532+						out += n;
 7533+					}
 7534+				} else {
 7535+					stbi_uc *y = coutput[0];
 7536+					if (n == 1) {
 7537+						for (i = 0; i < z->s->img_x; ++i) {
 7538+							out[i] = y[i];
 7539+						}
 7540+					} else {
 7541+						for (i = 0; i < z->s->img_x; ++i) {
 7542+							*out++ = y[i];
 7543+							*out++ = 255;
 7544+						}
 7545+					}
 7546+				}
 7547+			}
 7548+		}
 7549+		stbi__cleanup_jpeg(z);
 7550+		*out_x = z->s->img_x;
 7551+		*out_y = z->s->img_y;
 7552+		if (comp) {
 7553+			*comp = z->s->img_n >= 3
 7554+			            ? 3
 7555+			            : 1; // report original components, not output
 7556+		}
 7557+		return output;
 7558+	}
 7559+}
 7560+
 7561+static void *
 7562+stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
 7563+                stbi__result_info *ri)
 7564+{
 7565+	unsigned char *result;
 7566+	stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
 7567+	if (!j) {
 7568+		return stbi__errpuc("outofmem", "Out of memory");
 7569+	}
 7570+	memset(j, 0, sizeof(stbi__jpeg));
 7571+	STBI_NOTUSED(ri);
 7572+	j->s = s;
 7573+	stbi__setup_jpeg(j);
 7574+	result = load_jpeg_image(j, x, y, comp, req_comp);
 7575+	STBI_FREE(j);
 7576+	return result;
 7577+}
 7578+
 7579+static int
 7580+stbi__jpeg_test(stbi__context *s)
 7581+{
 7582+	int r;
 7583+	stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
 7584+	if (!j) {
 7585+		return stbi__err("outofmem", "Out of memory");
 7586+	}
 7587+	memset(j, 0, sizeof(stbi__jpeg));
 7588+	j->s = s;
 7589+	stbi__setup_jpeg(j);
 7590+	r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
 7591+	stbi__rewind(s);
 7592+	STBI_FREE(j);
 7593+	return r;
 7594+}
 7595+
 7596+static int
 7597+stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
 7598+{
 7599+	if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
 7600+		stbi__rewind(j->s);
 7601+		return 0;
 7602+	}
 7603+	if (x) {
 7604+		*x = j->s->img_x;
 7605+	}
 7606+	if (y) {
 7607+		*y = j->s->img_y;
 7608+	}
 7609+	if (comp) {
 7610+		*comp = j->s->img_n >= 3 ? 3 : 1;
 7611+	}
 7612+	return 1;
 7613+}
 7614+
 7615+static int
 7616+stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 7617+{
 7618+	int result;
 7619+	stbi__jpeg *j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg)));
 7620+	if (!j) {
 7621+		return stbi__err("outofmem", "Out of memory");
 7622+	}
 7623+	memset(j, 0, sizeof(stbi__jpeg));
 7624+	j->s = s;
 7625+	result = stbi__jpeg_info_raw(j, x, y, comp);
 7626+	STBI_FREE(j);
 7627+	return result;
 7628 }
 7629 #endif
 7630 
 7631@@ -4088,84 +5057,92 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 7632 #ifndef STBI_NO_ZLIB
 7633 
 7634 // fast-way is faster to check than jpeg huffman, but slow way is slower
 7635-#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 7636-#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
 7637+#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
 7638+#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
 7639 #define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
 7640 
 7641 // zlib-style huffman encoding
 7642 // (jpegs packs from left, zlib from right, so can't share code)
 7643-typedef struct
 7644-{
 7645-   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
 7646-   stbi__uint16 firstcode[16];
 7647-   int maxcode[17];
 7648-   stbi__uint16 firstsymbol[16];
 7649-   stbi_uc  size[STBI__ZNSYMS];
 7650-   stbi__uint16 value[STBI__ZNSYMS];
 7651+typedef struct {
 7652+	stbi__uint16 fast[1 << STBI__ZFAST_BITS];
 7653+	stbi__uint16 firstcode[16];
 7654+	int maxcode[17];
 7655+	stbi__uint16 firstsymbol[16];
 7656+	stbi_uc size[STBI__ZNSYMS];
 7657+	stbi__uint16 value[STBI__ZNSYMS];
 7658 } stbi__zhuffman;
 7659 
 7660-stbi_inline static int stbi__bitreverse16(int n)
 7661-{
 7662-  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
 7663-  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
 7664-  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
 7665-  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
 7666-  return n;
 7667-}
 7668-
 7669-stbi_inline static int stbi__bit_reverse(int v, int bits)
 7670-{
 7671-   STBI_ASSERT(bits <= 16);
 7672-   // to bit reverse n bits, reverse 16 and shift
 7673-   // e.g. 11 bits, bit reverse and shift away 5
 7674-   return stbi__bitreverse16(v) >> (16-bits);
 7675-}
 7676-
 7677-static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 7678-{
 7679-   int i,k=0;
 7680-   int code, next_code[16], sizes[17];
 7681-
 7682-   // DEFLATE spec for generating codes
 7683-   memset(sizes, 0, sizeof(sizes));
 7684-   memset(z->fast, 0, sizeof(z->fast));
 7685-   for (i=0; i < num; ++i)
 7686-      ++sizes[sizelist[i]];
 7687-   sizes[0] = 0;
 7688-   for (i=1; i < 16; ++i)
 7689-      if (sizes[i] > (1 << i))
 7690-         return stbi__err("bad sizes", "Corrupt PNG");
 7691-   code = 0;
 7692-   for (i=1; i < 16; ++i) {
 7693-      next_code[i] = code;
 7694-      z->firstcode[i] = (stbi__uint16) code;
 7695-      z->firstsymbol[i] = (stbi__uint16) k;
 7696-      code = (code + sizes[i]);
 7697-      if (sizes[i])
 7698-         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
 7699-      z->maxcode[i] = code << (16-i); // preshift for inner loop
 7700-      code <<= 1;
 7701-      k += sizes[i];
 7702-   }
 7703-   z->maxcode[16] = 0x10000; // sentinel
 7704-   for (i=0; i < num; ++i) {
 7705-      int s = sizelist[i];
 7706-      if (s) {
 7707-         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
 7708-         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
 7709-         z->size [c] = (stbi_uc     ) s;
 7710-         z->value[c] = (stbi__uint16) i;
 7711-         if (s <= STBI__ZFAST_BITS) {
 7712-            int j = stbi__bit_reverse(next_code[s],s);
 7713-            while (j < (1 << STBI__ZFAST_BITS)) {
 7714-               z->fast[j] = fastv;
 7715-               j += (1 << s);
 7716-            }
 7717-         }
 7718-         ++next_code[s];
 7719-      }
 7720-   }
 7721-   return 1;
 7722+stbi_inline static int
 7723+stbi__bitreverse16(int n)
 7724+{
 7725+	n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
 7726+	n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
 7727+	n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
 7728+	n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
 7729+	return n;
 7730+}
 7731+
 7732+stbi_inline static int
 7733+stbi__bit_reverse(int v, int bits)
 7734+{
 7735+	STBI_ASSERT(bits <= 16);
 7736+	// to bit reverse n bits, reverse 16 and shift
 7737+	// e.g. 11 bits, bit reverse and shift away 5
 7738+	return stbi__bitreverse16(v) >> (16 - bits);
 7739+}
 7740+
 7741+static int
 7742+stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 7743+{
 7744+	int i, k = 0;
 7745+	int code, next_code[16], sizes[17];
 7746+
 7747+	// DEFLATE spec for generating codes
 7748+	memset(sizes, 0, sizeof(sizes));
 7749+	memset(z->fast, 0, sizeof(z->fast));
 7750+	for (i = 0; i < num; ++i) {
 7751+		++sizes[sizelist[i]];
 7752+	}
 7753+	sizes[0] = 0;
 7754+	for (i = 1; i < 16; ++i) {
 7755+		if (sizes[i] > (1 << i)) {
 7756+			return stbi__err("bad sizes", "Corrupt PNG");
 7757+		}
 7758+	}
 7759+	code = 0;
 7760+	for (i = 1; i < 16; ++i) {
 7761+		next_code[i] = code;
 7762+		z->firstcode[i] = (stbi__uint16)code;
 7763+		z->firstsymbol[i] = (stbi__uint16)k;
 7764+		code = (code + sizes[i]);
 7765+		if (sizes[i]) {
 7766+			if (code - 1 >= (1 << i)) {
 7767+				return stbi__err("bad codelengths", "Corrupt PNG");
 7768+			}
 7769+		}
 7770+		z->maxcode[i] = code << (16 - i); // preshift for inner loop
 7771+		code <<= 1;
 7772+		k += sizes[i];
 7773+	}
 7774+	z->maxcode[16] = 0x10000; // sentinel
 7775+	for (i = 0; i < num; ++i) {
 7776+		int s = sizelist[i];
 7777+		if (s) {
 7778+			int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
 7779+			stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
 7780+			z->size[c] = (stbi_uc)s;
 7781+			z->value[c] = (stbi__uint16)i;
 7782+			if (s <= STBI__ZFAST_BITS) {
 7783+				int j = stbi__bit_reverse(next_code[s], s);
 7784+				while (j < (1 << STBI__ZFAST_BITS)) {
 7785+					z->fast[j] = fastv;
 7786+					j += (1 << s);
 7787+				}
 7788+			}
 7789+			++next_code[s];
 7790+		}
 7791+	}
 7792+	return 1;
 7793 }
 7794 
 7795 // zlib-from-memory implementation for PNG reading
 7796@@ -4174,297 +5151,397 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int
 7797 //    we require PNG read all the IDATs and combine them into a single
 7798 //    memory buffer
 7799 
 7800-typedef struct
 7801-{
 7802-   stbi_uc *zbuffer, *zbuffer_end;
 7803-   int num_bits;
 7804-   int hit_zeof_once;
 7805-   stbi__uint32 code_buffer;
 7806+typedef struct {
 7807+	stbi_uc *zbuffer, *zbuffer_end;
 7808+	int num_bits;
 7809+	int hit_zeof_once;
 7810+	stbi__uint32 code_buffer;
 7811 
 7812-   char *zout;
 7813-   char *zout_start;
 7814-   char *zout_end;
 7815-   int   z_expandable;
 7816+	char *zout;
 7817+	char *zout_start;
 7818+	char *zout_end;
 7819+	int z_expandable;
 7820 
 7821-   stbi__zhuffman z_length, z_distance;
 7822+	stbi__zhuffman z_length, z_distance;
 7823 } stbi__zbuf;
 7824 
 7825-stbi_inline static int stbi__zeof(stbi__zbuf *z)
 7826-{
 7827-   return (z->zbuffer >= z->zbuffer_end);
 7828-}
 7829-
 7830-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
 7831-{
 7832-   return stbi__zeof(z) ? 0 : *z->zbuffer++;
 7833-}
 7834-
 7835-static void stbi__fill_bits(stbi__zbuf *z)
 7836-{
 7837-   do {
 7838-      if (z->code_buffer >= (1U << z->num_bits)) {
 7839-        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
 7840-        return;
 7841-      }
 7842-      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
 7843-      z->num_bits += 8;
 7844-   } while (z->num_bits <= 24);
 7845-}
 7846-
 7847-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
 7848-{
 7849-   unsigned int k;
 7850-   if (z->num_bits < n) stbi__fill_bits(z);
 7851-   k = z->code_buffer & ((1 << n) - 1);
 7852-   z->code_buffer >>= n;
 7853-   z->num_bits -= n;
 7854-   return k;
 7855-}
 7856-
 7857-static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
 7858-{
 7859-   int b,s,k;
 7860-   // not resolved by fast table, so compute it the slow way
 7861-   // use jpeg approach, which requires MSbits at top
 7862-   k = stbi__bit_reverse(a->code_buffer, 16);
 7863-   for (s=STBI__ZFAST_BITS+1; ; ++s)
 7864-      if (k < z->maxcode[s])
 7865-         break;
 7866-   if (s >= 16) return -1; // invalid code!
 7867-   // code size is s, so:
 7868-   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
 7869-   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
 7870-   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
 7871-   a->code_buffer >>= s;
 7872-   a->num_bits -= s;
 7873-   return z->value[b];
 7874-}
 7875-
 7876-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 7877-{
 7878-   int b,s;
 7879-   if (a->num_bits < 16) {
 7880-      if (stbi__zeof(a)) {
 7881-         if (!a->hit_zeof_once) {
 7882-            // This is the first time we hit eof, insert 16 extra padding btis
 7883-            // to allow us to keep going; if we actually consume any of them
 7884-            // though, that is invalid data. This is caught later.
 7885-            a->hit_zeof_once = 1;
 7886-            a->num_bits += 16; // add 16 implicit zero bits
 7887-         } else {
 7888-            // We already inserted our extra 16 padding bits and are again
 7889-            // out, this stream is actually prematurely terminated.
 7890-            return -1;
 7891-         }
 7892-      } else {
 7893-         stbi__fill_bits(a);
 7894-      }
 7895-   }
 7896-   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
 7897-   if (b) {
 7898-      s = b >> 9;
 7899-      a->code_buffer >>= s;
 7900-      a->num_bits -= s;
 7901-      return b & 511;
 7902-   }
 7903-   return stbi__zhuffman_decode_slowpath(a, z);
 7904-}
 7905-
 7906-static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 7907-{
 7908-   char *q;
 7909-   unsigned int cur, limit, old_limit;
 7910-   z->zout = zout;
 7911-   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
 7912-   cur   = (unsigned int) (z->zout - z->zout_start);
 7913-   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
 7914-   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
 7915-   while (cur + n > limit) {
 7916-      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
 7917-      limit *= 2;
 7918-   }
 7919-   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
 7920-   STBI_NOTUSED(old_limit);
 7921-   if (q == NULL) return stbi__err("outofmem", "Out of memory");
 7922-   z->zout_start = q;
 7923-   z->zout       = q + cur;
 7924-   z->zout_end   = q + limit;
 7925-   return 1;
 7926+stbi_inline static int
 7927+stbi__zeof(stbi__zbuf *z)
 7928+{
 7929+	return (z->zbuffer >= z->zbuffer_end);
 7930+}
 7931+
 7932+stbi_inline static stbi_uc
 7933+stbi__zget8(stbi__zbuf *z)
 7934+{
 7935+	return stbi__zeof(z) ? 0 : *z->zbuffer++;
 7936+}
 7937+
 7938+static void
 7939+stbi__fill_bits(stbi__zbuf *z)
 7940+{
 7941+	do {
 7942+		if (z->code_buffer >= (1U << z->num_bits)) {
 7943+			z->zbuffer = z->zbuffer_end; /* treat this as EOF so we fail. */
 7944+			return;
 7945+		}
 7946+		z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
 7947+		z->num_bits += 8;
 7948+	} while (z->num_bits <= 24);
 7949+}
 7950+
 7951+stbi_inline static unsigned int
 7952+stbi__zreceive(stbi__zbuf *z, int n)
 7953+{
 7954+	unsigned int k;
 7955+	if (z->num_bits < n) {
 7956+		stbi__fill_bits(z);
 7957+	}
 7958+	k = z->code_buffer & ((1 << n) - 1);
 7959+	z->code_buffer >>= n;
 7960+	z->num_bits -= n;
 7961+	return k;
 7962+}
 7963+
 7964+static int
 7965+stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
 7966+{
 7967+	int b, s, k;
 7968+	// not resolved by fast table, so compute it the slow way
 7969+	// use jpeg approach, which requires MSbits at top
 7970+	k = stbi__bit_reverse(a->code_buffer, 16);
 7971+	for (s = STBI__ZFAST_BITS + 1;; ++s) {
 7972+		if (k < z->maxcode[s]) {
 7973+			break;
 7974+		}
 7975+	}
 7976+	if (s >= 16) {
 7977+		return -1; // invalid code!
 7978+	}
 7979+	// code size is s, so:
 7980+	b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
 7981+	if (b >= STBI__ZNSYMS) {
 7982+		return -1; // some data was corrupt somewhere!
 7983+	}
 7984+	if (z->size[b] != s) {
 7985+		return -1; // was originally an assert, but report failure instead.
 7986+	}
 7987+	a->code_buffer >>= s;
 7988+	a->num_bits -= s;
 7989+	return z->value[b];
 7990+}
 7991+
 7992+stbi_inline static int
 7993+stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 7994+{
 7995+	int b, s;
 7996+	if (a->num_bits < 16) {
 7997+		if (stbi__zeof(a)) {
 7998+			if (!a->hit_zeof_once) {
 7999+				// This is the first time we hit eof, insert 16 extra padding
 8000+				// btis to allow us to keep going; if we actually consume any of
 8001+				// them though, that is invalid data. This is caught later.
 8002+				a->hit_zeof_once = 1;
 8003+				a->num_bits += 16; // add 16 implicit zero bits
 8004+			} else {
 8005+				// We already inserted our extra 16 padding bits and are again
 8006+				// out, this stream is actually prematurely terminated.
 8007+				return -1;
 8008+			}
 8009+		} else {
 8010+			stbi__fill_bits(a);
 8011+		}
 8012+	}
 8013+	b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
 8014+	if (b) {
 8015+		s = b >> 9;
 8016+		a->code_buffer >>= s;
 8017+		a->num_bits -= s;
 8018+		return b & 511;
 8019+	}
 8020+	return stbi__zhuffman_decode_slowpath(a, z);
 8021+}
 8022+
 8023+static int
 8024+stbi__zexpand(stbi__zbuf *z, char *zout, int n) // need to make room for n bytes
 8025+{
 8026+	char *q;
 8027+	unsigned int cur, limit, old_limit;
 8028+	z->zout = zout;
 8029+	if (!z->z_expandable) {
 8030+		return stbi__err("output buffer limit", "Corrupt PNG");
 8031+	}
 8032+	cur = (unsigned int)(z->zout - z->zout_start);
 8033+	limit = old_limit = (unsigned)(z->zout_end - z->zout_start);
 8034+	if (UINT_MAX - cur < (unsigned)n) {
 8035+		return stbi__err("outofmem", "Out of memory");
 8036+	}
 8037+	while (cur + n > limit) {
 8038+		if (limit > UINT_MAX / 2) {
 8039+			return stbi__err("outofmem", "Out of memory");
 8040+		}
 8041+		limit *= 2;
 8042+	}
 8043+	q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
 8044+	STBI_NOTUSED(old_limit);
 8045+	if (q == NULL) {
 8046+		return stbi__err("outofmem", "Out of memory");
 8047+	}
 8048+	z->zout_start = q;
 8049+	z->zout = q + cur;
 8050+	z->zout_end = q + limit;
 8051+	return 1;
 8052 }
 8053 
 8054 static const int stbi__zlength_base[31] = {
 8055-   3,4,5,6,7,8,9,10,11,13,
 8056-   15,17,19,23,27,31,35,43,51,59,
 8057-   67,83,99,115,131,163,195,227,258,0,0 };
 8058-
 8059-static const int stbi__zlength_extra[31]=
 8060-{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 8061-
 8062-static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
 8063-257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
 8064-
 8065-static const int stbi__zdist_extra[32] =
 8066-{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
 8067-
 8068-static int stbi__parse_huffman_block(stbi__zbuf *a)
 8069-{
 8070-   char *zout = a->zout;
 8071-   for(;;) {
 8072-      int z = stbi__zhuffman_decode(a, &a->z_length);
 8073-      if (z < 256) {
 8074-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
 8075-         if (zout >= a->zout_end) {
 8076-            if (!stbi__zexpand(a, zout, 1)) return 0;
 8077-            zout = a->zout;
 8078-         }
 8079-         *zout++ = (char) z;
 8080-      } else {
 8081-         stbi_uc *p;
 8082-         int len,dist;
 8083-         if (z == 256) {
 8084-            a->zout = zout;
 8085-            if (a->hit_zeof_once && a->num_bits < 16) {
 8086-               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
 8087-               // buffer so the decoder can just do its speculative decoding. But if we
 8088-               // actually consumed any of those bits (which is the case when num_bits < 16),
 8089-               // the stream actually read past the end so it is malformed.
 8090-               return stbi__err("unexpected end","Corrupt PNG");
 8091-            }
 8092-            return 1;
 8093-         }
 8094-         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
 8095-         z -= 257;
 8096-         len = stbi__zlength_base[z];
 8097-         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
 8098-         z = stbi__zhuffman_decode(a, &a->z_distance);
 8099-         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
 8100-         dist = stbi__zdist_base[z];
 8101-         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
 8102-         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
 8103-         if (len > a->zout_end - zout) {
 8104-            if (!stbi__zexpand(a, zout, len)) return 0;
 8105-            zout = a->zout;
 8106-         }
 8107-         p = (stbi_uc *) (zout - dist);
 8108-         if (dist == 1) { // run of one byte; common in images.
 8109-            stbi_uc v = *p;
 8110-            if (len) { do *zout++ = v; while (--len); }
 8111-         } else {
 8112-            if (len) { do *zout++ = *p++; while (--len); }
 8113-         }
 8114-      }
 8115-   }
 8116-}
 8117-
 8118-static int stbi__compute_huffman_codes(stbi__zbuf *a)
 8119-{
 8120-   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
 8121-   stbi__zhuffman z_codelength;
 8122-   stbi_uc lencodes[286+32+137];//padding for maximum single op
 8123-   stbi_uc codelength_sizes[19];
 8124-   int i,n;
 8125-
 8126-   int hlit  = stbi__zreceive(a,5) + 257;
 8127-   int hdist = stbi__zreceive(a,5) + 1;
 8128-   int hclen = stbi__zreceive(a,4) + 4;
 8129-   int ntot  = hlit + hdist;
 8130-
 8131-   memset(codelength_sizes, 0, sizeof(codelength_sizes));
 8132-   for (i=0; i < hclen; ++i) {
 8133-      int s = stbi__zreceive(a,3);
 8134-      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
 8135-   }
 8136-   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
 8137-
 8138-   n = 0;
 8139-   while (n < ntot) {
 8140-      int c = stbi__zhuffman_decode(a, &z_codelength);
 8141-      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
 8142-      if (c < 16)
 8143-         lencodes[n++] = (stbi_uc) c;
 8144-      else {
 8145-         stbi_uc fill = 0;
 8146-         if (c == 16) {
 8147-            c = stbi__zreceive(a,2)+3;
 8148-            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
 8149-            fill = lencodes[n-1];
 8150-         } else if (c == 17) {
 8151-            c = stbi__zreceive(a,3)+3;
 8152-         } else if (c == 18) {
 8153-            c = stbi__zreceive(a,7)+11;
 8154-         } else {
 8155-            return stbi__err("bad codelengths", "Corrupt PNG");
 8156-         }
 8157-         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
 8158-         memset(lencodes+n, fill, c);
 8159-         n += c;
 8160-      }
 8161-   }
 8162-   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
 8163-   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
 8164-   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
 8165-   return 1;
 8166-}
 8167-
 8168-static int stbi__parse_uncompressed_block(stbi__zbuf *a)
 8169-{
 8170-   stbi_uc header[4];
 8171-   int len,nlen,k;
 8172-   if (a->num_bits & 7)
 8173-      stbi__zreceive(a, a->num_bits & 7); // discard
 8174-   // drain the bit-packed data into header
 8175-   k = 0;
 8176-   while (a->num_bits > 0) {
 8177-      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
 8178-      a->code_buffer >>= 8;
 8179-      a->num_bits -= 8;
 8180-   }
 8181-   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
 8182-   // now fill header the normal way
 8183-   while (k < 4)
 8184-      header[k++] = stbi__zget8(a);
 8185-   len  = header[1] * 256 + header[0];
 8186-   nlen = header[3] * 256 + header[2];
 8187-   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
 8188-   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
 8189-   if (a->zout + len > a->zout_end)
 8190-      if (!stbi__zexpand(a, a->zout, len)) return 0;
 8191-   memcpy(a->zout, a->zbuffer, len);
 8192-   a->zbuffer += len;
 8193-   a->zout += len;
 8194-   return 1;
 8195-}
 8196-
 8197-static int stbi__parse_zlib_header(stbi__zbuf *a)
 8198-{
 8199-   int cmf   = stbi__zget8(a);
 8200-   int cm    = cmf & 15;
 8201-   /* int cinfo = cmf >> 4; */
 8202-   int flg   = stbi__zget8(a);
 8203-   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
 8204-   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
 8205-   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
 8206-   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
 8207-   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
 8208-   return 1;
 8209-}
 8210-
 8211-static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
 8212-{
 8213-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 8214-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 8215-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 8216-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 8217-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 8218-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 8219-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 8220-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 8221-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
 8222-};
 8223-static const stbi_uc stbi__zdefault_distance[32] =
 8224-{
 8225-   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
 8226-};
 8227+    3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
 8228+    35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
 8229+
 8230+static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
 8231+                                            1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
 8232+                                            4, 4, 5, 5, 5, 5, 0, 0, 0};
 8233+
 8234+static const int stbi__zdist_base[32] = {
 8235+    1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
 8236+    49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
 8237+    2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
 8238+
 8239+static const int stbi__zdist_extra[32] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
 8240+                                          4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
 8241+                                          9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
 8242+
 8243+static int
 8244+stbi__parse_huffman_block(stbi__zbuf *a)
 8245+{
 8246+	char *zout = a->zout;
 8247+	for (;;) {
 8248+		int z = stbi__zhuffman_decode(a, &a->z_length);
 8249+		if (z < 256) {
 8250+			if (z < 0) {
 8251+				return stbi__err("bad huffman code",
 8252+				                 "Corrupt PNG"); // error in huffman codes
 8253+			}
 8254+			if (zout >= a->zout_end) {
 8255+				if (!stbi__zexpand(a, zout, 1)) {
 8256+					return 0;
 8257+				}
 8258+				zout = a->zout;
 8259+			}
 8260+			*zout++ = (char)z;
 8261+		} else {
 8262+			stbi_uc *p;
 8263+			int len, dist;
 8264+			if (z == 256) {
 8265+				a->zout = zout;
 8266+				if (a->hit_zeof_once && a->num_bits < 16) {
 8267+					// The first time we hit zeof, we inserted 16 extra zero
 8268+					// bits into our bit buffer so the decoder can just do its
 8269+					// speculative decoding. But if we actually consumed any of
 8270+					// those bits (which is the case when num_bits < 16), the
 8271+					// stream actually read past the end so it is malformed.
 8272+					return stbi__err("unexpected end", "Corrupt PNG");
 8273+				}
 8274+				return 1;
 8275+			}
 8276+			if (z >= 286) {
 8277+				return stbi__err(
 8278+				    "bad huffman code",
 8279+				    "Corrupt PNG"); // per DEFLATE, length codes 286 and 287
 8280+				                    // must not appear in compressed data
 8281+			}
 8282+			z -= 257;
 8283+			len = stbi__zlength_base[z];
 8284+			if (stbi__zlength_extra[z]) {
 8285+				len += stbi__zreceive(a, stbi__zlength_extra[z]);
 8286+			}
 8287+			z = stbi__zhuffman_decode(a, &a->z_distance);
 8288+			if (z < 0 || z >= 30) {
 8289+				return stbi__err(
 8290+				    "bad huffman code",
 8291+				    "Corrupt PNG"); // per DEFLATE, distance codes 30 and 31
 8292+				                    // must not appear in compressed data
 8293+			}
 8294+			dist = stbi__zdist_base[z];
 8295+			if (stbi__zdist_extra[z]) {
 8296+				dist += stbi__zreceive(a, stbi__zdist_extra[z]);
 8297+			}
 8298+			if (zout - a->zout_start < dist) {
 8299+				return stbi__err("bad dist", "Corrupt PNG");
 8300+			}
 8301+			if (len > a->zout_end - zout) {
 8302+				if (!stbi__zexpand(a, zout, len)) {
 8303+					return 0;
 8304+				}
 8305+				zout = a->zout;
 8306+			}
 8307+			p = (stbi_uc *)(zout - dist);
 8308+			if (dist == 1) { // run of one byte; common in images.
 8309+				stbi_uc v = *p;
 8310+				if (len) {
 8311+					do {
 8312+						*zout++ = v;
 8313+					} while (--len);
 8314+				}
 8315+			} else {
 8316+				if (len) {
 8317+					do {
 8318+						*zout++ = *p++;
 8319+					} while (--len);
 8320+				}
 8321+			}
 8322+		}
 8323+	}
 8324+}
 8325+
 8326+static int
 8327+stbi__compute_huffman_codes(stbi__zbuf *a)
 8328+{
 8329+	static const stbi_uc length_dezigzag[19] = {
 8330+	    16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
 8331+	stbi__zhuffman z_codelength;
 8332+	stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
 8333+	stbi_uc codelength_sizes[19];
 8334+	int i, n;
 8335+
 8336+	int hlit = stbi__zreceive(a, 5) + 257;
 8337+	int hdist = stbi__zreceive(a, 5) + 1;
 8338+	int hclen = stbi__zreceive(a, 4) + 4;
 8339+	int ntot = hlit + hdist;
 8340+
 8341+	memset(codelength_sizes, 0, sizeof(codelength_sizes));
 8342+	for (i = 0; i < hclen; ++i) {
 8343+		int s = stbi__zreceive(a, 3);
 8344+		codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
 8345+	}
 8346+	if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) {
 8347+		return 0;
 8348+	}
 8349+
 8350+	n = 0;
 8351+	while (n < ntot) {
 8352+		int c = stbi__zhuffman_decode(a, &z_codelength);
 8353+		if (c < 0 || c >= 19) {
 8354+			return stbi__err("bad codelengths", "Corrupt PNG");
 8355+		}
 8356+		if (c < 16) {
 8357+			lencodes[n++] = (stbi_uc)c;
 8358+		} else {
 8359+			stbi_uc fill = 0;
 8360+			if (c == 16) {
 8361+				c = stbi__zreceive(a, 2) + 3;
 8362+				if (n == 0) {
 8363+					return stbi__err("bad codelengths", "Corrupt PNG");
 8364+				}
 8365+				fill = lencodes[n - 1];
 8366+			} else if (c == 17) {
 8367+				c = stbi__zreceive(a, 3) + 3;
 8368+			} else if (c == 18) {
 8369+				c = stbi__zreceive(a, 7) + 11;
 8370+			} else {
 8371+				return stbi__err("bad codelengths", "Corrupt PNG");
 8372+			}
 8373+			if (ntot - n < c) {
 8374+				return stbi__err("bad codelengths", "Corrupt PNG");
 8375+			}
 8376+			memset(lencodes + n, fill, c);
 8377+			n += c;
 8378+		}
 8379+	}
 8380+	if (n != ntot) {
 8381+		return stbi__err("bad codelengths", "Corrupt PNG");
 8382+	}
 8383+	if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) {
 8384+		return 0;
 8385+	}
 8386+	if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) {
 8387+		return 0;
 8388+	}
 8389+	return 1;
 8390+}
 8391+
 8392+static int
 8393+stbi__parse_uncompressed_block(stbi__zbuf *a)
 8394+{
 8395+	stbi_uc header[4];
 8396+	int len, nlen, k;
 8397+	if (a->num_bits & 7) {
 8398+		stbi__zreceive(a, a->num_bits & 7); // discard
 8399+	}
 8400+	// drain the bit-packed data into header
 8401+	k = 0;
 8402+	while (a->num_bits > 0) {
 8403+		header[k++] =
 8404+		    (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
 8405+		a->code_buffer >>= 8;
 8406+		a->num_bits -= 8;
 8407+	}
 8408+	if (a->num_bits < 0) {
 8409+		return stbi__err("zlib corrupt", "Corrupt PNG");
 8410+	}
 8411+	// now fill header the normal way
 8412+	while (k < 4) {
 8413+		header[k++] = stbi__zget8(a);
 8414+	}
 8415+	len = header[1] * 256 + header[0];
 8416+	nlen = header[3] * 256 + header[2];
 8417+	if (nlen != (len ^ 0xffff)) {
 8418+		return stbi__err("zlib corrupt", "Corrupt PNG");
 8419+	}
 8420+	if (a->zbuffer + len > a->zbuffer_end) {
 8421+		return stbi__err("read past buffer", "Corrupt PNG");
 8422+	}
 8423+	if (a->zout + len > a->zout_end) {
 8424+		if (!stbi__zexpand(a, a->zout, len)) {
 8425+			return 0;
 8426+		}
 8427+	}
 8428+	memcpy(a->zout, a->zbuffer, len);
 8429+	a->zbuffer += len;
 8430+	a->zout += len;
 8431+	return 1;
 8432+}
 8433+
 8434+static int
 8435+stbi__parse_zlib_header(stbi__zbuf *a)
 8436+{
 8437+	int cmf = stbi__zget8(a);
 8438+	int cm = cmf & 15;
 8439+	/* int cinfo = cmf >> 4; */
 8440+	int flg = stbi__zget8(a);
 8441+	if (stbi__zeof(a)) {
 8442+		return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
 8443+	}
 8444+	if ((cmf * 256 + flg) % 31 != 0) {
 8445+		return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
 8446+	}
 8447+	if (flg & 32) {
 8448+		return stbi__err("no preset dict",
 8449+		                 "Corrupt PNG"); // preset dictionary not allowed in png
 8450+	}
 8451+	if (cm != 8) {
 8452+		return stbi__err("bad compression",
 8453+		                 "Corrupt PNG"); // DEFLATE required for png
 8454+	}
 8455+	// window = 1 << (8 + cinfo)... but who cares, we fully buffer output
 8456+	return 1;
 8457+}
 8458+
 8459+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = {
 8460+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8461+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8462+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8463+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8464+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8465+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
 8466+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
 8467+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
 8468+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
 8469+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
 8470+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7,
 8471+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8};
 8472+static const stbi_uc stbi__zdefault_distance[32] = {
 8473+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 8474+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
 8475 /*
 8476 Init algorithm:
 8477 {
 8478@@ -4478,118 +5555,159 @@ Init algorithm:
 8479 }
 8480 */
 8481 
 8482-static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 8483-{
 8484-   int final, type;
 8485-   if (parse_header)
 8486-      if (!stbi__parse_zlib_header(a)) return 0;
 8487-   a->num_bits = 0;
 8488-   a->code_buffer = 0;
 8489-   a->hit_zeof_once = 0;
 8490-   do {
 8491-      final = stbi__zreceive(a,1);
 8492-      type = stbi__zreceive(a,2);
 8493-      if (type == 0) {
 8494-         if (!stbi__parse_uncompressed_block(a)) return 0;
 8495-      } else if (type == 3) {
 8496-         return 0;
 8497-      } else {
 8498-         if (type == 1) {
 8499-            // use fixed code lengths
 8500-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
 8501-            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
 8502-         } else {
 8503-            if (!stbi__compute_huffman_codes(a)) return 0;
 8504-         }
 8505-         if (!stbi__parse_huffman_block(a)) return 0;
 8506-      }
 8507-   } while (!final);
 8508-   return 1;
 8509-}
 8510-
 8511-static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
 8512-{
 8513-   a->zout_start = obuf;
 8514-   a->zout       = obuf;
 8515-   a->zout_end   = obuf + olen;
 8516-   a->z_expandable = exp;
 8517-
 8518-   return stbi__parse_zlib(a, parse_header);
 8519-}
 8520-
 8521-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
 8522-{
 8523-   stbi__zbuf a;
 8524-   char *p = (char *) stbi__malloc(initial_size);
 8525-   if (p == NULL) return NULL;
 8526-   a.zbuffer = (stbi_uc *) buffer;
 8527-   a.zbuffer_end = (stbi_uc *) buffer + len;
 8528-   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
 8529-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
 8530-      return a.zout_start;
 8531-   } else {
 8532-      STBI_FREE(a.zout_start);
 8533-      return NULL;
 8534-   }
 8535-}
 8536-
 8537-STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
 8538-{
 8539-   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
 8540-}
 8541-
 8542-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
 8543-{
 8544-   stbi__zbuf a;
 8545-   char *p = (char *) stbi__malloc(initial_size);
 8546-   if (p == NULL) return NULL;
 8547-   a.zbuffer = (stbi_uc *) buffer;
 8548-   a.zbuffer_end = (stbi_uc *) buffer + len;
 8549-   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
 8550-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
 8551-      return a.zout_start;
 8552-   } else {
 8553-      STBI_FREE(a.zout_start);
 8554-      return NULL;
 8555-   }
 8556-}
 8557-
 8558-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
 8559-{
 8560-   stbi__zbuf a;
 8561-   a.zbuffer = (stbi_uc *) ibuffer;
 8562-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
 8563-   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
 8564-      return (int) (a.zout - a.zout_start);
 8565-   else
 8566-      return -1;
 8567-}
 8568-
 8569-STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
 8570-{
 8571-   stbi__zbuf a;
 8572-   char *p = (char *) stbi__malloc(16384);
 8573-   if (p == NULL) return NULL;
 8574-   a.zbuffer = (stbi_uc *) buffer;
 8575-   a.zbuffer_end = (stbi_uc *) buffer+len;
 8576-   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
 8577-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
 8578-      return a.zout_start;
 8579-   } else {
 8580-      STBI_FREE(a.zout_start);
 8581-      return NULL;
 8582-   }
 8583-}
 8584-
 8585-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
 8586-{
 8587-   stbi__zbuf a;
 8588-   a.zbuffer = (stbi_uc *) ibuffer;
 8589-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
 8590-   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
 8591-      return (int) (a.zout - a.zout_start);
 8592-   else
 8593-      return -1;
 8594+static int
 8595+stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 8596+{
 8597+	int final, type;
 8598+	if (parse_header) {
 8599+		if (!stbi__parse_zlib_header(a)) {
 8600+			return 0;
 8601+		}
 8602+	}
 8603+	a->num_bits = 0;
 8604+	a->code_buffer = 0;
 8605+	a->hit_zeof_once = 0;
 8606+	do {
 8607+		final = stbi__zreceive(a, 1);
 8608+		type = stbi__zreceive(a, 2);
 8609+		if (type == 0) {
 8610+			if (!stbi__parse_uncompressed_block(a)) {
 8611+				return 0;
 8612+			}
 8613+		} else if (type == 3) {
 8614+			return 0;
 8615+		} else {
 8616+			if (type == 1) {
 8617+				// use fixed code lengths
 8618+				if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length,
 8619+				                          STBI__ZNSYMS)) {
 8620+					return 0;
 8621+				}
 8622+				if (!stbi__zbuild_huffman(&a->z_distance,
 8623+				                          stbi__zdefault_distance, 32)) {
 8624+					return 0;
 8625+				}
 8626+			} else {
 8627+				if (!stbi__compute_huffman_codes(a)) {
 8628+					return 0;
 8629+				}
 8630+			}
 8631+			if (!stbi__parse_huffman_block(a)) {
 8632+				return 0;
 8633+			}
 8634+		}
 8635+	} while (!final);
 8636+	return 1;
 8637+}
 8638+
 8639+static int
 8640+stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
 8641+{
 8642+	a->zout_start = obuf;
 8643+	a->zout = obuf;
 8644+	a->zout_end = obuf + olen;
 8645+	a->z_expandable = exp;
 8646+
 8647+	return stbi__parse_zlib(a, parse_header);
 8648+}
 8649+
 8650+STBIDEF char *
 8651+stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size,
 8652+                                  int *outlen)
 8653+{
 8654+	stbi__zbuf a;
 8655+	char *p = (char *)stbi__malloc(initial_size);
 8656+	if (p == NULL) {
 8657+		return NULL;
 8658+	}
 8659+	a.zbuffer = (stbi_uc *)buffer;
 8660+	a.zbuffer_end = (stbi_uc *)buffer + len;
 8661+	if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
 8662+		if (outlen) {
 8663+			*outlen = (int)(a.zout - a.zout_start);
 8664+		}
 8665+		return a.zout_start;
 8666+	} else {
 8667+		STBI_FREE(a.zout_start);
 8668+		return NULL;
 8669+	}
 8670+}
 8671+
 8672+STBIDEF char *
 8673+stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
 8674+{
 8675+	return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
 8676+}
 8677+
 8678+STBIDEF char *
 8679+stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len,
 8680+                                             int initial_size, int *outlen,
 8681+                                             int parse_header)
 8682+{
 8683+	stbi__zbuf a;
 8684+	char *p = (char *)stbi__malloc(initial_size);
 8685+	if (p == NULL) {
 8686+		return NULL;
 8687+	}
 8688+	a.zbuffer = (stbi_uc *)buffer;
 8689+	a.zbuffer_end = (stbi_uc *)buffer + len;
 8690+	if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
 8691+		if (outlen) {
 8692+			*outlen = (int)(a.zout - a.zout_start);
 8693+		}
 8694+		return a.zout_start;
 8695+	} else {
 8696+		STBI_FREE(a.zout_start);
 8697+		return NULL;
 8698+	}
 8699+}
 8700+
 8701+STBIDEF int
 8702+stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
 8703+{
 8704+	stbi__zbuf a;
 8705+	a.zbuffer = (stbi_uc *)ibuffer;
 8706+	a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
 8707+	if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) {
 8708+		return (int)(a.zout - a.zout_start);
 8709+	} else {
 8710+		return -1;
 8711+	}
 8712+}
 8713+
 8714+STBIDEF char *
 8715+stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
 8716+{
 8717+	stbi__zbuf a;
 8718+	char *p = (char *)stbi__malloc(16384);
 8719+	if (p == NULL) {
 8720+		return NULL;
 8721+	}
 8722+	a.zbuffer = (stbi_uc *)buffer;
 8723+	a.zbuffer_end = (stbi_uc *)buffer + len;
 8724+	if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
 8725+		if (outlen) {
 8726+			*outlen = (int)(a.zout - a.zout_start);
 8727+		}
 8728+		return a.zout_start;
 8729+	} else {
 8730+		STBI_FREE(a.zout_start);
 8731+		return NULL;
 8732+	}
 8733+}
 8734+
 8735+STBIDEF int
 8736+stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer,
 8737+                                 int ilen)
 8738+{
 8739+	stbi__zbuf a;
 8740+	a.zbuffer = (stbi_uc *)ibuffer;
 8741+	a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
 8742+	if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) {
 8743+		return (int)(a.zout - a.zout_start);
 8744+	} else {
 8745+		return -1;
 8746+	}
 8747 }
 8748 #endif
 8749 
 8750@@ -4604,1131 +5722,1498 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char
 8751 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
 8752 
 8753 #ifndef STBI_NO_PNG
 8754-typedef struct
 8755-{
 8756-   stbi__uint32 length;
 8757-   stbi__uint32 type;
 8758+typedef struct {
 8759+	stbi__uint32 length;
 8760+	stbi__uint32 type;
 8761 } stbi__pngchunk;
 8762 
 8763-static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 8764+static stbi__pngchunk
 8765+stbi__get_chunk_header(stbi__context *s)
 8766 {
 8767-   stbi__pngchunk c;
 8768-   c.length = stbi__get32be(s);
 8769-   c.type   = stbi__get32be(s);
 8770-   return c;
 8771+	stbi__pngchunk c;
 8772+	c.length = stbi__get32be(s);
 8773+	c.type = stbi__get32be(s);
 8774+	return c;
 8775 }
 8776 
 8777-static int stbi__check_png_header(stbi__context *s)
 8778+static int
 8779+stbi__check_png_header(stbi__context *s)
 8780 {
 8781-   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
 8782-   int i;
 8783-   for (i=0; i < 8; ++i)
 8784-      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
 8785-   return 1;
 8786+	static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
 8787+	int i;
 8788+	for (i = 0; i < 8; ++i) {
 8789+		if (stbi__get8(s) != png_sig[i]) {
 8790+			return stbi__err("bad png sig", "Not a PNG");
 8791+		}
 8792+	}
 8793+	return 1;
 8794 }
 8795 
 8796-typedef struct
 8797-{
 8798-   stbi__context *s;
 8799-   stbi_uc *idata, *expanded, *out;
 8800-   int depth;
 8801+typedef struct {
 8802+	stbi__context *s;
 8803+	stbi_uc *idata, *expanded, *out;
 8804+	int depth;
 8805 } stbi__png;
 8806 
 8807-
 8808 enum {
 8809-   STBI__F_none=0,
 8810-   STBI__F_sub=1,
 8811-   STBI__F_up=2,
 8812-   STBI__F_avg=3,
 8813-   STBI__F_paeth=4,
 8814-   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
 8815-   STBI__F_avg_first
 8816+	STBI__F_none = 0,
 8817+	STBI__F_sub = 1,
 8818+	STBI__F_up = 2,
 8819+	STBI__F_avg = 3,
 8820+	STBI__F_paeth = 4,
 8821+	// synthetic filter used for first scanline to avoid needing a dummy row of
 8822+	// 0s
 8823+	STBI__F_avg_first
 8824 };
 8825 
 8826-static stbi_uc first_row_filter[5] =
 8827-{
 8828-   STBI__F_none,
 8829-   STBI__F_sub,
 8830-   STBI__F_none,
 8831-   STBI__F_avg_first,
 8832-   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
 8833+static stbi_uc first_row_filter[5] = {
 8834+    STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first,
 8835+    STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
 8836 };
 8837 
 8838-static int stbi__paeth(int a, int b, int c)
 8839+static int
 8840+stbi__paeth(int a, int b, int c)
 8841 {
 8842-   // This formulation looks very different from the reference in the PNG spec, but is
 8843-   // actually equivalent and has favorable data dependencies and admits straightforward
 8844-   // generation of branch-free code, which helps performance significantly.
 8845-   int thresh = c*3 - (a + b);
 8846-   int lo = a < b ? a : b;
 8847-   int hi = a < b ? b : a;
 8848-   int t0 = (hi <= thresh) ? lo : c;
 8849-   int t1 = (thresh <= lo) ? hi : t0;
 8850-   return t1;
 8851+	// This formulation looks very different from the reference in the PNG spec,
 8852+	// but is actually equivalent and has favorable data dependencies and admits
 8853+	// straightforward generation of branch-free code, which helps performance
 8854+	// significantly.
 8855+	int thresh = c * 3 - (a + b);
 8856+	int lo = a < b ? a : b;
 8857+	int hi = a < b ? b : a;
 8858+	int t0 = (hi <= thresh) ? lo : c;
 8859+	int t1 = (thresh <= lo) ? hi : t0;
 8860+	return t1;
 8861 }
 8862 
 8863-static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 8864+static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0,   0x11,
 8865+                                                   0, 0,    0,    0x01};
 8866 
 8867 // adds an extra all-255 alpha channel
 8868 // dest == src is legal
 8869 // img_n must be 1 or 3
 8870-static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
 8871-{
 8872-   int i;
 8873-   // must process data backwards since we allow dest==src
 8874-   if (img_n == 1) {
 8875-      for (i=x-1; i >= 0; --i) {
 8876-         dest[i*2+1] = 255;
 8877-         dest[i*2+0] = src[i];
 8878-      }
 8879-   } else {
 8880-      STBI_ASSERT(img_n == 3);
 8881-      for (i=x-1; i >= 0; --i) {
 8882-         dest[i*4+3] = 255;
 8883-         dest[i*4+2] = src[i*3+2];
 8884-         dest[i*4+1] = src[i*3+1];
 8885-         dest[i*4+0] = src[i*3+0];
 8886-      }
 8887-   }
 8888+static void
 8889+stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x,
 8890+                               int img_n)
 8891+{
 8892+	int i;
 8893+	// must process data backwards since we allow dest==src
 8894+	if (img_n == 1) {
 8895+		for (i = x - 1; i >= 0; --i) {
 8896+			dest[i * 2 + 1] = 255;
 8897+			dest[i * 2 + 0] = src[i];
 8898+		}
 8899+	} else {
 8900+		STBI_ASSERT(img_n == 3);
 8901+		for (i = x - 1; i >= 0; --i) {
 8902+			dest[i * 4 + 3] = 255;
 8903+			dest[i * 4 + 2] = src[i * 3 + 2];
 8904+			dest[i * 4 + 1] = src[i * 3 + 1];
 8905+			dest[i * 4 + 0] = src[i * 3 + 0];
 8906+		}
 8907+	}
 8908 }
 8909 
 8910 // create the png data from post-deflated data
 8911-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 8912-{
 8913-   int bytes = (depth == 16 ? 2 : 1);
 8914-   stbi__context *s = a->s;
 8915-   stbi__uint32 i,j,stride = x*out_n*bytes;
 8916-   stbi__uint32 img_len, img_width_bytes;
 8917-   stbi_uc *filter_buf;
 8918-   int all_ok = 1;
 8919-   int k;
 8920-   int img_n = s->img_n; // copy it into a local for later
 8921-
 8922-   int output_bytes = out_n*bytes;
 8923-   int filter_bytes = img_n*bytes;
 8924-   int width = x;
 8925-
 8926-   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
 8927-   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
 8928-   if (!a->out) return stbi__err("outofmem", "Out of memory");
 8929-
 8930-   // note: error exits here don't need to clean up a->out individually,
 8931-   // stbi__do_png always does on error.
 8932-   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
 8933-   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
 8934-   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
 8935-   img_len = (img_width_bytes + 1) * y;
 8936-
 8937-   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
 8938-   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
 8939-   // so just check for raw_len < img_len always.
 8940-   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 8941-
 8942-   // Allocate two scan lines worth of filter workspace buffer.
 8943-   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
 8944-   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
 8945-
 8946-   // Filtering for low-bit-depth images
 8947-   if (depth < 8) {
 8948-      filter_bytes = 1;
 8949-      width = img_width_bytes;
 8950-   }
 8951-
 8952-   for (j=0; j < y; ++j) {
 8953-      // cur/prior filter buffers alternate
 8954-      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
 8955-      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
 8956-      stbi_uc *dest = a->out + stride*j;
 8957-      int nk = width * filter_bytes;
 8958-      int filter = *raw++;
 8959-
 8960-      // check filter type
 8961-      if (filter > 4) {
 8962-         all_ok = stbi__err("invalid filter","Corrupt PNG");
 8963-         break;
 8964-      }
 8965-
 8966-      // if first row, use special filter that doesn't sample previous row
 8967-      if (j == 0) filter = first_row_filter[filter];
 8968-
 8969-      // perform actual filtering
 8970-      switch (filter) {
 8971-      case STBI__F_none:
 8972-         memcpy(cur, raw, nk);
 8973-         break;
 8974-      case STBI__F_sub:
 8975-         memcpy(cur, raw, filter_bytes);
 8976-         for (k = filter_bytes; k < nk; ++k)
 8977-            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
 8978-         break;
 8979-      case STBI__F_up:
 8980-         for (k = 0; k < nk; ++k)
 8981-            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
 8982-         break;
 8983-      case STBI__F_avg:
 8984-         for (k = 0; k < filter_bytes; ++k)
 8985-            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
 8986-         for (k = filter_bytes; k < nk; ++k)
 8987-            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
 8988-         break;
 8989-      case STBI__F_paeth:
 8990-         for (k = 0; k < filter_bytes; ++k)
 8991-            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
 8992-         for (k = filter_bytes; k < nk; ++k)
 8993-            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
 8994-         break;
 8995-      case STBI__F_avg_first:
 8996-         memcpy(cur, raw, filter_bytes);
 8997-         for (k = filter_bytes; k < nk; ++k)
 8998-            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
 8999-         break;
 9000-      }
 9001-
 9002-      raw += nk;
 9003-
 9004-      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
 9005-      if (depth < 8) {
 9006-         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
 9007-         stbi_uc *in = cur;
 9008-         stbi_uc *out = dest;
 9009-         stbi_uc inb = 0;
 9010-         stbi__uint32 nsmp = x*img_n;
 9011-
 9012-         // expand bits to bytes first
 9013-         if (depth == 4) {
 9014-            for (i=0; i < nsmp; ++i) {
 9015-               if ((i & 1) == 0) inb = *in++;
 9016-               *out++ = scale * (inb >> 4);
 9017-               inb <<= 4;
 9018-            }
 9019-         } else if (depth == 2) {
 9020-            for (i=0; i < nsmp; ++i) {
 9021-               if ((i & 3) == 0) inb = *in++;
 9022-               *out++ = scale * (inb >> 6);
 9023-               inb <<= 2;
 9024-            }
 9025-         } else {
 9026-            STBI_ASSERT(depth == 1);
 9027-            for (i=0; i < nsmp; ++i) {
 9028-               if ((i & 7) == 0) inb = *in++;
 9029-               *out++ = scale * (inb >> 7);
 9030-               inb <<= 1;
 9031-            }
 9032-         }
 9033-
 9034-         // insert alpha=255 values if desired
 9035-         if (img_n != out_n)
 9036-            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
 9037-      } else if (depth == 8) {
 9038-         if (img_n == out_n)
 9039-            memcpy(dest, cur, x*img_n);
 9040-         else
 9041-            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
 9042-      } else if (depth == 16) {
 9043-         // convert the image data from big-endian to platform-native
 9044-         stbi__uint16 *dest16 = (stbi__uint16*)dest;
 9045-         stbi__uint32 nsmp = x*img_n;
 9046-
 9047-         if (img_n == out_n) {
 9048-            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
 9049-               *dest16 = (cur[0] << 8) | cur[1];
 9050-         } else {
 9051-            STBI_ASSERT(img_n+1 == out_n);
 9052-            if (img_n == 1) {
 9053-               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
 9054-                  dest16[0] = (cur[0] << 8) | cur[1];
 9055-                  dest16[1] = 0xffff;
 9056-               }
 9057-            } else {
 9058-               STBI_ASSERT(img_n == 3);
 9059-               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
 9060-                  dest16[0] = (cur[0] << 8) | cur[1];
 9061-                  dest16[1] = (cur[2] << 8) | cur[3];
 9062-                  dest16[2] = (cur[4] << 8) | cur[5];
 9063-                  dest16[3] = 0xffff;
 9064-               }
 9065-            }
 9066-         }
 9067-      }
 9068-   }
 9069-
 9070-   STBI_FREE(filter_buf);
 9071-   if (!all_ok) return 0;
 9072-
 9073-   return 1;
 9074-}
 9075-
 9076-static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 9077-{
 9078-   int bytes = (depth == 16 ? 2 : 1);
 9079-   int out_bytes = out_n * bytes;
 9080-   stbi_uc *final;
 9081-   int p;
 9082-   if (!interlaced)
 9083-      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 9084-
 9085-   // de-interlacing
 9086-   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
 9087-   if (!final) return stbi__err("outofmem", "Out of memory");
 9088-   for (p=0; p < 7; ++p) {
 9089-      int xorig[] = { 0,4,0,2,0,1,0 };
 9090-      int yorig[] = { 0,0,4,0,2,0,1 };
 9091-      int xspc[]  = { 8,8,4,4,2,2,1 };
 9092-      int yspc[]  = { 8,8,8,4,4,2,2 };
 9093-      int i,j,x,y;
 9094-      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
 9095-      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
 9096-      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
 9097-      if (x && y) {
 9098-         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
 9099-         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
 9100-            STBI_FREE(final);
 9101-            return 0;
 9102-         }
 9103-         for (j=0; j < y; ++j) {
 9104-            for (i=0; i < x; ++i) {
 9105-               int out_y = j*yspc[p]+yorig[p];
 9106-               int out_x = i*xspc[p]+xorig[p];
 9107-               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
 9108-                      a->out + (j*x+i)*out_bytes, out_bytes);
 9109-            }
 9110-         }
 9111-         STBI_FREE(a->out);
 9112-         image_data += img_len;
 9113-         image_data_len -= img_len;
 9114-      }
 9115-   }
 9116-   a->out = final;
 9117-
 9118-   return 1;
 9119-}
 9120-
 9121-static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
 9122-{
 9123-   stbi__context *s = z->s;
 9124-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9125-   stbi_uc *p = z->out;
 9126-
 9127-   // compute color-based transparency, assuming we've
 9128-   // already got 255 as the alpha value in the output
 9129-   STBI_ASSERT(out_n == 2 || out_n == 4);
 9130-
 9131-   if (out_n == 2) {
 9132-      for (i=0; i < pixel_count; ++i) {
 9133-         p[1] = (p[0] == tc[0] ? 0 : 255);
 9134-         p += 2;
 9135-      }
 9136-   } else {
 9137-      for (i=0; i < pixel_count; ++i) {
 9138-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
 9139-            p[3] = 0;
 9140-         p += 4;
 9141-      }
 9142-   }
 9143-   return 1;
 9144-}
 9145-
 9146-static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
 9147-{
 9148-   stbi__context *s = z->s;
 9149-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9150-   stbi__uint16 *p = (stbi__uint16*) z->out;
 9151-
 9152-   // compute color-based transparency, assuming we've
 9153-   // already got 65535 as the alpha value in the output
 9154-   STBI_ASSERT(out_n == 2 || out_n == 4);
 9155-
 9156-   if (out_n == 2) {
 9157-      for (i = 0; i < pixel_count; ++i) {
 9158-         p[1] = (p[0] == tc[0] ? 0 : 65535);
 9159-         p += 2;
 9160-      }
 9161-   } else {
 9162-      for (i = 0; i < pixel_count; ++i) {
 9163-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
 9164-            p[3] = 0;
 9165-         p += 4;
 9166-      }
 9167-   }
 9168-   return 1;
 9169-}
 9170-
 9171-static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
 9172-{
 9173-   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
 9174-   stbi_uc *p, *temp_out, *orig = a->out;
 9175-
 9176-   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
 9177-   if (p == NULL) return stbi__err("outofmem", "Out of memory");
 9178-
 9179-   // between here and free(out) below, exitting would leak
 9180-   temp_out = p;
 9181-
 9182-   if (pal_img_n == 3) {
 9183-      for (i=0; i < pixel_count; ++i) {
 9184-         int n = orig[i]*4;
 9185-         p[0] = palette[n  ];
 9186-         p[1] = palette[n+1];
 9187-         p[2] = palette[n+2];
 9188-         p += 3;
 9189-      }
 9190-   } else {
 9191-      for (i=0; i < pixel_count; ++i) {
 9192-         int n = orig[i]*4;
 9193-         p[0] = palette[n  ];
 9194-         p[1] = palette[n+1];
 9195-         p[2] = palette[n+2];
 9196-         p[3] = palette[n+3];
 9197-         p += 4;
 9198-      }
 9199-   }
 9200-   STBI_FREE(a->out);
 9201-   a->out = temp_out;
 9202-
 9203-   STBI_NOTUSED(len);
 9204-
 9205-   return 1;
 9206+static int
 9207+stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len,
 9208+                           int out_n, stbi__uint32 x, stbi__uint32 y, int depth,
 9209+                           int color)
 9210+{
 9211+	int bytes = (depth == 16 ? 2 : 1);
 9212+	stbi__context *s = a->s;
 9213+	stbi__uint32 i, j, stride = x * out_n * bytes;
 9214+	stbi__uint32 img_len, img_width_bytes;
 9215+	stbi_uc *filter_buf;
 9216+	int all_ok = 1;
 9217+	int k;
 9218+	int img_n = s->img_n; // copy it into a local for later
 9219+
 9220+	int output_bytes = out_n * bytes;
 9221+	int filter_bytes = img_n * bytes;
 9222+	int width = x;
 9223+
 9224+	STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
 9225+	a->out = (stbi_uc *)stbi__malloc_mad3(
 9226+	    x, y, output_bytes, 0); // extra bytes to write off the end into
 9227+	if (!a->out) {
 9228+		return stbi__err("outofmem", "Out of memory");
 9229+	}
 9230+
 9231+	// note: error exits here don't need to clean up a->out individually,
 9232+	// stbi__do_png always does on error.
 9233+	if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) {
 9234+		return stbi__err("too large", "Corrupt PNG");
 9235+	}
 9236+	img_width_bytes = (((img_n * x * depth) + 7) >> 3);
 9237+	if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) {
 9238+		return stbi__err("too large", "Corrupt PNG");
 9239+	}
 9240+	img_len = (img_width_bytes + 1) * y;
 9241+
 9242+	// we used to check for exact match between raw_len and img_len on
 9243+	// non-interlaced PNGs, but issue #276 reported a PNG in the wild that had
 9244+	// extra data at the end (all zeros), so just check for raw_len < img_len
 9245+	// always.
 9246+	if (raw_len < img_len) {
 9247+		return stbi__err("not enough pixels", "Corrupt PNG");
 9248+	}
 9249+
 9250+	// Allocate two scan lines worth of filter workspace buffer.
 9251+	filter_buf = (stbi_uc *)stbi__malloc_mad2(img_width_bytes, 2, 0);
 9252+	if (!filter_buf) {
 9253+		return stbi__err("outofmem", "Out of memory");
 9254+	}
 9255+
 9256+	// Filtering for low-bit-depth images
 9257+	if (depth < 8) {
 9258+		filter_bytes = 1;
 9259+		width = img_width_bytes;
 9260+	}
 9261+
 9262+	for (j = 0; j < y; ++j) {
 9263+		// cur/prior filter buffers alternate
 9264+		stbi_uc *cur = filter_buf + (j & 1) * img_width_bytes;
 9265+		stbi_uc *prior = filter_buf + (~j & 1) * img_width_bytes;
 9266+		stbi_uc *dest = a->out + stride * j;
 9267+		int nk = width * filter_bytes;
 9268+		int filter = *raw++;
 9269+
 9270+		// check filter type
 9271+		if (filter > 4) {
 9272+			all_ok = stbi__err("invalid filter", "Corrupt PNG");
 9273+			break;
 9274+		}
 9275+
 9276+		// if first row, use special filter that doesn't sample previous row
 9277+		if (j == 0) {
 9278+			filter = first_row_filter[filter];
 9279+		}
 9280+
 9281+		// perform actual filtering
 9282+		switch (filter) {
 9283+		case STBI__F_none:
 9284+			memcpy(cur, raw, nk);
 9285+			break;
 9286+		case STBI__F_sub:
 9287+			memcpy(cur, raw, filter_bytes);
 9288+			for (k = filter_bytes; k < nk; ++k) {
 9289+				cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
 9290+			}
 9291+			break;
 9292+		case STBI__F_up:
 9293+			for (k = 0; k < nk; ++k) {
 9294+				cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
 9295+			}
 9296+			break;
 9297+		case STBI__F_avg:
 9298+			for (k = 0; k < filter_bytes; ++k) {
 9299+				cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
 9300+			}
 9301+			for (k = filter_bytes; k < nk; ++k) {
 9302+				cur[k] = STBI__BYTECAST(
 9303+				    raw[k] + ((prior[k] + cur[k - filter_bytes]) >> 1));
 9304+			}
 9305+			break;
 9306+		case STBI__F_paeth:
 9307+			for (k = 0; k < filter_bytes; ++k) {
 9308+				cur[k] = STBI__BYTECAST(
 9309+				    raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
 9310+			}
 9311+			for (k = filter_bytes; k < nk; ++k) {
 9312+				cur[k] = STBI__BYTECAST(
 9313+				    raw[k] + stbi__paeth(cur[k - filter_bytes], prior[k],
 9314+				                         prior[k - filter_bytes]));
 9315+			}
 9316+			break;
 9317+		case STBI__F_avg_first:
 9318+			memcpy(cur, raw, filter_bytes);
 9319+			for (k = filter_bytes; k < nk; ++k) {
 9320+				cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
 9321+			}
 9322+			break;
 9323+		}
 9324+
 9325+		raw += nk;
 9326+
 9327+		// expand decoded bits in cur to dest, also adding an extra alpha
 9328+		// channel if desired
 9329+		if (depth < 8) {
 9330+			stbi_uc scale = (color == 0)
 9331+			                    ? stbi__depth_scale_table[depth]
 9332+			                    : 1; // scale grayscale values to 0..255 range
 9333+			stbi_uc *in = cur;
 9334+			stbi_uc *out = dest;
 9335+			stbi_uc inb = 0;
 9336+			stbi__uint32 nsmp = x * img_n;
 9337+
 9338+			// expand bits to bytes first
 9339+			if (depth == 4) {
 9340+				for (i = 0; i < nsmp; ++i) {
 9341+					if ((i & 1) == 0) {
 9342+						inb = *in++;
 9343+					}
 9344+					*out++ = scale * (inb >> 4);
 9345+					inb <<= 4;
 9346+				}
 9347+			} else if (depth == 2) {
 9348+				for (i = 0; i < nsmp; ++i) {
 9349+					if ((i & 3) == 0) {
 9350+						inb = *in++;
 9351+					}
 9352+					*out++ = scale * (inb >> 6);
 9353+					inb <<= 2;
 9354+				}
 9355+			} else {
 9356+				STBI_ASSERT(depth == 1);
 9357+				for (i = 0; i < nsmp; ++i) {
 9358+					if ((i & 7) == 0) {
 9359+						inb = *in++;
 9360+					}
 9361+					*out++ = scale * (inb >> 7);
 9362+					inb <<= 1;
 9363+				}
 9364+			}
 9365+
 9366+			// insert alpha=255 values if desired
 9367+			if (img_n != out_n) {
 9368+				stbi__create_png_alpha_expand8(dest, dest, x, img_n);
 9369+			}
 9370+		} else if (depth == 8) {
 9371+			if (img_n == out_n) {
 9372+				memcpy(dest, cur, x * img_n);
 9373+			} else {
 9374+				stbi__create_png_alpha_expand8(dest, cur, x, img_n);
 9375+			}
 9376+		} else if (depth == 16) {
 9377+			// convert the image data from big-endian to platform-native
 9378+			stbi__uint16 *dest16 = (stbi__uint16 *)dest;
 9379+			stbi__uint32 nsmp = x * img_n;
 9380+
 9381+			if (img_n == out_n) {
 9382+				for (i = 0; i < nsmp; ++i, ++dest16, cur += 2) {
 9383+					*dest16 = (cur[0] << 8) | cur[1];
 9384+				}
 9385+			} else {
 9386+				STBI_ASSERT(img_n + 1 == out_n);
 9387+				if (img_n == 1) {
 9388+					for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
 9389+						dest16[0] = (cur[0] << 8) | cur[1];
 9390+						dest16[1] = 0xffff;
 9391+					}
 9392+				} else {
 9393+					STBI_ASSERT(img_n == 3);
 9394+					for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
 9395+						dest16[0] = (cur[0] << 8) | cur[1];
 9396+						dest16[1] = (cur[2] << 8) | cur[3];
 9397+						dest16[2] = (cur[4] << 8) | cur[5];
 9398+						dest16[3] = 0xffff;
 9399+					}
 9400+				}
 9401+			}
 9402+		}
 9403+	}
 9404+
 9405+	STBI_FREE(filter_buf);
 9406+	if (!all_ok) {
 9407+		return 0;
 9408+	}
 9409+
 9410+	return 1;
 9411+}
 9412+
 9413+static int
 9414+stbi__create_png_image(stbi__png *a, stbi_uc *image_data,
 9415+                       stbi__uint32 image_data_len, int out_n, int depth,
 9416+                       int color, int interlaced)
 9417+{
 9418+	int bytes = (depth == 16 ? 2 : 1);
 9419+	int out_bytes = out_n * bytes;
 9420+	stbi_uc *final;
 9421+	int p;
 9422+	if (!interlaced) {
 9423+		return stbi__create_png_image_raw(a, image_data, image_data_len, out_n,
 9424+		                                  a->s->img_x, a->s->img_y, depth,
 9425+		                                  color);
 9426+	}
 9427+
 9428+	// de-interlacing
 9429+	final =
 9430+	    (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
 9431+	if (!final) {
 9432+		return stbi__err("outofmem", "Out of memory");
 9433+	}
 9434+	for (p = 0; p < 7; ++p) {
 9435+		int xorig[] = {0, 4, 0, 2, 0, 1, 0};
 9436+		int yorig[] = {0, 0, 4, 0, 2, 0, 1};
 9437+		int xspc[] = {8, 8, 4, 4, 2, 2, 1};
 9438+		int yspc[] = {8, 8, 8, 4, 4, 2, 2};
 9439+		int i, j, x, y;
 9440+		// pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
 9441+		x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
 9442+		y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
 9443+		if (x && y) {
 9444+			stbi__uint32 img_len =
 9445+			    ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
 9446+			if (!stbi__create_png_image_raw(a, image_data, image_data_len,
 9447+			                                out_n, x, y, depth, color)) {
 9448+				STBI_FREE(final);
 9449+				return 0;
 9450+			}
 9451+			for (j = 0; j < y; ++j) {
 9452+				for (i = 0; i < x; ++i) {
 9453+					int out_y = j * yspc[p] + yorig[p];
 9454+					int out_x = i * xspc[p] + xorig[p];
 9455+					memcpy(final + out_y * a->s->img_x * out_bytes +
 9456+					           out_x * out_bytes,
 9457+					       a->out + (j * x + i) * out_bytes, out_bytes);
 9458+				}
 9459+			}
 9460+			STBI_FREE(a->out);
 9461+			image_data += img_len;
 9462+			image_data_len -= img_len;
 9463+		}
 9464+	}
 9465+	a->out = final;
 9466+
 9467+	return 1;
 9468+}
 9469+
 9470+static int
 9471+stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
 9472+{
 9473+	stbi__context *s = z->s;
 9474+	stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9475+	stbi_uc *p = z->out;
 9476+
 9477+	// compute color-based transparency, assuming we've
 9478+	// already got 255 as the alpha value in the output
 9479+	STBI_ASSERT(out_n == 2 || out_n == 4);
 9480+
 9481+	if (out_n == 2) {
 9482+		for (i = 0; i < pixel_count; ++i) {
 9483+			p[1] = (p[0] == tc[0] ? 0 : 255);
 9484+			p += 2;
 9485+		}
 9486+	} else {
 9487+		for (i = 0; i < pixel_count; ++i) {
 9488+			if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) {
 9489+				p[3] = 0;
 9490+			}
 9491+			p += 4;
 9492+		}
 9493+	}
 9494+	return 1;
 9495+}
 9496+
 9497+static int
 9498+stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
 9499+{
 9500+	stbi__context *s = z->s;
 9501+	stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9502+	stbi__uint16 *p = (stbi__uint16 *)z->out;
 9503+
 9504+	// compute color-based transparency, assuming we've
 9505+	// already got 65535 as the alpha value in the output
 9506+	STBI_ASSERT(out_n == 2 || out_n == 4);
 9507+
 9508+	if (out_n == 2) {
 9509+		for (i = 0; i < pixel_count; ++i) {
 9510+			p[1] = (p[0] == tc[0] ? 0 : 65535);
 9511+			p += 2;
 9512+		}
 9513+	} else {
 9514+		for (i = 0; i < pixel_count; ++i) {
 9515+			if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) {
 9516+				p[3] = 0;
 9517+			}
 9518+			p += 4;
 9519+		}
 9520+	}
 9521+	return 1;
 9522+}
 9523+
 9524+static int
 9525+stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
 9526+{
 9527+	stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
 9528+	stbi_uc *p, *temp_out, *orig = a->out;
 9529+
 9530+	p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
 9531+	if (p == NULL) {
 9532+		return stbi__err("outofmem", "Out of memory");
 9533+	}
 9534+
 9535+	// between here and free(out) below, exitting would leak
 9536+	temp_out = p;
 9537+
 9538+	if (pal_img_n == 3) {
 9539+		for (i = 0; i < pixel_count; ++i) {
 9540+			int n = orig[i] * 4;
 9541+			p[0] = palette[n];
 9542+			p[1] = palette[n + 1];
 9543+			p[2] = palette[n + 2];
 9544+			p += 3;
 9545+		}
 9546+	} else {
 9547+		for (i = 0; i < pixel_count; ++i) {
 9548+			int n = orig[i] * 4;
 9549+			p[0] = palette[n];
 9550+			p[1] = palette[n + 1];
 9551+			p[2] = palette[n + 2];
 9552+			p[3] = palette[n + 3];
 9553+			p += 4;
 9554+		}
 9555+	}
 9556+	STBI_FREE(a->out);
 9557+	a->out = temp_out;
 9558+
 9559+	STBI_NOTUSED(len);
 9560+
 9561+	return 1;
 9562 }
 9563 
 9564 static int stbi__unpremultiply_on_load_global = 0;
 9565 static int stbi__de_iphone_flag_global = 0;
 9566 
 9567-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
 9568+STBIDEF void
 9569+stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
 9570 {
 9571-   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 9572+	stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 9573 }
 9574 
 9575-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 9576+STBIDEF void
 9577+stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 9578 {
 9579-   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 9580+	stbi__de_iphone_flag_global = flag_true_if_should_convert;
 9581 }
 9582 
 9583 #ifndef STBI_THREAD_LOCAL
 9584-#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
 9585-#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
 9586+#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global
 9587+#define stbi__de_iphone_flag stbi__de_iphone_flag_global
 9588 #else
 9589-static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
 9590-static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
 9591+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local,
 9592+    stbi__unpremultiply_on_load_set;
 9593+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local,
 9594+    stbi__de_iphone_flag_set;
 9595 
 9596-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
 9597+STBIDEF void
 9598+stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
 9599 {
 9600-   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
 9601-   stbi__unpremultiply_on_load_set = 1;
 9602+	stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
 9603+	stbi__unpremultiply_on_load_set = 1;
 9604 }
 9605 
 9606-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
 9607+STBIDEF void
 9608+stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
 9609 {
 9610-   stbi__de_iphone_flag_local = flag_true_if_should_convert;
 9611-   stbi__de_iphone_flag_set = 1;
 9612+	stbi__de_iphone_flag_local = flag_true_if_should_convert;
 9613+	stbi__de_iphone_flag_set = 1;
 9614 }
 9615 
 9616-#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
 9617-                                       ? stbi__unpremultiply_on_load_local      \
 9618-                                       : stbi__unpremultiply_on_load_global)
 9619-#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
 9620-                                ? stbi__de_iphone_flag_local                    \
 9621-                                : stbi__de_iphone_flag_global)
 9622+#define stbi__unpremultiply_on_load                                            \
 9623+	(stbi__unpremultiply_on_load_set ? stbi__unpremultiply_on_load_local       \
 9624+	                                 : stbi__unpremultiply_on_load_global)
 9625+#define stbi__de_iphone_flag                                                   \
 9626+	(stbi__de_iphone_flag_set ? stbi__de_iphone_flag_local                     \
 9627+	                          : stbi__de_iphone_flag_global)
 9628 #endif // STBI_THREAD_LOCAL
 9629 
 9630-static void stbi__de_iphone(stbi__png *z)
 9631-{
 9632-   stbi__context *s = z->s;
 9633-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9634-   stbi_uc *p = z->out;
 9635-
 9636-   if (s->img_out_n == 3) {  // convert bgr to rgb
 9637-      for (i=0; i < pixel_count; ++i) {
 9638-         stbi_uc t = p[0];
 9639-         p[0] = p[2];
 9640-         p[2] = t;
 9641-         p += 3;
 9642-      }
 9643-   } else {
 9644-      STBI_ASSERT(s->img_out_n == 4);
 9645-      if (stbi__unpremultiply_on_load) {
 9646-         // convert bgr to rgb and unpremultiply
 9647-         for (i=0; i < pixel_count; ++i) {
 9648-            stbi_uc a = p[3];
 9649-            stbi_uc t = p[0];
 9650-            if (a) {
 9651-               stbi_uc half = a / 2;
 9652-               p[0] = (p[2] * 255 + half) / a;
 9653-               p[1] = (p[1] * 255 + half) / a;
 9654-               p[2] = ( t   * 255 + half) / a;
 9655-            } else {
 9656-               p[0] = p[2];
 9657-               p[2] = t;
 9658-            }
 9659-            p += 4;
 9660-         }
 9661-      } else {
 9662-         // convert bgr to rgb
 9663-         for (i=0; i < pixel_count; ++i) {
 9664-            stbi_uc t = p[0];
 9665-            p[0] = p[2];
 9666-            p[2] = t;
 9667-            p += 4;
 9668-         }
 9669-      }
 9670-   }
 9671-}
 9672-
 9673-#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 9674-
 9675-static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 9676-{
 9677-   stbi_uc palette[1024], pal_img_n=0;
 9678-   stbi_uc has_trans=0, tc[3]={0};
 9679-   stbi__uint16 tc16[3];
 9680-   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
 9681-   int first=1,k,interlace=0, color=0, is_iphone=0;
 9682-   stbi__context *s = z->s;
 9683-
 9684-   z->expanded = NULL;
 9685-   z->idata = NULL;
 9686-   z->out = NULL;
 9687-
 9688-   if (!stbi__check_png_header(s)) return 0;
 9689-
 9690-   if (scan == STBI__SCAN_type) return 1;
 9691-
 9692-   for (;;) {
 9693-      stbi__pngchunk c = stbi__get_chunk_header(s);
 9694-      switch (c.type) {
 9695-         case STBI__PNG_TYPE('C','g','B','I'):
 9696-            is_iphone = 1;
 9697-            stbi__skip(s, c.length);
 9698-            break;
 9699-         case STBI__PNG_TYPE('I','H','D','R'): {
 9700-            int comp,filter;
 9701-            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
 9702-            first = 0;
 9703-            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
 9704-            s->img_x = stbi__get32be(s);
 9705-            s->img_y = stbi__get32be(s);
 9706-            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
 9707-            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
 9708-            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
 9709-            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
 9710-            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
 9711-            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
 9712-            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
 9713-            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
 9714-            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
 9715-            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
 9716-            if (!pal_img_n) {
 9717-               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
 9718-               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
 9719-            } else {
 9720-               // if paletted, then pal_n is our final components, and
 9721-               // img_n is # components to decompress/filter.
 9722-               s->img_n = 1;
 9723-               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
 9724-            }
 9725-            // even with SCAN_header, have to scan to see if we have a tRNS
 9726-            break;
 9727-         }
 9728-
 9729-         case STBI__PNG_TYPE('P','L','T','E'):  {
 9730-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
 9731-            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
 9732-            pal_len = c.length / 3;
 9733-            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
 9734-            for (i=0; i < pal_len; ++i) {
 9735-               palette[i*4+0] = stbi__get8(s);
 9736-               palette[i*4+1] = stbi__get8(s);
 9737-               palette[i*4+2] = stbi__get8(s);
 9738-               palette[i*4+3] = 255;
 9739-            }
 9740-            break;
 9741-         }
 9742-
 9743-         case STBI__PNG_TYPE('t','R','N','S'): {
 9744-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
 9745-            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
 9746-            if (pal_img_n) {
 9747-               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
 9748-               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
 9749-               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
 9750-               pal_img_n = 4;
 9751-               for (i=0; i < c.length; ++i)
 9752-                  palette[i*4+3] = stbi__get8(s);
 9753-            } else {
 9754-               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
 9755-               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
 9756-               has_trans = 1;
 9757-               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
 9758-               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
 9759-               if (z->depth == 16) {
 9760-                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
 9761-                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
 9762-               } else {
 9763-                  for (k = 0; k < s->img_n && k < 3; ++k)
 9764-                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
 9765-               }
 9766-            }
 9767-            break;
 9768-         }
 9769-
 9770-         case STBI__PNG_TYPE('I','D','A','T'): {
 9771-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
 9772-            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
 9773-            if (scan == STBI__SCAN_header) {
 9774-               // header scan definitely stops at first IDAT
 9775-               if (pal_img_n)
 9776-                  s->img_n = pal_img_n;
 9777-               return 1;
 9778-            }
 9779-            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
 9780-            if ((int)(ioff + c.length) < (int)ioff) return 0;
 9781-            if (ioff + c.length > idata_limit) {
 9782-               stbi__uint32 idata_limit_old = idata_limit;
 9783-               stbi_uc *p;
 9784-               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
 9785-               while (ioff + c.length > idata_limit)
 9786-                  idata_limit *= 2;
 9787-               STBI_NOTUSED(idata_limit_old);
 9788-               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
 9789-               z->idata = p;
 9790-            }
 9791-            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
 9792-            ioff += c.length;
 9793-            break;
 9794-         }
 9795-
 9796-         case STBI__PNG_TYPE('I','E','N','D'): {
 9797-            stbi__uint32 raw_len, bpl;
 9798-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
 9799-            if (scan != STBI__SCAN_load) return 1;
 9800-            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
 9801-            // initial guess for decoded data size to avoid unnecessary reallocs
 9802-            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
 9803-            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
 9804-            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
 9805-            if (z->expanded == NULL) return 0; // zlib should set error
 9806-            STBI_FREE(z->idata); z->idata = NULL;
 9807-            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
 9808-               s->img_out_n = s->img_n+1;
 9809-            else
 9810-               s->img_out_n = s->img_n;
 9811-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
 9812-            if (has_trans) {
 9813-               if (z->depth == 16) {
 9814-                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
 9815-               } else {
 9816-                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
 9817-               }
 9818-            }
 9819-            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
 9820-               stbi__de_iphone(z);
 9821-            if (pal_img_n) {
 9822-               // pal_img_n == 3 or 4
 9823-               s->img_n = pal_img_n; // record the actual colors we had
 9824-               s->img_out_n = pal_img_n;
 9825-               if (req_comp >= 3) s->img_out_n = req_comp;
 9826-               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
 9827-                  return 0;
 9828-            } else if (has_trans) {
 9829-               // non-paletted image with tRNS -> source image has (constant) alpha
 9830-               ++s->img_n;
 9831-            }
 9832-            STBI_FREE(z->expanded); z->expanded = NULL;
 9833-            // end of PNG chunk, read and skip CRC
 9834-            stbi__get32be(s);
 9835-            return 1;
 9836-         }
 9837-
 9838-         default:
 9839-            // if critical, fail
 9840-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
 9841-            if ((c.type & (1 << 29)) == 0) {
 9842-               #ifndef STBI_NO_FAILURE_STRINGS
 9843-               // not threadsafe
 9844-               static char invalid_chunk[] = "XXXX PNG chunk not known";
 9845-               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
 9846-               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
 9847-               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
 9848-               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
 9849-               #endif
 9850-               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
 9851-            }
 9852-            stbi__skip(s, c.length);
 9853-            break;
 9854-      }
 9855-      // end of PNG chunk, read and skip CRC
 9856-      stbi__get32be(s);
 9857-   }
 9858-}
 9859-
 9860-static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
 9861-{
 9862-   void *result=NULL;
 9863-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
 9864-   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
 9865-      if (p->depth <= 8)
 9866-         ri->bits_per_channel = 8;
 9867-      else if (p->depth == 16)
 9868-         ri->bits_per_channel = 16;
 9869-      else
 9870-         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
 9871-      result = p->out;
 9872-      p->out = NULL;
 9873-      if (req_comp && req_comp != p->s->img_out_n) {
 9874-         if (ri->bits_per_channel == 8)
 9875-            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
 9876-         else
 9877-            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
 9878-         p->s->img_out_n = req_comp;
 9879-         if (result == NULL) return result;
 9880-      }
 9881-      *x = p->s->img_x;
 9882-      *y = p->s->img_y;
 9883-      if (n) *n = p->s->img_n;
 9884-   }
 9885-   STBI_FREE(p->out);      p->out      = NULL;
 9886-   STBI_FREE(p->expanded); p->expanded = NULL;
 9887-   STBI_FREE(p->idata);    p->idata    = NULL;
 9888-
 9889-   return result;
 9890-}
 9891-
 9892-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 9893-{
 9894-   stbi__png p;
 9895-   p.s = s;
 9896-   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 9897-}
 9898-
 9899-static int stbi__png_test(stbi__context *s)
 9900-{
 9901-   int r;
 9902-   r = stbi__check_png_header(s);
 9903-   stbi__rewind(s);
 9904-   return r;
 9905-}
 9906-
 9907-static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
 9908-{
 9909-   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
 9910-      stbi__rewind( p->s );
 9911-      return 0;
 9912-   }
 9913-   if (x) *x = p->s->img_x;
 9914-   if (y) *y = p->s->img_y;
 9915-   if (comp) *comp = p->s->img_n;
 9916-   return 1;
 9917-}
 9918-
 9919-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
 9920-{
 9921-   stbi__png p;
 9922-   p.s = s;
 9923-   return stbi__png_info_raw(&p, x, y, comp);
 9924-}
 9925-
 9926-static int stbi__png_is16(stbi__context *s)
 9927-{
 9928-   stbi__png p;
 9929-   p.s = s;
 9930-   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
 9931-	   return 0;
 9932-   if (p.depth != 16) {
 9933-      stbi__rewind(p.s);
 9934-      return 0;
 9935-   }
 9936-   return 1;
 9937+static void
 9938+stbi__de_iphone(stbi__png *z)
 9939+{
 9940+	stbi__context *s = z->s;
 9941+	stbi__uint32 i, pixel_count = s->img_x * s->img_y;
 9942+	stbi_uc *p = z->out;
 9943+
 9944+	if (s->img_out_n == 3) { // convert bgr to rgb
 9945+		for (i = 0; i < pixel_count; ++i) {
 9946+			stbi_uc t = p[0];
 9947+			p[0] = p[2];
 9948+			p[2] = t;
 9949+			p += 3;
 9950+		}
 9951+	} else {
 9952+		STBI_ASSERT(s->img_out_n == 4);
 9953+		if (stbi__unpremultiply_on_load) {
 9954+			// convert bgr to rgb and unpremultiply
 9955+			for (i = 0; i < pixel_count; ++i) {
 9956+				stbi_uc a = p[3];
 9957+				stbi_uc t = p[0];
 9958+				if (a) {
 9959+					stbi_uc half = a / 2;
 9960+					p[0] = (p[2] * 255 + half) / a;
 9961+					p[1] = (p[1] * 255 + half) / a;
 9962+					p[2] = (t * 255 + half) / a;
 9963+				} else {
 9964+					p[0] = p[2];
 9965+					p[2] = t;
 9966+				}
 9967+				p += 4;
 9968+			}
 9969+		} else {
 9970+			// convert bgr to rgb
 9971+			for (i = 0; i < pixel_count; ++i) {
 9972+				stbi_uc t = p[0];
 9973+				p[0] = p[2];
 9974+				p[2] = t;
 9975+				p += 4;
 9976+			}
 9977+		}
 9978+	}
 9979+}
 9980+
 9981+#define STBI__PNG_TYPE(a, b, c, d)                                             \
 9982+	(((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) +    \
 9983+	 (unsigned)(d))
 9984+
 9985+static int
 9986+stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 9987+{
 9988+	stbi_uc palette[1024], pal_img_n = 0;
 9989+	stbi_uc has_trans = 0, tc[3] = {0};
 9990+	stbi__uint16 tc16[3];
 9991+	stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
 9992+	int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
 9993+	stbi__context *s = z->s;
 9994+
 9995+	z->expanded = NULL;
 9996+	z->idata = NULL;
 9997+	z->out = NULL;
 9998+
 9999+	if (!stbi__check_png_header(s)) {
10000+		return 0;
10001+	}
10002+
10003+	if (scan == STBI__SCAN_type) {
10004+		return 1;
10005+	}
10006+
10007+	for (;;) {
10008+		stbi__pngchunk c = stbi__get_chunk_header(s);
10009+		switch (c.type) {
10010+		case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
10011+			is_iphone = 1;
10012+			stbi__skip(s, c.length);
10013+			break;
10014+		case STBI__PNG_TYPE('I', 'H', 'D', 'R'): {
10015+			int comp, filter;
10016+			if (!first) {
10017+				return stbi__err("multiple IHDR", "Corrupt PNG");
10018+			}
10019+			first = 0;
10020+			if (c.length != 13) {
10021+				return stbi__err("bad IHDR len", "Corrupt PNG");
10022+			}
10023+			s->img_x = stbi__get32be(s);
10024+			s->img_y = stbi__get32be(s);
10025+			if (s->img_y > STBI_MAX_DIMENSIONS) {
10026+				return stbi__err("too large", "Very large image (corrupt?)");
10027+			}
10028+			if (s->img_x > STBI_MAX_DIMENSIONS) {
10029+				return stbi__err("too large", "Very large image (corrupt?)");
10030+			}
10031+			z->depth = stbi__get8(s);
10032+			if (z->depth != 1 && z->depth != 2 && z->depth != 4 &&
10033+			    z->depth != 8 && z->depth != 16) {
10034+				return stbi__err("1/2/4/8/16-bit only",
10035+				                 "PNG not supported: 1/2/4/8/16-bit only");
10036+			}
10037+			color = stbi__get8(s);
10038+			if (color > 6) {
10039+				return stbi__err("bad ctype", "Corrupt PNG");
10040+			}
10041+			if (color == 3 && z->depth == 16) {
10042+				return stbi__err("bad ctype", "Corrupt PNG");
10043+			}
10044+			if (color == 3) {
10045+				pal_img_n = 3;
10046+			} else if (color & 1) {
10047+				return stbi__err("bad ctype", "Corrupt PNG");
10048+			}
10049+			comp = stbi__get8(s);
10050+			if (comp) {
10051+				return stbi__err("bad comp method", "Corrupt PNG");
10052+			}
10053+			filter = stbi__get8(s);
10054+			if (filter) {
10055+				return stbi__err("bad filter method", "Corrupt PNG");
10056+			}
10057+			interlace = stbi__get8(s);
10058+			if (interlace > 1) {
10059+				return stbi__err("bad interlace method", "Corrupt PNG");
10060+			}
10061+			if (!s->img_x || !s->img_y) {
10062+				return stbi__err("0-pixel image", "Corrupt PNG");
10063+			}
10064+			if (!pal_img_n) {
10065+				s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
10066+				if ((1 << 30) / s->img_x / s->img_n < s->img_y) {
10067+					return stbi__err("too large", "Image too large to decode");
10068+				}
10069+			} else {
10070+				// if paletted, then pal_n is our final components, and
10071+				// img_n is # components to decompress/filter.
10072+				s->img_n = 1;
10073+				if ((1 << 30) / s->img_x / 4 < s->img_y) {
10074+					return stbi__err("too large", "Corrupt PNG");
10075+				}
10076+			}
10077+			// even with SCAN_header, have to scan to see if we have a tRNS
10078+			break;
10079+		}
10080+
10081+		case STBI__PNG_TYPE('P', 'L', 'T', 'E'): {
10082+			if (first) {
10083+				return stbi__err("first not IHDR", "Corrupt PNG");
10084+			}
10085+			if (c.length > 256 * 3) {
10086+				return stbi__err("invalid PLTE", "Corrupt PNG");
10087+			}
10088+			pal_len = c.length / 3;
10089+			if (pal_len * 3 != c.length) {
10090+				return stbi__err("invalid PLTE", "Corrupt PNG");
10091+			}
10092+			for (i = 0; i < pal_len; ++i) {
10093+				palette[i * 4 + 0] = stbi__get8(s);
10094+				palette[i * 4 + 1] = stbi__get8(s);
10095+				palette[i * 4 + 2] = stbi__get8(s);
10096+				palette[i * 4 + 3] = 255;
10097+			}
10098+			break;
10099+		}
10100+
10101+		case STBI__PNG_TYPE('t', 'R', 'N', 'S'): {
10102+			if (first) {
10103+				return stbi__err("first not IHDR", "Corrupt PNG");
10104+			}
10105+			if (z->idata) {
10106+				return stbi__err("tRNS after IDAT", "Corrupt PNG");
10107+			}
10108+			if (pal_img_n) {
10109+				if (scan == STBI__SCAN_header) {
10110+					s->img_n = 4;
10111+					return 1;
10112+				}
10113+				if (pal_len == 0) {
10114+					return stbi__err("tRNS before PLTE", "Corrupt PNG");
10115+				}
10116+				if (c.length > pal_len) {
10117+					return stbi__err("bad tRNS len", "Corrupt PNG");
10118+				}
10119+				pal_img_n = 4;
10120+				for (i = 0; i < c.length; ++i) {
10121+					palette[i * 4 + 3] = stbi__get8(s);
10122+				}
10123+			} else {
10124+				if (!(s->img_n & 1)) {
10125+					return stbi__err("tRNS with alpha", "Corrupt PNG");
10126+				}
10127+				if (c.length != (stbi__uint32)s->img_n * 2) {
10128+					return stbi__err("bad tRNS len", "Corrupt PNG");
10129+				}
10130+				has_trans = 1;
10131+				// non-paletted with tRNS = constant alpha. if header-scanning,
10132+				// we can stop now.
10133+				if (scan == STBI__SCAN_header) {
10134+					++s->img_n;
10135+					return 1;
10136+				}
10137+				if (z->depth == 16) {
10138+					for (k = 0; k < s->img_n && k < 3;
10139+					     ++k) { // extra loop test to suppress false GCC warning
10140+						tc16[k] = (stbi__uint16)stbi__get16be(
10141+						    s); // copy the values as-is
10142+					}
10143+				} else {
10144+					for (k = 0; k < s->img_n && k < 3; ++k) {
10145+						tc[k] =
10146+						    (stbi_uc)(stbi__get16be(s) & 255) *
10147+						    stbi__depth_scale_table
10148+						        [z->depth]; // non 8-bit images will be larger
10149+					}
10150+				}
10151+			}
10152+			break;
10153+		}
10154+
10155+		case STBI__PNG_TYPE('I', 'D', 'A', 'T'): {
10156+			if (first) {
10157+				return stbi__err("first not IHDR", "Corrupt PNG");
10158+			}
10159+			if (pal_img_n && !pal_len) {
10160+				return stbi__err("no PLTE", "Corrupt PNG");
10161+			}
10162+			if (scan == STBI__SCAN_header) {
10163+				// header scan definitely stops at first IDAT
10164+				if (pal_img_n) {
10165+					s->img_n = pal_img_n;
10166+				}
10167+				return 1;
10168+			}
10169+			if (c.length > (1u << 30)) {
10170+				return stbi__err("IDAT size limit",
10171+				                 "IDAT section larger than 2^30 bytes");
10172+			}
10173+			if ((int)(ioff + c.length) < (int)ioff) {
10174+				return 0;
10175+			}
10176+			if (ioff + c.length > idata_limit) {
10177+				stbi__uint32 idata_limit_old = idata_limit;
10178+				stbi_uc *p;
10179+				if (idata_limit == 0) {
10180+					idata_limit = c.length > 4096 ? c.length : 4096;
10181+				}
10182+				while (ioff + c.length > idata_limit) {
10183+					idata_limit *= 2;
10184+				}
10185+				STBI_NOTUSED(idata_limit_old);
10186+				p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old,
10187+				                                  idata_limit);
10188+				if (p == NULL) {
10189+					return stbi__err("outofmem", "Out of memory");
10190+				}
10191+				z->idata = p;
10192+			}
10193+			if (!stbi__getn(s, z->idata + ioff, c.length)) {
10194+				return stbi__err("outofdata", "Corrupt PNG");
10195+			}
10196+			ioff += c.length;
10197+			break;
10198+		}
10199+
10200+		case STBI__PNG_TYPE('I', 'E', 'N', 'D'): {
10201+			stbi__uint32 raw_len, bpl;
10202+			if (first) {
10203+				return stbi__err("first not IHDR", "Corrupt PNG");
10204+			}
10205+			if (scan != STBI__SCAN_load) {
10206+				return 1;
10207+			}
10208+			if (z->idata == NULL) {
10209+				return stbi__err("no IDAT", "Corrupt PNG");
10210+			}
10211+			// initial guess for decoded data size to avoid unnecessary reallocs
10212+			bpl =
10213+			    (s->img_x * z->depth + 7) / 8; // bytes per line, per component
10214+			raw_len = bpl * s->img_y * s->img_n /* pixels */ +
10215+			          s->img_y /* filter mode per row */;
10216+			z->expanded =
10217+			    (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag(
10218+			        (char *)z->idata, ioff, raw_len, (int *)&raw_len,
10219+			        !is_iphone);
10220+			if (z->expanded == NULL) {
10221+				return 0; // zlib should set error
10222+			}
10223+			STBI_FREE(z->idata);
10224+			z->idata = NULL;
10225+			if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) ||
10226+			    has_trans) {
10227+				s->img_out_n = s->img_n + 1;
10228+			} else {
10229+				s->img_out_n = s->img_n;
10230+			}
10231+			if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n,
10232+			                            z->depth, color, interlace)) {
10233+				return 0;
10234+			}
10235+			if (has_trans) {
10236+				if (z->depth == 16) {
10237+					if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) {
10238+						return 0;
10239+					}
10240+				} else {
10241+					if (!stbi__compute_transparency(z, tc, s->img_out_n)) {
10242+						return 0;
10243+					}
10244+				}
10245+			}
10246+			if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) {
10247+				stbi__de_iphone(z);
10248+			}
10249+			if (pal_img_n) {
10250+				// pal_img_n == 3 or 4
10251+				s->img_n = pal_img_n; // record the actual colors we had
10252+				s->img_out_n = pal_img_n;
10253+				if (req_comp >= 3) {
10254+					s->img_out_n = req_comp;
10255+				}
10256+				if (!stbi__expand_png_palette(z, palette, pal_len,
10257+				                              s->img_out_n)) {
10258+					return 0;
10259+				}
10260+			} else if (has_trans) {
10261+				// non-paletted image with tRNS -> source image has (constant)
10262+				// alpha
10263+				++s->img_n;
10264+			}
10265+			STBI_FREE(z->expanded);
10266+			z->expanded = NULL;
10267+			// end of PNG chunk, read and skip CRC
10268+			stbi__get32be(s);
10269+			return 1;
10270+		}
10271+
10272+		default:
10273+			// if critical, fail
10274+			if (first) {
10275+				return stbi__err("first not IHDR", "Corrupt PNG");
10276+			}
10277+			if ((c.type & (1 << 29)) == 0) {
10278+#ifndef STBI_NO_FAILURE_STRINGS
10279+				// not threadsafe
10280+				static char invalid_chunk[] = "XXXX PNG chunk not known";
10281+				invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
10282+				invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
10283+				invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
10284+				invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
10285+#endif
10286+				return stbi__err(invalid_chunk,
10287+				                 "PNG not supported: unknown PNG chunk type");
10288+			}
10289+			stbi__skip(s, c.length);
10290+			break;
10291+		}
10292+		// end of PNG chunk, read and skip CRC
10293+		stbi__get32be(s);
10294+	}
10295+}
10296+
10297+static void *
10298+stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp,
10299+             stbi__result_info *ri)
10300+{
10301+	void *result = NULL;
10302+	if (req_comp < 0 || req_comp > 4) {
10303+		return stbi__errpuc("bad req_comp", "Internal error");
10304+	}
10305+	if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
10306+		if (p->depth <= 8) {
10307+			ri->bits_per_channel = 8;
10308+		} else if (p->depth == 16) {
10309+			ri->bits_per_channel = 16;
10310+		} else {
10311+			return stbi__errpuc("bad bits_per_channel",
10312+			                    "PNG not supported: unsupported color depth");
10313+		}
10314+		result = p->out;
10315+		p->out = NULL;
10316+		if (req_comp && req_comp != p->s->img_out_n) {
10317+			if (ri->bits_per_channel == 8) {
10318+				result = stbi__convert_format((unsigned char *)result,
10319+				                              p->s->img_out_n, req_comp,
10320+				                              p->s->img_x, p->s->img_y);
10321+			} else {
10322+				result = stbi__convert_format16((stbi__uint16 *)result,
10323+				                                p->s->img_out_n, req_comp,
10324+				                                p->s->img_x, p->s->img_y);
10325+			}
10326+			p->s->img_out_n = req_comp;
10327+			if (result == NULL) {
10328+				return result;
10329+			}
10330+		}
10331+		*x = p->s->img_x;
10332+		*y = p->s->img_y;
10333+		if (n) {
10334+			*n = p->s->img_n;
10335+		}
10336+	}
10337+	STBI_FREE(p->out);
10338+	p->out = NULL;
10339+	STBI_FREE(p->expanded);
10340+	p->expanded = NULL;
10341+	STBI_FREE(p->idata);
10342+	p->idata = NULL;
10343+
10344+	return result;
10345+}
10346+
10347+static void *
10348+stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
10349+               stbi__result_info *ri)
10350+{
10351+	stbi__png p;
10352+	p.s = s;
10353+	return stbi__do_png(&p, x, y, comp, req_comp, ri);
10354+}
10355+
10356+static int
10357+stbi__png_test(stbi__context *s)
10358+{
10359+	int r;
10360+	r = stbi__check_png_header(s);
10361+	stbi__rewind(s);
10362+	return r;
10363+}
10364+
10365+static int
10366+stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
10367+{
10368+	if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
10369+		stbi__rewind(p->s);
10370+		return 0;
10371+	}
10372+	if (x) {
10373+		*x = p->s->img_x;
10374+	}
10375+	if (y) {
10376+		*y = p->s->img_y;
10377+	}
10378+	if (comp) {
10379+		*comp = p->s->img_n;
10380+	}
10381+	return 1;
10382+}
10383+
10384+static int
10385+stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
10386+{
10387+	stbi__png p;
10388+	p.s = s;
10389+	return stbi__png_info_raw(&p, x, y, comp);
10390+}
10391+
10392+static int
10393+stbi__png_is16(stbi__context *s)
10394+{
10395+	stbi__png p;
10396+	p.s = s;
10397+	if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) {
10398+		return 0;
10399+	}
10400+	if (p.depth != 16) {
10401+		stbi__rewind(p.s);
10402+		return 0;
10403+	}
10404+	return 1;
10405 }
10406 #endif
10407 
10408 // Microsoft/Windows BMP image
10409 
10410 #ifndef STBI_NO_BMP
10411-static int stbi__bmp_test_raw(stbi__context *s)
10412-{
10413-   int r;
10414-   int sz;
10415-   if (stbi__get8(s) != 'B') return 0;
10416-   if (stbi__get8(s) != 'M') return 0;
10417-   stbi__get32le(s); // discard filesize
10418-   stbi__get16le(s); // discard reserved
10419-   stbi__get16le(s); // discard reserved
10420-   stbi__get32le(s); // discard data offset
10421-   sz = stbi__get32le(s);
10422-   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
10423-   return r;
10424+static int
10425+stbi__bmp_test_raw(stbi__context *s)
10426+{
10427+	int r;
10428+	int sz;
10429+	if (stbi__get8(s) != 'B') {
10430+		return 0;
10431+	}
10432+	if (stbi__get8(s) != 'M') {
10433+		return 0;
10434+	}
10435+	stbi__get32le(s); // discard filesize
10436+	stbi__get16le(s); // discard reserved
10437+	stbi__get16le(s); // discard reserved
10438+	stbi__get32le(s); // discard data offset
10439+	sz = stbi__get32le(s);
10440+	r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
10441+	return r;
10442+}
10443+
10444+static int
10445+stbi__bmp_test(stbi__context *s)
10446+{
10447+	int r = stbi__bmp_test_raw(s);
10448+	stbi__rewind(s);
10449+	return r;
10450 }
10451 
10452-static int stbi__bmp_test(stbi__context *s)
10453-{
10454-   int r = stbi__bmp_test_raw(s);
10455-   stbi__rewind(s);
10456-   return r;
10457-}
10458-
10459-
10460 // returns 0..31 for the highest set bit
10461-static int stbi__high_bit(unsigned int z)
10462-{
10463-   int n=0;
10464-   if (z == 0) return -1;
10465-   if (z >= 0x10000) { n += 16; z >>= 16; }
10466-   if (z >= 0x00100) { n +=  8; z >>=  8; }
10467-   if (z >= 0x00010) { n +=  4; z >>=  4; }
10468-   if (z >= 0x00004) { n +=  2; z >>=  2; }
10469-   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
10470-   return n;
10471-}
10472-
10473-static int stbi__bitcount(unsigned int a)
10474-{
10475-   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
10476-   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
10477-   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
10478-   a = (a + (a >> 8)); // max 16 per 8 bits
10479-   a = (a + (a >> 16)); // max 32 per 8 bits
10480-   return a & 0xff;
10481+static int
10482+stbi__high_bit(unsigned int z)
10483+{
10484+	int n = 0;
10485+	if (z == 0) {
10486+		return -1;
10487+	}
10488+	if (z >= 0x10000) {
10489+		n += 16;
10490+		z >>= 16;
10491+	}
10492+	if (z >= 0x00100) {
10493+		n += 8;
10494+		z >>= 8;
10495+	}
10496+	if (z >= 0x00010) {
10497+		n += 4;
10498+		z >>= 4;
10499+	}
10500+	if (z >= 0x00004) {
10501+		n += 2;
10502+		z >>= 2;
10503+	}
10504+	if (z >= 0x00002) {
10505+		n += 1; /* >>=  1;*/
10506+	}
10507+	return n;
10508+}
10509+
10510+static int
10511+stbi__bitcount(unsigned int a)
10512+{
10513+	a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
10514+	a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
10515+	a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
10516+	a = (a + (a >> 8));                             // max 16 per 8 bits
10517+	a = (a + (a >> 16));                            // max 32 per 8 bits
10518+	return a & 0xff;
10519 }
10520 
10521 // extract an arbitrarily-aligned N-bit value (N=bits)
10522 // from v, and then make it 8-bits long and fractionally
10523 // extend it to full full range.
10524-static int stbi__shiftsigned(unsigned int v, int shift, int bits)
10525-{
10526-   static unsigned int mul_table[9] = {
10527-      0,
10528-      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
10529-      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
10530-   };
10531-   static unsigned int shift_table[9] = {
10532-      0, 0,0,1,0,2,4,6,0,
10533-   };
10534-   if (shift < 0)
10535-      v <<= -shift;
10536-   else
10537-      v >>= shift;
10538-   STBI_ASSERT(v < 256);
10539-   v >>= (8-bits);
10540-   STBI_ASSERT(bits >= 0 && bits <= 8);
10541-   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
10542-}
10543-
10544-typedef struct
10545-{
10546-   int bpp, offset, hsz;
10547-   unsigned int mr,mg,mb,ma, all_a;
10548-   int extra_read;
10549+static int
10550+stbi__shiftsigned(unsigned int v, int shift, int bits)
10551+{
10552+	static unsigned int mul_table[9] = {
10553+	    0,
10554+	    0xff /*0b11111111*/,
10555+	    0x55 /*0b01010101*/,
10556+	    0x49 /*0b01001001*/,
10557+	    0x11 /*0b00010001*/,
10558+	    0x21 /*0b00100001*/,
10559+	    0x41 /*0b01000001*/,
10560+	    0x81 /*0b10000001*/,
10561+	    0x01 /*0b00000001*/,
10562+	};
10563+	static unsigned int shift_table[9] = {
10564+	    0, 0, 0, 1, 0, 2, 4, 6, 0,
10565+	};
10566+	if (shift < 0) {
10567+		v <<= -shift;
10568+	} else {
10569+		v >>= shift;
10570+	}
10571+	STBI_ASSERT(v < 256);
10572+	v >>= (8 - bits);
10573+	STBI_ASSERT(bits >= 0 && bits <= 8);
10574+	return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
10575+}
10576+
10577+typedef struct {
10578+	int bpp, offset, hsz;
10579+	unsigned int mr, mg, mb, ma, all_a;
10580+	int extra_read;
10581 } stbi__bmp_data;
10582 
10583-static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
10584-{
10585-   // BI_BITFIELDS specifies masks explicitly, don't override
10586-   if (compress == 3)
10587-      return 1;
10588-
10589-   if (compress == 0) {
10590-      if (info->bpp == 16) {
10591-         info->mr = 31u << 10;
10592-         info->mg = 31u <<  5;
10593-         info->mb = 31u <<  0;
10594-      } else if (info->bpp == 32) {
10595-         info->mr = 0xffu << 16;
10596-         info->mg = 0xffu <<  8;
10597-         info->mb = 0xffu <<  0;
10598-         info->ma = 0xffu << 24;
10599-         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
10600-      } else {
10601-         // otherwise, use defaults, which is all-0
10602-         info->mr = info->mg = info->mb = info->ma = 0;
10603-      }
10604-      return 1;
10605-   }
10606-   return 0; // error
10607-}
10608-
10609-static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
10610-{
10611-   int hsz;
10612-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
10613-   stbi__get32le(s); // discard filesize
10614-   stbi__get16le(s); // discard reserved
10615-   stbi__get16le(s); // discard reserved
10616-   info->offset = stbi__get32le(s);
10617-   info->hsz = hsz = stbi__get32le(s);
10618-   info->mr = info->mg = info->mb = info->ma = 0;
10619-   info->extra_read = 14;
10620-
10621-   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
10622-
10623-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
10624-   if (hsz == 12) {
10625-      s->img_x = stbi__get16le(s);
10626-      s->img_y = stbi__get16le(s);
10627-   } else {
10628-      s->img_x = stbi__get32le(s);
10629-      s->img_y = stbi__get32le(s);
10630-   }
10631-   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
10632-   info->bpp = stbi__get16le(s);
10633-   if (hsz != 12) {
10634-      int compress = stbi__get32le(s);
10635-      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
10636-      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
10637-      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
10638-      stbi__get32le(s); // discard sizeof
10639-      stbi__get32le(s); // discard hres
10640-      stbi__get32le(s); // discard vres
10641-      stbi__get32le(s); // discard colorsused
10642-      stbi__get32le(s); // discard max important
10643-      if (hsz == 40 || hsz == 56) {
10644-         if (hsz == 56) {
10645-            stbi__get32le(s);
10646-            stbi__get32le(s);
10647-            stbi__get32le(s);
10648-            stbi__get32le(s);
10649-         }
10650-         if (info->bpp == 16 || info->bpp == 32) {
10651-            if (compress == 0) {
10652-               stbi__bmp_set_mask_defaults(info, compress);
10653-            } else if (compress == 3) {
10654-               info->mr = stbi__get32le(s);
10655-               info->mg = stbi__get32le(s);
10656-               info->mb = stbi__get32le(s);
10657-               info->extra_read += 12;
10658-               // not documented, but generated by photoshop and handled by mspaint
10659-               if (info->mr == info->mg && info->mg == info->mb) {
10660-                  // ?!?!?
10661-                  return stbi__errpuc("bad BMP", "bad BMP");
10662-               }
10663-            } else
10664-               return stbi__errpuc("bad BMP", "bad BMP");
10665-         }
10666-      } else {
10667-         // V4/V5 header
10668-         int i;
10669-         if (hsz != 108 && hsz != 124)
10670-            return stbi__errpuc("bad BMP", "bad BMP");
10671-         info->mr = stbi__get32le(s);
10672-         info->mg = stbi__get32le(s);
10673-         info->mb = stbi__get32le(s);
10674-         info->ma = stbi__get32le(s);
10675-         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
10676-            stbi__bmp_set_mask_defaults(info, compress);
10677-         stbi__get32le(s); // discard color space
10678-         for (i=0; i < 12; ++i)
10679-            stbi__get32le(s); // discard color space parameters
10680-         if (hsz == 124) {
10681-            stbi__get32le(s); // discard rendering intent
10682-            stbi__get32le(s); // discard offset of profile data
10683-            stbi__get32le(s); // discard size of profile data
10684-            stbi__get32le(s); // discard reserved
10685-         }
10686-      }
10687-   }
10688-   return (void *) 1;
10689-}
10690-
10691-
10692-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
10693-{
10694-   stbi_uc *out;
10695-   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
10696-   stbi_uc pal[256][4];
10697-   int psize=0,i,j,width;
10698-   int flip_vertically, pad, target;
10699-   stbi__bmp_data info;
10700-   STBI_NOTUSED(ri);
10701-
10702-   info.all_a = 255;
10703-   if (stbi__bmp_parse_header(s, &info) == NULL)
10704-      return NULL; // error code already set
10705-
10706-   flip_vertically = ((int) s->img_y) > 0;
10707-   s->img_y = abs((int) s->img_y);
10708-
10709-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
10710-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
10711-
10712-   mr = info.mr;
10713-   mg = info.mg;
10714-   mb = info.mb;
10715-   ma = info.ma;
10716-   all_a = info.all_a;
10717-
10718-   if (info.hsz == 12) {
10719-      if (info.bpp < 24)
10720-         psize = (info.offset - info.extra_read - 24) / 3;
10721-   } else {
10722-      if (info.bpp < 16)
10723-         psize = (info.offset - info.extra_read - info.hsz) >> 2;
10724-   }
10725-   if (psize == 0) {
10726-      // accept some number of extra bytes after the header, but if the offset points either to before
10727-      // the header ends or implies a large amount of extra data, reject the file as malformed
10728-      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
10729-      int header_limit = 1024; // max we actually read is below 256 bytes currently.
10730-      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
10731-      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
10732-         return stbi__errpuc("bad header", "Corrupt BMP");
10733-      }
10734-      // we established that bytes_read_so_far is positive and sensible.
10735-      // the first half of this test rejects offsets that are either too small positives, or
10736-      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
10737-      // ensures the number computed in the second half of the test can't overflow.
10738-      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
10739-         return stbi__errpuc("bad offset", "Corrupt BMP");
10740-      } else {
10741-         stbi__skip(s, info.offset - bytes_read_so_far);
10742-      }
10743-   }
10744-
10745-   if (info.bpp == 24 && ma == 0xff000000)
10746-      s->img_n = 3;
10747-   else
10748-      s->img_n = ma ? 4 : 3;
10749-   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
10750-      target = req_comp;
10751-   else
10752-      target = s->img_n; // if they want monochrome, we'll post-convert
10753-
10754-   // sanity-check size
10755-   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
10756-      return stbi__errpuc("too large", "Corrupt BMP");
10757-
10758-   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
10759-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
10760-   if (info.bpp < 16) {
10761-      int z=0;
10762-      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
10763-      for (i=0; i < psize; ++i) {
10764-         pal[i][2] = stbi__get8(s);
10765-         pal[i][1] = stbi__get8(s);
10766-         pal[i][0] = stbi__get8(s);
10767-         if (info.hsz != 12) stbi__get8(s);
10768-         pal[i][3] = 255;
10769-      }
10770-      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
10771-      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
10772-      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
10773-      else if (info.bpp == 8) width = s->img_x;
10774-      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
10775-      pad = (-width)&3;
10776-      if (info.bpp == 1) {
10777-         for (j=0; j < (int) s->img_y; ++j) {
10778-            int bit_offset = 7, v = stbi__get8(s);
10779-            for (i=0; i < (int) s->img_x; ++i) {
10780-               int color = (v>>bit_offset)&0x1;
10781-               out[z++] = pal[color][0];
10782-               out[z++] = pal[color][1];
10783-               out[z++] = pal[color][2];
10784-               if (target == 4) out[z++] = 255;
10785-               if (i+1 == (int) s->img_x) break;
10786-               if((--bit_offset) < 0) {
10787-                  bit_offset = 7;
10788-                  v = stbi__get8(s);
10789-               }
10790-            }
10791-            stbi__skip(s, pad);
10792-         }
10793-      } else {
10794-         for (j=0; j < (int) s->img_y; ++j) {
10795-            for (i=0; i < (int) s->img_x; i += 2) {
10796-               int v=stbi__get8(s),v2=0;
10797-               if (info.bpp == 4) {
10798-                  v2 = v & 15;
10799-                  v >>= 4;
10800-               }
10801-               out[z++] = pal[v][0];
10802-               out[z++] = pal[v][1];
10803-               out[z++] = pal[v][2];
10804-               if (target == 4) out[z++] = 255;
10805-               if (i+1 == (int) s->img_x) break;
10806-               v = (info.bpp == 8) ? stbi__get8(s) : v2;
10807-               out[z++] = pal[v][0];
10808-               out[z++] = pal[v][1];
10809-               out[z++] = pal[v][2];
10810-               if (target == 4) out[z++] = 255;
10811-            }
10812-            stbi__skip(s, pad);
10813-         }
10814-      }
10815-   } else {
10816-      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
10817-      int z = 0;
10818-      int easy=0;
10819-      stbi__skip(s, info.offset - info.extra_read - info.hsz);
10820-      if (info.bpp == 24) width = 3 * s->img_x;
10821-      else if (info.bpp == 16) width = 2*s->img_x;
10822-      else /* bpp = 32 and pad = 0 */ width=0;
10823-      pad = (-width) & 3;
10824-      if (info.bpp == 24) {
10825-         easy = 1;
10826-      } else if (info.bpp == 32) {
10827-         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
10828-            easy = 2;
10829-      }
10830-      if (!easy) {
10831-         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
10832-         // right shift amt to put high bit in position #7
10833-         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
10834-         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
10835-         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
10836-         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
10837-         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
10838-      }
10839-      for (j=0; j < (int) s->img_y; ++j) {
10840-         if (easy) {
10841-            for (i=0; i < (int) s->img_x; ++i) {
10842-               unsigned char a;
10843-               out[z+2] = stbi__get8(s);
10844-               out[z+1] = stbi__get8(s);
10845-               out[z+0] = stbi__get8(s);
10846-               z += 3;
10847-               a = (easy == 2 ? stbi__get8(s) : 255);
10848-               all_a |= a;
10849-               if (target == 4) out[z++] = a;
10850-            }
10851-         } else {
10852-            int bpp = info.bpp;
10853-            for (i=0; i < (int) s->img_x; ++i) {
10854-               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
10855-               unsigned int a;
10856-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
10857-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
10858-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
10859-               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
10860-               all_a |= a;
10861-               if (target == 4) out[z++] = STBI__BYTECAST(a);
10862-            }
10863-         }
10864-         stbi__skip(s, pad);
10865-      }
10866-   }
10867-
10868-   // if alpha channel is all 0s, replace with all 255s
10869-   if (target == 4 && all_a == 0)
10870-      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
10871-         out[i] = 255;
10872-
10873-   if (flip_vertically) {
10874-      stbi_uc t;
10875-      for (j=0; j < (int) s->img_y>>1; ++j) {
10876-         stbi_uc *p1 = out +      j     *s->img_x*target;
10877-         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
10878-         for (i=0; i < (int) s->img_x*target; ++i) {
10879-            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
10880-         }
10881-      }
10882-   }
10883-
10884-   if (req_comp && req_comp != target) {
10885-      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
10886-      if (out == NULL) return out; // stbi__convert_format frees input on failure
10887-   }
10888-
10889-   *x = s->img_x;
10890-   *y = s->img_y;
10891-   if (comp) *comp = s->img_n;
10892-   return out;
10893+static int
10894+stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
10895+{
10896+	// BI_BITFIELDS specifies masks explicitly, don't override
10897+	if (compress == 3) {
10898+		return 1;
10899+	}
10900+
10901+	if (compress == 0) {
10902+		if (info->bpp == 16) {
10903+			info->mr = 31u << 10;
10904+			info->mg = 31u << 5;
10905+			info->mb = 31u << 0;
10906+		} else if (info->bpp == 32) {
10907+			info->mr = 0xffu << 16;
10908+			info->mg = 0xffu << 8;
10909+			info->mb = 0xffu << 0;
10910+			info->ma = 0xffu << 24;
10911+			info->all_a = 0; // if all_a is 0 at end, then we loaded alpha
10912+			                 // channel but it was all 0
10913+		} else {
10914+			// otherwise, use defaults, which is all-0
10915+			info->mr = info->mg = info->mb = info->ma = 0;
10916+		}
10917+		return 1;
10918+	}
10919+	return 0; // error
10920+}
10921+
10922+static void *
10923+stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
10924+{
10925+	int hsz;
10926+	if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') {
10927+		return stbi__errpuc("not BMP", "Corrupt BMP");
10928+	}
10929+	stbi__get32le(s); // discard filesize
10930+	stbi__get16le(s); // discard reserved
10931+	stbi__get16le(s); // discard reserved
10932+	info->offset = stbi__get32le(s);
10933+	info->hsz = hsz = stbi__get32le(s);
10934+	info->mr = info->mg = info->mb = info->ma = 0;
10935+	info->extra_read = 14;
10936+
10937+	if (info->offset < 0) {
10938+		return stbi__errpuc("bad BMP", "bad BMP");
10939+	}
10940+
10941+	if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) {
10942+		return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
10943+	}
10944+	if (hsz == 12) {
10945+		s->img_x = stbi__get16le(s);
10946+		s->img_y = stbi__get16le(s);
10947+	} else {
10948+		s->img_x = stbi__get32le(s);
10949+		s->img_y = stbi__get32le(s);
10950+	}
10951+	if (stbi__get16le(s) != 1) {
10952+		return stbi__errpuc("bad BMP", "bad BMP");
10953+	}
10954+	info->bpp = stbi__get16le(s);
10955+	if (hsz != 12) {
10956+		int compress = stbi__get32le(s);
10957+		if (compress == 1 || compress == 2) {
10958+			return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
10959+		}
10960+		if (compress >= 4) {
10961+			return stbi__errpuc(
10962+			    "BMP JPEG/PNG",
10963+			    "BMP type not supported: unsupported compression"); // this
10964+			                                                        // includes
10965+			                                                        // PNG/JPEG
10966+			                                                        // modes
10967+		}
10968+		if (compress == 3 && info->bpp != 16 && info->bpp != 32) {
10969+			return stbi__errpuc(
10970+			    "bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
10971+		}
10972+		stbi__get32le(s); // discard sizeof
10973+		stbi__get32le(s); // discard hres
10974+		stbi__get32le(s); // discard vres
10975+		stbi__get32le(s); // discard colorsused
10976+		stbi__get32le(s); // discard max important
10977+		if (hsz == 40 || hsz == 56) {
10978+			if (hsz == 56) {
10979+				stbi__get32le(s);
10980+				stbi__get32le(s);
10981+				stbi__get32le(s);
10982+				stbi__get32le(s);
10983+			}
10984+			if (info->bpp == 16 || info->bpp == 32) {
10985+				if (compress == 0) {
10986+					stbi__bmp_set_mask_defaults(info, compress);
10987+				} else if (compress == 3) {
10988+					info->mr = stbi__get32le(s);
10989+					info->mg = stbi__get32le(s);
10990+					info->mb = stbi__get32le(s);
10991+					info->extra_read += 12;
10992+					// not documented, but generated by photoshop and handled by
10993+					// mspaint
10994+					if (info->mr == info->mg && info->mg == info->mb) {
10995+						// ?!?!?
10996+						return stbi__errpuc("bad BMP", "bad BMP");
10997+					}
10998+				} else {
10999+					return stbi__errpuc("bad BMP", "bad BMP");
11000+				}
11001+			}
11002+		} else {
11003+			// V4/V5 header
11004+			int i;
11005+			if (hsz != 108 && hsz != 124) {
11006+				return stbi__errpuc("bad BMP", "bad BMP");
11007+			}
11008+			info->mr = stbi__get32le(s);
11009+			info->mg = stbi__get32le(s);
11010+			info->mb = stbi__get32le(s);
11011+			info->ma = stbi__get32le(s);
11012+			if (compress != 3) { // override mr/mg/mb unless in BI_BITFIELDS
11013+				                 // mode, as per docs
11014+				stbi__bmp_set_mask_defaults(info, compress);
11015+			}
11016+			stbi__get32le(s); // discard color space
11017+			for (i = 0; i < 12; ++i) {
11018+				stbi__get32le(s); // discard color space parameters
11019+			}
11020+			if (hsz == 124) {
11021+				stbi__get32le(s); // discard rendering intent
11022+				stbi__get32le(s); // discard offset of profile data
11023+				stbi__get32le(s); // discard size of profile data
11024+				stbi__get32le(s); // discard reserved
11025+			}
11026+		}
11027+	}
11028+	return (void *)1;
11029+}
11030+
11031+static void *
11032+stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
11033+               stbi__result_info *ri)
11034+{
11035+	stbi_uc *out;
11036+	unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a;
11037+	stbi_uc pal[256][4];
11038+	int psize = 0, i, j, width;
11039+	int flip_vertically, pad, target;
11040+	stbi__bmp_data info;
11041+	STBI_NOTUSED(ri);
11042+
11043+	info.all_a = 255;
11044+	if (stbi__bmp_parse_header(s, &info) == NULL) {
11045+		return NULL; // error code already set
11046+	}
11047+
11048+	flip_vertically = ((int)s->img_y) > 0;
11049+	s->img_y = abs((int)s->img_y);
11050+
11051+	if (s->img_y > STBI_MAX_DIMENSIONS) {
11052+		return stbi__errpuc("too large", "Very large image (corrupt?)");
11053+	}
11054+	if (s->img_x > STBI_MAX_DIMENSIONS) {
11055+		return stbi__errpuc("too large", "Very large image (corrupt?)");
11056+	}
11057+
11058+	mr = info.mr;
11059+	mg = info.mg;
11060+	mb = info.mb;
11061+	ma = info.ma;
11062+	all_a = info.all_a;
11063+
11064+	if (info.hsz == 12) {
11065+		if (info.bpp < 24) {
11066+			psize = (info.offset - info.extra_read - 24) / 3;
11067+		}
11068+	} else {
11069+		if (info.bpp < 16) {
11070+			psize = (info.offset - info.extra_read - info.hsz) >> 2;
11071+		}
11072+	}
11073+	if (psize == 0) {
11074+		// accept some number of extra bytes after the header, but if the offset
11075+		// points either to before the header ends or implies a large amount of
11076+		// extra data, reject the file as malformed
11077+		int bytes_read_so_far = s->callback_already_read +
11078+		                        (int)(s->img_buffer - s->img_buffer_original);
11079+		int header_limit =
11080+		    1024; // max we actually read is below 256 bytes currently.
11081+		int extra_data_limit =
11082+		    256 * 4; // what ordinarily goes here is a palette; 256 entries*4
11083+		             // bytes is its max size.
11084+		if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
11085+			return stbi__errpuc("bad header", "Corrupt BMP");
11086+		}
11087+		// we established that bytes_read_so_far is positive and sensible.
11088+		// the first half of this test rejects offsets that are either too small
11089+		// positives, or negative, and guarantees that info.offset >=
11090+		// bytes_read_so_far > 0. this in turn ensures the number computed in
11091+		// the second half of the test can't overflow.
11092+		if (info.offset < bytes_read_so_far ||
11093+		    info.offset - bytes_read_so_far > extra_data_limit) {
11094+			return stbi__errpuc("bad offset", "Corrupt BMP");
11095+		} else {
11096+			stbi__skip(s, info.offset - bytes_read_so_far);
11097+		}
11098+	}
11099+
11100+	if (info.bpp == 24 && ma == 0xff000000) {
11101+		s->img_n = 3;
11102+	} else {
11103+		s->img_n = ma ? 4 : 3;
11104+	}
11105+	if (req_comp && req_comp >= 3) { // we can directly decode 3 or 4
11106+		target = req_comp;
11107+	} else {
11108+		target = s->img_n; // if they want monochrome, we'll post-convert
11109+	}
11110+
11111+	// sanity-check size
11112+	if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) {
11113+		return stbi__errpuc("too large", "Corrupt BMP");
11114+	}
11115+
11116+	out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
11117+	if (!out) {
11118+		return stbi__errpuc("outofmem", "Out of memory");
11119+	}
11120+	if (info.bpp < 16) {
11121+		int z = 0;
11122+		if (psize == 0 || psize > 256) {
11123+			STBI_FREE(out);
11124+			return stbi__errpuc("invalid", "Corrupt BMP");
11125+		}
11126+		for (i = 0; i < psize; ++i) {
11127+			pal[i][2] = stbi__get8(s);
11128+			pal[i][1] = stbi__get8(s);
11129+			pal[i][0] = stbi__get8(s);
11130+			if (info.hsz != 12) {
11131+				stbi__get8(s);
11132+			}
11133+			pal[i][3] = 255;
11134+		}
11135+		stbi__skip(s, info.offset - info.extra_read - info.hsz -
11136+		                  psize * (info.hsz == 12 ? 3 : 4));
11137+		if (info.bpp == 1) {
11138+			width = (s->img_x + 7) >> 3;
11139+		} else if (info.bpp == 4) {
11140+			width = (s->img_x + 1) >> 1;
11141+		} else if (info.bpp == 8) {
11142+			width = s->img_x;
11143+		} else {
11144+			STBI_FREE(out);
11145+			return stbi__errpuc("bad bpp", "Corrupt BMP");
11146+		}
11147+		pad = (-width) & 3;
11148+		if (info.bpp == 1) {
11149+			for (j = 0; j < (int)s->img_y; ++j) {
11150+				int bit_offset = 7, v = stbi__get8(s);
11151+				for (i = 0; i < (int)s->img_x; ++i) {
11152+					int color = (v >> bit_offset) & 0x1;
11153+					out[z++] = pal[color][0];
11154+					out[z++] = pal[color][1];
11155+					out[z++] = pal[color][2];
11156+					if (target == 4) {
11157+						out[z++] = 255;
11158+					}
11159+					if (i + 1 == (int)s->img_x) {
11160+						break;
11161+					}
11162+					if ((--bit_offset) < 0) {
11163+						bit_offset = 7;
11164+						v = stbi__get8(s);
11165+					}
11166+				}
11167+				stbi__skip(s, pad);
11168+			}
11169+		} else {
11170+			for (j = 0; j < (int)s->img_y; ++j) {
11171+				for (i = 0; i < (int)s->img_x; i += 2) {
11172+					int v = stbi__get8(s), v2 = 0;
11173+					if (info.bpp == 4) {
11174+						v2 = v & 15;
11175+						v >>= 4;
11176+					}
11177+					out[z++] = pal[v][0];
11178+					out[z++] = pal[v][1];
11179+					out[z++] = pal[v][2];
11180+					if (target == 4) {
11181+						out[z++] = 255;
11182+					}
11183+					if (i + 1 == (int)s->img_x) {
11184+						break;
11185+					}
11186+					v = (info.bpp == 8) ? stbi__get8(s) : v2;
11187+					out[z++] = pal[v][0];
11188+					out[z++] = pal[v][1];
11189+					out[z++] = pal[v][2];
11190+					if (target == 4) {
11191+						out[z++] = 255;
11192+					}
11193+				}
11194+				stbi__skip(s, pad);
11195+			}
11196+		}
11197+	} else {
11198+		int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0,
11199+		    gcount = 0, bcount = 0, acount = 0;
11200+		int z = 0;
11201+		int easy = 0;
11202+		stbi__skip(s, info.offset - info.extra_read - info.hsz);
11203+		if (info.bpp == 24) {
11204+			width = 3 * s->img_x;
11205+		} else if (info.bpp == 16) {
11206+			width = 2 * s->img_x;
11207+		} else { /* bpp = 32 and pad = 0 */
11208+			width = 0;
11209+		}
11210+		pad = (-width) & 3;
11211+		if (info.bpp == 24) {
11212+			easy = 1;
11213+		} else if (info.bpp == 32) {
11214+			if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 &&
11215+			    ma == 0xff000000) {
11216+				easy = 2;
11217+			}
11218+		}
11219+		if (!easy) {
11220+			if (!mr || !mg || !mb) {
11221+				STBI_FREE(out);
11222+				return stbi__errpuc("bad masks", "Corrupt BMP");
11223+			}
11224+			// right shift amt to put high bit in position #7
11225+			rshift = stbi__high_bit(mr) - 7;
11226+			rcount = stbi__bitcount(mr);
11227+			gshift = stbi__high_bit(mg) - 7;
11228+			gcount = stbi__bitcount(mg);
11229+			bshift = stbi__high_bit(mb) - 7;
11230+			bcount = stbi__bitcount(mb);
11231+			ashift = stbi__high_bit(ma) - 7;
11232+			acount = stbi__bitcount(ma);
11233+			if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) {
11234+				STBI_FREE(out);
11235+				return stbi__errpuc("bad masks", "Corrupt BMP");
11236+			}
11237+		}
11238+		for (j = 0; j < (int)s->img_y; ++j) {
11239+			if (easy) {
11240+				for (i = 0; i < (int)s->img_x; ++i) {
11241+					unsigned char a;
11242+					out[z + 2] = stbi__get8(s);
11243+					out[z + 1] = stbi__get8(s);
11244+					out[z + 0] = stbi__get8(s);
11245+					z += 3;
11246+					a = (easy == 2 ? stbi__get8(s) : 255);
11247+					all_a |= a;
11248+					if (target == 4) {
11249+						out[z++] = a;
11250+					}
11251+				}
11252+			} else {
11253+				int bpp = info.bpp;
11254+				for (i = 0; i < (int)s->img_x; ++i) {
11255+					stbi__uint32 v = (bpp == 16 ? (stbi__uint32)stbi__get16le(s)
11256+					                            : stbi__get32le(s));
11257+					unsigned int a;
11258+					out[z++] = STBI__BYTECAST(
11259+					    stbi__shiftsigned(v & mr, rshift, rcount));
11260+					out[z++] = STBI__BYTECAST(
11261+					    stbi__shiftsigned(v & mg, gshift, gcount));
11262+					out[z++] = STBI__BYTECAST(
11263+					    stbi__shiftsigned(v & mb, bshift, bcount));
11264+					a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
11265+					all_a |= a;
11266+					if (target == 4) {
11267+						out[z++] = STBI__BYTECAST(a);
11268+					}
11269+				}
11270+			}
11271+			stbi__skip(s, pad);
11272+		}
11273+	}
11274+
11275+	// if alpha channel is all 0s, replace with all 255s
11276+	if (target == 4 && all_a == 0) {
11277+		for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) {
11278+			out[i] = 255;
11279+		}
11280+	}
11281+
11282+	if (flip_vertically) {
11283+		stbi_uc t;
11284+		for (j = 0; j < (int)s->img_y >> 1; ++j) {
11285+			stbi_uc *p1 = out + j * s->img_x * target;
11286+			stbi_uc *p2 = out + (s->img_y - 1 - j) * s->img_x * target;
11287+			for (i = 0; i < (int)s->img_x * target; ++i) {
11288+				t = p1[i];
11289+				p1[i] = p2[i];
11290+				p2[i] = t;
11291+			}
11292+		}
11293+	}
11294+
11295+	if (req_comp && req_comp != target) {
11296+		out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
11297+		if (out == NULL) {
11298+			return out; // stbi__convert_format frees input on failure
11299+		}
11300+	}
11301+
11302+	*x = s->img_x;
11303+	*y = s->img_y;
11304+	if (comp) {
11305+		*comp = s->img_n;
11306+	}
11307+	return out;
11308 }
11309 #endif
11310 
11311@@ -5736,592 +7221,690 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
11312 // by Jonathan Dummer
11313 #ifndef STBI_NO_TGA
11314 // returns STBI_rgb or whatever, 0 on error
11315-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
11316-{
11317-   // only RGB or RGBA (incl. 16bit) or grey allowed
11318-   if (is_rgb16) *is_rgb16 = 0;
11319-   switch(bits_per_pixel) {
11320-      case 8:  return STBI_grey;
11321-      case 16: if(is_grey) return STBI_grey_alpha;
11322-               // fallthrough
11323-      case 15: if(is_rgb16) *is_rgb16 = 1;
11324-               return STBI_rgb;
11325-      case 24: // fallthrough
11326-      case 32: return bits_per_pixel/8;
11327-      default: return 0;
11328-   }
11329-}
11330-
11331-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
11332-{
11333-    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
11334-    int sz, tga_colormap_type;
11335-    stbi__get8(s);                   // discard Offset
11336-    tga_colormap_type = stbi__get8(s); // colormap type
11337-    if( tga_colormap_type > 1 ) {
11338-        stbi__rewind(s);
11339-        return 0;      // only RGB or indexed allowed
11340-    }
11341-    tga_image_type = stbi__get8(s); // image type
11342-    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
11343-        if (tga_image_type != 1 && tga_image_type != 9) {
11344-            stbi__rewind(s);
11345-            return 0;
11346-        }
11347-        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
11348-        sz = stbi__get8(s);    //   check bits per palette color entry
11349-        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
11350-            stbi__rewind(s);
11351-            return 0;
11352-        }
11353-        stbi__skip(s,4);       // skip image x and y origin
11354-        tga_colormap_bpp = sz;
11355-    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
11356-        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
11357-            stbi__rewind(s);
11358-            return 0; // only RGB or grey allowed, +/- RLE
11359-        }
11360-        stbi__skip(s,9); // skip colormap specification and image x/y origin
11361-        tga_colormap_bpp = 0;
11362-    }
11363-    tga_w = stbi__get16le(s);
11364-    if( tga_w < 1 ) {
11365-        stbi__rewind(s);
11366-        return 0;   // test width
11367-    }
11368-    tga_h = stbi__get16le(s);
11369-    if( tga_h < 1 ) {
11370-        stbi__rewind(s);
11371-        return 0;   // test height
11372-    }
11373-    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
11374-    stbi__get8(s); // ignore alpha bits
11375-    if (tga_colormap_bpp != 0) {
11376-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
11377-            // when using a colormap, tga_bits_per_pixel is the size of the indexes
11378-            // I don't think anything but 8 or 16bit indexes makes sense
11379-            stbi__rewind(s);
11380-            return 0;
11381-        }
11382-        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
11383-    } else {
11384-        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
11385-    }
11386-    if(!tga_comp) {
11387-      stbi__rewind(s);
11388-      return 0;
11389-    }
11390-    if (x) *x = tga_w;
11391-    if (y) *y = tga_h;
11392-    if (comp) *comp = tga_comp;
11393-    return 1;                   // seems to have passed everything
11394-}
11395-
11396-static int stbi__tga_test(stbi__context *s)
11397-{
11398-   int res = 0;
11399-   int sz, tga_color_type;
11400-   stbi__get8(s);      //   discard Offset
11401-   tga_color_type = stbi__get8(s);   //   color type
11402-   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
11403-   sz = stbi__get8(s);   //   image type
11404-   if ( tga_color_type == 1 ) { // colormapped (paletted) image
11405-      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
11406-      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
11407-      sz = stbi__get8(s);    //   check bits per palette color entry
11408-      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
11409-      stbi__skip(s,4);       // skip image x and y origin
11410-   } else { // "normal" image w/o colormap
11411-      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
11412-      stbi__skip(s,9); // skip colormap specification and image x/y origin
11413-   }
11414-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
11415-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
11416-   sz = stbi__get8(s);   //   bits per pixel
11417-   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
11418-   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
11419-
11420-   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
11421+static int
11422+stbi__tga_get_comp(int bits_per_pixel, int is_grey, int *is_rgb16)
11423+{
11424+	// only RGB or RGBA (incl. 16bit) or grey allowed
11425+	if (is_rgb16) {
11426+		*is_rgb16 = 0;
11427+	}
11428+	switch (bits_per_pixel) {
11429+	case 8:
11430+		return STBI_grey;
11431+	case 16:
11432+		if (is_grey) {
11433+			return STBI_grey_alpha;
11434+		}
11435+		// fallthrough
11436+	case 15:
11437+		if (is_rgb16) {
11438+			*is_rgb16 = 1;
11439+		}
11440+		return STBI_rgb;
11441+	case 24: // fallthrough
11442+	case 32:
11443+		return bits_per_pixel / 8;
11444+	default:
11445+		return 0;
11446+	}
11447+}
11448+
11449+static int
11450+stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
11451+{
11452+	int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel,
11453+	    tga_colormap_bpp;
11454+	int sz, tga_colormap_type;
11455+	stbi__get8(s);                     // discard Offset
11456+	tga_colormap_type = stbi__get8(s); // colormap type
11457+	if (tga_colormap_type > 1) {
11458+		stbi__rewind(s);
11459+		return 0; // only RGB or indexed allowed
11460+	}
11461+	tga_image_type = stbi__get8(s); // image type
11462+	if (tga_colormap_type == 1) {   // colormapped (paletted) image
11463+		if (tga_image_type != 1 && tga_image_type != 9) {
11464+			stbi__rewind(s);
11465+			return 0;
11466+		}
11467+		stbi__skip(
11468+		    s, 4); // skip index of first colormap entry and number of entries
11469+		sz = stbi__get8(s); //   check bits per palette color entry
11470+		if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
11471+			stbi__rewind(s);
11472+			return 0;
11473+		}
11474+		stbi__skip(s, 4); // skip image x and y origin
11475+		tga_colormap_bpp = sz;
11476+	} else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
11477+		if ((tga_image_type != 2) && (tga_image_type != 3) &&
11478+		    (tga_image_type != 10) && (tga_image_type != 11)) {
11479+			stbi__rewind(s);
11480+			return 0; // only RGB or grey allowed, +/- RLE
11481+		}
11482+		stbi__skip(s, 9); // skip colormap specification and image x/y origin
11483+		tga_colormap_bpp = 0;
11484+	}
11485+	tga_w = stbi__get16le(s);
11486+	if (tga_w < 1) {
11487+		stbi__rewind(s);
11488+		return 0; // test width
11489+	}
11490+	tga_h = stbi__get16le(s);
11491+	if (tga_h < 1) {
11492+		stbi__rewind(s);
11493+		return 0; // test height
11494+	}
11495+	tga_bits_per_pixel = stbi__get8(s); // bits per pixel
11496+	stbi__get8(s);                      // ignore alpha bits
11497+	if (tga_colormap_bpp != 0) {
11498+		if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
11499+			// when using a colormap, tga_bits_per_pixel is the size of the
11500+			// indexes I don't think anything but 8 or 16bit indexes makes sense
11501+			stbi__rewind(s);
11502+			return 0;
11503+		}
11504+		tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
11505+	} else {
11506+		tga_comp = stbi__tga_get_comp(
11507+		    tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11),
11508+		    NULL);
11509+	}
11510+	if (!tga_comp) {
11511+		stbi__rewind(s);
11512+		return 0;
11513+	}
11514+	if (x) {
11515+		*x = tga_w;
11516+	}
11517+	if (y) {
11518+		*y = tga_h;
11519+	}
11520+	if (comp) {
11521+		*comp = tga_comp;
11522+	}
11523+	return 1; // seems to have passed everything
11524+}
11525+
11526+static int
11527+stbi__tga_test(stbi__context *s)
11528+{
11529+	int res = 0;
11530+	int sz, tga_color_type;
11531+	stbi__get8(s);                  //   discard Offset
11532+	tga_color_type = stbi__get8(s); //   color type
11533+	if (tga_color_type > 1) {
11534+		goto errorEnd; //   only RGB or indexed allowed
11535+	}
11536+	sz = stbi__get8(s);        //   image type
11537+	if (tga_color_type == 1) { // colormapped (paletted) image
11538+		if (sz != 1 && sz != 9) {
11539+			goto errorEnd; // colortype 1 demands image type 1 or 9
11540+		}
11541+		stbi__skip(
11542+		    s, 4); // skip index of first colormap entry and number of entries
11543+		sz = stbi__get8(s); //   check bits per palette color entry
11544+		if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
11545+			goto errorEnd;
11546+		}
11547+		stbi__skip(s, 4); // skip image x and y origin
11548+	} else {              // "normal" image w/o colormap
11549+		if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) {
11550+			goto errorEnd; // only RGB or grey allowed, +/- RLE
11551+		}
11552+		stbi__skip(s, 9); // skip colormap specification and image x/y origin
11553+	}
11554+	if (stbi__get16le(s) < 1) {
11555+		goto errorEnd; //   test width
11556+	}
11557+	if (stbi__get16le(s) < 1) {
11558+		goto errorEnd; //   test height
11559+	}
11560+	sz = stbi__get8(s); //   bits per pixel
11561+	if ((tga_color_type == 1) && (sz != 8) && (sz != 16)) {
11562+		goto errorEnd; // for colormapped images, bpp is size of an index
11563+	}
11564+	if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
11565+		goto errorEnd;
11566+	}
11567+
11568+	res = 1; // if we got this far, everything's good and we can return 1
11569+	         // instead of 0
11570 
11571 errorEnd:
11572-   stbi__rewind(s);
11573-   return res;
11574+	stbi__rewind(s);
11575+	return res;
11576 }
11577 
11578 // read 16bit value and convert to 24bit RGB
11579-static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
11580-{
11581-   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
11582-   stbi__uint16 fiveBitMask = 31;
11583-   // we have 3 channels with 5bits each
11584-   int r = (px >> 10) & fiveBitMask;
11585-   int g = (px >> 5) & fiveBitMask;
11586-   int b = px & fiveBitMask;
11587-   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
11588-   out[0] = (stbi_uc)((r * 255)/31);
11589-   out[1] = (stbi_uc)((g * 255)/31);
11590-   out[2] = (stbi_uc)((b * 255)/31);
11591-
11592-   // some people claim that the most significant bit might be used for alpha
11593-   // (possibly if an alpha-bit is set in the "image descriptor byte")
11594-   // but that only made 16bit test images completely translucent..
11595-   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
11596-}
11597-
11598-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
11599-{
11600-   //   read in the TGA header stuff
11601-   int tga_offset = stbi__get8(s);
11602-   int tga_indexed = stbi__get8(s);
11603-   int tga_image_type = stbi__get8(s);
11604-   int tga_is_RLE = 0;
11605-   int tga_palette_start = stbi__get16le(s);
11606-   int tga_palette_len = stbi__get16le(s);
11607-   int tga_palette_bits = stbi__get8(s);
11608-   int tga_x_origin = stbi__get16le(s);
11609-   int tga_y_origin = stbi__get16le(s);
11610-   int tga_width = stbi__get16le(s);
11611-   int tga_height = stbi__get16le(s);
11612-   int tga_bits_per_pixel = stbi__get8(s);
11613-   int tga_comp, tga_rgb16=0;
11614-   int tga_inverted = stbi__get8(s);
11615-   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
11616-   //   image data
11617-   unsigned char *tga_data;
11618-   unsigned char *tga_palette = NULL;
11619-   int i, j;
11620-   unsigned char raw_data[4] = {0};
11621-   int RLE_count = 0;
11622-   int RLE_repeating = 0;
11623-   int read_next_pixel = 1;
11624-   STBI_NOTUSED(ri);
11625-   STBI_NOTUSED(tga_x_origin); // @TODO
11626-   STBI_NOTUSED(tga_y_origin); // @TODO
11627-
11628-   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
11629-   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
11630-
11631-   //   do a tiny bit of precessing
11632-   if ( tga_image_type >= 8 )
11633-   {
11634-      tga_image_type -= 8;
11635-      tga_is_RLE = 1;
11636-   }
11637-   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
11638-
11639-   //   If I'm paletted, then I'll use the number of bits from the palette
11640-   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
11641-   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
11642-
11643-   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
11644-      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
11645-
11646-   //   tga info
11647-   *x = tga_width;
11648-   *y = tga_height;
11649-   if (comp) *comp = tga_comp;
11650-
11651-   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
11652-      return stbi__errpuc("too large", "Corrupt TGA");
11653-
11654-   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
11655-   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
11656-
11657-   // skip to the data's starting position (offset usually = 0)
11658-   stbi__skip(s, tga_offset );
11659-
11660-   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
11661-      for (i=0; i < tga_height; ++i) {
11662-         int row = tga_inverted ? tga_height -i - 1 : i;
11663-         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
11664-         stbi__getn(s, tga_row, tga_width * tga_comp);
11665-      }
11666-   } else  {
11667-      //   do I need to load a palette?
11668-      if ( tga_indexed)
11669-      {
11670-         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
11671-            STBI_FREE(tga_data);
11672-            return stbi__errpuc("bad palette", "Corrupt TGA");
11673-         }
11674-
11675-         //   any data to skip? (offset usually = 0)
11676-         stbi__skip(s, tga_palette_start );
11677-         //   load the palette
11678-         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
11679-         if (!tga_palette) {
11680-            STBI_FREE(tga_data);
11681-            return stbi__errpuc("outofmem", "Out of memory");
11682-         }
11683-         if (tga_rgb16) {
11684-            stbi_uc *pal_entry = tga_palette;
11685-            STBI_ASSERT(tga_comp == STBI_rgb);
11686-            for (i=0; i < tga_palette_len; ++i) {
11687-               stbi__tga_read_rgb16(s, pal_entry);
11688-               pal_entry += tga_comp;
11689-            }
11690-         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
11691-               STBI_FREE(tga_data);
11692-               STBI_FREE(tga_palette);
11693-               return stbi__errpuc("bad palette", "Corrupt TGA");
11694-         }
11695-      }
11696-      //   load the data
11697-      for (i=0; i < tga_width * tga_height; ++i)
11698-      {
11699-         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
11700-         if ( tga_is_RLE )
11701-         {
11702-            if ( RLE_count == 0 )
11703-            {
11704-               //   yep, get the next byte as a RLE command
11705-               int RLE_cmd = stbi__get8(s);
11706-               RLE_count = 1 + (RLE_cmd & 127);
11707-               RLE_repeating = RLE_cmd >> 7;
11708-               read_next_pixel = 1;
11709-            } else if ( !RLE_repeating )
11710-            {
11711-               read_next_pixel = 1;
11712-            }
11713-         } else
11714-         {
11715-            read_next_pixel = 1;
11716-         }
11717-         //   OK, if I need to read a pixel, do it now
11718-         if ( read_next_pixel )
11719-         {
11720-            //   load however much data we did have
11721-            if ( tga_indexed )
11722-            {
11723-               // read in index, then perform the lookup
11724-               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
11725-               if ( pal_idx >= tga_palette_len ) {
11726-                  // invalid index
11727-                  pal_idx = 0;
11728-               }
11729-               pal_idx *= tga_comp;
11730-               for (j = 0; j < tga_comp; ++j) {
11731-                  raw_data[j] = tga_palette[pal_idx+j];
11732-               }
11733-            } else if(tga_rgb16) {
11734-               STBI_ASSERT(tga_comp == STBI_rgb);
11735-               stbi__tga_read_rgb16(s, raw_data);
11736-            } else {
11737-               //   read in the data raw
11738-               for (j = 0; j < tga_comp; ++j) {
11739-                  raw_data[j] = stbi__get8(s);
11740-               }
11741-            }
11742-            //   clear the reading flag for the next pixel
11743-            read_next_pixel = 0;
11744-         } // end of reading a pixel
11745-
11746-         // copy data
11747-         for (j = 0; j < tga_comp; ++j)
11748-           tga_data[i*tga_comp+j] = raw_data[j];
11749-
11750-         //   in case we're in RLE mode, keep counting down
11751-         --RLE_count;
11752-      }
11753-      //   do I need to invert the image?
11754-      if ( tga_inverted )
11755-      {
11756-         for (j = 0; j*2 < tga_height; ++j)
11757-         {
11758-            int index1 = j * tga_width * tga_comp;
11759-            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
11760-            for (i = tga_width * tga_comp; i > 0; --i)
11761-            {
11762-               unsigned char temp = tga_data[index1];
11763-               tga_data[index1] = tga_data[index2];
11764-               tga_data[index2] = temp;
11765-               ++index1;
11766-               ++index2;
11767-            }
11768-         }
11769-      }
11770-      //   clear my palette, if I had one
11771-      if ( tga_palette != NULL )
11772-      {
11773-         STBI_FREE( tga_palette );
11774-      }
11775-   }
11776-
11777-   // swap RGB - if the source data was RGB16, it already is in the right order
11778-   if (tga_comp >= 3 && !tga_rgb16)
11779-   {
11780-      unsigned char* tga_pixel = tga_data;
11781-      for (i=0; i < tga_width * tga_height; ++i)
11782-      {
11783-         unsigned char temp = tga_pixel[0];
11784-         tga_pixel[0] = tga_pixel[2];
11785-         tga_pixel[2] = temp;
11786-         tga_pixel += tga_comp;
11787-      }
11788-   }
11789-
11790-   // convert to target component count
11791-   if (req_comp && req_comp != tga_comp)
11792-      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
11793-
11794-   //   the things I do to get rid of an error message, and yet keep
11795-   //   Microsoft's C compilers happy... [8^(
11796-   tga_palette_start = tga_palette_len = tga_palette_bits =
11797-         tga_x_origin = tga_y_origin = 0;
11798-   STBI_NOTUSED(tga_palette_start);
11799-   //   OK, done
11800-   return tga_data;
11801+static void
11802+stbi__tga_read_rgb16(stbi__context *s, stbi_uc *out)
11803+{
11804+	stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
11805+	stbi__uint16 fiveBitMask = 31;
11806+	// we have 3 channels with 5bits each
11807+	int r = (px >> 10) & fiveBitMask;
11808+	int g = (px >> 5) & fiveBitMask;
11809+	int b = px & fiveBitMask;
11810+	// Note that this saves the data in RGB(A) order, so it doesn't need to be
11811+	// swapped later
11812+	out[0] = (stbi_uc)((r * 255) / 31);
11813+	out[1] = (stbi_uc)((g * 255) / 31);
11814+	out[2] = (stbi_uc)((b * 255) / 31);
11815+
11816+	// some people claim that the most significant bit might be used for alpha
11817+	// (possibly if an alpha-bit is set in the "image descriptor byte")
11818+	// but that only made 16bit test images completely translucent..
11819+	// so let's treat all 15 and 16bit TGAs as RGB with no alpha.
11820+}
11821+
11822+static void *
11823+stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
11824+               stbi__result_info *ri)
11825+{
11826+	//   read in the TGA header stuff
11827+	int tga_offset = stbi__get8(s);
11828+	int tga_indexed = stbi__get8(s);
11829+	int tga_image_type = stbi__get8(s);
11830+	int tga_is_RLE = 0;
11831+	int tga_palette_start = stbi__get16le(s);
11832+	int tga_palette_len = stbi__get16le(s);
11833+	int tga_palette_bits = stbi__get8(s);
11834+	int tga_x_origin = stbi__get16le(s);
11835+	int tga_y_origin = stbi__get16le(s);
11836+	int tga_width = stbi__get16le(s);
11837+	int tga_height = stbi__get16le(s);
11838+	int tga_bits_per_pixel = stbi__get8(s);
11839+	int tga_comp, tga_rgb16 = 0;
11840+	int tga_inverted = stbi__get8(s);
11841+	// int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused
11842+	// (useless?)
11843+	//   image data
11844+	unsigned char *tga_data;
11845+	unsigned char *tga_palette = NULL;
11846+	int i, j;
11847+	unsigned char raw_data[4] = {0};
11848+	int RLE_count = 0;
11849+	int RLE_repeating = 0;
11850+	int read_next_pixel = 1;
11851+	STBI_NOTUSED(ri);
11852+	STBI_NOTUSED(tga_x_origin); // @TODO
11853+	STBI_NOTUSED(tga_y_origin); // @TODO
11854+
11855+	if (tga_height > STBI_MAX_DIMENSIONS) {
11856+		return stbi__errpuc("too large", "Very large image (corrupt?)");
11857+	}
11858+	if (tga_width > STBI_MAX_DIMENSIONS) {
11859+		return stbi__errpuc("too large", "Very large image (corrupt?)");
11860+	}
11861+
11862+	//   do a tiny bit of precessing
11863+	if (tga_image_type >= 8) {
11864+		tga_image_type -= 8;
11865+		tga_is_RLE = 1;
11866+	}
11867+	tga_inverted = 1 - ((tga_inverted >> 5) & 1);
11868+
11869+	//   If I'm paletted, then I'll use the number of bits from the palette
11870+	if (tga_indexed) {
11871+		tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
11872+	} else {
11873+		tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3),
11874+		                              &tga_rgb16);
11875+	}
11876+
11877+	if (!tga_comp) { // shouldn't really happen, stbi__tga_test() should have
11878+		             // ensured basic consistency
11879+		return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
11880+	}
11881+
11882+	//   tga info
11883+	*x = tga_width;
11884+	*y = tga_height;
11885+	if (comp) {
11886+		*comp = tga_comp;
11887+	}
11888+
11889+	if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) {
11890+		return stbi__errpuc("too large", "Corrupt TGA");
11891+	}
11892+
11893+	tga_data =
11894+	    (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
11895+	if (!tga_data) {
11896+		return stbi__errpuc("outofmem", "Out of memory");
11897+	}
11898+
11899+	// skip to the data's starting position (offset usually = 0)
11900+	stbi__skip(s, tga_offset);
11901+
11902+	if (!tga_indexed && !tga_is_RLE && !tga_rgb16) {
11903+		for (i = 0; i < tga_height; ++i) {
11904+			int row = tga_inverted ? tga_height - i - 1 : i;
11905+			stbi_uc *tga_row = tga_data + row * tga_width * tga_comp;
11906+			stbi__getn(s, tga_row, tga_width * tga_comp);
11907+		}
11908+	} else {
11909+		//   do I need to load a palette?
11910+		if (tga_indexed) {
11911+			if (tga_palette_len ==
11912+			    0) { /* you have to have at least one entry! */
11913+				STBI_FREE(tga_data);
11914+				return stbi__errpuc("bad palette", "Corrupt TGA");
11915+			}
11916+
11917+			//   any data to skip? (offset usually = 0)
11918+			stbi__skip(s, tga_palette_start);
11919+			//   load the palette
11920+			tga_palette = (unsigned char *)stbi__malloc_mad2(tga_palette_len,
11921+			                                                 tga_comp, 0);
11922+			if (!tga_palette) {
11923+				STBI_FREE(tga_data);
11924+				return stbi__errpuc("outofmem", "Out of memory");
11925+			}
11926+			if (tga_rgb16) {
11927+				stbi_uc *pal_entry = tga_palette;
11928+				STBI_ASSERT(tga_comp == STBI_rgb);
11929+				for (i = 0; i < tga_palette_len; ++i) {
11930+					stbi__tga_read_rgb16(s, pal_entry);
11931+					pal_entry += tga_comp;
11932+				}
11933+			} else if (!stbi__getn(s, tga_palette,
11934+			                       tga_palette_len * tga_comp)) {
11935+				STBI_FREE(tga_data);
11936+				STBI_FREE(tga_palette);
11937+				return stbi__errpuc("bad palette", "Corrupt TGA");
11938+			}
11939+		}
11940+		//   load the data
11941+		for (i = 0; i < tga_width * tga_height; ++i) {
11942+			//   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
11943+			if (tga_is_RLE) {
11944+				if (RLE_count == 0) {
11945+					//   yep, get the next byte as a RLE command
11946+					int RLE_cmd = stbi__get8(s);
11947+					RLE_count = 1 + (RLE_cmd & 127);
11948+					RLE_repeating = RLE_cmd >> 7;
11949+					read_next_pixel = 1;
11950+				} else if (!RLE_repeating) {
11951+					read_next_pixel = 1;
11952+				}
11953+			} else {
11954+				read_next_pixel = 1;
11955+			}
11956+			//   OK, if I need to read a pixel, do it now
11957+			if (read_next_pixel) {
11958+				//   load however much data we did have
11959+				if (tga_indexed) {
11960+					// read in index, then perform the lookup
11961+					int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s)
11962+					                                        : stbi__get16le(s);
11963+					if (pal_idx >= tga_palette_len) {
11964+						// invalid index
11965+						pal_idx = 0;
11966+					}
11967+					pal_idx *= tga_comp;
11968+					for (j = 0; j < tga_comp; ++j) {
11969+						raw_data[j] = tga_palette[pal_idx + j];
11970+					}
11971+				} else if (tga_rgb16) {
11972+					STBI_ASSERT(tga_comp == STBI_rgb);
11973+					stbi__tga_read_rgb16(s, raw_data);
11974+				} else {
11975+					//   read in the data raw
11976+					for (j = 0; j < tga_comp; ++j) {
11977+						raw_data[j] = stbi__get8(s);
11978+					}
11979+				}
11980+				//   clear the reading flag for the next pixel
11981+				read_next_pixel = 0;
11982+			} // end of reading a pixel
11983+
11984+			// copy data
11985+			for (j = 0; j < tga_comp; ++j) {
11986+				tga_data[i * tga_comp + j] = raw_data[j];
11987+			}
11988+
11989+			//   in case we're in RLE mode, keep counting down
11990+			--RLE_count;
11991+		}
11992+		//   do I need to invert the image?
11993+		if (tga_inverted) {
11994+			for (j = 0; j * 2 < tga_height; ++j) {
11995+				int index1 = j * tga_width * tga_comp;
11996+				int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
11997+				for (i = tga_width * tga_comp; i > 0; --i) {
11998+					unsigned char temp = tga_data[index1];
11999+					tga_data[index1] = tga_data[index2];
12000+					tga_data[index2] = temp;
12001+					++index1;
12002+					++index2;
12003+				}
12004+			}
12005+		}
12006+		//   clear my palette, if I had one
12007+		if (tga_palette != NULL) {
12008+			STBI_FREE(tga_palette);
12009+		}
12010+	}
12011+
12012+	// swap RGB - if the source data was RGB16, it already is in the right order
12013+	if (tga_comp >= 3 && !tga_rgb16) {
12014+		unsigned char *tga_pixel = tga_data;
12015+		for (i = 0; i < tga_width * tga_height; ++i) {
12016+			unsigned char temp = tga_pixel[0];
12017+			tga_pixel[0] = tga_pixel[2];
12018+			tga_pixel[2] = temp;
12019+			tga_pixel += tga_comp;
12020+		}
12021+	}
12022+
12023+	// convert to target component count
12024+	if (req_comp && req_comp != tga_comp) {
12025+		tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width,
12026+		                                tga_height);
12027+	}
12028+
12029+	//   the things I do to get rid of an error message, and yet keep
12030+	//   Microsoft's C compilers happy... [8^(
12031+	tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin =
12032+	    tga_y_origin = 0;
12033+	STBI_NOTUSED(tga_palette_start);
12034+	//   OK, done
12035+	return tga_data;
12036 }
12037 #endif
12038 
12039 // *************************************************************************************************
12040-// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
12041+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz,
12042+// tweaked by STB
12043 
12044 #ifndef STBI_NO_PSD
12045-static int stbi__psd_test(stbi__context *s)
12046-{
12047-   int r = (stbi__get32be(s) == 0x38425053);
12048-   stbi__rewind(s);
12049-   return r;
12050-}
12051-
12052-static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
12053-{
12054-   int count, nleft, len;
12055-
12056-   count = 0;
12057-   while ((nleft = pixelCount - count) > 0) {
12058-      len = stbi__get8(s);
12059-      if (len == 128) {
12060-         // No-op.
12061-      } else if (len < 128) {
12062-         // Copy next len+1 bytes literally.
12063-         len++;
12064-         if (len > nleft) return 0; // corrupt data
12065-         count += len;
12066-         while (len) {
12067-            *p = stbi__get8(s);
12068-            p += 4;
12069-            len--;
12070-         }
12071-      } else if (len > 128) {
12072-         stbi_uc   val;
12073-         // Next -len+1 bytes in the dest are replicated from next source byte.
12074-         // (Interpret len as a negative 8-bit int.)
12075-         len = 257 - len;
12076-         if (len > nleft) return 0; // corrupt data
12077-         val = stbi__get8(s);
12078-         count += len;
12079-         while (len) {
12080-            *p = val;
12081-            p += 4;
12082-            len--;
12083-         }
12084-      }
12085-   }
12086-
12087-   return 1;
12088-}
12089-
12090-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
12091-{
12092-   int pixelCount;
12093-   int channelCount, compression;
12094-   int channel, i;
12095-   int bitdepth;
12096-   int w,h;
12097-   stbi_uc *out;
12098-   STBI_NOTUSED(ri);
12099-
12100-   // Check identifier
12101-   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
12102-      return stbi__errpuc("not PSD", "Corrupt PSD image");
12103-
12104-   // Check file type version.
12105-   if (stbi__get16be(s) != 1)
12106-      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
12107-
12108-   // Skip 6 reserved bytes.
12109-   stbi__skip(s, 6 );
12110-
12111-   // Read the number of channels (R, G, B, A, etc).
12112-   channelCount = stbi__get16be(s);
12113-   if (channelCount < 0 || channelCount > 16)
12114-      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
12115-
12116-   // Read the rows and columns of the image.
12117-   h = stbi__get32be(s);
12118-   w = stbi__get32be(s);
12119-
12120-   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
12121-   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
12122-
12123-   // Make sure the depth is 8 bits.
12124-   bitdepth = stbi__get16be(s);
12125-   if (bitdepth != 8 && bitdepth != 16)
12126-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
12127-
12128-   // Make sure the color mode is RGB.
12129-   // Valid options are:
12130-   //   0: Bitmap
12131-   //   1: Grayscale
12132-   //   2: Indexed color
12133-   //   3: RGB color
12134-   //   4: CMYK color
12135-   //   7: Multichannel
12136-   //   8: Duotone
12137-   //   9: Lab color
12138-   if (stbi__get16be(s) != 3)
12139-      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
12140-
12141-   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
12142-   stbi__skip(s,stbi__get32be(s) );
12143-
12144-   // Skip the image resources.  (resolution, pen tool paths, etc)
12145-   stbi__skip(s, stbi__get32be(s) );
12146-
12147-   // Skip the reserved data.
12148-   stbi__skip(s, stbi__get32be(s) );
12149-
12150-   // Find out if the data is compressed.
12151-   // Known values:
12152-   //   0: no compression
12153-   //   1: RLE compressed
12154-   compression = stbi__get16be(s);
12155-   if (compression > 1)
12156-      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
12157-
12158-   // Check size
12159-   if (!stbi__mad3sizes_valid(4, w, h, 0))
12160-      return stbi__errpuc("too large", "Corrupt PSD");
12161-
12162-   // Create the destination image.
12163-
12164-   if (!compression && bitdepth == 16 && bpc == 16) {
12165-      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
12166-      ri->bits_per_channel = 16;
12167-   } else
12168-      out = (stbi_uc *) stbi__malloc(4 * w*h);
12169-
12170-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
12171-   pixelCount = w*h;
12172-
12173-   // Initialize the data to zero.
12174-   //memset( out, 0, pixelCount * 4 );
12175-
12176-   // Finally, the image data.
12177-   if (compression) {
12178-      // RLE as used by .PSD and .TIFF
12179-      // Loop until you get the number of unpacked bytes you are expecting:
12180-      //     Read the next source byte into n.
12181-      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
12182-      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
12183-      //     Else if n is 128, noop.
12184-      // Endloop
12185-
12186-      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
12187-      // which we're going to just skip.
12188-      stbi__skip(s, h * channelCount * 2 );
12189-
12190-      // Read the RLE data by channel.
12191-      for (channel = 0; channel < 4; channel++) {
12192-         stbi_uc *p;
12193-
12194-         p = out+channel;
12195-         if (channel >= channelCount) {
12196-            // Fill this channel with default data.
12197-            for (i = 0; i < pixelCount; i++, p += 4)
12198-               *p = (channel == 3 ? 255 : 0);
12199-         } else {
12200-            // Read the RLE data.
12201-            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
12202-               STBI_FREE(out);
12203-               return stbi__errpuc("corrupt", "bad RLE data");
12204-            }
12205-         }
12206-      }
12207-
12208-   } else {
12209-      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
12210-      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
12211-
12212-      // Read the data by channel.
12213-      for (channel = 0; channel < 4; channel++) {
12214-         if (channel >= channelCount) {
12215-            // Fill this channel with default data.
12216-            if (bitdepth == 16 && bpc == 16) {
12217-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
12218-               stbi__uint16 val = channel == 3 ? 65535 : 0;
12219-               for (i = 0; i < pixelCount; i++, q += 4)
12220-                  *q = val;
12221-            } else {
12222-               stbi_uc *p = out+channel;
12223-               stbi_uc val = channel == 3 ? 255 : 0;
12224-               for (i = 0; i < pixelCount; i++, p += 4)
12225-                  *p = val;
12226-            }
12227-         } else {
12228-            if (ri->bits_per_channel == 16) {    // output bpc
12229-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
12230-               for (i = 0; i < pixelCount; i++, q += 4)
12231-                  *q = (stbi__uint16) stbi__get16be(s);
12232-            } else {
12233-               stbi_uc *p = out+channel;
12234-               if (bitdepth == 16) {  // input bpc
12235-                  for (i = 0; i < pixelCount; i++, p += 4)
12236-                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
12237-               } else {
12238-                  for (i = 0; i < pixelCount; i++, p += 4)
12239-                     *p = stbi__get8(s);
12240-               }
12241-            }
12242-         }
12243-      }
12244-   }
12245-
12246-   // remove weird white matte from PSD
12247-   if (channelCount >= 4) {
12248-      if (ri->bits_per_channel == 16) {
12249-         for (i=0; i < w*h; ++i) {
12250-            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
12251-            if (pixel[3] != 0 && pixel[3] != 65535) {
12252-               float a = pixel[3] / 65535.0f;
12253-               float ra = 1.0f / a;
12254-               float inv_a = 65535.0f * (1 - ra);
12255-               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
12256-               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
12257-               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
12258-            }
12259-         }
12260-      } else {
12261-         for (i=0; i < w*h; ++i) {
12262-            unsigned char *pixel = out + 4*i;
12263-            if (pixel[3] != 0 && pixel[3] != 255) {
12264-               float a = pixel[3] / 255.0f;
12265-               float ra = 1.0f / a;
12266-               float inv_a = 255.0f * (1 - ra);
12267-               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
12268-               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
12269-               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
12270-            }
12271-         }
12272-      }
12273-   }
12274-
12275-   // convert to desired output format
12276-   if (req_comp && req_comp != 4) {
12277-      if (ri->bits_per_channel == 16)
12278-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
12279-      else
12280-         out = stbi__convert_format(out, 4, req_comp, w, h);
12281-      if (out == NULL) return out; // stbi__convert_format frees input on failure
12282-   }
12283-
12284-   if (comp) *comp = 4;
12285-   *y = h;
12286-   *x = w;
12287-
12288-   return out;
12289+static int
12290+stbi__psd_test(stbi__context *s)
12291+{
12292+	int r = (stbi__get32be(s) == 0x38425053);
12293+	stbi__rewind(s);
12294+	return r;
12295+}
12296+
12297+static int
12298+stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
12299+{
12300+	int count, nleft, len;
12301+
12302+	count = 0;
12303+	while ((nleft = pixelCount - count) > 0) {
12304+		len = stbi__get8(s);
12305+		if (len == 128) {
12306+			// No-op.
12307+		} else if (len < 128) {
12308+			// Copy next len+1 bytes literally.
12309+			len++;
12310+			if (len > nleft) {
12311+				return 0; // corrupt data
12312+			}
12313+			count += len;
12314+			while (len) {
12315+				*p = stbi__get8(s);
12316+				p += 4;
12317+				len--;
12318+			}
12319+		} else if (len > 128) {
12320+			stbi_uc val;
12321+			// Next -len+1 bytes in the dest are replicated from next source
12322+			// byte. (Interpret len as a negative 8-bit int.)
12323+			len = 257 - len;
12324+			if (len > nleft) {
12325+				return 0; // corrupt data
12326+			}
12327+			val = stbi__get8(s);
12328+			count += len;
12329+			while (len) {
12330+				*p = val;
12331+				p += 4;
12332+				len--;
12333+			}
12334+		}
12335+	}
12336+
12337+	return 1;
12338+}
12339+
12340+static void *
12341+stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
12342+               stbi__result_info *ri, int bpc)
12343+{
12344+	int pixelCount;
12345+	int channelCount, compression;
12346+	int channel, i;
12347+	int bitdepth;
12348+	int w, h;
12349+	stbi_uc *out;
12350+	STBI_NOTUSED(ri);
12351+
12352+	// Check identifier
12353+	if (stbi__get32be(s) != 0x38425053) { // "8BPS"
12354+		return stbi__errpuc("not PSD", "Corrupt PSD image");
12355+	}
12356+
12357+	// Check file type version.
12358+	if (stbi__get16be(s) != 1) {
12359+		return stbi__errpuc("wrong version",
12360+		                    "Unsupported version of PSD image");
12361+	}
12362+
12363+	// Skip 6 reserved bytes.
12364+	stbi__skip(s, 6);
12365+
12366+	// Read the number of channels (R, G, B, A, etc).
12367+	channelCount = stbi__get16be(s);
12368+	if (channelCount < 0 || channelCount > 16) {
12369+		return stbi__errpuc("wrong channel count",
12370+		                    "Unsupported number of channels in PSD image");
12371+	}
12372+
12373+	// Read the rows and columns of the image.
12374+	h = stbi__get32be(s);
12375+	w = stbi__get32be(s);
12376+
12377+	if (h > STBI_MAX_DIMENSIONS) {
12378+		return stbi__errpuc("too large", "Very large image (corrupt?)");
12379+	}
12380+	if (w > STBI_MAX_DIMENSIONS) {
12381+		return stbi__errpuc("too large", "Very large image (corrupt?)");
12382+	}
12383+
12384+	// Make sure the depth is 8 bits.
12385+	bitdepth = stbi__get16be(s);
12386+	if (bitdepth != 8 && bitdepth != 16) {
12387+		return stbi__errpuc("unsupported bit depth",
12388+		                    "PSD bit depth is not 8 or 16 bit");
12389+	}
12390+
12391+	// Make sure the color mode is RGB.
12392+	// Valid options are:
12393+	//   0: Bitmap
12394+	//   1: Grayscale
12395+	//   2: Indexed color
12396+	//   3: RGB color
12397+	//   4: CMYK color
12398+	//   7: Multichannel
12399+	//   8: Duotone
12400+	//   9: Lab color
12401+	if (stbi__get16be(s) != 3) {
12402+		return stbi__errpuc("wrong color format",
12403+		                    "PSD is not in RGB color format");
12404+	}
12405+
12406+	// Skip the Mode Data.  (It's the palette for indexed color; other info for
12407+	// other modes.)
12408+	stbi__skip(s, stbi__get32be(s));
12409+
12410+	// Skip the image resources.  (resolution, pen tool paths, etc)
12411+	stbi__skip(s, stbi__get32be(s));
12412+
12413+	// Skip the reserved data.
12414+	stbi__skip(s, stbi__get32be(s));
12415+
12416+	// Find out if the data is compressed.
12417+	// Known values:
12418+	//   0: no compression
12419+	//   1: RLE compressed
12420+	compression = stbi__get16be(s);
12421+	if (compression > 1) {
12422+		return stbi__errpuc("bad compression",
12423+		                    "PSD has an unknown compression format");
12424+	}
12425+
12426+	// Check size
12427+	if (!stbi__mad3sizes_valid(4, w, h, 0)) {
12428+		return stbi__errpuc("too large", "Corrupt PSD");
12429+	}
12430+
12431+	// Create the destination image.
12432+
12433+	if (!compression && bitdepth == 16 && bpc == 16) {
12434+		out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0);
12435+		ri->bits_per_channel = 16;
12436+	} else {
12437+		out = (stbi_uc *)stbi__malloc(4 * w * h);
12438+	}
12439+
12440+	if (!out) {
12441+		return stbi__errpuc("outofmem", "Out of memory");
12442+	}
12443+	pixelCount = w * h;
12444+
12445+	// Initialize the data to zero.
12446+	// memset( out, 0, pixelCount * 4 );
12447+
12448+	// Finally, the image data.
12449+	if (compression) {
12450+		// RLE as used by .PSD and .TIFF
12451+		// Loop until you get the number of unpacked bytes you are expecting:
12452+		//     Read the next source byte into n.
12453+		//     If n is between 0 and 127 inclusive, copy the next n+1 bytes
12454+		//     literally. Else if n is between -127 and -1 inclusive, copy the
12455+		//     next byte -n+1 times. Else if n is 128, noop.
12456+		// Endloop
12457+
12458+		// The RLE-compressed data is preceded by a 2-byte data count for each
12459+		// row in the data, which we're going to just skip.
12460+		stbi__skip(s, h * channelCount * 2);
12461+
12462+		// Read the RLE data by channel.
12463+		for (channel = 0; channel < 4; channel++) {
12464+			stbi_uc *p;
12465+
12466+			p = out + channel;
12467+			if (channel >= channelCount) {
12468+				// Fill this channel with default data.
12469+				for (i = 0; i < pixelCount; i++, p += 4) {
12470+					*p = (channel == 3 ? 255 : 0);
12471+				}
12472+			} else {
12473+				// Read the RLE data.
12474+				if (!stbi__psd_decode_rle(s, p, pixelCount)) {
12475+					STBI_FREE(out);
12476+					return stbi__errpuc("corrupt", "bad RLE data");
12477+				}
12478+			}
12479+		}
12480+
12481+	} else {
12482+		// We're at the raw image data.  It's each channel in order (Red, Green,
12483+		// Blue, Alpha, ...) where each channel consists of an 8-bit (or 16-bit)
12484+		// value for each pixel in the image.
12485+
12486+		// Read the data by channel.
12487+		for (channel = 0; channel < 4; channel++) {
12488+			if (channel >= channelCount) {
12489+				// Fill this channel with default data.
12490+				if (bitdepth == 16 && bpc == 16) {
12491+					stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
12492+					stbi__uint16 val = channel == 3 ? 65535 : 0;
12493+					for (i = 0; i < pixelCount; i++, q += 4) {
12494+						*q = val;
12495+					}
12496+				} else {
12497+					stbi_uc *p = out + channel;
12498+					stbi_uc val = channel == 3 ? 255 : 0;
12499+					for (i = 0; i < pixelCount; i++, p += 4) {
12500+						*p = val;
12501+					}
12502+				}
12503+			} else {
12504+				if (ri->bits_per_channel == 16) { // output bpc
12505+					stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
12506+					for (i = 0; i < pixelCount; i++, q += 4) {
12507+						*q = (stbi__uint16)stbi__get16be(s);
12508+					}
12509+				} else {
12510+					stbi_uc *p = out + channel;
12511+					if (bitdepth == 16) { // input bpc
12512+						for (i = 0; i < pixelCount; i++, p += 4) {
12513+							*p = (stbi_uc)(stbi__get16be(s) >> 8);
12514+						}
12515+					} else {
12516+						for (i = 0; i < pixelCount; i++, p += 4) {
12517+							*p = stbi__get8(s);
12518+						}
12519+					}
12520+				}
12521+			}
12522+		}
12523+	}
12524+
12525+	// remove weird white matte from PSD
12526+	if (channelCount >= 4) {
12527+		if (ri->bits_per_channel == 16) {
12528+			for (i = 0; i < w * h; ++i) {
12529+				stbi__uint16 *pixel = (stbi__uint16 *)out + 4 * i;
12530+				if (pixel[3] != 0 && pixel[3] != 65535) {
12531+					float a = pixel[3] / 65535.0f;
12532+					float ra = 1.0f / a;
12533+					float inv_a = 65535.0f * (1 - ra);
12534+					pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a);
12535+					pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a);
12536+					pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a);
12537+				}
12538+			}
12539+		} else {
12540+			for (i = 0; i < w * h; ++i) {
12541+				unsigned char *pixel = out + 4 * i;
12542+				if (pixel[3] != 0 && pixel[3] != 255) {
12543+					float a = pixel[3] / 255.0f;
12544+					float ra = 1.0f / a;
12545+					float inv_a = 255.0f * (1 - ra);
12546+					pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
12547+					pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
12548+					pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
12549+				}
12550+			}
12551+		}
12552+	}
12553+
12554+	// convert to desired output format
12555+	if (req_comp && req_comp != 4) {
12556+		if (ri->bits_per_channel == 16) {
12557+			out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4,
12558+			                                        req_comp, w, h);
12559+		} else {
12560+			out = stbi__convert_format(out, 4, req_comp, w, h);
12561+		}
12562+		if (out == NULL) {
12563+			return out; // stbi__convert_format frees input on failure
12564+		}
12565+	}
12566+
12567+	if (comp) {
12568+		*comp = 4;
12569+	}
12570+	*y = h;
12571+	*x = w;
12572+
12573+	return out;
12574 }
12575 #endif
12576 
12577@@ -6333,216 +7916,273 @@ static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req
12578 // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
12579 
12580 #ifndef STBI_NO_PIC
12581-static int stbi__pic_is4(stbi__context *s,const char *str)
12582+static int
12583+stbi__pic_is4(stbi__context *s, const char *str)
12584 {
12585-   int i;
12586-   for (i=0; i<4; ++i)
12587-      if (stbi__get8(s) != (stbi_uc)str[i])
12588-         return 0;
12589+	int i;
12590+	for (i = 0; i < 4; ++i) {
12591+		if (stbi__get8(s) != (stbi_uc)str[i]) {
12592+			return 0;
12593+		}
12594+	}
12595 
12596-   return 1;
12597+	return 1;
12598 }
12599 
12600-static int stbi__pic_test_core(stbi__context *s)
12601+static int
12602+stbi__pic_test_core(stbi__context *s)
12603 {
12604-   int i;
12605+	int i;
12606 
12607-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
12608-      return 0;
12609+	if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
12610+		return 0;
12611+	}
12612 
12613-   for(i=0;i<84;++i)
12614-      stbi__get8(s);
12615+	for (i = 0; i < 84; ++i) {
12616+		stbi__get8(s);
12617+	}
12618 
12619-   if (!stbi__pic_is4(s,"PICT"))
12620-      return 0;
12621+	if (!stbi__pic_is4(s, "PICT")) {
12622+		return 0;
12623+	}
12624 
12625-   return 1;
12626+	return 1;
12627 }
12628 
12629-typedef struct
12630-{
12631-   stbi_uc size,type,channel;
12632+typedef struct {
12633+	stbi_uc size, type, channel;
12634 } stbi__pic_packet;
12635 
12636-static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
12637+static stbi_uc *
12638+stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
12639 {
12640-   int mask=0x80, i;
12641+	int mask = 0x80, i;
12642 
12643-   for (i=0; i<4; ++i, mask>>=1) {
12644-      if (channel & mask) {
12645-         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
12646-         dest[i]=stbi__get8(s);
12647-      }
12648-   }
12649+	for (i = 0; i < 4; ++i, mask >>= 1) {
12650+		if (channel & mask) {
12651+			if (stbi__at_eof(s)) {
12652+				return stbi__errpuc("bad file", "PIC file too short");
12653+			}
12654+			dest[i] = stbi__get8(s);
12655+		}
12656+	}
12657 
12658-   return dest;
12659+	return dest;
12660 }
12661 
12662-static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
12663+static void
12664+stbi__copyval(int channel, stbi_uc *dest, const stbi_uc *src)
12665 {
12666-   int mask=0x80,i;
12667+	int mask = 0x80, i;
12668 
12669-   for (i=0;i<4; ++i, mask>>=1)
12670-      if (channel&mask)
12671-         dest[i]=src[i];
12672+	for (i = 0; i < 4; ++i, mask >>= 1) {
12673+		if (channel & mask) {
12674+			dest[i] = src[i];
12675+		}
12676+	}
12677 }
12678 
12679-static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
12680-{
12681-   int act_comp=0,num_packets=0,y,chained;
12682-   stbi__pic_packet packets[10];
12683-
12684-   // this will (should...) cater for even some bizarre stuff like having data
12685-    // for the same channel in multiple packets.
12686-   do {
12687-      stbi__pic_packet *packet;
12688-
12689-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
12690-         return stbi__errpuc("bad format","too many packets");
12691-
12692-      packet = &packets[num_packets++];
12693-
12694-      chained = stbi__get8(s);
12695-      packet->size    = stbi__get8(s);
12696-      packet->type    = stbi__get8(s);
12697-      packet->channel = stbi__get8(s);
12698-
12699-      act_comp |= packet->channel;
12700-
12701-      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
12702-      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
12703-   } while (chained);
12704-
12705-   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
12706-
12707-   for(y=0; y<height; ++y) {
12708-      int packet_idx;
12709-
12710-      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
12711-         stbi__pic_packet *packet = &packets[packet_idx];
12712-         stbi_uc *dest = result+y*width*4;
12713-
12714-         switch (packet->type) {
12715-            default:
12716-               return stbi__errpuc("bad format","packet has bad compression type");
12717-
12718-            case 0: {//uncompressed
12719-               int x;
12720-
12721-               for(x=0;x<width;++x, dest+=4)
12722-                  if (!stbi__readval(s,packet->channel,dest))
12723-                     return 0;
12724-               break;
12725-            }
12726-
12727-            case 1://Pure RLE
12728-               {
12729-                  int left=width, i;
12730-
12731-                  while (left>0) {
12732-                     stbi_uc count,value[4];
12733-
12734-                     count=stbi__get8(s);
12735-                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
12736-
12737-                     if (count > left)
12738-                        count = (stbi_uc) left;
12739-
12740-                     if (!stbi__readval(s,packet->channel,value))  return 0;
12741-
12742-                     for(i=0; i<count; ++i,dest+=4)
12743-                        stbi__copyval(packet->channel,dest,value);
12744-                     left -= count;
12745-                  }
12746-               }
12747-               break;
12748-
12749-            case 2: {//Mixed RLE
12750-               int left=width;
12751-               while (left>0) {
12752-                  int count = stbi__get8(s), i;
12753-                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
12754-
12755-                  if (count >= 128) { // Repeated
12756-                     stbi_uc value[4];
12757-
12758-                     if (count==128)
12759-                        count = stbi__get16be(s);
12760-                     else
12761-                        count -= 127;
12762-                     if (count > left)
12763-                        return stbi__errpuc("bad file","scanline overrun");
12764-
12765-                     if (!stbi__readval(s,packet->channel,value))
12766-                        return 0;
12767-
12768-                     for(i=0;i<count;++i, dest += 4)
12769-                        stbi__copyval(packet->channel,dest,value);
12770-                  } else { // Raw
12771-                     ++count;
12772-                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
12773-
12774-                     for(i=0;i<count;++i, dest+=4)
12775-                        if (!stbi__readval(s,packet->channel,dest))
12776-                           return 0;
12777-                  }
12778-                  left-=count;
12779-               }
12780-               break;
12781-            }
12782-         }
12783-      }
12784-   }
12785-
12786-   return result;
12787-}
12788-
12789-static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
12790-{
12791-   stbi_uc *result;
12792-   int i, x,y, internal_comp;
12793-   STBI_NOTUSED(ri);
12794-
12795-   if (!comp) comp = &internal_comp;
12796-
12797-   for (i=0; i<92; ++i)
12798-      stbi__get8(s);
12799-
12800-   x = stbi__get16be(s);
12801-   y = stbi__get16be(s);
12802-
12803-   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
12804-   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
12805-
12806-   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
12807-   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
12808-
12809-   stbi__get32be(s); //skip `ratio'
12810-   stbi__get16be(s); //skip `fields'
12811-   stbi__get16be(s); //skip `pad'
12812-
12813-   // intermediate buffer is RGBA
12814-   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
12815-   if (!result) return stbi__errpuc("outofmem", "Out of memory");
12816-   memset(result, 0xff, x*y*4);
12817-
12818-   if (!stbi__pic_load_core(s,x,y,comp, result)) {
12819-      STBI_FREE(result);
12820-      result=0;
12821-   }
12822-   *px = x;
12823-   *py = y;
12824-   if (req_comp == 0) req_comp = *comp;
12825-   result=stbi__convert_format(result,4,req_comp,x,y);
12826-
12827-   return result;
12828-}
12829-
12830-static int stbi__pic_test(stbi__context *s)
12831-{
12832-   int r = stbi__pic_test_core(s);
12833-   stbi__rewind(s);
12834-   return r;
12835+static stbi_uc *
12836+stbi__pic_load_core(stbi__context *s, int width, int height, int *comp,
12837+                    stbi_uc *result)
12838+{
12839+	int act_comp = 0, num_packets = 0, y, chained;
12840+	stbi__pic_packet packets[10];
12841+
12842+	// this will (should...) cater for even some bizarre stuff like having data
12843+	// for the same channel in multiple packets.
12844+	do {
12845+		stbi__pic_packet *packet;
12846+
12847+		if (num_packets == sizeof(packets) / sizeof(packets[0])) {
12848+			return stbi__errpuc("bad format", "too many packets");
12849+		}
12850+
12851+		packet = &packets[num_packets++];
12852+
12853+		chained = stbi__get8(s);
12854+		packet->size = stbi__get8(s);
12855+		packet->type = stbi__get8(s);
12856+		packet->channel = stbi__get8(s);
12857+
12858+		act_comp |= packet->channel;
12859+
12860+		if (stbi__at_eof(s)) {
12861+			return stbi__errpuc("bad file", "file too short (reading packets)");
12862+		}
12863+		if (packet->size != 8) {
12864+			return stbi__errpuc("bad format", "packet isn't 8bpp");
12865+		}
12866+	} while (chained);
12867+
12868+	*comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
12869+
12870+	for (y = 0; y < height; ++y) {
12871+		int packet_idx;
12872+
12873+		for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) {
12874+			stbi__pic_packet *packet = &packets[packet_idx];
12875+			stbi_uc *dest = result + y * width * 4;
12876+
12877+			switch (packet->type) {
12878+			default:
12879+				return stbi__errpuc("bad format",
12880+				                    "packet has bad compression type");
12881+
12882+			case 0: { // uncompressed
12883+				int x;
12884+
12885+				for (x = 0; x < width; ++x, dest += 4) {
12886+					if (!stbi__readval(s, packet->channel, dest)) {
12887+						return 0;
12888+					}
12889+				}
12890+				break;
12891+			}
12892+
12893+			case 1: // Pure RLE
12894+			{
12895+				int left = width, i;
12896+
12897+				while (left > 0) {
12898+					stbi_uc count, value[4];
12899+
12900+					count = stbi__get8(s);
12901+					if (stbi__at_eof(s)) {
12902+						return stbi__errpuc("bad file",
12903+						                    "file too short (pure read count)");
12904+					}
12905+
12906+					if (count > left) {
12907+						count = (stbi_uc)left;
12908+					}
12909+
12910+					if (!stbi__readval(s, packet->channel, value)) {
12911+						return 0;
12912+					}
12913+
12914+					for (i = 0; i < count; ++i, dest += 4) {
12915+						stbi__copyval(packet->channel, dest, value);
12916+					}
12917+					left -= count;
12918+				}
12919+			} break;
12920+
12921+			case 2: { // Mixed RLE
12922+				int left = width;
12923+				while (left > 0) {
12924+					int count = stbi__get8(s), i;
12925+					if (stbi__at_eof(s)) {
12926+						return stbi__errpuc(
12927+						    "bad file", "file too short (mixed read count)");
12928+					}
12929+
12930+					if (count >= 128) { // Repeated
12931+						stbi_uc value[4];
12932+
12933+						if (count == 128) {
12934+							count = stbi__get16be(s);
12935+						} else {
12936+							count -= 127;
12937+						}
12938+						if (count > left) {
12939+							return stbi__errpuc("bad file", "scanline overrun");
12940+						}
12941+
12942+						if (!stbi__readval(s, packet->channel, value)) {
12943+							return 0;
12944+						}
12945+
12946+						for (i = 0; i < count; ++i, dest += 4) {
12947+							stbi__copyval(packet->channel, dest, value);
12948+						}
12949+					} else { // Raw
12950+						++count;
12951+						if (count > left) {
12952+							return stbi__errpuc("bad file", "scanline overrun");
12953+						}
12954+
12955+						for (i = 0; i < count; ++i, dest += 4) {
12956+							if (!stbi__readval(s, packet->channel, dest)) {
12957+								return 0;
12958+							}
12959+						}
12960+					}
12961+					left -= count;
12962+				}
12963+				break;
12964+			}
12965+			}
12966+		}
12967+	}
12968+
12969+	return result;
12970+}
12971+
12972+static void *
12973+stbi__pic_load(stbi__context *s, int *px, int *py, int *comp, int req_comp,
12974+               stbi__result_info *ri)
12975+{
12976+	stbi_uc *result;
12977+	int i, x, y, internal_comp;
12978+	STBI_NOTUSED(ri);
12979+
12980+	if (!comp) {
12981+		comp = &internal_comp;
12982+	}
12983+
12984+	for (i = 0; i < 92; ++i) {
12985+		stbi__get8(s);
12986+	}
12987+
12988+	x = stbi__get16be(s);
12989+	y = stbi__get16be(s);
12990+
12991+	if (y > STBI_MAX_DIMENSIONS) {
12992+		return stbi__errpuc("too large", "Very large image (corrupt?)");
12993+	}
12994+	if (x > STBI_MAX_DIMENSIONS) {
12995+		return stbi__errpuc("too large", "Very large image (corrupt?)");
12996+	}
12997+
12998+	if (stbi__at_eof(s)) {
12999+		return stbi__errpuc("bad file", "file too short (pic header)");
13000+	}
13001+	if (!stbi__mad3sizes_valid(x, y, 4, 0)) {
13002+		return stbi__errpuc("too large", "PIC image too large to decode");
13003+	}
13004+
13005+	stbi__get32be(s); // skip `ratio'
13006+	stbi__get16be(s); // skip `fields'
13007+	stbi__get16be(s); // skip `pad'
13008+
13009+	// intermediate buffer is RGBA
13010+	result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0);
13011+	if (!result) {
13012+		return stbi__errpuc("outofmem", "Out of memory");
13013+	}
13014+	memset(result, 0xff, x * y * 4);
13015+
13016+	if (!stbi__pic_load_core(s, x, y, comp, result)) {
13017+		STBI_FREE(result);
13018+		result = 0;
13019+	}
13020+	*px = x;
13021+	*py = y;
13022+	if (req_comp == 0) {
13023+		req_comp = *comp;
13024+	}
13025+	result = stbi__convert_format(result, 4, req_comp, x, y);
13026+
13027+	return result;
13028+}
13029+
13030+static int
13031+stbi__pic_test(stbi__context *s)
13032+{
13033+	int r = stbi__pic_test_core(s);
13034+	stbi__rewind(s);
13035+	return r;
13036 }
13037 #endif
13038 
13039@@ -6550,533 +8190,630 @@ static int stbi__pic_test(stbi__context *s)
13040 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
13041 
13042 #ifndef STBI_NO_GIF
13043-typedef struct
13044-{
13045-   stbi__int16 prefix;
13046-   stbi_uc first;
13047-   stbi_uc suffix;
13048+typedef struct {
13049+	stbi__int16 prefix;
13050+	stbi_uc first;
13051+	stbi_uc suffix;
13052 } stbi__gif_lzw;
13053 
13054-typedef struct
13055-{
13056-   int w,h;
13057-   stbi_uc *out;                 // output buffer (always 4 components)
13058-   stbi_uc *background;          // The current "background" as far as a gif is concerned
13059-   stbi_uc *history;
13060-   int flags, bgindex, ratio, transparent, eflags;
13061-   stbi_uc  pal[256][4];
13062-   stbi_uc lpal[256][4];
13063-   stbi__gif_lzw codes[8192];
13064-   stbi_uc *color_table;
13065-   int parse, step;
13066-   int lflags;
13067-   int start_x, start_y;
13068-   int max_x, max_y;
13069-   int cur_x, cur_y;
13070-   int line_size;
13071-   int delay;
13072+typedef struct {
13073+	int w, h;
13074+	stbi_uc *out; // output buffer (always 4 components)
13075+	stbi_uc
13076+	    *background; // The current "background" as far as a gif is concerned
13077+	stbi_uc *history;
13078+	int flags, bgindex, ratio, transparent, eflags;
13079+	stbi_uc pal[256][4];
13080+	stbi_uc lpal[256][4];
13081+	stbi__gif_lzw codes[8192];
13082+	stbi_uc *color_table;
13083+	int parse, step;
13084+	int lflags;
13085+	int start_x, start_y;
13086+	int max_x, max_y;
13087+	int cur_x, cur_y;
13088+	int line_size;
13089+	int delay;
13090 } stbi__gif;
13091 
13092-static int stbi__gif_test_raw(stbi__context *s)
13093-{
13094-   int sz;
13095-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
13096-   sz = stbi__get8(s);
13097-   if (sz != '9' && sz != '7') return 0;
13098-   if (stbi__get8(s) != 'a') return 0;
13099-   return 1;
13100-}
13101-
13102-static int stbi__gif_test(stbi__context *s)
13103-{
13104-   int r = stbi__gif_test_raw(s);
13105-   stbi__rewind(s);
13106-   return r;
13107-}
13108-
13109-static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
13110-{
13111-   int i;
13112-   for (i=0; i < num_entries; ++i) {
13113-      pal[i][2] = stbi__get8(s);
13114-      pal[i][1] = stbi__get8(s);
13115-      pal[i][0] = stbi__get8(s);
13116-      pal[i][3] = transp == i ? 0 : 255;
13117-   }
13118-}
13119-
13120-static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
13121-{
13122-   stbi_uc version;
13123-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
13124-      return stbi__err("not GIF", "Corrupt GIF");
13125-
13126-   version = stbi__get8(s);
13127-   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
13128-   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
13129-
13130-   stbi__g_failure_reason = "";
13131-   g->w = stbi__get16le(s);
13132-   g->h = stbi__get16le(s);
13133-   g->flags = stbi__get8(s);
13134-   g->bgindex = stbi__get8(s);
13135-   g->ratio = stbi__get8(s);
13136-   g->transparent = -1;
13137-
13138-   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
13139-   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
13140-
13141-   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
13142-
13143-   if (is_info) return 1;
13144-
13145-   if (g->flags & 0x80)
13146-      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
13147-
13148-   return 1;
13149-}
13150-
13151-static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
13152-{
13153-   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
13154-   if (!g) return stbi__err("outofmem", "Out of memory");
13155-   if (!stbi__gif_header(s, g, comp, 1)) {
13156-      STBI_FREE(g);
13157-      stbi__rewind( s );
13158-      return 0;
13159-   }
13160-   if (x) *x = g->w;
13161-   if (y) *y = g->h;
13162-   STBI_FREE(g);
13163-   return 1;
13164-}
13165-
13166-static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
13167-{
13168-   stbi_uc *p, *c;
13169-   int idx;
13170-
13171-   // recurse to decode the prefixes, since the linked-list is backwards,
13172-   // and working backwards through an interleaved image would be nasty
13173-   if (g->codes[code].prefix >= 0)
13174-      stbi__out_gif_code(g, g->codes[code].prefix);
13175-
13176-   if (g->cur_y >= g->max_y) return;
13177-
13178-   idx = g->cur_x + g->cur_y;
13179-   p = &g->out[idx];
13180-   g->history[idx / 4] = 1;
13181-
13182-   c = &g->color_table[g->codes[code].suffix * 4];
13183-   if (c[3] > 128) { // don't render transparent pixels;
13184-      p[0] = c[2];
13185-      p[1] = c[1];
13186-      p[2] = c[0];
13187-      p[3] = c[3];
13188-   }
13189-   g->cur_x += 4;
13190-
13191-   if (g->cur_x >= g->max_x) {
13192-      g->cur_x = g->start_x;
13193-      g->cur_y += g->step;
13194-
13195-      while (g->cur_y >= g->max_y && g->parse > 0) {
13196-         g->step = (1 << g->parse) * g->line_size;
13197-         g->cur_y = g->start_y + (g->step >> 1);
13198-         --g->parse;
13199-      }
13200-   }
13201-}
13202-
13203-static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
13204-{
13205-   stbi_uc lzw_cs;
13206-   stbi__int32 len, init_code;
13207-   stbi__uint32 first;
13208-   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
13209-   stbi__gif_lzw *p;
13210-
13211-   lzw_cs = stbi__get8(s);
13212-   if (lzw_cs > 12) return NULL;
13213-   clear = 1 << lzw_cs;
13214-   first = 1;
13215-   codesize = lzw_cs + 1;
13216-   codemask = (1 << codesize) - 1;
13217-   bits = 0;
13218-   valid_bits = 0;
13219-   for (init_code = 0; init_code < clear; init_code++) {
13220-      g->codes[init_code].prefix = -1;
13221-      g->codes[init_code].first = (stbi_uc) init_code;
13222-      g->codes[init_code].suffix = (stbi_uc) init_code;
13223-   }
13224-
13225-   // support no starting clear code
13226-   avail = clear+2;
13227-   oldcode = -1;
13228-
13229-   len = 0;
13230-   for(;;) {
13231-      if (valid_bits < codesize) {
13232-         if (len == 0) {
13233-            len = stbi__get8(s); // start new block
13234-            if (len == 0)
13235-               return g->out;
13236-         }
13237-         --len;
13238-         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
13239-         valid_bits += 8;
13240-      } else {
13241-         stbi__int32 code = bits & codemask;
13242-         bits >>= codesize;
13243-         valid_bits -= codesize;
13244-         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
13245-         if (code == clear) {  // clear code
13246-            codesize = lzw_cs + 1;
13247-            codemask = (1 << codesize) - 1;
13248-            avail = clear + 2;
13249-            oldcode = -1;
13250-            first = 0;
13251-         } else if (code == clear + 1) { // end of stream code
13252-            stbi__skip(s, len);
13253-            while ((len = stbi__get8(s)) > 0)
13254-               stbi__skip(s,len);
13255-            return g->out;
13256-         } else if (code <= avail) {
13257-            if (first) {
13258-               return stbi__errpuc("no clear code", "Corrupt GIF");
13259-            }
13260-
13261-            if (oldcode >= 0) {
13262-               p = &g->codes[avail++];
13263-               if (avail > 8192) {
13264-                  return stbi__errpuc("too many codes", "Corrupt GIF");
13265-               }
13266-
13267-               p->prefix = (stbi__int16) oldcode;
13268-               p->first = g->codes[oldcode].first;
13269-               p->suffix = (code == avail) ? p->first : g->codes[code].first;
13270-            } else if (code == avail)
13271-               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
13272-
13273-            stbi__out_gif_code(g, (stbi__uint16) code);
13274-
13275-            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
13276-               codesize++;
13277-               codemask = (1 << codesize) - 1;
13278-            }
13279-
13280-            oldcode = code;
13281-         } else {
13282-            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
13283-         }
13284-      }
13285-   }
13286-}
13287-
13288-// this function is designed to support animated gifs, although stb_image doesn't support it
13289-// two back is the image from two frames ago, used for a very specific disposal format
13290-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
13291-{
13292-   int dispose;
13293-   int first_frame;
13294-   int pi;
13295-   int pcount;
13296-   STBI_NOTUSED(req_comp);
13297-
13298-   // on first frame, any non-written pixels get the background colour (non-transparent)
13299-   first_frame = 0;
13300-   if (g->out == 0) {
13301-      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
13302-      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
13303-         return stbi__errpuc("too large", "GIF image is too large");
13304-      pcount = g->w * g->h;
13305-      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
13306-      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
13307-      g->history = (stbi_uc *) stbi__malloc(pcount);
13308-      if (!g->out || !g->background || !g->history)
13309-         return stbi__errpuc("outofmem", "Out of memory");
13310-
13311-      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
13312-      // background colour is only used for pixels that are not rendered first frame, after that "background"
13313-      // color refers to the color that was there the previous frame.
13314-      memset(g->out, 0x00, 4 * pcount);
13315-      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
13316-      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
13317-      first_frame = 1;
13318-   } else {
13319-      // second frame - how do we dispose of the previous one?
13320-      dispose = (g->eflags & 0x1C) >> 2;
13321-      pcount = g->w * g->h;
13322-
13323-      if ((dispose == 3) && (two_back == 0)) {
13324-         dispose = 2; // if I don't have an image to revert back to, default to the old background
13325-      }
13326-
13327-      if (dispose == 3) { // use previous graphic
13328-         for (pi = 0; pi < pcount; ++pi) {
13329-            if (g->history[pi]) {
13330-               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
13331-            }
13332-         }
13333-      } else if (dispose == 2) {
13334-         // restore what was changed last frame to background before that frame;
13335-         for (pi = 0; pi < pcount; ++pi) {
13336-            if (g->history[pi]) {
13337-               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
13338-            }
13339-         }
13340-      } else {
13341-         // This is a non-disposal case eithe way, so just
13342-         // leave the pixels as is, and they will become the new background
13343-         // 1: do not dispose
13344-         // 0:  not specified.
13345-      }
13346-
13347-      // background is what out is after the undoing of the previou frame;
13348-      memcpy( g->background, g->out, 4 * g->w * g->h );
13349-   }
13350-
13351-   // clear my history;
13352-   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
13353-
13354-   for (;;) {
13355-      int tag = stbi__get8(s);
13356-      switch (tag) {
13357-         case 0x2C: /* Image Descriptor */
13358-         {
13359-            stbi__int32 x, y, w, h;
13360-            stbi_uc *o;
13361-
13362-            x = stbi__get16le(s);
13363-            y = stbi__get16le(s);
13364-            w = stbi__get16le(s);
13365-            h = stbi__get16le(s);
13366-            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
13367-               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
13368-
13369-            g->line_size = g->w * 4;
13370-            g->start_x = x * 4;
13371-            g->start_y = y * g->line_size;
13372-            g->max_x   = g->start_x + w * 4;
13373-            g->max_y   = g->start_y + h * g->line_size;
13374-            g->cur_x   = g->start_x;
13375-            g->cur_y   = g->start_y;
13376-
13377-            // if the width of the specified rectangle is 0, that means
13378-            // we may not see *any* pixels or the image is malformed;
13379-            // to make sure this is caught, move the current y down to
13380-            // max_y (which is what out_gif_code checks).
13381-            if (w == 0)
13382-               g->cur_y = g->max_y;
13383-
13384-            g->lflags = stbi__get8(s);
13385-
13386-            if (g->lflags & 0x40) {
13387-               g->step = 8 * g->line_size; // first interlaced spacing
13388-               g->parse = 3;
13389-            } else {
13390-               g->step = g->line_size;
13391-               g->parse = 0;
13392-            }
13393-
13394-            if (g->lflags & 0x80) {
13395-               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
13396-               g->color_table = (stbi_uc *) g->lpal;
13397-            } else if (g->flags & 0x80) {
13398-               g->color_table = (stbi_uc *) g->pal;
13399-            } else
13400-               return stbi__errpuc("missing color table", "Corrupt GIF");
13401-
13402-            o = stbi__process_gif_raster(s, g);
13403-            if (!o) return NULL;
13404-
13405-            // if this was the first frame,
13406-            pcount = g->w * g->h;
13407-            if (first_frame && (g->bgindex > 0)) {
13408-               // if first frame, any pixel not drawn to gets the background color
13409-               for (pi = 0; pi < pcount; ++pi) {
13410-                  if (g->history[pi] == 0) {
13411-                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
13412-                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
13413-                  }
13414-               }
13415-            }
13416-
13417-            return o;
13418-         }
13419-
13420-         case 0x21: // Comment Extension.
13421-         {
13422-            int len;
13423-            int ext = stbi__get8(s);
13424-            if (ext == 0xF9) { // Graphic Control Extension.
13425-               len = stbi__get8(s);
13426-               if (len == 4) {
13427-                  g->eflags = stbi__get8(s);
13428-                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
13429-
13430-                  // unset old transparent
13431-                  if (g->transparent >= 0) {
13432-                     g->pal[g->transparent][3] = 255;
13433-                  }
13434-                  if (g->eflags & 0x01) {
13435-                     g->transparent = stbi__get8(s);
13436-                     if (g->transparent >= 0) {
13437-                        g->pal[g->transparent][3] = 0;
13438-                     }
13439-                  } else {
13440-                     // don't need transparent
13441-                     stbi__skip(s, 1);
13442-                     g->transparent = -1;
13443-                  }
13444-               } else {
13445-                  stbi__skip(s, len);
13446-                  break;
13447-               }
13448-            }
13449-            while ((len = stbi__get8(s)) != 0) {
13450-               stbi__skip(s, len);
13451-            }
13452-            break;
13453-         }
13454-
13455-         case 0x3B: // gif stream termination code
13456-            return (stbi_uc *) s; // using '1' causes warning on some compilers
13457-
13458-         default:
13459-            return stbi__errpuc("unknown code", "Corrupt GIF");
13460-      }
13461-   }
13462-}
13463-
13464-static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
13465-{
13466-   STBI_FREE(g->out);
13467-   STBI_FREE(g->history);
13468-   STBI_FREE(g->background);
13469-
13470-   if (out) STBI_FREE(out);
13471-   if (delays && *delays) STBI_FREE(*delays);
13472-   return stbi__errpuc("outofmem", "Out of memory");
13473-}
13474-
13475-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
13476-{
13477-   if (stbi__gif_test(s)) {
13478-      int layers = 0;
13479-      stbi_uc *u = 0;
13480-      stbi_uc *out = 0;
13481-      stbi_uc *two_back = 0;
13482-      stbi__gif g;
13483-      int stride;
13484-      int out_size = 0;
13485-      int delays_size = 0;
13486-
13487-      STBI_NOTUSED(out_size);
13488-      STBI_NOTUSED(delays_size);
13489-
13490-      memset(&g, 0, sizeof(g));
13491-      if (delays) {
13492-         *delays = 0;
13493-      }
13494-
13495-      do {
13496-         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
13497-         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
13498-
13499-         if (u) {
13500-            *x = g.w;
13501-            *y = g.h;
13502-            ++layers;
13503-            stride = g.w * g.h * 4;
13504-
13505-            if (out) {
13506-               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
13507-               if (!tmp)
13508-                  return stbi__load_gif_main_outofmem(&g, out, delays);
13509-               else {
13510-                   out = (stbi_uc*) tmp;
13511-                   out_size = layers * stride;
13512-               }
13513-
13514-               if (delays) {
13515-                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
13516-                  if (!new_delays)
13517-                     return stbi__load_gif_main_outofmem(&g, out, delays);
13518-                  *delays = new_delays;
13519-                  delays_size = layers * sizeof(int);
13520-               }
13521-            } else {
13522-               out = (stbi_uc*)stbi__malloc( layers * stride );
13523-               if (!out)
13524-                  return stbi__load_gif_main_outofmem(&g, out, delays);
13525-               out_size = layers * stride;
13526-               if (delays) {
13527-                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
13528-                  if (!*delays)
13529-                     return stbi__load_gif_main_outofmem(&g, out, delays);
13530-                  delays_size = layers * sizeof(int);
13531-               }
13532-            }
13533-            memcpy( out + ((layers - 1) * stride), u, stride );
13534-            if (layers >= 2) {
13535-               two_back = out - 2 * stride;
13536-            }
13537-
13538-            if (delays) {
13539-               (*delays)[layers - 1U] = g.delay;
13540-            }
13541-         }
13542-      } while (u != 0);
13543-
13544-      // free temp buffer;
13545-      STBI_FREE(g.out);
13546-      STBI_FREE(g.history);
13547-      STBI_FREE(g.background);
13548-
13549-      // do the final conversion after loading everything;
13550-      if (req_comp && req_comp != 4)
13551-         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
13552-
13553-      *z = layers;
13554-      return out;
13555-   } else {
13556-      return stbi__errpuc("not GIF", "Image was not as a gif type.");
13557-   }
13558-}
13559-
13560-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
13561-{
13562-   stbi_uc *u = 0;
13563-   stbi__gif g;
13564-   memset(&g, 0, sizeof(g));
13565-   STBI_NOTUSED(ri);
13566-
13567-   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
13568-   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
13569-   if (u) {
13570-      *x = g.w;
13571-      *y = g.h;
13572-
13573-      // moved conversion to after successful load so that the same
13574-      // can be done for multiple frames.
13575-      if (req_comp && req_comp != 4)
13576-         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
13577-   } else if (g.out) {
13578-      // if there was an error and we allocated an image buffer, free it!
13579-      STBI_FREE(g.out);
13580-   }
13581-
13582-   // free buffers needed for multiple frame loading;
13583-   STBI_FREE(g.history);
13584-   STBI_FREE(g.background);
13585-
13586-   return u;
13587-}
13588-
13589-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
13590-{
13591-   return stbi__gif_info_raw(s,x,y,comp);
13592+static int
13593+stbi__gif_test_raw(stbi__context *s)
13594+{
13595+	int sz;
13596+	if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
13597+	    stbi__get8(s) != '8') {
13598+		return 0;
13599+	}
13600+	sz = stbi__get8(s);
13601+	if (sz != '9' && sz != '7') {
13602+		return 0;
13603+	}
13604+	if (stbi__get8(s) != 'a') {
13605+		return 0;
13606+	}
13607+	return 1;
13608+}
13609+
13610+static int
13611+stbi__gif_test(stbi__context *s)
13612+{
13613+	int r = stbi__gif_test_raw(s);
13614+	stbi__rewind(s);
13615+	return r;
13616+}
13617+
13618+static void
13619+stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4],
13620+                           int num_entries, int transp)
13621+{
13622+	int i;
13623+	for (i = 0; i < num_entries; ++i) {
13624+		pal[i][2] = stbi__get8(s);
13625+		pal[i][1] = stbi__get8(s);
13626+		pal[i][0] = stbi__get8(s);
13627+		pal[i][3] = transp == i ? 0 : 255;
13628+	}
13629+}
13630+
13631+static int
13632+stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
13633+{
13634+	stbi_uc version;
13635+	if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
13636+	    stbi__get8(s) != '8') {
13637+		return stbi__err("not GIF", "Corrupt GIF");
13638+	}
13639+
13640+	version = stbi__get8(s);
13641+	if (version != '7' && version != '9') {
13642+		return stbi__err("not GIF", "Corrupt GIF");
13643+	}
13644+	if (stbi__get8(s) != 'a') {
13645+		return stbi__err("not GIF", "Corrupt GIF");
13646+	}
13647+
13648+	stbi__g_failure_reason = "";
13649+	g->w = stbi__get16le(s);
13650+	g->h = stbi__get16le(s);
13651+	g->flags = stbi__get8(s);
13652+	g->bgindex = stbi__get8(s);
13653+	g->ratio = stbi__get8(s);
13654+	g->transparent = -1;
13655+
13656+	if (g->w > STBI_MAX_DIMENSIONS) {
13657+		return stbi__err("too large", "Very large image (corrupt?)");
13658+	}
13659+	if (g->h > STBI_MAX_DIMENSIONS) {
13660+		return stbi__err("too large", "Very large image (corrupt?)");
13661+	}
13662+
13663+	if (comp != 0) {
13664+		*comp = 4; // can't actually tell whether it's 3 or 4 until we parse the
13665+		           // comments
13666+	}
13667+
13668+	if (is_info) {
13669+		return 1;
13670+	}
13671+
13672+	if (g->flags & 0x80) {
13673+		stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
13674+	}
13675+
13676+	return 1;
13677+}
13678+
13679+static int
13680+stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
13681+{
13682+	stbi__gif *g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif));
13683+	if (!g) {
13684+		return stbi__err("outofmem", "Out of memory");
13685+	}
13686+	if (!stbi__gif_header(s, g, comp, 1)) {
13687+		STBI_FREE(g);
13688+		stbi__rewind(s);
13689+		return 0;
13690+	}
13691+	if (x) {
13692+		*x = g->w;
13693+	}
13694+	if (y) {
13695+		*y = g->h;
13696+	}
13697+	STBI_FREE(g);
13698+	return 1;
13699+}
13700+
13701+static void
13702+stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
13703+{
13704+	stbi_uc *p, *c;
13705+	int idx;
13706+
13707+	// recurse to decode the prefixes, since the linked-list is backwards,
13708+	// and working backwards through an interleaved image would be nasty
13709+	if (g->codes[code].prefix >= 0) {
13710+		stbi__out_gif_code(g, g->codes[code].prefix);
13711+	}
13712+
13713+	if (g->cur_y >= g->max_y) {
13714+		return;
13715+	}
13716+
13717+	idx = g->cur_x + g->cur_y;
13718+	p = &g->out[idx];
13719+	g->history[idx / 4] = 1;
13720+
13721+	c = &g->color_table[g->codes[code].suffix * 4];
13722+	if (c[3] > 128) { // don't render transparent pixels;
13723+		p[0] = c[2];
13724+		p[1] = c[1];
13725+		p[2] = c[0];
13726+		p[3] = c[3];
13727+	}
13728+	g->cur_x += 4;
13729+
13730+	if (g->cur_x >= g->max_x) {
13731+		g->cur_x = g->start_x;
13732+		g->cur_y += g->step;
13733+
13734+		while (g->cur_y >= g->max_y && g->parse > 0) {
13735+			g->step = (1 << g->parse) * g->line_size;
13736+			g->cur_y = g->start_y + (g->step >> 1);
13737+			--g->parse;
13738+		}
13739+	}
13740+}
13741+
13742+static stbi_uc *
13743+stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
13744+{
13745+	stbi_uc lzw_cs;
13746+	stbi__int32 len, init_code;
13747+	stbi__uint32 first;
13748+	stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
13749+	stbi__gif_lzw *p;
13750+
13751+	lzw_cs = stbi__get8(s);
13752+	if (lzw_cs > 12) {
13753+		return NULL;
13754+	}
13755+	clear = 1 << lzw_cs;
13756+	first = 1;
13757+	codesize = lzw_cs + 1;
13758+	codemask = (1 << codesize) - 1;
13759+	bits = 0;
13760+	valid_bits = 0;
13761+	for (init_code = 0; init_code < clear; init_code++) {
13762+		g->codes[init_code].prefix = -1;
13763+		g->codes[init_code].first = (stbi_uc)init_code;
13764+		g->codes[init_code].suffix = (stbi_uc)init_code;
13765+	}
13766+
13767+	// support no starting clear code
13768+	avail = clear + 2;
13769+	oldcode = -1;
13770+
13771+	len = 0;
13772+	for (;;) {
13773+		if (valid_bits < codesize) {
13774+			if (len == 0) {
13775+				len = stbi__get8(s); // start new block
13776+				if (len == 0) {
13777+					return g->out;
13778+				}
13779+			}
13780+			--len;
13781+			bits |= (stbi__int32)stbi__get8(s) << valid_bits;
13782+			valid_bits += 8;
13783+		} else {
13784+			stbi__int32 code = bits & codemask;
13785+			bits >>= codesize;
13786+			valid_bits -= codesize;
13787+			// @OPTIMIZE: is there some way we can accelerate the non-clear
13788+			// path?
13789+			if (code == clear) { // clear code
13790+				codesize = lzw_cs + 1;
13791+				codemask = (1 << codesize) - 1;
13792+				avail = clear + 2;
13793+				oldcode = -1;
13794+				first = 0;
13795+			} else if (code == clear + 1) { // end of stream code
13796+				stbi__skip(s, len);
13797+				while ((len = stbi__get8(s)) > 0) {
13798+					stbi__skip(s, len);
13799+				}
13800+				return g->out;
13801+			} else if (code <= avail) {
13802+				if (first) {
13803+					return stbi__errpuc("no clear code", "Corrupt GIF");
13804+				}
13805+
13806+				if (oldcode >= 0) {
13807+					p = &g->codes[avail++];
13808+					if (avail > 8192) {
13809+						return stbi__errpuc("too many codes", "Corrupt GIF");
13810+					}
13811+
13812+					p->prefix = (stbi__int16)oldcode;
13813+					p->first = g->codes[oldcode].first;
13814+					p->suffix =
13815+					    (code == avail) ? p->first : g->codes[code].first;
13816+				} else if (code == avail) {
13817+					return stbi__errpuc("illegal code in raster",
13818+					                    "Corrupt GIF");
13819+				}
13820+
13821+				stbi__out_gif_code(g, (stbi__uint16)code);
13822+
13823+				if ((avail & codemask) == 0 && avail <= 0x0FFF) {
13824+					codesize++;
13825+					codemask = (1 << codesize) - 1;
13826+				}
13827+
13828+				oldcode = code;
13829+			} else {
13830+				return stbi__errpuc("illegal code in raster", "Corrupt GIF");
13831+			}
13832+		}
13833+	}
13834+}
13835+
13836+// this function is designed to support animated gifs, although stb_image
13837+// doesn't support it two back is the image from two frames ago, used for a very
13838+// specific disposal format
13839+static stbi_uc *
13840+stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp,
13841+                    stbi_uc *two_back)
13842+{
13843+	int dispose;
13844+	int first_frame;
13845+	int pi;
13846+	int pcount;
13847+	STBI_NOTUSED(req_comp);
13848+
13849+	// on first frame, any non-written pixels get the background colour
13850+	// (non-transparent)
13851+	first_frame = 0;
13852+	if (g->out == 0) {
13853+		if (!stbi__gif_header(s, g, comp, 0)) {
13854+			return 0; // stbi__g_failure_reason set by stbi__gif_header
13855+		}
13856+		if (!stbi__mad3sizes_valid(4, g->w, g->h, 0)) {
13857+			return stbi__errpuc("too large", "GIF image is too large");
13858+		}
13859+		pcount = g->w * g->h;
13860+		g->out = (stbi_uc *)stbi__malloc(4 * pcount);
13861+		g->background = (stbi_uc *)stbi__malloc(4 * pcount);
13862+		g->history = (stbi_uc *)stbi__malloc(pcount);
13863+		if (!g->out || !g->background || !g->history) {
13864+			return stbi__errpuc("outofmem", "Out of memory");
13865+		}
13866+
13867+		// image is treated as "transparent" at the start - ie, nothing
13868+		// overwrites the current background; background colour is only used for
13869+		// pixels that are not rendered first frame, after that "background"
13870+		// color refers to the color that was there the previous frame.
13871+		memset(g->out, 0x00, 4 * pcount);
13872+		memset(g->background, 0x00,
13873+		       4 * pcount); // state of the background (starts transparent)
13874+		memset(g->history, 0x00,
13875+		       pcount); // pixels that were affected previous frame
13876+		first_frame = 1;
13877+	} else {
13878+		// second frame - how do we dispose of the previous one?
13879+		dispose = (g->eflags & 0x1C) >> 2;
13880+		pcount = g->w * g->h;
13881+
13882+		if ((dispose == 3) && (two_back == 0)) {
13883+			dispose = 2; // if I don't have an image to revert back to, default
13884+			             // to the old background
13885+		}
13886+
13887+		if (dispose == 3) { // use previous graphic
13888+			for (pi = 0; pi < pcount; ++pi) {
13889+				if (g->history[pi]) {
13890+					memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
13891+				}
13892+			}
13893+		} else if (dispose == 2) {
13894+			// restore what was changed last frame to background before that
13895+			// frame;
13896+			for (pi = 0; pi < pcount; ++pi) {
13897+				if (g->history[pi]) {
13898+					memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
13899+				}
13900+			}
13901+		} else {
13902+			// This is a non-disposal case eithe way, so just
13903+			// leave the pixels as is, and they will become the new background
13904+			// 1: do not dispose
13905+			// 0:  not specified.
13906+		}
13907+
13908+		// background is what out is after the undoing of the previou frame;
13909+		memcpy(g->background, g->out, 4 * g->w * g->h);
13910+	}
13911+
13912+	// clear my history;
13913+	memset(g->history, 0x00,
13914+	       g->w * g->h); // pixels that were affected previous frame
13915+
13916+	for (;;) {
13917+		int tag = stbi__get8(s);
13918+		switch (tag) {
13919+		case 0x2C: /* Image Descriptor */
13920+		{
13921+			stbi__int32 x, y, w, h;
13922+			stbi_uc *o;
13923+
13924+			x = stbi__get16le(s);
13925+			y = stbi__get16le(s);
13926+			w = stbi__get16le(s);
13927+			h = stbi__get16le(s);
13928+			if (((x + w) > (g->w)) || ((y + h) > (g->h))) {
13929+				return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
13930+			}
13931+
13932+			g->line_size = g->w * 4;
13933+			g->start_x = x * 4;
13934+			g->start_y = y * g->line_size;
13935+			g->max_x = g->start_x + w * 4;
13936+			g->max_y = g->start_y + h * g->line_size;
13937+			g->cur_x = g->start_x;
13938+			g->cur_y = g->start_y;
13939+
13940+			// if the width of the specified rectangle is 0, that means
13941+			// we may not see *any* pixels or the image is malformed;
13942+			// to make sure this is caught, move the current y down to
13943+			// max_y (which is what out_gif_code checks).
13944+			if (w == 0) {
13945+				g->cur_y = g->max_y;
13946+			}
13947+
13948+			g->lflags = stbi__get8(s);
13949+
13950+			if (g->lflags & 0x40) {
13951+				g->step = 8 * g->line_size; // first interlaced spacing
13952+				g->parse = 3;
13953+			} else {
13954+				g->step = g->line_size;
13955+				g->parse = 0;
13956+			}
13957+
13958+			if (g->lflags & 0x80) {
13959+				stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
13960+				                           g->eflags & 0x01 ? g->transparent
13961+				                                            : -1);
13962+				g->color_table = (stbi_uc *)g->lpal;
13963+			} else if (g->flags & 0x80) {
13964+				g->color_table = (stbi_uc *)g->pal;
13965+			} else {
13966+				return stbi__errpuc("missing color table", "Corrupt GIF");
13967+			}
13968+
13969+			o = stbi__process_gif_raster(s, g);
13970+			if (!o) {
13971+				return NULL;
13972+			}
13973+
13974+			// if this was the first frame,
13975+			pcount = g->w * g->h;
13976+			if (first_frame && (g->bgindex > 0)) {
13977+				// if first frame, any pixel not drawn to gets the background
13978+				// color
13979+				for (pi = 0; pi < pcount; ++pi) {
13980+					if (g->history[pi] == 0) {
13981+						g->pal[g->bgindex][3] =
13982+						    255; // just in case it was made transparent, undo
13983+						         // that; It will be reset next frame if need
13984+						         // be;
13985+						memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
13986+					}
13987+				}
13988+			}
13989+
13990+			return o;
13991+		}
13992+
13993+		case 0x21: // Comment Extension.
13994+		{
13995+			int len;
13996+			int ext = stbi__get8(s);
13997+			if (ext == 0xF9) { // Graphic Control Extension.
13998+				len = stbi__get8(s);
13999+				if (len == 4) {
14000+					g->eflags = stbi__get8(s);
14001+					g->delay =
14002+					    10 * stbi__get16le(s); // delay - 1/100th of a second,
14003+					                           // saving as 1/1000ths.
14004+
14005+					// unset old transparent
14006+					if (g->transparent >= 0) {
14007+						g->pal[g->transparent][3] = 255;
14008+					}
14009+					if (g->eflags & 0x01) {
14010+						g->transparent = stbi__get8(s);
14011+						if (g->transparent >= 0) {
14012+							g->pal[g->transparent][3] = 0;
14013+						}
14014+					} else {
14015+						// don't need transparent
14016+						stbi__skip(s, 1);
14017+						g->transparent = -1;
14018+					}
14019+				} else {
14020+					stbi__skip(s, len);
14021+					break;
14022+				}
14023+			}
14024+			while ((len = stbi__get8(s)) != 0) {
14025+				stbi__skip(s, len);
14026+			}
14027+			break;
14028+		}
14029+
14030+		case 0x3B:               // gif stream termination code
14031+			return (stbi_uc *)s; // using '1' causes warning on some compilers
14032+
14033+		default:
14034+			return stbi__errpuc("unknown code", "Corrupt GIF");
14035+		}
14036+	}
14037+}
14038+
14039+static void *
14040+stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
14041+{
14042+	STBI_FREE(g->out);
14043+	STBI_FREE(g->history);
14044+	STBI_FREE(g->background);
14045+
14046+	if (out) {
14047+		STBI_FREE(out);
14048+	}
14049+	if (delays && *delays) {
14050+		STBI_FREE(*delays);
14051+	}
14052+	return stbi__errpuc("outofmem", "Out of memory");
14053+}
14054+
14055+static void *
14056+stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z,
14057+                    int *comp, int req_comp)
14058+{
14059+	if (stbi__gif_test(s)) {
14060+		int layers = 0;
14061+		stbi_uc *u = 0;
14062+		stbi_uc *out = 0;
14063+		stbi_uc *two_back = 0;
14064+		stbi__gif g;
14065+		int stride;
14066+		int out_size = 0;
14067+		int delays_size = 0;
14068+
14069+		STBI_NOTUSED(out_size);
14070+		STBI_NOTUSED(delays_size);
14071+
14072+		memset(&g, 0, sizeof(g));
14073+		if (delays) {
14074+			*delays = 0;
14075+		}
14076+
14077+		do {
14078+			u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
14079+			if (u == (stbi_uc *)s) {
14080+				u = 0; // end of animated gif marker
14081+			}
14082+
14083+			if (u) {
14084+				*x = g.w;
14085+				*y = g.h;
14086+				++layers;
14087+				stride = g.w * g.h * 4;
14088+
14089+				if (out) {
14090+					void *tmp = (stbi_uc *)STBI_REALLOC_SIZED(out, out_size,
14091+					                                          layers * stride);
14092+					if (!tmp) {
14093+						return stbi__load_gif_main_outofmem(&g, out, delays);
14094+					} else {
14095+						out = (stbi_uc *)tmp;
14096+						out_size = layers * stride;
14097+					}
14098+
14099+					if (delays) {
14100+						int *new_delays = (int *)STBI_REALLOC_SIZED(
14101+						    *delays, delays_size, sizeof(int) * layers);
14102+						if (!new_delays) {
14103+							return stbi__load_gif_main_outofmem(&g, out,
14104+							                                    delays);
14105+						}
14106+						*delays = new_delays;
14107+						delays_size = layers * sizeof(int);
14108+					}
14109+				} else {
14110+					out = (stbi_uc *)stbi__malloc(layers * stride);
14111+					if (!out) {
14112+						return stbi__load_gif_main_outofmem(&g, out, delays);
14113+					}
14114+					out_size = layers * stride;
14115+					if (delays) {
14116+						*delays = (int *)stbi__malloc(layers * sizeof(int));
14117+						if (!*delays) {
14118+							return stbi__load_gif_main_outofmem(&g, out,
14119+							                                    delays);
14120+						}
14121+						delays_size = layers * sizeof(int);
14122+					}
14123+				}
14124+				memcpy(out + ((layers - 1) * stride), u, stride);
14125+				if (layers >= 2) {
14126+					two_back = out - 2 * stride;
14127+				}
14128+
14129+				if (delays) {
14130+					(*delays)[layers - 1U] = g.delay;
14131+				}
14132+			}
14133+		} while (u != 0);
14134+
14135+		// free temp buffer;
14136+		STBI_FREE(g.out);
14137+		STBI_FREE(g.history);
14138+		STBI_FREE(g.background);
14139+
14140+		// do the final conversion after loading everything;
14141+		if (req_comp && req_comp != 4) {
14142+			out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
14143+		}
14144+
14145+		*z = layers;
14146+		return out;
14147+	} else {
14148+		return stbi__errpuc("not GIF", "Image was not as a gif type.");
14149+	}
14150+}
14151+
14152+static void *
14153+stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
14154+               stbi__result_info *ri)
14155+{
14156+	stbi_uc *u = 0;
14157+	stbi__gif g;
14158+	memset(&g, 0, sizeof(g));
14159+	STBI_NOTUSED(ri);
14160+
14161+	u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
14162+	if (u == (stbi_uc *)s) {
14163+		u = 0; // end of animated gif marker
14164+	}
14165+	if (u) {
14166+		*x = g.w;
14167+		*y = g.h;
14168+
14169+		// moved conversion to after successful load so that the same
14170+		// can be done for multiple frames.
14171+		if (req_comp && req_comp != 4) {
14172+			u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
14173+		}
14174+	} else if (g.out) {
14175+		// if there was an error and we allocated an image buffer, free it!
14176+		STBI_FREE(g.out);
14177+	}
14178+
14179+	// free buffers needed for multiple frame loading;
14180+	STBI_FREE(g.history);
14181+	STBI_FREE(g.background);
14182+
14183+	return u;
14184+}
14185+
14186+static int
14187+stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
14188+{
14189+	return stbi__gif_info_raw(s, x, y, comp);
14190 }
14191 #endif
14192 
14193@@ -7084,397 +8821,496 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
14194 // Radiance RGBE HDR loader
14195 // originally by Nicolas Schulz
14196 #ifndef STBI_NO_HDR
14197-static int stbi__hdr_test_core(stbi__context *s, const char *signature)
14198-{
14199-   int i;
14200-   for (i=0; signature[i]; ++i)
14201-      if (stbi__get8(s) != signature[i])
14202-          return 0;
14203-   stbi__rewind(s);
14204-   return 1;
14205-}
14206-
14207-static int stbi__hdr_test(stbi__context* s)
14208-{
14209-   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
14210-   stbi__rewind(s);
14211-   if(!r) {
14212-       r = stbi__hdr_test_core(s, "#?RGBE\n");
14213-       stbi__rewind(s);
14214-   }
14215-   return r;
14216-}
14217-
14218-#define STBI__HDR_BUFLEN  1024
14219-static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
14220-{
14221-   int len=0;
14222-   char c = '\0';
14223-
14224-   c = (char) stbi__get8(z);
14225-
14226-   while (!stbi__at_eof(z) && c != '\n') {
14227-      buffer[len++] = c;
14228-      if (len == STBI__HDR_BUFLEN-1) {
14229-         // flush to end of line
14230-         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
14231-            ;
14232-         break;
14233-      }
14234-      c = (char) stbi__get8(z);
14235-   }
14236-
14237-   buffer[len] = 0;
14238-   return buffer;
14239-}
14240-
14241-static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
14242-{
14243-   if ( input[3] != 0 ) {
14244-      float f1;
14245-      // Exponent
14246-      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
14247-      if (req_comp <= 2)
14248-         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
14249-      else {
14250-         output[0] = input[0] * f1;
14251-         output[1] = input[1] * f1;
14252-         output[2] = input[2] * f1;
14253-      }
14254-      if (req_comp == 2) output[1] = 1;
14255-      if (req_comp == 4) output[3] = 1;
14256-   } else {
14257-      switch (req_comp) {
14258-         case 4: output[3] = 1; /* fallthrough */
14259-         case 3: output[0] = output[1] = output[2] = 0;
14260-                 break;
14261-         case 2: output[1] = 1; /* fallthrough */
14262-         case 1: output[0] = 0;
14263-                 break;
14264-      }
14265-   }
14266-}
14267-
14268-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
14269-{
14270-   char buffer[STBI__HDR_BUFLEN];
14271-   char *token;
14272-   int valid = 0;
14273-   int width, height;
14274-   stbi_uc *scanline;
14275-   float *hdr_data;
14276-   int len;
14277-   unsigned char count, value;
14278-   int i, j, k, c1,c2, z;
14279-   const char *headerToken;
14280-   STBI_NOTUSED(ri);
14281-
14282-   // Check identifier
14283-   headerToken = stbi__hdr_gettoken(s,buffer);
14284-   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
14285-      return stbi__errpf("not HDR", "Corrupt HDR image");
14286-
14287-   // Parse header
14288-   for(;;) {
14289-      token = stbi__hdr_gettoken(s,buffer);
14290-      if (token[0] == 0) break;
14291-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
14292-   }
14293-
14294-   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
14295-
14296-   // Parse width and height
14297-   // can't use sscanf() if we're not using stdio!
14298-   token = stbi__hdr_gettoken(s,buffer);
14299-   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
14300-   token += 3;
14301-   height = (int) strtol(token, &token, 10);
14302-   while (*token == ' ') ++token;
14303-   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
14304-   token += 3;
14305-   width = (int) strtol(token, NULL, 10);
14306-
14307-   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
14308-   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
14309-
14310-   *x = width;
14311-   *y = height;
14312-
14313-   if (comp) *comp = 3;
14314-   if (req_comp == 0) req_comp = 3;
14315-
14316-   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
14317-      return stbi__errpf("too large", "HDR image is too large");
14318-
14319-   // Read data
14320-   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
14321-   if (!hdr_data)
14322-      return stbi__errpf("outofmem", "Out of memory");
14323-
14324-   // Load image data
14325-   // image data is stored as some number of sca
14326-   if ( width < 8 || width >= 32768) {
14327-      // Read flat data
14328-      for (j=0; j < height; ++j) {
14329-         for (i=0; i < width; ++i) {
14330-            stbi_uc rgbe[4];
14331-           main_decode_loop:
14332-            stbi__getn(s, rgbe, 4);
14333-            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
14334-         }
14335-      }
14336-   } else {
14337-      // Read RLE-encoded data
14338-      scanline = NULL;
14339-
14340-      for (j = 0; j < height; ++j) {
14341-         c1 = stbi__get8(s);
14342-         c2 = stbi__get8(s);
14343-         len = stbi__get8(s);
14344-         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
14345-            // not run-length encoded, so we have to actually use THIS data as a decoded
14346-            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
14347-            stbi_uc rgbe[4];
14348-            rgbe[0] = (stbi_uc) c1;
14349-            rgbe[1] = (stbi_uc) c2;
14350-            rgbe[2] = (stbi_uc) len;
14351-            rgbe[3] = (stbi_uc) stbi__get8(s);
14352-            stbi__hdr_convert(hdr_data, rgbe, req_comp);
14353-            i = 1;
14354-            j = 0;
14355-            STBI_FREE(scanline);
14356-            goto main_decode_loop; // yes, this makes no sense
14357-         }
14358-         len <<= 8;
14359-         len |= stbi__get8(s);
14360-         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
14361-         if (scanline == NULL) {
14362-            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
14363-            if (!scanline) {
14364-               STBI_FREE(hdr_data);
14365-               return stbi__errpf("outofmem", "Out of memory");
14366-            }
14367-         }
14368-
14369-         for (k = 0; k < 4; ++k) {
14370-            int nleft;
14371-            i = 0;
14372-            while ((nleft = width - i) > 0) {
14373-               count = stbi__get8(s);
14374-               if (count > 128) {
14375-                  // Run
14376-                  value = stbi__get8(s);
14377-                  count -= 128;
14378-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
14379-                  for (z = 0; z < count; ++z)
14380-                     scanline[i++ * 4 + k] = value;
14381-               } else {
14382-                  // Dump
14383-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
14384-                  for (z = 0; z < count; ++z)
14385-                     scanline[i++ * 4 + k] = stbi__get8(s);
14386-               }
14387-            }
14388-         }
14389-         for (i=0; i < width; ++i)
14390-            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
14391-      }
14392-      if (scanline)
14393-         STBI_FREE(scanline);
14394-   }
14395-
14396-   return hdr_data;
14397-}
14398-
14399-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
14400-{
14401-   char buffer[STBI__HDR_BUFLEN];
14402-   char *token;
14403-   int valid = 0;
14404-   int dummy;
14405-
14406-   if (!x) x = &dummy;
14407-   if (!y) y = &dummy;
14408-   if (!comp) comp = &dummy;
14409-
14410-   if (stbi__hdr_test(s) == 0) {
14411-       stbi__rewind( s );
14412-       return 0;
14413-   }
14414-
14415-   for(;;) {
14416-      token = stbi__hdr_gettoken(s,buffer);
14417-      if (token[0] == 0) break;
14418-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
14419-   }
14420-
14421-   if (!valid) {
14422-       stbi__rewind( s );
14423-       return 0;
14424-   }
14425-   token = stbi__hdr_gettoken(s,buffer);
14426-   if (strncmp(token, "-Y ", 3)) {
14427-       stbi__rewind( s );
14428-       return 0;
14429-   }
14430-   token += 3;
14431-   *y = (int) strtol(token, &token, 10);
14432-   while (*token == ' ') ++token;
14433-   if (strncmp(token, "+X ", 3)) {
14434-       stbi__rewind( s );
14435-       return 0;
14436-   }
14437-   token += 3;
14438-   *x = (int) strtol(token, NULL, 10);
14439-   *comp = 3;
14440-   return 1;
14441+static int
14442+stbi__hdr_test_core(stbi__context *s, const char *signature)
14443+{
14444+	int i;
14445+	for (i = 0; signature[i]; ++i) {
14446+		if (stbi__get8(s) != signature[i]) {
14447+			return 0;
14448+		}
14449+	}
14450+	stbi__rewind(s);
14451+	return 1;
14452+}
14453+
14454+static int
14455+stbi__hdr_test(stbi__context *s)
14456+{
14457+	int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
14458+	stbi__rewind(s);
14459+	if (!r) {
14460+		r = stbi__hdr_test_core(s, "#?RGBE\n");
14461+		stbi__rewind(s);
14462+	}
14463+	return r;
14464+}
14465+
14466+#define STBI__HDR_BUFLEN 1024
14467+static char *
14468+stbi__hdr_gettoken(stbi__context *z, char *buffer)
14469+{
14470+	int len = 0;
14471+	char c = '\0';
14472+
14473+	c = (char)stbi__get8(z);
14474+
14475+	while (!stbi__at_eof(z) && c != '\n') {
14476+		buffer[len++] = c;
14477+		if (len == STBI__HDR_BUFLEN - 1) {
14478+			// flush to end of line
14479+			while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
14480+				;
14481+			break;
14482+		}
14483+		c = (char)stbi__get8(z);
14484+	}
14485+
14486+	buffer[len] = 0;
14487+	return buffer;
14488+}
14489+
14490+static void
14491+stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
14492+{
14493+	if (input[3] != 0) {
14494+		float f1;
14495+		// Exponent
14496+		f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
14497+		if (req_comp <= 2) {
14498+			output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
14499+		} else {
14500+			output[0] = input[0] * f1;
14501+			output[1] = input[1] * f1;
14502+			output[2] = input[2] * f1;
14503+		}
14504+		if (req_comp == 2) {
14505+			output[1] = 1;
14506+		}
14507+		if (req_comp == 4) {
14508+			output[3] = 1;
14509+		}
14510+	} else {
14511+		switch (req_comp) {
14512+		case 4:
14513+			output[3] = 1; /* fallthrough */
14514+		case 3:
14515+			output[0] = output[1] = output[2] = 0;
14516+			break;
14517+		case 2:
14518+			output[1] = 1; /* fallthrough */
14519+		case 1:
14520+			output[0] = 0;
14521+			break;
14522+		}
14523+	}
14524+}
14525+
14526+static float *
14527+stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
14528+               stbi__result_info *ri)
14529+{
14530+	char buffer[STBI__HDR_BUFLEN];
14531+	char *token;
14532+	int valid = 0;
14533+	int width, height;
14534+	stbi_uc *scanline;
14535+	float *hdr_data;
14536+	int len;
14537+	unsigned char count, value;
14538+	int i, j, k, c1, c2, z;
14539+	const char *headerToken;
14540+	STBI_NOTUSED(ri);
14541+
14542+	// Check identifier
14543+	headerToken = stbi__hdr_gettoken(s, buffer);
14544+	if (strcmp(headerToken, "#?RADIANCE") != 0 &&
14545+	    strcmp(headerToken, "#?RGBE") != 0) {
14546+		return stbi__errpf("not HDR", "Corrupt HDR image");
14547+	}
14548+
14549+	// Parse header
14550+	for (;;) {
14551+		token = stbi__hdr_gettoken(s, buffer);
14552+		if (token[0] == 0) {
14553+			break;
14554+		}
14555+		if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) {
14556+			valid = 1;
14557+		}
14558+	}
14559+
14560+	if (!valid) {
14561+		return stbi__errpf("unsupported format", "Unsupported HDR format");
14562+	}
14563+
14564+	// Parse width and height
14565+	// can't use sscanf() if we're not using stdio!
14566+	token = stbi__hdr_gettoken(s, buffer);
14567+	if (strncmp(token, "-Y ", 3)) {
14568+		return stbi__errpf("unsupported data layout", "Unsupported HDR format");
14569+	}
14570+	token += 3;
14571+	height = (int)strtol(token, &token, 10);
14572+	while (*token == ' ') {
14573+		++token;
14574+	}
14575+	if (strncmp(token, "+X ", 3)) {
14576+		return stbi__errpf("unsupported data layout", "Unsupported HDR format");
14577+	}
14578+	token += 3;
14579+	width = (int)strtol(token, NULL, 10);
14580+
14581+	if (height > STBI_MAX_DIMENSIONS) {
14582+		return stbi__errpf("too large", "Very large image (corrupt?)");
14583+	}
14584+	if (width > STBI_MAX_DIMENSIONS) {
14585+		return stbi__errpf("too large", "Very large image (corrupt?)");
14586+	}
14587+
14588+	*x = width;
14589+	*y = height;
14590+
14591+	if (comp) {
14592+		*comp = 3;
14593+	}
14594+	if (req_comp == 0) {
14595+		req_comp = 3;
14596+	}
14597+
14598+	if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) {
14599+		return stbi__errpf("too large", "HDR image is too large");
14600+	}
14601+
14602+	// Read data
14603+	hdr_data =
14604+	    (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
14605+	if (!hdr_data) {
14606+		return stbi__errpf("outofmem", "Out of memory");
14607+	}
14608+
14609+	// Load image data
14610+	// image data is stored as some number of sca
14611+	if (width < 8 || width >= 32768) {
14612+		// Read flat data
14613+		for (j = 0; j < height; ++j) {
14614+			for (i = 0; i < width; ++i) {
14615+				stbi_uc rgbe[4];
14616+			main_decode_loop:
14617+				stbi__getn(s, rgbe, 4);
14618+				stbi__hdr_convert(hdr_data + j * width * req_comp +
14619+				                      i * req_comp,
14620+				                  rgbe, req_comp);
14621+			}
14622+		}
14623+	} else {
14624+		// Read RLE-encoded data
14625+		scanline = NULL;
14626+
14627+		for (j = 0; j < height; ++j) {
14628+			c1 = stbi__get8(s);
14629+			c2 = stbi__get8(s);
14630+			len = stbi__get8(s);
14631+			if (c1 != 2 || c2 != 2 || (len & 0x80)) {
14632+				// not run-length encoded, so we have to actually use THIS data
14633+				// as a decoded pixel (note this can't be a valid pixel--one of
14634+				// RGB must be >= 128)
14635+				stbi_uc rgbe[4];
14636+				rgbe[0] = (stbi_uc)c1;
14637+				rgbe[1] = (stbi_uc)c2;
14638+				rgbe[2] = (stbi_uc)len;
14639+				rgbe[3] = (stbi_uc)stbi__get8(s);
14640+				stbi__hdr_convert(hdr_data, rgbe, req_comp);
14641+				i = 1;
14642+				j = 0;
14643+				STBI_FREE(scanline);
14644+				goto main_decode_loop; // yes, this makes no sense
14645+			}
14646+			len <<= 8;
14647+			len |= stbi__get8(s);
14648+			if (len != width) {
14649+				STBI_FREE(hdr_data);
14650+				STBI_FREE(scanline);
14651+				return stbi__errpf("invalid decoded scanline length",
14652+				                   "corrupt HDR");
14653+			}
14654+			if (scanline == NULL) {
14655+				scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0);
14656+				if (!scanline) {
14657+					STBI_FREE(hdr_data);
14658+					return stbi__errpf("outofmem", "Out of memory");
14659+				}
14660+			}
14661+
14662+			for (k = 0; k < 4; ++k) {
14663+				int nleft;
14664+				i = 0;
14665+				while ((nleft = width - i) > 0) {
14666+					count = stbi__get8(s);
14667+					if (count > 128) {
14668+						// Run
14669+						value = stbi__get8(s);
14670+						count -= 128;
14671+						if ((count == 0) || (count > nleft)) {
14672+							STBI_FREE(hdr_data);
14673+							STBI_FREE(scanline);
14674+							return stbi__errpf("corrupt",
14675+							                   "bad RLE data in HDR");
14676+						}
14677+						for (z = 0; z < count; ++z) {
14678+							scanline[i++ * 4 + k] = value;
14679+						}
14680+					} else {
14681+						// Dump
14682+						if ((count == 0) || (count > nleft)) {
14683+							STBI_FREE(hdr_data);
14684+							STBI_FREE(scanline);
14685+							return stbi__errpf("corrupt",
14686+							                   "bad RLE data in HDR");
14687+						}
14688+						for (z = 0; z < count; ++z) {
14689+							scanline[i++ * 4 + k] = stbi__get8(s);
14690+						}
14691+					}
14692+				}
14693+			}
14694+			for (i = 0; i < width; ++i) {
14695+				stbi__hdr_convert(hdr_data + (j * width + i) * req_comp,
14696+				                  scanline + i * 4, req_comp);
14697+			}
14698+		}
14699+		if (scanline) {
14700+			STBI_FREE(scanline);
14701+		}
14702+	}
14703+
14704+	return hdr_data;
14705+}
14706+
14707+static int
14708+stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
14709+{
14710+	char buffer[STBI__HDR_BUFLEN];
14711+	char *token;
14712+	int valid = 0;
14713+	int dummy;
14714+
14715+	if (!x) {
14716+		x = &dummy;
14717+	}
14718+	if (!y) {
14719+		y = &dummy;
14720+	}
14721+	if (!comp) {
14722+		comp = &dummy;
14723+	}
14724+
14725+	if (stbi__hdr_test(s) == 0) {
14726+		stbi__rewind(s);
14727+		return 0;
14728+	}
14729+
14730+	for (;;) {
14731+		token = stbi__hdr_gettoken(s, buffer);
14732+		if (token[0] == 0) {
14733+			break;
14734+		}
14735+		if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) {
14736+			valid = 1;
14737+		}
14738+	}
14739+
14740+	if (!valid) {
14741+		stbi__rewind(s);
14742+		return 0;
14743+	}
14744+	token = stbi__hdr_gettoken(s, buffer);
14745+	if (strncmp(token, "-Y ", 3)) {
14746+		stbi__rewind(s);
14747+		return 0;
14748+	}
14749+	token += 3;
14750+	*y = (int)strtol(token, &token, 10);
14751+	while (*token == ' ') {
14752+		++token;
14753+	}
14754+	if (strncmp(token, "+X ", 3)) {
14755+		stbi__rewind(s);
14756+		return 0;
14757+	}
14758+	token += 3;
14759+	*x = (int)strtol(token, NULL, 10);
14760+	*comp = 3;
14761+	return 1;
14762 }
14763 #endif // STBI_NO_HDR
14764 
14765 #ifndef STBI_NO_BMP
14766-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
14767-{
14768-   void *p;
14769-   stbi__bmp_data info;
14770-
14771-   info.all_a = 255;
14772-   p = stbi__bmp_parse_header(s, &info);
14773-   if (p == NULL) {
14774-      stbi__rewind( s );
14775-      return 0;
14776-   }
14777-   if (x) *x = s->img_x;
14778-   if (y) *y = s->img_y;
14779-   if (comp) {
14780-      if (info.bpp == 24 && info.ma == 0xff000000)
14781-         *comp = 3;
14782-      else
14783-         *comp = info.ma ? 4 : 3;
14784-   }
14785-   return 1;
14786+static int
14787+stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
14788+{
14789+	void *p;
14790+	stbi__bmp_data info;
14791+
14792+	info.all_a = 255;
14793+	p = stbi__bmp_parse_header(s, &info);
14794+	if (p == NULL) {
14795+		stbi__rewind(s);
14796+		return 0;
14797+	}
14798+	if (x) {
14799+		*x = s->img_x;
14800+	}
14801+	if (y) {
14802+		*y = s->img_y;
14803+	}
14804+	if (comp) {
14805+		if (info.bpp == 24 && info.ma == 0xff000000) {
14806+			*comp = 3;
14807+		} else {
14808+			*comp = info.ma ? 4 : 3;
14809+		}
14810+	}
14811+	return 1;
14812 }
14813 #endif
14814 
14815 #ifndef STBI_NO_PSD
14816-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
14817-{
14818-   int channelCount, dummy, depth;
14819-   if (!x) x = &dummy;
14820-   if (!y) y = &dummy;
14821-   if (!comp) comp = &dummy;
14822-   if (stbi__get32be(s) != 0x38425053) {
14823-       stbi__rewind( s );
14824-       return 0;
14825-   }
14826-   if (stbi__get16be(s) != 1) {
14827-       stbi__rewind( s );
14828-       return 0;
14829-   }
14830-   stbi__skip(s, 6);
14831-   channelCount = stbi__get16be(s);
14832-   if (channelCount < 0 || channelCount > 16) {
14833-       stbi__rewind( s );
14834-       return 0;
14835-   }
14836-   *y = stbi__get32be(s);
14837-   *x = stbi__get32be(s);
14838-   depth = stbi__get16be(s);
14839-   if (depth != 8 && depth != 16) {
14840-       stbi__rewind( s );
14841-       return 0;
14842-   }
14843-   if (stbi__get16be(s) != 3) {
14844-       stbi__rewind( s );
14845-       return 0;
14846-   }
14847-   *comp = 4;
14848-   return 1;
14849-}
14850-
14851-static int stbi__psd_is16(stbi__context *s)
14852-{
14853-   int channelCount, depth;
14854-   if (stbi__get32be(s) != 0x38425053) {
14855-       stbi__rewind( s );
14856-       return 0;
14857-   }
14858-   if (stbi__get16be(s) != 1) {
14859-       stbi__rewind( s );
14860-       return 0;
14861-   }
14862-   stbi__skip(s, 6);
14863-   channelCount = stbi__get16be(s);
14864-   if (channelCount < 0 || channelCount > 16) {
14865-       stbi__rewind( s );
14866-       return 0;
14867-   }
14868-   STBI_NOTUSED(stbi__get32be(s));
14869-   STBI_NOTUSED(stbi__get32be(s));
14870-   depth = stbi__get16be(s);
14871-   if (depth != 16) {
14872-       stbi__rewind( s );
14873-       return 0;
14874-   }
14875-   return 1;
14876+static int
14877+stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
14878+{
14879+	int channelCount, dummy, depth;
14880+	if (!x) {
14881+		x = &dummy;
14882+	}
14883+	if (!y) {
14884+		y = &dummy;
14885+	}
14886+	if (!comp) {
14887+		comp = &dummy;
14888+	}
14889+	if (stbi__get32be(s) != 0x38425053) {
14890+		stbi__rewind(s);
14891+		return 0;
14892+	}
14893+	if (stbi__get16be(s) != 1) {
14894+		stbi__rewind(s);
14895+		return 0;
14896+	}
14897+	stbi__skip(s, 6);
14898+	channelCount = stbi__get16be(s);
14899+	if (channelCount < 0 || channelCount > 16) {
14900+		stbi__rewind(s);
14901+		return 0;
14902+	}
14903+	*y = stbi__get32be(s);
14904+	*x = stbi__get32be(s);
14905+	depth = stbi__get16be(s);
14906+	if (depth != 8 && depth != 16) {
14907+		stbi__rewind(s);
14908+		return 0;
14909+	}
14910+	if (stbi__get16be(s) != 3) {
14911+		stbi__rewind(s);
14912+		return 0;
14913+	}
14914+	*comp = 4;
14915+	return 1;
14916+}
14917+
14918+static int
14919+stbi__psd_is16(stbi__context *s)
14920+{
14921+	int channelCount, depth;
14922+	if (stbi__get32be(s) != 0x38425053) {
14923+		stbi__rewind(s);
14924+		return 0;
14925+	}
14926+	if (stbi__get16be(s) != 1) {
14927+		stbi__rewind(s);
14928+		return 0;
14929+	}
14930+	stbi__skip(s, 6);
14931+	channelCount = stbi__get16be(s);
14932+	if (channelCount < 0 || channelCount > 16) {
14933+		stbi__rewind(s);
14934+		return 0;
14935+	}
14936+	STBI_NOTUSED(stbi__get32be(s));
14937+	STBI_NOTUSED(stbi__get32be(s));
14938+	depth = stbi__get16be(s);
14939+	if (depth != 16) {
14940+		stbi__rewind(s);
14941+		return 0;
14942+	}
14943+	return 1;
14944 }
14945 #endif
14946 
14947 #ifndef STBI_NO_PIC
14948-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
14949-{
14950-   int act_comp=0,num_packets=0,chained,dummy;
14951-   stbi__pic_packet packets[10];
14952-
14953-   if (!x) x = &dummy;
14954-   if (!y) y = &dummy;
14955-   if (!comp) comp = &dummy;
14956-
14957-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
14958-      stbi__rewind(s);
14959-      return 0;
14960-   }
14961-
14962-   stbi__skip(s, 88);
14963-
14964-   *x = stbi__get16be(s);
14965-   *y = stbi__get16be(s);
14966-   if (stbi__at_eof(s)) {
14967-      stbi__rewind( s);
14968-      return 0;
14969-   }
14970-   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
14971-      stbi__rewind( s );
14972-      return 0;
14973-   }
14974-
14975-   stbi__skip(s, 8);
14976-
14977-   do {
14978-      stbi__pic_packet *packet;
14979-
14980-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
14981-         return 0;
14982-
14983-      packet = &packets[num_packets++];
14984-      chained = stbi__get8(s);
14985-      packet->size    = stbi__get8(s);
14986-      packet->type    = stbi__get8(s);
14987-      packet->channel = stbi__get8(s);
14988-      act_comp |= packet->channel;
14989-
14990-      if (stbi__at_eof(s)) {
14991-          stbi__rewind( s );
14992-          return 0;
14993-      }
14994-      if (packet->size != 8) {
14995-          stbi__rewind( s );
14996-          return 0;
14997-      }
14998-   } while (chained);
14999-
15000-   *comp = (act_comp & 0x10 ? 4 : 3);
15001-
15002-   return 1;
15003+static int
15004+stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
15005+{
15006+	int act_comp = 0, num_packets = 0, chained, dummy;
15007+	stbi__pic_packet packets[10];
15008+
15009+	if (!x) {
15010+		x = &dummy;
15011+	}
15012+	if (!y) {
15013+		y = &dummy;
15014+	}
15015+	if (!comp) {
15016+		comp = &dummy;
15017+	}
15018+
15019+	if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
15020+		stbi__rewind(s);
15021+		return 0;
15022+	}
15023+
15024+	stbi__skip(s, 88);
15025+
15026+	*x = stbi__get16be(s);
15027+	*y = stbi__get16be(s);
15028+	if (stbi__at_eof(s)) {
15029+		stbi__rewind(s);
15030+		return 0;
15031+	}
15032+	if ((*x) != 0 && (1 << 28) / (*x) < (*y)) {
15033+		stbi__rewind(s);
15034+		return 0;
15035+	}
15036+
15037+	stbi__skip(s, 8);
15038+
15039+	do {
15040+		stbi__pic_packet *packet;
15041+
15042+		if (num_packets == sizeof(packets) / sizeof(packets[0])) {
15043+			return 0;
15044+		}
15045+
15046+		packet = &packets[num_packets++];
15047+		chained = stbi__get8(s);
15048+		packet->size = stbi__get8(s);
15049+		packet->type = stbi__get8(s);
15050+		packet->channel = stbi__get8(s);
15051+		act_comp |= packet->channel;
15052+
15053+		if (stbi__at_eof(s)) {
15054+			stbi__rewind(s);
15055+			return 0;
15056+		}
15057+		if (packet->size != 8) {
15058+			stbi__rewind(s);
15059+			return 0;
15060+		}
15061+	} while (chained);
15062+
15063+	*comp = (act_comp & 0x10 ? 4 : 3);
15064+
15065+	return 1;
15066 }
15067 #endif
15068 
15069@@ -7491,282 +9327,369 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
15070 
15071 #ifndef STBI_NO_PNM
15072 
15073-static int      stbi__pnm_test(stbi__context *s)
15074-{
15075-   char p, t;
15076-   p = (char) stbi__get8(s);
15077-   t = (char) stbi__get8(s);
15078-   if (p != 'P' || (t != '5' && t != '6')) {
15079-       stbi__rewind( s );
15080-       return 0;
15081-   }
15082-   return 1;
15083-}
15084-
15085-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
15086-{
15087-   stbi_uc *out;
15088-   STBI_NOTUSED(ri);
15089-
15090-   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
15091-   if (ri->bits_per_channel == 0)
15092-      return 0;
15093-
15094-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
15095-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
15096-
15097-   *x = s->img_x;
15098-   *y = s->img_y;
15099-   if (comp) *comp = s->img_n;
15100-
15101-   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
15102-      return stbi__errpuc("too large", "PNM too large");
15103-
15104-   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
15105-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
15106-   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
15107-      STBI_FREE(out);
15108-      return stbi__errpuc("bad PNM", "PNM file truncated");
15109-   }
15110-
15111-   if (req_comp && req_comp != s->img_n) {
15112-      if (ri->bits_per_channel == 16) {
15113-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
15114-      } else {
15115-         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
15116-      }
15117-      if (out == NULL) return out; // stbi__convert_format frees input on failure
15118-   }
15119-   return out;
15120-}
15121-
15122-static int      stbi__pnm_isspace(char c)
15123-{
15124-   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
15125-}
15126-
15127-static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
15128-{
15129-   for (;;) {
15130-      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
15131-         *c = (char) stbi__get8(s);
15132-
15133-      if (stbi__at_eof(s) || *c != '#')
15134-         break;
15135-
15136-      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
15137-         *c = (char) stbi__get8(s);
15138-   }
15139-}
15140-
15141-static int      stbi__pnm_isdigit(char c)
15142-{
15143-   return c >= '0' && c <= '9';
15144-}
15145-
15146-static int      stbi__pnm_getinteger(stbi__context *s, char *c)
15147-{
15148-   int value = 0;
15149-
15150-   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
15151-      value = value*10 + (*c - '0');
15152-      *c = (char) stbi__get8(s);
15153-      if((value > 214748364) || (value == 214748364 && *c > '7'))
15154-          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
15155-   }
15156-
15157-   return value;
15158-}
15159-
15160-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
15161-{
15162-   int maxv, dummy;
15163-   char c, p, t;
15164-
15165-   if (!x) x = &dummy;
15166-   if (!y) y = &dummy;
15167-   if (!comp) comp = &dummy;
15168+static int
15169+stbi__pnm_test(stbi__context *s)
15170+{
15171+	char p, t;
15172+	p = (char)stbi__get8(s);
15173+	t = (char)stbi__get8(s);
15174+	if (p != 'P' || (t != '5' && t != '6')) {
15175+		stbi__rewind(s);
15176+		return 0;
15177+	}
15178+	return 1;
15179+}
15180+
15181+static void *
15182+stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp,
15183+               stbi__result_info *ri)
15184+{
15185+	stbi_uc *out;
15186+	STBI_NOTUSED(ri);
15187+
15188+	ri->bits_per_channel =
15189+	    stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
15190+	if (ri->bits_per_channel == 0) {
15191+		return 0;
15192+	}
15193 
15194-   stbi__rewind(s);
15195+	if (s->img_y > STBI_MAX_DIMENSIONS) {
15196+		return stbi__errpuc("too large", "Very large image (corrupt?)");
15197+	}
15198+	if (s->img_x > STBI_MAX_DIMENSIONS) {
15199+		return stbi__errpuc("too large", "Very large image (corrupt?)");
15200+	}
15201+
15202+	*x = s->img_x;
15203+	*y = s->img_y;
15204+	if (comp) {
15205+		*comp = s->img_n;
15206+	}
15207+
15208+	if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y,
15209+	                           ri->bits_per_channel / 8, 0)) {
15210+		return stbi__errpuc("too large", "PNM too large");
15211+	}
15212+
15213+	out = (stbi_uc *)stbi__malloc_mad4(s->img_n, s->img_x, s->img_y,
15214+	                                   ri->bits_per_channel / 8, 0);
15215+	if (!out) {
15216+		return stbi__errpuc("outofmem", "Out of memory");
15217+	}
15218+	if (!stbi__getn(s, out,
15219+	                s->img_n * s->img_x * s->img_y *
15220+	                    (ri->bits_per_channel / 8))) {
15221+		STBI_FREE(out);
15222+		return stbi__errpuc("bad PNM", "PNM file truncated");
15223+	}
15224+
15225+	if (req_comp && req_comp != s->img_n) {
15226+		if (ri->bits_per_channel == 16) {
15227+			out = (stbi_uc *)stbi__convert_format16(
15228+			    (stbi__uint16 *)out, s->img_n, req_comp, s->img_x, s->img_y);
15229+		} else {
15230+			out = stbi__convert_format(out, s->img_n, req_comp, s->img_x,
15231+			                           s->img_y);
15232+		}
15233+		if (out == NULL) {
15234+			return out; // stbi__convert_format frees input on failure
15235+		}
15236+	}
15237+	return out;
15238+}
15239+
15240+static int
15241+stbi__pnm_isspace(char c)
15242+{
15243+	return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' ||
15244+	       c == '\r';
15245+}
15246+
15247+static void
15248+stbi__pnm_skip_whitespace(stbi__context *s, char *c)
15249+{
15250+	for (;;) {
15251+		while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) {
15252+			*c = (char)stbi__get8(s);
15253+		}
15254+
15255+		if (stbi__at_eof(s) || *c != '#') {
15256+			break;
15257+		}
15258+
15259+		while (!stbi__at_eof(s) && *c != '\n' && *c != '\r') {
15260+			*c = (char)stbi__get8(s);
15261+		}
15262+	}
15263+}
15264+
15265+static int
15266+stbi__pnm_isdigit(char c)
15267+{
15268+	return c >= '0' && c <= '9';
15269+}
15270+
15271+static int
15272+stbi__pnm_getinteger(stbi__context *s, char *c)
15273+{
15274+	int value = 0;
15275+
15276+	while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
15277+		value = value * 10 + (*c - '0');
15278+		*c = (char)stbi__get8(s);
15279+		if ((value > 214748364) || (value == 214748364 && *c > '7')) {
15280+			return stbi__err(
15281+			    "integer parse overflow",
15282+			    "Parsing an integer in the PPM header overflowed a 32-bit int");
15283+		}
15284+	}
15285+
15286+	return value;
15287+}
15288+
15289+static int
15290+stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
15291+{
15292+	int maxv, dummy;
15293+	char c, p, t;
15294+
15295+	if (!x) {
15296+		x = &dummy;
15297+	}
15298+	if (!y) {
15299+		y = &dummy;
15300+	}
15301+	if (!comp) {
15302+		comp = &dummy;
15303+	}
15304+
15305+	stbi__rewind(s);
15306+
15307+	// Get identifier
15308+	p = (char)stbi__get8(s);
15309+	t = (char)stbi__get8(s);
15310+	if (p != 'P' || (t != '5' && t != '6')) {
15311+		stbi__rewind(s);
15312+		return 0;
15313+	}
15314+
15315+	*comp =
15316+	    (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
15317+
15318+	c = (char)stbi__get8(s);
15319+	stbi__pnm_skip_whitespace(s, &c);
15320+
15321+	*x = stbi__pnm_getinteger(s, &c); // read width
15322+	if (*x == 0) {
15323+		return stbi__err("invalid width",
15324+		                 "PPM image header had zero or overflowing width");
15325+	}
15326+	stbi__pnm_skip_whitespace(s, &c);
15327 
15328-   // Get identifier
15329-   p = (char) stbi__get8(s);
15330-   t = (char) stbi__get8(s);
15331-   if (p != 'P' || (t != '5' && t != '6')) {
15332-       stbi__rewind(s);
15333-       return 0;
15334-   }
15335+	*y = stbi__pnm_getinteger(s, &c); // read height
15336+	if (*y == 0) {
15337+		return stbi__err("invalid width",
15338+		                 "PPM image header had zero or overflowing width");
15339+	}
15340+	stbi__pnm_skip_whitespace(s, &c);
15341 
15342-   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
15343-
15344-   c = (char) stbi__get8(s);
15345-   stbi__pnm_skip_whitespace(s, &c);
15346-
15347-   *x = stbi__pnm_getinteger(s, &c); // read width
15348-   if(*x == 0)
15349-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
15350-   stbi__pnm_skip_whitespace(s, &c);
15351-
15352-   *y = stbi__pnm_getinteger(s, &c); // read height
15353-   if (*y == 0)
15354-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
15355-   stbi__pnm_skip_whitespace(s, &c);
15356-
15357-   maxv = stbi__pnm_getinteger(s, &c);  // read max value
15358-   if (maxv > 65535)
15359-      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
15360-   else if (maxv > 255)
15361-      return 16;
15362-   else
15363-      return 8;
15364+	maxv = stbi__pnm_getinteger(s, &c); // read max value
15365+	if (maxv > 65535) {
15366+		return stbi__err("max value > 65535",
15367+		                 "PPM image supports only 8-bit and 16-bit images");
15368+	} else if (maxv > 255) {
15369+		return 16;
15370+	} else {
15371+		return 8;
15372+	}
15373 }
15374 
15375-static int stbi__pnm_is16(stbi__context *s)
15376+static int
15377+stbi__pnm_is16(stbi__context *s)
15378 {
15379-   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
15380-	   return 1;
15381-   return 0;
15382+	if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) {
15383+		return 1;
15384+	}
15385+	return 0;
15386 }
15387 #endif
15388 
15389-static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
15390+static int
15391+stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
15392 {
15393-   #ifndef STBI_NO_JPEG
15394-   if (stbi__jpeg_info(s, x, y, comp)) return 1;
15395-   #endif
15396+#ifndef STBI_NO_JPEG
15397+	if (stbi__jpeg_info(s, x, y, comp)) {
15398+		return 1;
15399+	}
15400+#endif
15401 
15402-   #ifndef STBI_NO_PNG
15403-   if (stbi__png_info(s, x, y, comp))  return 1;
15404-   #endif
15405+#ifndef STBI_NO_PNG
15406+	if (stbi__png_info(s, x, y, comp)) {
15407+		return 1;
15408+	}
15409+#endif
15410 
15411-   #ifndef STBI_NO_GIF
15412-   if (stbi__gif_info(s, x, y, comp))  return 1;
15413-   #endif
15414+#ifndef STBI_NO_GIF
15415+	if (stbi__gif_info(s, x, y, comp)) {
15416+		return 1;
15417+	}
15418+#endif
15419 
15420-   #ifndef STBI_NO_BMP
15421-   if (stbi__bmp_info(s, x, y, comp))  return 1;
15422-   #endif
15423+#ifndef STBI_NO_BMP
15424+	if (stbi__bmp_info(s, x, y, comp)) {
15425+		return 1;
15426+	}
15427+#endif
15428 
15429-   #ifndef STBI_NO_PSD
15430-   if (stbi__psd_info(s, x, y, comp))  return 1;
15431-   #endif
15432+#ifndef STBI_NO_PSD
15433+	if (stbi__psd_info(s, x, y, comp)) {
15434+		return 1;
15435+	}
15436+#endif
15437 
15438-   #ifndef STBI_NO_PIC
15439-   if (stbi__pic_info(s, x, y, comp))  return 1;
15440-   #endif
15441+#ifndef STBI_NO_PIC
15442+	if (stbi__pic_info(s, x, y, comp)) {
15443+		return 1;
15444+	}
15445+#endif
15446 
15447-   #ifndef STBI_NO_PNM
15448-   if (stbi__pnm_info(s, x, y, comp))  return 1;
15449-   #endif
15450+#ifndef STBI_NO_PNM
15451+	if (stbi__pnm_info(s, x, y, comp)) {
15452+		return 1;
15453+	}
15454+#endif
15455 
15456-   #ifndef STBI_NO_HDR
15457-   if (stbi__hdr_info(s, x, y, comp))  return 1;
15458-   #endif
15459+#ifndef STBI_NO_HDR
15460+	if (stbi__hdr_info(s, x, y, comp)) {
15461+		return 1;
15462+	}
15463+#endif
15464 
15465-   // test tga last because it's a crappy test!
15466-   #ifndef STBI_NO_TGA
15467-   if (stbi__tga_info(s, x, y, comp))
15468-       return 1;
15469-   #endif
15470-   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
15471+// test tga last because it's a crappy test!
15472+#ifndef STBI_NO_TGA
15473+	if (stbi__tga_info(s, x, y, comp)) {
15474+		return 1;
15475+	}
15476+#endif
15477+	return stbi__err("unknown image type",
15478+	                 "Image not of any known type, or corrupt");
15479 }
15480 
15481-static int stbi__is_16_main(stbi__context *s)
15482+static int
15483+stbi__is_16_main(stbi__context *s)
15484 {
15485-   #ifndef STBI_NO_PNG
15486-   if (stbi__png_is16(s))  return 1;
15487-   #endif
15488+#ifndef STBI_NO_PNG
15489+	if (stbi__png_is16(s)) {
15490+		return 1;
15491+	}
15492+#endif
15493 
15494-   #ifndef STBI_NO_PSD
15495-   if (stbi__psd_is16(s))  return 1;
15496-   #endif
15497+#ifndef STBI_NO_PSD
15498+	if (stbi__psd_is16(s)) {
15499+		return 1;
15500+	}
15501+#endif
15502 
15503-   #ifndef STBI_NO_PNM
15504-   if (stbi__pnm_is16(s))  return 1;
15505-   #endif
15506-   return 0;
15507+#ifndef STBI_NO_PNM
15508+	if (stbi__pnm_is16(s)) {
15509+		return 1;
15510+	}
15511+#endif
15512+	return 0;
15513 }
15514 
15515 #ifndef STBI_NO_STDIO
15516-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
15517-{
15518-    FILE *f = stbi__fopen(filename, "rb");
15519-    int result;
15520-    if (!f) return stbi__err("can't fopen", "Unable to open file");
15521-    result = stbi_info_from_file(f, x, y, comp);
15522-    fclose(f);
15523-    return result;
15524-}
15525-
15526-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
15527-{
15528-   int r;
15529-   stbi__context s;
15530-   long pos = ftell(f);
15531-   stbi__start_file(&s, f);
15532-   r = stbi__info_main(&s,x,y,comp);
15533-   fseek(f,pos,SEEK_SET);
15534-   return r;
15535-}
15536-
15537-STBIDEF int stbi_is_16_bit(char const *filename)
15538-{
15539-    FILE *f = stbi__fopen(filename, "rb");
15540-    int result;
15541-    if (!f) return stbi__err("can't fopen", "Unable to open file");
15542-    result = stbi_is_16_bit_from_file(f);
15543-    fclose(f);
15544-    return result;
15545-}
15546-
15547-STBIDEF int stbi_is_16_bit_from_file(FILE *f)
15548-{
15549-   int r;
15550-   stbi__context s;
15551-   long pos = ftell(f);
15552-   stbi__start_file(&s, f);
15553-   r = stbi__is_16_main(&s);
15554-   fseek(f,pos,SEEK_SET);
15555-   return r;
15556+STBIDEF int
15557+stbi_info(char const *filename, int *x, int *y, int *comp)
15558+{
15559+	FILE *f = stbi__fopen(filename, "rb");
15560+	int result;
15561+	if (!f) {
15562+		return stbi__err("can't fopen", "Unable to open file");
15563+	}
15564+	result = stbi_info_from_file(f, x, y, comp);
15565+	fclose(f);
15566+	return result;
15567+}
15568+
15569+STBIDEF int
15570+stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
15571+{
15572+	int r;
15573+	stbi__context s;
15574+	long pos = ftell(f);
15575+	stbi__start_file(&s, f);
15576+	r = stbi__info_main(&s, x, y, comp);
15577+	fseek(f, pos, SEEK_SET);
15578+	return r;
15579+}
15580+
15581+STBIDEF int
15582+stbi_is_16_bit(char const *filename)
15583+{
15584+	FILE *f = stbi__fopen(filename, "rb");
15585+	int result;
15586+	if (!f) {
15587+		return stbi__err("can't fopen", "Unable to open file");
15588+	}
15589+	result = stbi_is_16_bit_from_file(f);
15590+	fclose(f);
15591+	return result;
15592+}
15593+
15594+STBIDEF int
15595+stbi_is_16_bit_from_file(FILE *f)
15596+{
15597+	int r;
15598+	stbi__context s;
15599+	long pos = ftell(f);
15600+	stbi__start_file(&s, f);
15601+	r = stbi__is_16_main(&s);
15602+	fseek(f, pos, SEEK_SET);
15603+	return r;
15604 }
15605 #endif // !STBI_NO_STDIO
15606 
15607-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
15608+STBIDEF int
15609+stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
15610 {
15611-   stbi__context s;
15612-   stbi__start_mem(&s,buffer,len);
15613-   return stbi__info_main(&s,x,y,comp);
15614+	stbi__context s;
15615+	stbi__start_mem(&s, buffer, len);
15616+	return stbi__info_main(&s, x, y, comp);
15617 }
15618 
15619-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
15620+STBIDEF int
15621+stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y,
15622+                         int *comp)
15623 {
15624-   stbi__context s;
15625-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
15626-   return stbi__info_main(&s,x,y,comp);
15627+	stbi__context s;
15628+	stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
15629+	return stbi__info_main(&s, x, y, comp);
15630 }
15631 
15632-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
15633+STBIDEF int
15634+stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
15635 {
15636-   stbi__context s;
15637-   stbi__start_mem(&s,buffer,len);
15638-   return stbi__is_16_main(&s);
15639+	stbi__context s;
15640+	stbi__start_mem(&s, buffer, len);
15641+	return stbi__is_16_main(&s);
15642 }
15643 
15644-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
15645+STBIDEF int
15646+stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
15647 {
15648-   stbi__context s;
15649-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
15650-   return stbi__is_16_main(&s);
15651+	stbi__context s;
15652+	stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
15653+	return stbi__is_16_main(&s);
15654 }
15655 
15656 #endif // STB_IMAGE_IMPLEMENTATION
15657 
15658 /*
15659    revision history:
15660-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
15661-      2.19  (2018-02-11) fix warning
15662-      2.18  (2018-01-30) fix warnings
15663-      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
15664+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and
15665+   platform ifdefs 2.19  (2018-02-11) fix warning 2.18  (2018-01-30) fix
15666+   warnings 2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
15667                          1-bit BMP
15668                          *_is_16_bit api
15669                          avoid warnings
15670@@ -7781,13 +9704,11 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15671                          warning fixes; disable run-time SSE detection on gcc;
15672                          uniform handling of optional "return" values;
15673                          thread-safe initialization of zlib tables
15674-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
15675-      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
15676-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
15677-      2.11  (2016-04-02) allocate large structures on the stack
15678-                         remove white matting for transparent PSD
15679-                         fix reported channel count for PNG & BMP
15680-                         re-enable SSE2 in non-gcc 64-bit
15681+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet
15682+   JPGs 2.13  (2016-11-29) add 16-bit API, only supported for PNG right now 2.12
15683+   (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11  (2016-04-02)
15684+   allocate large structures on the stack remove white matting for transparent
15685+   PSD fix reported channel count for PNG & BMP re-enable SSE2 in non-gcc 64-bit
15686                          support RGB-formatted JPEG
15687                          read 16-bit PNGs (only as 8-bit)
15688       2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
15689@@ -7795,11 +9716,9 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15690                          16-bit-per-pixel TGA (not bit-per-component)
15691                          info() for TGA could break due to .hdr handling
15692                          info() for BMP to shares code instead of sloppy parse
15693-                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
15694-                         code cleanup
15695-      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
15696-      2.07  (2015-09-13) fix compiler warnings
15697-                         partial animated GIF support
15698+                         can use STBI_REALLOC_SIZED if allocator doesn't support
15699+   realloc code cleanup 2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD
15700+   as RGBA 2.07  (2015-09-13) fix compiler warnings partial animated GIF support
15701                          limited 16-bpc PSD support
15702                          #ifdef unused functions
15703                          bug with < 92 byte PIC,PNM,HDR,TGA
15704@@ -7810,23 +9729,18 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15705                          stbi_set_flip_vertically_on_load (nguillemot)
15706                          fix NEON support; fix mingw support
15707       2.02  (2015-01-19) fix incorrect assert, fix warning
15708-      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
15709-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
15710-      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
15711-                         progressive JPEG (stb)
15712-                         PGM/PPM support (Ken Miller)
15713-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
15714+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit
15715+   without -msse2 2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG 2.00
15716+   (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg) progressive
15717+   JPEG (stb) PGM/PPM support (Ken Miller) STBI_MALLOC,STBI_REALLOC,STBI_FREE
15718                          GIF bugfix -- seemingly never worked
15719                          STBI_NO_*, STBI_ONLY_*
15720       1.48  (2014-12-14) fix incorrectly-named assert()
15721-      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
15722-                         optimize PNG (ryg)
15723-                         fix bug in interlaced PNG with user-specified channel count (stb)
15724-      1.46  (2014-08-26)
15725-              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
15726-      1.45  (2014-08-16)
15727-              fix MSVC-ARM internal compiler error by wrapping malloc
15728-      1.44  (2014-08-07)
15729+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar
15730+   Cornut & stb) optimize PNG (ryg) fix bug in interlaced PNG with
15731+   user-specified channel count (stb) 1.46  (2014-08-26) fix broken tRNS chunk
15732+   (colorkey-style transparency) in non-paletted PNG 1.45  (2014-08-16) fix
15733+   MSVC-ARM internal compiler error by wrapping malloc 1.44  (2014-08-07)
15734               various warning fixes from Ronny Chevalier
15735       1.43  (2014-07-15)
15736               fix MSVC-only compiler problem in code changed in 1.42
15737@@ -7835,73 +9749,48 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15738               fixes to stbi__cleanup_jpeg path
15739               added STBI_ASSERT to avoid requiring assert.h
15740       1.41  (2014-06-25)
15741-              fix search&replace from 1.36 that messed up comments/error messages
15742-      1.40  (2014-06-22)
15743-              fix gcc struct-initialization warning
15744-      1.39  (2014-06-15)
15745-              fix to TGA optimization when req_comp != number of components in TGA;
15746-              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
15747-              add support for BMP version 5 (more ignored fields)
15748-      1.38  (2014-06-06)
15749-              suppress MSVC warnings on integer casts truncating values
15750-              fix accidental rename of 'skip' field of I/O
15751-      1.37  (2014-06-04)
15752-              remove duplicate typedef
15753-      1.36  (2014-06-03)
15754-              convert to header file single-file library
15755-              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
15756-      1.35  (2014-05-27)
15757-              various warnings
15758-              fix broken STBI_SIMD path
15759-              fix bug where stbi_load_from_file no longer left file pointer in correct place
15760-              fix broken non-easy path for 32-bit BMP (possibly never used)
15761-              TGA optimization by Arseny Kapoulkine
15762-      1.34  (unknown)
15763-              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
15764-      1.33  (2011-07-14)
15765-              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
15766-      1.32  (2011-07-13)
15767-              support for "info" function for all supported filetypes (SpartanJ)
15768-      1.31  (2011-06-20)
15769-              a few more leak fixes, bug in PNG handling (SpartanJ)
15770-      1.30  (2011-06-11)
15771-              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
15772+              fix search&replace from 1.36 that messed up comments/error
15773+   messages 1.40  (2014-06-22) fix gcc struct-initialization warning 1.39
15774+   (2014-06-15) fix to TGA optimization when req_comp != number of components in
15775+   TGA; fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my
15776+   test suite) add support for BMP version 5 (more ignored fields) 1.38
15777+   (2014-06-06) suppress MSVC warnings on integer casts truncating values fix
15778+   accidental rename of 'skip' field of I/O 1.37  (2014-06-04) remove duplicate
15779+   typedef 1.36  (2014-06-03) convert to header file single-file library if
15780+   de-iphone isn't set, load iphone images color-swapped instead of returning
15781+   NULL 1.35  (2014-05-27) various warnings fix broken STBI_SIMD path fix bug
15782+   where stbi_load_from_file no longer left file pointer in correct place fix
15783+   broken non-easy path for 32-bit BMP (possibly never used) TGA optimization by
15784+   Arseny Kapoulkine 1.34  (unknown) use STBI_NOTUSED in
15785+   stbi__resample_row_generic(), fix one more leak in tga failure case 1.33
15786+   (2011-07-14) make stbi_is_hdr work in STBI_NO_HDR (as specified), minor
15787+   compiler-friendly improvements 1.32  (2011-07-13) support for "info" function
15788+   for all supported filetypes (SpartanJ) 1.31  (2011-06-20) a few more leak
15789+   fixes, bug in PNG handling (SpartanJ) 1.30  (2011-06-11) added ability to
15790+   load files via callbacks to accomidate custom input streams (Ben Wenger)
15791               removed deprecated format-specific test/load functions
15792-              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
15793-              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
15794-              fix inefficiency in decoding 32-bit BMP (David Woo)
15795-      1.29  (2010-08-16)
15796-              various warning fixes from Aurelien Pocheville
15797-      1.28  (2010-08-01)
15798-              fix bug in GIF palette transparency (SpartanJ)
15799-      1.27  (2010-08-01)
15800-              cast-to-stbi_uc to fix warnings
15801-      1.26  (2010-07-24)
15802-              fix bug in file buffering for PNG reported by SpartanJ
15803-      1.25  (2010-07-17)
15804-              refix trans_data warning (Won Chun)
15805-      1.24  (2010-07-12)
15806-              perf improvements reading from files on platforms with lock-heavy fgetc()
15807-              minor perf improvements for jpeg
15808-              deprecated type-specific functions so we'll get feedback if they're needed
15809-              attempt to fix trans_data warning (Won Chun)
15810-      1.23    fixed bug in iPhone support
15811-      1.22  (2010-07-10)
15812-              removed image *writing* support
15813-              stbi_info support from Jetro Lauha
15814-              GIF support from Jean-Marc Lienher
15815+              removed support for installable file formats (stbi_loader) --
15816+   would have been broken for IO callbacks anyway error cases in bmp and tga
15817+   give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in
15818+   decoding 32-bit BMP (David Woo) 1.29  (2010-08-16) various warning fixes from
15819+   Aurelien Pocheville 1.28  (2010-08-01) fix bug in GIF palette transparency
15820+   (SpartanJ) 1.27  (2010-08-01) cast-to-stbi_uc to fix warnings 1.26
15821+   (2010-07-24) fix bug in file buffering for PNG reported by SpartanJ 1.25
15822+   (2010-07-17) refix trans_data warning (Won Chun) 1.24  (2010-07-12) perf
15823+   improvements reading from files on platforms with lock-heavy fgetc() minor
15824+   perf improvements for jpeg deprecated type-specific functions so we'll get
15825+   feedback if they're needed attempt to fix trans_data warning (Won Chun) 1.23
15826+   fixed bug in iPhone support 1.22  (2010-07-10) removed image *writing*
15827+   support stbi_info support from Jetro Lauha GIF support from Jean-Marc Lienher
15828               iPhone PNG-extensions from James Brown
15829-              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
15830-      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
15831-      1.20    added support for Softimage PIC, by Tom Seddon
15832-      1.19    bug in interlaced PNG corruption check (found by ryg)
15833-      1.18  (2008-08-02)
15834-              fix a threading bug (local mutable static)
15835-      1.17    support interlaced PNG
15836-      1.16    major bugfix - stbi__convert_format converted one too many pixels
15837-      1.15    initialize some fields for thread safety
15838-      1.14    fix threadsafe conversion bug
15839-              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
15840+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err.
15841+   Janez (U+017D)emva) 1.21    fix use of 'stbi_uc' in header (reported by jon
15842+   blow) 1.20    added support for Softimage PIC, by Tom Seddon 1.19    bug in
15843+   interlaced PNG corruption check (found by ryg) 1.18  (2008-08-02) fix a
15844+   threading bug (local mutable static) 1.17    support interlaced PNG 1.16
15845+   major bugfix - stbi__convert_format converted one too many pixels 1.15
15846+   initialize some fields for thread safety 1.14    fix threadsafe conversion
15847+   bug header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
15848       1.13    threadsafe
15849       1.12    const qualifiers in the API
15850       1.11    Support installable IDCT, colorspace conversion routines
15851@@ -7911,15 +9800,14 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15852       1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
15853       1.07    attempt to fix C++ warning/errors again
15854       1.06    attempt to fix C++ warning/errors again
15855-      1.05    fix TGA loading to return correct *comp and use good luminance calc
15856-      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
15857-      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
15858-      1.02    support for (subset of) HDR files, float interface for preferred access to them
15859-      1.01    fix bug: possible bug in handling right-side up bmps... not sure
15860-              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
15861-      1.00    interface to zlib that skips zlib header
15862-      0.99    correct handling of alpha in palette
15863-      0.98    TGA loader by lonesock; dynamically add loaders (untested)
15864+      1.05    fix TGA loading to return correct *comp and use good luminance
15865+   calc 1.04    default float alpha is 1, not 255; use 'void *' for
15866+   stbi_image_free 1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR 1.02 support
15867+   for (subset of) HDR files, float interface for preferred access to them 1.01
15868+   fix bug: possible bug in handling right-side up bmps... not sure fix bug: the
15869+   stbi__bmp_load() and stbi__tga_load() functions didn't work at all 1.00
15870+   interface to zlib that skips zlib header 0.99    correct handling of alpha in
15871+   palette 0.98    TGA loader by lonesock; dynamically add loaders (untested)
15872       0.97    jpeg errors on too large a file; also catch another malloc failure
15873       0.96    fix detection of invalid v value - particleman@mollyrocket forum
15874       0.95    during header scan, seek to markers in case of padding
15875@@ -7932,8 +9820,8 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15876       0.60    fix compiling as c++
15877       0.59    fix warnings: merge Dave Moore's -Wall fixes
15878       0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
15879-      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
15880-      0.56    fix bug: zlib uncompressed mode len vs. nlen
15881+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but
15882+   less than 16 available 0.56    fix bug: zlib uncompressed mode len vs. nlen
15883       0.55    fix bug: restart_interval not initialized to 0
15884       0.54    allow NULL for 'int *comp'
15885       0.53    fix bug in png 3->4; speedup png decoding
15886@@ -7944,7 +9832,6 @@ STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user
15887               first released version
15888 */
15889 
15890-
15891 /*
15892 ------------------------------------------------------------------------------
15893 This software is available under 2 licenses -- choose whichever you prefer.
+11837, -9229
    1@@ -3,8 +3,8 @@
    2    by Jeff Roberts (v2) and Jorge L Rodriguez
    3    http://github.com/nothings/stb
    4 
    5-   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
    6-   scaling and translation is supported, no rotations or shears.
    7+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support.
    8+   Only scaling and translation is supported, no rotations or shears.
    9 
   10    COMPILING & LINKING
   11       In one C/C++ file that #includes this file, do this:
   12@@ -12,34 +12,37 @@
   13       before the #include. That will create the implementation in that file.
   14 
   15    EASY API CALLS:
   16-     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
   17+     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation,
   18+   clamps to edge.
   19 
   20-     stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
   21-                              output_pixels, output_w, output_h, output_stride_in_bytes,
   22-                              pixel_layout_enum )
   23+     stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,
   24+   input_stride_in_bytes, output_pixels, output_w, output_h,
   25+   output_stride_in_bytes, pixel_layout_enum )
   26 
   27-     stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
   28-                                output_pixels, output_w, output_h, output_stride_in_bytes,
   29-                                pixel_layout_enum )
   30+     stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,
   31+   input_stride_in_bytes, output_pixels, output_w, output_h,
   32+   output_stride_in_bytes, pixel_layout_enum )
   33 
   34-     stbir_resize_float_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
   35-                                output_pixels, output_w, output_h, output_stride_in_bytes,
   36-                                pixel_layout_enum )
   37+     stbir_resize_float_linear( input_pixels,  input_w,  input_h,
   38+   input_stride_in_bytes, output_pixels, output_w, output_h,
   39+   output_stride_in_bytes, pixel_layout_enum )
   40 
   41-     If you pass NULL or zero for the output_pixels, we will allocate the output buffer
   42-     for you and return it from the function (free with free() or STBIR_FREE).
   43-     As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
   44+     If you pass NULL or zero for the output_pixels, we will allocate the output
   45+   buffer for you and return it from the function (free with free() or
   46+   STBIR_FREE). As a special case, XX_stride_in_bytes of 0 means packed
   47+   continuously in memory.
   48 
   49    API LEVELS
   50-      There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
   51+      There are three levels of API - easy-to-use, medium-complexity and
   52+   extended-complexity.
   53 
   54       See the "header file" section of the source for API documentation.
   55 
   56    ADDITIONAL DOCUMENTATION
   57 
   58       MEMORY ALLOCATION
   59-         By default, we use malloc and free for memory allocation.  To override the
   60-         memory allocation, before the implementation #include, add a:
   61+         By default, we use malloc and free for memory allocation.  To override
   62+   the memory allocation, before the implementation #include, add a:
   63 
   64             #define STBIR_MALLOC(size,user_data) ...
   65             #define STBIR_FREE(ptr,user_data)   ...
   66@@ -51,79 +54,81 @@
   67       PERFORMANCE
   68          This library was written with an emphasis on performance. When testing
   69          stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
   70-         STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
   71-         libs do by default). Also, make sure SIMD is turned on of course (default
   72-         for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
   73+         STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other
   74+   resize libs do by default). Also, make sure SIMD is turned on of course
   75+   (default for 64-bit targets). Avoid WRAP edge mode if you want the fastest
   76+   speed.
   77 
   78-         This library also comes with profiling built-in. If you define STBIR_PROFILE,
   79-         you can use the advanced API and get low-level profiling information by
   80-         calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
   81-         after a resize.
   82+         This library also comes with profiling built-in. If you define
   83+   STBIR_PROFILE, you can use the advanced API and get low-level profiling
   84+   information by calling stbir_resize_extended_profile_info() or
   85+   stbir_resize_split_profile_info() after a resize.
   86 
   87       SIMD
   88          Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
   89 
   90-         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
   91-         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
   92-         STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
   93-         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
   94-         support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
   95-
   96-         On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
   97-         we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
   98-         on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
   99-         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
  100-         automatically enable NEON.
  101-
  102-         On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
  103-         for converting back and forth to half-floats. This is autoselected when we
  104-         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
  105-         the built-in half float hardware NEON instructions.
  106-
  107-         You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
  108-         Because x86 doesn't always have fma, we turn it off by default to maintain
  109-         determinism across all platforms. If you don't care about non-FMA determinism
  110-         and are willing to restrict yourself to more recent x86 CPUs (around the AVX
  111-         timeframe), then fma will give you around a 15% speedup.
  112-
  113-         You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
  114-         off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
  115-         to 40% faster, and AVX2 is generally another 12%.
  116+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64
  117+   and ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2
  118+   or STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
  119+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or
  120+   AVX2 support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
  121+
  122+         On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For
  123+   32-bit, we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2
  124+   enabled on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4
  125+   for both clang and GCC, but GCC also requires an additional
  126+   -mfp16-format=ieee to automatically enable NEON.
  127+
  128+         On x86 platforms, you can also define STBIR_FP16C to turn on FP16C
  129+   instructions for converting back and forth to half-floats. This is
  130+   autoselected when we are using AVX2. Clang and GCC also require the -mf16c
  131+   switch. ARM always uses the built-in half float hardware NEON instructions.
  132+
  133+         You can also tell us to use multiply-add instructions with
  134+   STBIR_USE_FMA. Because x86 doesn't always have fma, we turn it off by default
  135+   to maintain determinism across all platforms. If you don't care about non-FMA
  136+   determinism and are willing to restrict yourself to more recent x86 CPUs
  137+   (around the AVX timeframe), then fma will give you around a 15% speedup.
  138+
  139+         You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can
  140+   turn off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is
  141+   10% to 40% faster, and AVX2 is generally another 12%.
  142 
  143       ALPHA CHANNEL
  144-         Most of the resizing functions provide the ability to control how the alpha
  145-         channel of an image is processed.
  146+         Most of the resizing functions provide the ability to control how the
  147+   alpha channel of an image is processed.
  148 
  149          When alpha represents transparency, it is important that when combining
  150          colors with filtering, the pixels should not be treated equally; they
  151          should use a weighted average based on their alpha values. For example,
  152          if a pixel is 1% opaque bright green and another pixel is 99% opaque
  153          black and you average them, the average will be 50% opaque, but the
  154-         unweighted average and will be a middling green color, while the weighted
  155-         average will be nearly black. This means the unweighted version introduced
  156-         green energy that didn't exist in the source image.
  157+         unweighted average and will be a middling green color, while the
  158+   weighted average will be nearly black. This means the unweighted version
  159+   introduced green energy that didn't exist in the source image.
  160 
  161-         (If you want to know why this makes sense, you can work out the math for
  162-         the following: consider what happens if you alpha composite a source image
  163-         over a fixed color and then average the output, vs. if you average the
  164+         (If you want to know why this makes sense, you can work out the math
  165+   for the following: consider what happens if you alpha composite a source
  166+   image over a fixed color and then average the output, vs. if you average the
  167          source image pixels and then composite that over the same fixed color.
  168          Only the weighted average produces the same result as the ground truth
  169          composite-then-average result.)
  170 
  171-         Therefore, it is in general best to "alpha weight" the pixels when applying
  172-         filters to them. This essentially means multiplying the colors by the alpha
  173-         values before combining them, and then dividing by the alpha value at the
  174-         end.
  175-
  176-         The computer graphics industry introduced a technique called "premultiplied
  177-         alpha" or "associated alpha" in which image colors are stored in image files
  178-         already multiplied by their alpha. This saves some math when compositing,
  179-         and also avoids the need to divide by the alpha at the end (which is quite
  180-         inefficient). However, while premultiplied alpha is common in the movie CGI
  181-         industry, it is not commonplace in other industries like videogames, and most
  182-         consumer file formats are generally expected to contain not-premultiplied
  183-         colors. For example, Photoshop saves PNG files "unpremultiplied", and web
  184-         browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
  185+         Therefore, it is in general best to "alpha weight" the pixels when
  186+   applying filters to them. This essentially means multiplying the colors by
  187+   the alpha values before combining them, and then dividing by the alpha value
  188+   at the end.
  189+
  190+         The computer graphics industry introduced a technique called
  191+   "premultiplied alpha" or "associated alpha" in which image colors are stored
  192+   in image files already multiplied by their alpha. This saves some math when
  193+   compositing, and also avoids the need to divide by the alpha at the end
  194+   (which is quite inefficient). However, while premultiplied alpha is common in
  195+   the movie CGI industry, it is not commonplace in other industries like
  196+   videogames, and most consumer file formats are generally expected to contain
  197+   not-premultiplied colors. For example, Photoshop saves PNG files
  198+   "unpremultiplied", and web browsers like Chrome and Firefox expect PNG images
  199+   to be unpremultiplied.
  200 
  201          Note that there are three possibilities that might describe your image
  202          and resize expectation:
  203@@ -132,100 +137,101 @@
  204              2. images are not premultiplied, alpha weighting is not desired
  205              3. images are premultiplied
  206 
  207-         Both case #2 and case #3 require the exact same math: no alpha weighting
  208-         should be applied or removed. Only case 1 requires extra math operations;
  209-         the other two cases can be handled identically.
  210+         Both case #2 and case #3 require the exact same math: no alpha
  211+   weighting should be applied or removed. Only case 1 requires extra math
  212+   operations; the other two cases can be handled identically.
  213 
  214-         stb_image_resize expects case #1 by default, applying alpha weighting to
  215-         images, expecting the input images to be unpremultiplied. This is what the
  216+         stb_image_resize expects case #1 by default, applying alpha weighting
  217+   to images, expecting the input images to be unpremultiplied. This is what the
  218          COLOR+ALPHA buffer types tell the resizer to do.
  219 
  220          When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
  221-         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
  222-         non-premultiplied. In these cases, the resizer will alpha weight the colors
  223-         (effectively creating the premultiplied image), do the filtering, and then
  224-         convert back to non-premult on exit.
  225-
  226-         When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
  227-         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
  228-         ARE premultiplied. In this case, the resizer doesn't have to do the
  229-         premultipling - it can filter directly on the input. This about twice as
  230-         fast as the non-premultiplied case, so it's the right option if your data is
  231-         already setup correctly.
  232+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels
  233+   are non-premultiplied. In these cases, the resizer will alpha weight the
  234+   colors (effectively creating the premultiplied image), do the filtering, and
  235+   then convert back to non-premult on exit.
  236+
  237+         When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM,
  238+   STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling
  239+   that the pixels ARE premultiplied. In this case, the resizer doesn't have to
  240+   do the premultipling - it can filter directly on the input. This about twice
  241+   as fast as the non-premultiplied case, so it's the right option if your data
  242+   is already setup correctly.
  243 
  244          When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
  245-         telling us that there is no channel that represents transparency; it may be
  246-         RGB and some unrelated fourth channel that has been stored in the alpha
  247-         channel, but it is actually not alpha. No special processing will be
  248+         telling us that there is no channel that represents transparency; it
  249+   may be RGB and some unrelated fourth channel that has been stored in the
  250+   alpha channel, but it is actually not alpha. No special processing will be
  251          performed.
  252 
  253          The difference between the generic 4 or 2 channel layouts, and the
  254-         specialized _PM versions is with the _PM versions you are telling us that
  255-         the data *is* alpha, just don't premultiply it. That's important when
  256+         specialized _PM versions is with the _PM versions you are telling us
  257+   that the data *is* alpha, just don't premultiply it. That's important when
  258          using SRGB pixel formats, we need to know where the alpha is, because
  259          it is converted linearly (rather than with the SRGB converters).
  260 
  261          Because alpha weighting produces the same effect as premultiplying, you
  262          even have the option with non-premultiplied inputs to let the resizer
  263-         produce a premultiplied output. Because the intially computed alpha-weighted
  264-         output image is effectively premultiplied, this is actually more performant
  265-         than the normal path which un-premultiplies the output image as a final step.
  266+         produce a premultiplied output. Because the intially computed
  267+   alpha-weighted output image is effectively premultiplied, this is actually
  268+   more performant than the normal path which un-premultiplies the output image
  269+   as a final step.
  270 
  271-         Finally, when converting both in and out of non-premulitplied space (for
  272-         example, when using STBIR_RGBA), we go to somewhat heroic measures to
  273+         Finally, when converting both in and out of non-premulitplied space
  274+   (for example, when using STBIR_RGBA), we go to somewhat heroic measures to
  275          ensure that areas with zero alpha value pixels get something reasonable
  276          in the RGB values. If you don't care about the RGB values of zero alpha
  277          pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
  278-         function - this runs a premultiplied resize about 25% faster. That said,
  279-         when you really care about speed, using premultiplied pixels for both in
  280-         and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
  281+         function - this runs a premultiplied resize about 25% faster. That
  282+   said, when you really care about speed, using premultiplied pixels for both
  283+   in and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
  284          options.
  285 
  286       PIXEL LAYOUT CONVERSION
  287-         The resizer can convert from some pixel layouts to others. When using the
  288-         stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
  289-         on input, and STBIR_ARGB on output, and it will re-organize the channels
  290-         during the resize. Currently, you can only convert between two pixel
  291-         layouts with the same number of channels.
  292+         The resizer can convert from some pixel layouts to others. When using
  293+   the stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA on
  294+   input, and STBIR_ARGB on output, and it will re-organize the channels during
  295+   the resize. Currently, you can only convert between two pixel layouts with
  296+   the same number of channels.
  297 
  298       DETERMINISM
  299-         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
  300-         This requires compiling with fast-math off (using at least /fp:precise).
  301-         Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
  302-         We attempt to do this with pragmas, but with Clang, you usually want to add
  303-         -ffp-contract=off to the command line as well.
  304-
  305-         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
  306-         if the scalar x87 unit gets used at all, we immediately lose determinism.
  307-         On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
  308-         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
  309-         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
  310+         We commit to being deterministic (from x64 to ARM to scalar to SIMD,
  311+   etc). This requires compiling with fast-math off (using at least
  312+   /fp:precise). Also, you must turn off fp-contracting (which turns mult+adds
  313+   into fmas)! We attempt to do this with pragmas, but with Clang, you usually
  314+   want to add -ffp-contract=off to the command line as well.
  315+
  316+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That
  317+   is, if the scalar x87 unit gets used at all, we immediately lose determinism.
  318+         On Microsoft Visual Studio 2008 and earlier, from what we can tell
  319+   there is no way to be deterministic in 32-bit x86 (some x87 always leaks in,
  320+   even with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
  321          -fpmath=sse.
  322 
  323-         Note that we will not be deterministic with float data containing NaNs -
  324-         the NaNs will propagate differently on different SIMD and platforms.
  325+         Note that we will not be deterministic with float data containing NaNs
  326+   - the NaNs will propagate differently on different SIMD and platforms.
  327 
  328          If you turn on STBIR_USE_FMA, then we will be deterministic with other
  329-         fma targets, but we will differ from non-fma targets (this is unavoidable,
  330-         because a fma isn't simply an add with a mult - it also introduces a
  331-         rounding difference compared to non-fma instruction sequences.
  332+         fma targets, but we will differ from non-fma targets (this is
  333+   unavoidable, because a fma isn't simply an add with a mult - it also
  334+   introduces a rounding difference compared to non-fma instruction sequences.
  335 
  336       FLOAT PIXEL FORMAT RANGE
  337-         Any range of values can be used for the non-alpha float data that you pass
  338-         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
  339-         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
  340-         scale back properly. The alpha channel must also be 0 to 1 for any format
  341-         that does premultiplication prior to resizing.
  342+         Any range of values can be used for the non-alpha float data that you
  343+   pass in (0 to 1, -1 to 1, whatever). However, if you are inputting float
  344+   values but *outputting* bytes or shorts, you must use a range of 0 to 1 so
  345+   that we scale back properly. The alpha channel must also be 0 to 1 for any
  346+   format that does premultiplication prior to resizing.
  347 
  348-         Note also that with float output, using filters with negative lobes, the
  349-         output filtered values might go slightly out of range. You can define
  350-         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
  351-         to clamp to on output, if that's important.
  352+         Note also that with float output, using filters with negative lobes,
  353+   the output filtered values might go slightly out of range. You can define
  354+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the
  355+   range to clamp to on output, if that's important.
  356 
  357       MAX/MIN SCALE FACTORS
  358-         The input pixel resolutions are in integers, and we do the internal pointer
  359-         resolution in size_t sized integers. However, the scale ratio from input
  360-         resolution to output resolution is calculated in float form. This means
  361+         The input pixel resolutions are in integers, and we do the internal
  362+   pointer resolution in size_t sized integers. However, the scale ratio from
  363+   input resolution to output resolution is calculated in float form. This means
  364          the effective possible scale ratio is limited to 24 bits (or 16 million
  365          to 1). As you get close to the size of the float resolution (again, 16
  366          million pixels wide or high), you might start seeing float inaccuracy
  367@@ -234,10 +240,10 @@
  368          buffers).
  369 
  370       FLIPPED IMAGES
  371-         Stride is just the delta from one scanline to the next. This means you can
  372-         use a negative stride to handle inverted images (point to the final
  373-         scanline and use a negative stride). You can invert the input or output,
  374-         using negative strides.
  375+         Stride is just the delta from one scanline to the next. This means you
  376+   can use a negative stride to handle inverted images (point to the final
  377+         scanline and use a negative stride). You can invert the input or
  378+   output, using negative strides.
  379 
  380       DEFAULT FILTERS
  381          For functions which don't provide explicit control over what filters to
  382@@ -254,37 +260,41 @@
  383          using the stbir_set_filter_callbacks function.
  384 
  385       PROGRESS
  386-         For interactive use with slow resize operations, you can use the 
  387-         scanline callbacks in the extended API. It would have to be a *very* large
  388-         image resample to need progress though - we're very fast.
  389+         For interactive use with slow resize operations, you can use the
  390+         scanline callbacks in the extended API. It would have to be a *very*
  391+   large image resample to need progress though - we're very fast.
  392 
  393       CEIL and FLOOR
  394-         In scalar mode, the only functions we use from math.h are ceilf and floorf,
  395-         but if you have your own versions, you can define the STBIR_CEILF(v) and
  396-         STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
  397+         In scalar mode, the only functions we use from math.h are ceilf and
  398+   floorf, but if you have your own versions, you can define the STBIR_CEILF(v)
  399+   and STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
  400          our own versions.
  401 
  402       ASSERT
  403          Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
  404 
  405      PORTING FROM VERSION 1
  406-        The API has changed. You can continue to use the old version of stb_image_resize.h,
  407-        which is available in the "deprecated/" directory.
  408+        The API has changed. You can continue to use the old version of
  409+   stb_image_resize.h, which is available in the "deprecated/" directory.
  410 
  411         If you're using the old simple-to-use API, porting is straightforward.
  412         (For more advanced APIs, read the documentation.)
  413 
  414           stbir_resize_uint8():
  415-            - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
  416+            - call `stbir_resize_uint8_linear`, cast channel count to
  417+   `stbir_pixel_layout`
  418 
  419           stbir_resize_float():
  420-            - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
  421+            - call `stbir_resize_float_linear`, cast channel count to
  422+   `stbir_pixel_layout`
  423 
  424           stbir_resize_uint8_srgb():
  425             - function name is unchanged
  426             - cast channel count to `stbir_pixel_layout`
  427-            - above is sufficient unless your image has alpha and it's not RGBA/BGRA
  428-              - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
  429+            - above is sufficient unless your image has alpha and it's not
  430+   RGBA/BGRA
  431+              - in that case, follow the below instructions for
  432+   stbir_resize_uint8_srgb_edgemode
  433 
  434           stbir_resize_uint8_srgb_edgemode()
  435             - switch to the "medium complexity" API
  436@@ -293,7 +303,8 @@
  437               - data_type:    STBIR_TYPE_UINT8_SRGB
  438               - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
  439               - filter:       STBIR_FILTER_DEFAULT
  440-            - which channel is alpha is specified in stbir_pixel_layout, see enum for details
  441+            - which channel is alpha is specified in stbir_pixel_layout, see
  442+   enum for details
  443 
  444       FUTURE TODOS
  445         *  For polyphase integral filters, we just memcpy the coeffs to dupe
  446@@ -302,20 +313,20 @@
  447            (maybe, 1->3/4, 3->4, 4->1, 3->1).
  448          * For SIMD encode and decode scanline routines, do any pre-aligning
  449            for bad input/output buffer alignments and pitch?
  450-         * For very wide scanlines, we should we do vertical strips to stay within
  451-           L2 cache. Maybe do chunks of 1K pixels at a time. There would be
  452-           some pixel reconversion, but probably dwarfed by things falling out
  453-           of cache. Probably also something possible with alternating between
  454-           scattering and gathering at high resize scales?
  455+         * For very wide scanlines, we should we do vertical strips to stay
  456+   within L2 cache. Maybe do chunks of 1K pixels at a time. There would be some
  457+   pixel reconversion, but probably dwarfed by things falling out of cache.
  458+   Probably also something possible with alternating between scattering and
  459+   gathering at high resize scales?
  460          * Should we have a multiple MIPs at the same time function (could keep
  461            more memory in cache during multiple resizes)?
  462          * Rewrite the coefficient generator to do many at once.
  463          * AVX-512 vertical kernels - worried about downclocking here.
  464          * Convert the reincludes to macros when we know they aren't changing.
  465          * Experiment with pivoting the horizontal and always using the
  466-           vertical filters (which are faster, but perhaps not enough to overcome
  467-           the pivot cost and the extra memory touches). Need to buffer the whole
  468-           image so have to balance memory use.
  469+           vertical filters (which are faster, but perhaps not enough to
  470+   overcome the pivot cost and the extra memory touches). Need to buffer the
  471+   whole image so have to balance memory use.
  472          * Most of our code is internally function pointers, should we compile
  473            all the SIMD stuff always and dynamically dispatch?
  474 
  475@@ -330,69 +341,63 @@
  476 
  477    REVISIONS
  478       2.17 (2025-10-25) silly format bug in easy-to-use APIs.
  479-      2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative
  480-                          strides), fix vertical filter kernel callback, fix threaded
  481-                          gather buffer priming (and assert).
  482-                          (thanks adipose, TainZerL, and Harrison Green)
  483-      2.15 (2025-07-17) fixed an assert in debug mode when using floats with input
  484-                          callbacks, work around GCC warning when adding to null ptr
  485-                          (thanks Johannes Spohr and Pyry Kovanen).
  486-      2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and 
  487-                          scatter with vertical first.
  488-      2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for 
  489-                          tiny-c, fixed some variables that should have been static,
  490-                          fixes a bug when calculating temp memory with resizes that
  491-                          exceed 2GB of temp memory (very large resizes).
  492-      2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
  493-      2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
  494-                          with AVX-2, fix some weird scaling edge conditions with
  495-                          point sample mode.
  496-      2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
  497-                          fix MSVC 32-bit arm half float routines.
  498-      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
  499+      2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps
  500+   (negative strides), fix vertical filter kernel callback, fix threaded gather
  501+   buffer priming (and assert). (thanks adipose, TainZerL, and Harrison Green)
  502+      2.15 (2025-07-17) fixed an assert in debug mode when using floats with
  503+   input callbacks, work around GCC warning when adding to null ptr (thanks
  504+   Johannes Spohr and Pyry Kovanen). 2.14 (2025-05-09) fixed a bug using
  505+   downsampling gather horizontal first, and scatter with vertical first. 2.13
  506+   (2025-02-27) fixed a bug when using input callbacks, turned off simd for
  507+                          tiny-c, fixed some variables that should have been
  508+   static, fixes a bug when calculating temp memory with resizes that exceed 2GB
  509+   of temp memory (very large resizes). 2.12 (2024-10-18) fix incorrect use of
  510+   user_data with STBIR_FREE 2.11 (2024-09-08) fix harmless asan warnings in
  511+   2-channel and 3-channel mode with AVX-2, fix some weird scaling edge
  512+   conditions with point sample mode. 2.10 (2024-07-27) fix the defines GCC and
  513+   mingw for loop unroll control, fix MSVC 32-bit arm half float routines. 2.09
  514+   (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
  515                           hardware half floats).
  516-      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
  517-                          to Ryan Salsbury), fix for sub-rect resizes, use the
  518-                          pragmas to control unrolling when they are available.
  519-      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
  520-                          wide scanlines when downsampling (caused by extra input 
  521-                          converting), fix for wide scanline resamples with many 
  522-                          splits (int overflow), fix GCC warning.
  523-      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
  524-                          undersampling a single row on rare resize ratios (about 1%).
  525-      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
  526-                        fix for output callback (thanks Julien Koenen).
  527-      2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
  528-      2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
  529-      2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
  530-                          2x-5x faster without simd, 4x-12x faster with simd,
  531-                          in some cases, 20x to 40x faster esp resizing large to very small.
  532-      0.96 (2019-03-04) fixed warnings
  533-      0.95 (2017-07-23) fixed warnings
  534-      0.94 (2017-03-18) fixed warnings
  535-      0.93 (2017-03-03) fixed bug with certain combinations of heights
  536-      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
  537-      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
  538-      0.90 (2014-09-17) first released version
  539+      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD
  540+   (thanks to Ryan Salsbury), fix for sub-rect resizes, use the pragmas to
  541+   control unrolling when they are available. 2.07 (2024-05-24) fix for slow
  542+   final split during threaded conversions of very wide scanlines when
  543+   downsampling (caused by extra input converting), fix for wide scanline
  544+   resamples with many splits (int overflow), fix GCC warning. 2.06 (2024-02-10)
  545+   fix for identical width/height 3x or more down-scaling undersampling a single
  546+   row on rare resize ratios (about 1%). 2.05 (2024-02-07) fix for 2 pixel to 1
  547+   pixel resizes with wrap (thanks Aras), fix for output callback (thanks Julien
  548+   Koenen). 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks
  549+   Nikola Smiljanic). 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor
  550+   tweaks. 2.00 (2023-10-10) mostly new source: new api, optimizations, simd,
  551+   vertical-first, etc 2x-5x faster without simd, 4x-12x faster with simd, in
  552+   some cases, 20x to 40x faster esp resizing large to very small. 0.96
  553+   (2019-03-04) fixed warnings 0.95 (2017-07-23) fixed warnings 0.94
  554+   (2017-03-18) fixed warnings 0.93 (2017-03-03) fixed bug with certain
  555+   combinations of heights 0.92 (2017-01-02) fix integer overflow on large
  556+   (>2GB) images 0.91 (2016-04-02) fix warnings; fix handling of subpixel
  557+   regions 0.90 (2014-09-17) first released version
  558 
  559    LICENSE
  560      See end of file for license information.
  561 */
  562 
  563-#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS)   // for internal re-includes
  564+#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) &&                               \
  565+    !defined(STB_IMAGE_RESIZE_DO_VERTICALS) &&                                 \
  566+    !defined(STB_IMAGE_RESIZE_DO_CODERS) // for internal re-includes
  567 
  568 #ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
  569 #define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
  570 
  571 #include <stddef.h>
  572 #ifdef _MSC_VER
  573-typedef unsigned char    stbir_uint8;
  574-typedef unsigned short   stbir_uint16;
  575-typedef unsigned int     stbir_uint32;
  576+typedef unsigned char stbir_uint8;
  577+typedef unsigned short stbir_uint16;
  578+typedef unsigned int stbir_uint32;
  579 typedef unsigned __int64 stbir_uint64;
  580 #else
  581 #include <stdint.h>
  582-typedef uint8_t  stbir_uint8;
  583+typedef uint8_t stbir_uint8;
  584 typedef uint16_t stbir_uint16;
  585 typedef uint32_t stbir_uint32;
  586 typedef uint64_t stbir_uint64;
  587@@ -422,60 +427,74 @@ typedef uint64_t stbir_uint64;
  588 //     * Uses edge mode clamped
  589 //     * returned result is 1 for success or 0 in case of an error.
  590 
  591-
  592 // stbir_pixel_layout specifies:
  593 //   number of channels
  594 //   order of channels
  595 //   whether color is premultiplied by alpha
  596-// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
  597-typedef enum
  598-{
  599-  STBIR_1CHANNEL = 1,
  600-  STBIR_2CHANNEL = 2,
  601-  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping)
  602-  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
  603-  STBIR_4CHANNEL = 5,
  604-
  605-  STBIR_RGBA = 4,                   // alpha formats, where alpha is NOT premultiplied into color channels
  606-  STBIR_BGRA = 6,
  607-  STBIR_ARGB = 7,
  608-  STBIR_ABGR = 8,
  609-  STBIR_RA   = 9,
  610-  STBIR_AR   = 10,
  611-
  612-  STBIR_RGBA_PM = 11,               // alpha formats, where alpha is premultiplied into color channels
  613-  STBIR_BGRA_PM = 12,
  614-  STBIR_ARGB_PM = 13,
  615-  STBIR_ABGR_PM = 14,
  616-  STBIR_RA_PM   = 15,
  617-  STBIR_AR_PM   = 16,
  618-
  619-  STBIR_RGBA_NO_AW = 11,            // alpha formats, where NO alpha weighting is applied at all!
  620-  STBIR_BGRA_NO_AW = 12,            //   these are just synonyms for the _PM flags (which also do
  621-  STBIR_ARGB_NO_AW = 13,            //   no alpha weighting). These names just make it more clear
  622-  STBIR_ABGR_NO_AW = 14,            //   for some folks).
  623-  STBIR_RA_NO_AW   = 15,
  624-  STBIR_AR_NO_AW   = 16,
  625+// for back compatibility, you can cast the old channel count to an
  626+// stbir_pixel_layout
  627+typedef enum {
  628+	STBIR_1CHANNEL = 1,
  629+	STBIR_2CHANNEL = 2,
  630+	STBIR_RGB = 3, // 3-chan, with order specified (for channel flipping)
  631+	STBIR_BGR = 0, // 3-chan, with order specified (for channel flipping)
  632+	STBIR_4CHANNEL = 5,
  633+
  634+	STBIR_RGBA = 4, // alpha formats, where alpha is NOT premultiplied into
  635+	                // color channels
  636+	STBIR_BGRA = 6,
  637+	STBIR_ARGB = 7,
  638+	STBIR_ABGR = 8,
  639+	STBIR_RA = 9,
  640+	STBIR_AR = 10,
  641+
  642+	STBIR_RGBA_PM =
  643+	    11, // alpha formats, where alpha is premultiplied into color channels
  644+	STBIR_BGRA_PM = 12,
  645+	STBIR_ARGB_PM = 13,
  646+	STBIR_ABGR_PM = 14,
  647+	STBIR_RA_PM = 15,
  648+	STBIR_AR_PM = 16,
  649+
  650+	STBIR_RGBA_NO_AW =
  651+	    11, // alpha formats, where NO alpha weighting is applied at all!
  652+	STBIR_BGRA_NO_AW =
  653+	    12, //   these are just synonyms for the _PM flags (which also do
  654+	STBIR_ARGB_NO_AW =
  655+	    13, //   no alpha weighting). These names just make it more clear
  656+	STBIR_ABGR_NO_AW = 14, //   for some folks).
  657+	STBIR_RA_NO_AW = 15,
  658+	STBIR_AR_NO_AW = 16,
  659 
  660 } stbir_pixel_layout;
  661 
  662 //===============================================================
  663 //  Simple-complexity API
  664 //
  665-//    If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
  666+//    If output_pixels is NULL (0), then we will allocate the buffer and return
  667+//    it to you.
  668 //--------------------------------
  669 
  670-STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  671-                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  672-                                                        stbir_pixel_layout pixel_type );
  673-
  674-STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  675-                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  676-                                                          stbir_pixel_layout pixel_type );
  677-
  678-STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  679-                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  680-                                                  stbir_pixel_layout pixel_type );
  681+STBIRDEF unsigned char *
  682+stbir_resize_uint8_srgb(const unsigned char *input_pixels, int input_w,
  683+                        int input_h, int input_stride_in_bytes,
  684+                        unsigned char *output_pixels, int output_w,
  685+                        int output_h, int output_stride_in_bytes,
  686+                        stbir_pixel_layout pixel_type);
  687+
  688+STBIRDEF unsigned char *
  689+stbir_resize_uint8_linear(const unsigned char *input_pixels, int input_w,
  690+                          int input_h, int input_stride_in_bytes,
  691+                          unsigned char *output_pixels, int output_w,
  692+                          int output_h, int output_stride_in_bytes,
  693+                          stbir_pixel_layout pixel_type);
  694+
  695+STBIRDEF float *
  696+stbir_resize_float_linear(const float *input_pixels, int input_w, int input_h,
  697+                          int input_stride_in_bytes, float *output_pixels,
  698+                          int output_w, int output_h,
  699+                          int output_stride_in_bytes,
  700+                          stbir_pixel_layout pixel_type);
  701 //===============================================================
  702 
  703 //===============================================================
  704@@ -488,45 +507,48 @@ STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int inpu
  705 //     * Filter can be selected explicitly
  706 //--------------------------------
  707 
  708-typedef enum
  709-{
  710-  STBIR_EDGE_CLAMP   = 0,
  711-  STBIR_EDGE_REFLECT = 1,
  712-  STBIR_EDGE_WRAP    = 2,  // this edge mode is slower and uses more memory
  713-  STBIR_EDGE_ZERO    = 3,
  714+typedef enum {
  715+	STBIR_EDGE_CLAMP = 0,
  716+	STBIR_EDGE_REFLECT = 1,
  717+	STBIR_EDGE_WRAP = 2, // this edge mode is slower and uses more memory
  718+	STBIR_EDGE_ZERO = 3,
  719 } stbir_edge;
  720 
  721-typedef enum
  722-{
  723-  STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
  724-  STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
  725-  STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
  726-  STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
  727-  STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
  728-  STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
  729-  STBIR_FILTER_POINT_SAMPLE = 6,  // Simple point sampling
  730-  STBIR_FILTER_OTHER        = 7,  // User callback specified
  731+typedef enum {
  732+	STBIR_FILTER_DEFAULT =
  733+	    0,                // use same filter type that easy-to-use API chooses
  734+	STBIR_FILTER_BOX = 1, // A trapezoid w/1-pixel wide ramps, same result as
  735+	                      // box for integer scale ratios
  736+	STBIR_FILTER_TRIANGLE =
  737+	    2, // On upsampling, produces same results as bilinear texture filtering
  738+	STBIR_FILTER_CUBICBSPLINE =
  739+	    3, // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0),
  740+	       // gaussian-esque
  741+	STBIR_FILTER_CATMULLROM = 4, // An interpolating cubic spline
  742+	STBIR_FILTER_MITCHELL = 5,   // Mitchell-Netrevalli filter with B=1/3, C=1/3
  743+	STBIR_FILTER_POINT_SAMPLE = 6, // Simple point sampling
  744+	STBIR_FILTER_OTHER = 7,        // User callback specified
  745 } stbir_filter;
  746 
  747-typedef enum
  748-{
  749-  STBIR_TYPE_UINT8            = 0,
  750-  STBIR_TYPE_UINT8_SRGB       = 1,
  751-  STBIR_TYPE_UINT8_SRGB_ALPHA = 2,  // alpha channel, when present, should also be SRGB (this is very unusual)
  752-  STBIR_TYPE_UINT16           = 3,
  753-  STBIR_TYPE_FLOAT            = 4,
  754-  STBIR_TYPE_HALF_FLOAT       = 5
  755+typedef enum {
  756+	STBIR_TYPE_UINT8 = 0,
  757+	STBIR_TYPE_UINT8_SRGB = 1,
  758+	STBIR_TYPE_UINT8_SRGB_ALPHA = 2, // alpha channel, when present, should also
  759+	                                 // be SRGB (this is very unusual)
  760+	STBIR_TYPE_UINT16 = 3,
  761+	STBIR_TYPE_FLOAT = 4,
  762+	STBIR_TYPE_HALF_FLOAT = 5
  763 } stbir_datatype;
  764 
  765 // medium api
  766-STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  767-                                     void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  768-                               stbir_pixel_layout pixel_layout, stbir_datatype data_type,
  769-                               stbir_edge edge, stbir_filter filter );
  770+STBIRDEF void *
  771+stbir_resize(const void *input_pixels, int input_w, int input_h,
  772+             int input_stride_in_bytes, void *output_pixels, int output_w,
  773+             int output_h, int output_stride_in_bytes,
  774+             stbir_pixel_layout pixel_layout, stbir_datatype data_type,
  775+             stbir_edge edge, stbir_filter filter);
  776 //===============================================================
  777 
  778-
  779-
  780 //===============================================================
  781 // Extended-complexity API
  782 //
  783@@ -540,122 +562,175 @@ STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int inpu
  784 //     * Can specify a memory callback
  785 //     * Can specify a callback data type for pixel input and output
  786 //     * Can be threaded for a single resize
  787-//     * Can be used to resize many frames without recalculating the sampler info
  788+//     * Can be used to resize many frames without recalculating the sampler
  789+//     info
  790 //
  791 //  Use this API as follows:
  792 //     1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
  793 //     2) Call any of the stbir_set functions
  794-//     3) Optionally call stbir_build_samplers() if you are going to resample multiple times
  795+//     3) Optionally call stbir_build_samplers() if you are going to resample
  796+//     multiple times
  797 //        with the same input and output dimensions (like resizing video frames)
  798 //     4) Resample by calling stbir_resize_extended().
  799 //     5) Call stbir_free_samplers() if you called stbir_build_samplers()
  800 //--------------------------------
  801 
  802-
  803 // Types:
  804 
  805 // INPUT CALLBACK: this callback is used for input scanlines
  806-typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
  807+typedef void const *
  808+stbir_input_callback(void *optional_output, void const *input_ptr,
  809+                     int num_pixels, int x, int y, void *context);
  810 
  811 // OUTPUT CALLBACK: this callback is used for output scanlines
  812-typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
  813+typedef void
  814+stbir_output_callback(void const *output_ptr, int num_pixels, int y,
  815+                      void *context);
  816 
  817 // callbacks for user installed filters
  818-typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
  819-typedef float stbir__support_callback( float scale, void * user_data );
  820+typedef float
  821+stbir__kernel_callback(float x, float scale,
  822+                       void *user_data); // centered at zero
  823+typedef float
  824+stbir__support_callback(float scale, void *user_data);
  825 
  826 // internal structure with precomputed scaling
  827 typedef struct stbir__info stbir__info;
  828 
  829-typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
  830-{
  831-  void * user_data;
  832-  void const * input_pixels;
  833-  int input_w, input_h;
  834-  double input_s0, input_t0, input_s1, input_t1;
  835-  stbir_input_callback * input_cb;
  836-  void * output_pixels;
  837-  int output_w, output_h;
  838-  int output_subx, output_suby, output_subw, output_subh;
  839-  stbir_output_callback * output_cb;
  840-  int input_stride_in_bytes;
  841-  int output_stride_in_bytes;
  842-  int splits;
  843-  int fast_alpha;
  844-  int needs_rebuild;
  845-  int called_alloc;
  846-  stbir_pixel_layout input_pixel_layout_public;
  847-  stbir_pixel_layout output_pixel_layout_public;
  848-  stbir_datatype input_data_type;
  849-  stbir_datatype output_data_type;
  850-  stbir_filter horizontal_filter, vertical_filter;
  851-  stbir_edge horizontal_edge, vertical_edge;
  852-  stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
  853-  stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
  854-  stbir__info * samplers;
  855+typedef struct STBIR_RESIZE // use the stbir_resize_init and stbir_override
  856+                            // functions to set these values for future
  857+                            // compatibility
  858+{
  859+	void *user_data;
  860+	void const *input_pixels;
  861+	int input_w, input_h;
  862+	double input_s0, input_t0, input_s1, input_t1;
  863+	stbir_input_callback *input_cb;
  864+	void *output_pixels;
  865+	int output_w, output_h;
  866+	int output_subx, output_suby, output_subw, output_subh;
  867+	stbir_output_callback *output_cb;
  868+	int input_stride_in_bytes;
  869+	int output_stride_in_bytes;
  870+	int splits;
  871+	int fast_alpha;
  872+	int needs_rebuild;
  873+	int called_alloc;
  874+	stbir_pixel_layout input_pixel_layout_public;
  875+	stbir_pixel_layout output_pixel_layout_public;
  876+	stbir_datatype input_data_type;
  877+	stbir_datatype output_data_type;
  878+	stbir_filter horizontal_filter, vertical_filter;
  879+	stbir_edge horizontal_edge, vertical_edge;
  880+	stbir__kernel_callback *horizontal_filter_kernel;
  881+	stbir__support_callback *horizontal_filter_support;
  882+	stbir__kernel_callback *vertical_filter_kernel;
  883+	stbir__support_callback *vertical_filter_support;
  884+	stbir__info *samplers;
  885 } STBIR_RESIZE;
  886 
  887 // extended complexity api
  888 
  889-
  890-// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
  891-STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
  892-                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
  893-                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
  894-                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type );
  895+// First off, you must ALWAYS call stbir_resize_init on your resize structure
  896+// before any of the other calls!
  897+STBIRDEF void
  898+stbir_resize_init(STBIR_RESIZE *resize, const void *input_pixels, int input_w,
  899+                  int input_h, int input_stride_in_bytes, // stride can be zero
  900+                  void *output_pixels, int output_w, int output_h,
  901+                  int output_stride_in_bytes, // stride can be zero
  902+                  stbir_pixel_layout pixel_layout, stbir_datatype data_type);
  903 
  904 //===============================================================
  905-// You can update these parameters any time after resize_init and there is no cost
  906+// You can update these parameters any time after resize_init and there is no
  907+// cost
  908 //--------------------------------
  909 
  910-STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
  911-STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
  912-STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
  913-STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
  914+STBIRDEF void
  915+stbir_set_datatypes(STBIR_RESIZE *resize, stbir_datatype input_type,
  916+                    stbir_datatype output_type);
  917+STBIRDEF void
  918+stbir_set_pixel_callbacks(
  919+    STBIR_RESIZE *resize, stbir_input_callback *input_cb,
  920+    stbir_output_callback *output_cb); // no callbacks by default
  921+STBIRDEF void
  922+stbir_set_user_data(STBIR_RESIZE *resize,
  923+                    void *user_data); // pass back STBIR_RESIZE* by default
  924+STBIRDEF void
  925+stbir_set_buffer_ptrs(STBIR_RESIZE *resize, const void *input_pixels,
  926+                      int input_stride_in_bytes, void *output_pixels,
  927+                      int output_stride_in_bytes);
  928 
  929 //===============================================================
  930 
  931-
  932 //===============================================================
  933 // If you call any of these functions, you will trigger a sampler rebuild!
  934 //--------------------------------
  935 
  936-STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout );  // sets new buffer layouts
  937-STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
  938-
  939-STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
  940-STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
  941-
  942-STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
  943-STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
  944-STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
  945-
  946-// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
  947-//   that fills the zero alpha pixel's RGB values with something plausible.  If you don't care about areas of
  948-//   zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
  949-//   types of resizes.
  950-STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
  951+STBIRDEF int
  952+stbir_set_pixel_layouts(
  953+    STBIR_RESIZE *resize, stbir_pixel_layout input_pixel_layout,
  954+    stbir_pixel_layout output_pixel_layout); // sets new buffer layouts
  955+STBIRDEF int
  956+stbir_set_edgemodes(STBIR_RESIZE *resize, stbir_edge horizontal_edge,
  957+                    stbir_edge vertical_edge); // CLAMP by default
  958+
  959+STBIRDEF int
  960+stbir_set_filters(STBIR_RESIZE *resize, stbir_filter horizontal_filter,
  961+                  stbir_filter vertical_filter); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE
  962+                                                 // by default
  963+STBIRDEF int
  964+stbir_set_filter_callbacks(STBIR_RESIZE *resize,
  965+                           stbir__kernel_callback *horizontal_filter,
  966+                           stbir__support_callback *horizontal_support,
  967+                           stbir__kernel_callback *vertical_filter,
  968+                           stbir__support_callback *vertical_support);
  969+
  970+STBIRDEF int
  971+stbir_set_pixel_subrect(
  972+    STBIR_RESIZE *resize, int subx, int suby, int subw,
  973+    int subh); // sets both sub-regions (full regions by default)
  974+STBIRDEF int
  975+stbir_set_input_subrect(
  976+    STBIR_RESIZE *resize, double s0, double t0, double s1,
  977+    double t1); // sets input sub-region (full region by default)
  978+STBIRDEF int
  979+stbir_set_output_pixel_subrect(
  980+    STBIR_RESIZE *resize, int subx, int suby, int subw,
  981+    int subh); // sets output sub-region (full region by default)
  982+
  983+// when inputting AND outputting non-premultiplied alpha pixels, we use a slower
  984+// but higher quality technique
  985+//   that fills the zero alpha pixel's RGB values with something plausible.  If
  986+//   you don't care about areas of zero alpha, you can call this function to get
  987+//   about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA types of
  988+//   resizes.
  989+STBIRDEF int
  990+stbir_set_non_pm_alpha_speed_over_quality(STBIR_RESIZE *resize,
  991+                                          int non_pma_alpha_speed_over_quality);
  992 //===============================================================
  993 
  994-
  995 //===============================================================
  996-// You can call build_samplers to prebuild all the internal data we need to resample.
  997-//   Then, if you call resize_extended many times with the same resize, you only pay the
  998-//   cost once.
  999+// You can call build_samplers to prebuild all the internal data we need to
 1000+// resample.
 1001+//   Then, if you call resize_extended many times with the same resize, you only
 1002+//   pay the cost once.
 1003 // If you do call build_samplers, you MUST call free_samplers eventually.
 1004 //--------------------------------
 1005 
 1006 // This builds the samplers and does one allocation
 1007-STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
 1008+STBIRDEF int
 1009+stbir_build_samplers(STBIR_RESIZE *resize);
 1010 
 1011-// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
 1012-STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
 1013+// You MUST call this, if you call stbir_build_samplers or
 1014+// stbir_build_samplers_with_splits
 1015+STBIRDEF void
 1016+stbir_free_samplers(STBIR_RESIZE *resize);
 1017 //===============================================================
 1018 
 1019-
 1020-// And this is the main function to perform the resize synchronously on one thread.
 1021-STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
 1022-
 1023+// And this is the main function to perform the resize synchronously on one
 1024+// thread.
 1025+STBIRDEF int
 1026+stbir_resize_extended(STBIR_RESIZE *resize);
 1027 
 1028 //===============================================================
 1029 // Use these functions for multithreading.
 1030@@ -669,23 +744,30 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
 1031 //   It returns the number of splits (threads) that you can call it with.
 1032 ///  It might be less if the image resize can't be split up that many ways.
 1033 
 1034-STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
 1035+STBIRDEF int
 1036+stbir_build_samplers_with_splits(STBIR_RESIZE *resize, int try_splits);
 1037 
 1038 // This function does a split of the resizing (you call this fuction for each
 1039-// split, on multiple threads). A split is a piece of the output resize pixel space.
 1040+// split, on multiple threads). A split is a piece of the output resize pixel
 1041+// space.
 1042 
 1043-// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
 1044+// Note that you MUST call stbir_build_samplers_with_splits before
 1045+// stbir_resize_extended_split!
 1046 
 1047-// Usually, you will always call stbir_resize_split with split_start as the thread_index
 1048+// Usually, you will always call stbir_resize_split with split_start as the
 1049+// thread_index
 1050 //   and "1" for the split_count.
 1051-// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
 1052-//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
 1053-//   split_count each time to turn in into a 4 thread resize. (This is unusual).
 1054-
 1055-STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
 1056+// But, if you have a weird situation where you MIGHT want 8 threads, but
 1057+// sometimes
 1058+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for
 1059+//   the split_count each time to turn in into a 4 thread resize. (This is
 1060+//   unusual).
 1061+
 1062+STBIRDEF int
 1063+stbir_resize_extended_split(STBIR_RESIZE *resize, int split_start,
 1064+                            int split_count);
 1065 //===============================================================
 1066 
 1067-
 1068 //===============================================================
 1069 // Pixel Callbacks info:
 1070 //--------------------------------
 1071@@ -700,83 +782,98 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 1072 //   calculate your own input_ptr based on the size of each non-supported pixel.
 1073 //   (Something like the third example below.)
 1074 //
 1075-//   You can also install just an input or just an output callback by setting the
 1076-//   callback that you don't want to zero.
 1077+//   You can also install just an input or just an output callback by setting
 1078+//   the callback that you don't want to zero.
 1079 //
 1080-//     First example, progress: (getting a callback that you can monitor the progress):
 1081-//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 1082+//     First example, progress: (getting a callback that you can monitor the
 1083+//     progress):
 1084+//        void const * my_callback( void * optional_output, void const *
 1085+//        input_ptr, int num_pixels, int x, int y, void * context )
 1086 //        {
 1087 //           percentage_done = y / input_height;
 1088 //           return input_ptr;  // use buffer from call
 1089 //        }
 1090 //
 1091 //     Next example, copying: (copy from some other buffer or stream):
 1092-//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 1093+//        void const * my_callback( void * optional_output, void const *
 1094+//        input_ptr, int num_pixels, int x, int y, void * context )
 1095 //        {
 1096-//           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
 1097-//           return optional_output;  // return the optional buffer that we filled
 1098+//           CopyOrStreamData( optional_output, other_data_src, num_pixels *
 1099+//           pixel_width_in_bytes ); return optional_output;  // return the
 1100+//           optional buffer that we filled
 1101 //        }
 1102 //
 1103-//     Third example, input another buffer without copying: (zero-copy from other buffer):
 1104-//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 1105+//     Third example, input another buffer without copying: (zero-copy from
 1106+//     other buffer):
 1107+//        void const * my_callback( void * optional_output, void const *
 1108+//        input_ptr, int num_pixels, int x, int y, void * context )
 1109 //        {
 1110-//           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
 1111-//           return pixels;       // return pointer to your data without copying
 1112+//           void * pixels = ( (char*) other_image_base ) + ( y *
 1113+//           other_image_stride ) + ( x * other_pixel_width_in_bytes ); return
 1114+//           pixels;       // return pointer to your data without copying
 1115 //        }
 1116 //
 1117 //
 1118-//   The output callback is considerably simpler - it just calls you so that you can dump
 1119-//   out each scanline. You could even directly copy out to disk if you have a simple format
 1120-//   like TGA or BMP. You can also convert to other output types here if you want.
 1121+//   The output callback is considerably simpler - it just calls you so that you
 1122+//   can dump out each scanline. You could even directly copy out to disk if you
 1123+//   have a simple format like TGA or BMP. You can also convert to other output
 1124+//   types here if you want.
 1125 //
 1126 //   Simple example:
 1127-//        void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
 1128+//        void const * my_output( void * output_ptr, int num_pixels, int y, void
 1129+//        * context )
 1130 //        {
 1131 //           percentage_done = y / output_height;
 1132-//           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
 1133+//           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file
 1134+//           );
 1135 //        }
 1136 //===============================================================
 1137 
 1138-
 1139-
 1140-
 1141 //===============================================================
 1142 // optional built-in profiling API
 1143 //--------------------------------
 1144 
 1145 #ifdef STBIR_PROFILE
 1146 
 1147-typedef struct STBIR_PROFILE_INFO
 1148-{
 1149-  stbir_uint64 total_clocks;
 1150+typedef struct STBIR_PROFILE_INFO {
 1151+	stbir_uint64 total_clocks;
 1152 
 1153-  // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
 1154-  //    there are "resize_count" number of zones
 1155-  stbir_uint64 clocks[ 8 ];
 1156-  char const ** descriptions;
 1157+	// how many clocks spent (of total_clocks) in the various resize routines,
 1158+	// along with a string description
 1159+	//    there are "resize_count" number of zones
 1160+	stbir_uint64 clocks[8];
 1161+	char const **descriptions;
 1162 
 1163-  // count of clocks and descriptions
 1164-  stbir_uint32 count;
 1165+	// count of clocks and descriptions
 1166+	stbir_uint32 count;
 1167 } STBIR_PROFILE_INFO;
 1168 
 1169-// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
 1170-STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
 1171+// use after calling stbir_resize_extended (or stbir_build_samplers or
 1172+// stbir_build_samplers_with_splits)
 1173+STBIRDEF void
 1174+stbir_resize_build_profile_info(STBIR_PROFILE_INFO *out_info,
 1175+                                STBIR_RESIZE const *resize);
 1176 
 1177 // use after calling stbir_resize_extended
 1178-STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
 1179+STBIRDEF void
 1180+stbir_resize_extended_profile_info(STBIR_PROFILE_INFO *out_info,
 1181+                                   STBIR_RESIZE const *resize);
 1182 
 1183 // use after calling stbir_resize_extended_split
 1184-STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
 1185+STBIRDEF void
 1186+stbir_resize_split_profile_info(STBIR_PROFILE_INFO *out_info,
 1187+                                STBIR_RESIZE const *resize, int split_start,
 1188+                                int split_num);
 1189 
 1190 //===============================================================
 1191 
 1192 #endif
 1193 
 1194-
 1195 ////   end header file   /////////////////////////////////////////////////////
 1196 #endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
 1197 
 1198-#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
 1199+#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) ||                                \
 1200+    defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
 1201 
 1202 #ifndef STBIR_ASSERT
 1203 #include <assert.h>
 1204@@ -785,9 +882,10 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, ST
 1205 
 1206 #ifndef STBIR_MALLOC
 1207 #include <stdlib.h>
 1208-#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
 1209-#define STBIR_FREE(ptr,user_data)    ((void)(user_data), free(ptr))
 1210-// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
 1211+#define STBIR_MALLOC(size, user_data) ((void)(user_data), malloc(size))
 1212+#define STBIR_FREE(ptr, user_data) ((void)(user_data), free(ptr))
 1213+// (we used the comma operator to evaluate user_data, to avoid "unused
 1214+// parameter" warnings)
 1215 #endif
 1216 
 1217 #ifdef _MSC_VER
 1218@@ -800,30 +898,31 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, ST
 1219 
 1220 // Clang address sanitizer
 1221 #if defined(__has_feature)
 1222-  #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
 1223-    #ifndef STBIR__SEPARATE_ALLOCATIONS
 1224-      #define STBIR__SEPARATE_ALLOCATIONS
 1225-    #endif
 1226-  #endif
 1227+#if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
 1228+#ifndef STBIR__SEPARATE_ALLOCATIONS
 1229+#define STBIR__SEPARATE_ALLOCATIONS
 1230+#endif
 1231+#endif
 1232 #endif
 1233 
 1234 #endif
 1235 
 1236 // GCC and MSVC
 1237 #if defined(__SANITIZE_ADDRESS__)
 1238-  #ifndef STBIR__SEPARATE_ALLOCATIONS
 1239-    #define STBIR__SEPARATE_ALLOCATIONS
 1240-  #endif
 1241+#ifndef STBIR__SEPARATE_ALLOCATIONS
 1242+#define STBIR__SEPARATE_ALLOCATIONS
 1243+#endif
 1244 #endif
 1245 
 1246 // Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
 1247 // Otherwise, this is a determinism disaster.
 1248-#ifndef STBIR_DONT_CHANGE_FP_CONTRACT  // override in case you don't want this behavior
 1249+#ifndef STBIR_DONT_CHANGE_FP_CONTRACT // override in case you don't want this
 1250+                                      // behavior
 1251 #if defined(_MSC_VER) && !defined(__clang__)
 1252 #if _MSC_VER > 1200
 1253 #pragma fp_contract(off)
 1254 #endif
 1255-#elif defined(__GNUC__) &&  !defined(__clang__)
 1256+#elif defined(__GNUC__) && !defined(__clang__)
 1257 #pragma GCC optimize("fp-contract=off")
 1258 #else
 1259 #pragma STDC FP_CONTRACT OFF
 1260@@ -831,53 +930,53 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, ST
 1261 #endif
 1262 
 1263 #ifdef _MSC_VER
 1264-#define STBIR__UNUSED(v)  (void)(v)
 1265+#define STBIR__UNUSED(v) (void)(v)
 1266 #else
 1267-#define STBIR__UNUSED(v)  (void)sizeof(v)
 1268+#define STBIR__UNUSED(v) (void)sizeof(v)
 1269 #endif
 1270 
 1271-#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
 1272-
 1273+#define STBIR__ARRAY_SIZE(a) (sizeof((a)) / sizeof((a)[0]))
 1274 
 1275 #ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
 1276-#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
 1277+#define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_CATMULLROM
 1278 #endif
 1279 
 1280 #ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
 1281-#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
 1282+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_MITCHELL
 1283 #endif
 1284 
 1285-
 1286 #ifndef STBIR__HEADER_FILENAME
 1287 #define STBIR__HEADER_FILENAME "stb_image_resize2.h"
 1288 #endif
 1289 
 1290-// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
 1291-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
 1292-typedef enum
 1293-{
 1294-  STBIRI_1CHANNEL = 0,
 1295-  STBIRI_2CHANNEL = 1,
 1296-  STBIRI_RGB      = 2,
 1297-  STBIRI_BGR      = 3,
 1298-  STBIRI_4CHANNEL = 4,
 1299-
 1300-  STBIRI_RGBA = 5,
 1301-  STBIRI_BGRA = 6,
 1302-  STBIRI_ARGB = 7,
 1303-  STBIRI_ABGR = 8,
 1304-  STBIRI_RA   = 9,
 1305-  STBIRI_AR   = 10,
 1306-
 1307-  STBIRI_RGBA_PM = 11,
 1308-  STBIRI_BGRA_PM = 12,
 1309-  STBIRI_ARGB_PM = 13,
 1310-  STBIRI_ABGR_PM = 14,
 1311-  STBIRI_RA_PM   = 15,
 1312-  STBIRI_AR_PM   = 16,
 1313+// the internal pixel layout enums are in a different order, so we can easily do
 1314+// range comparisons of types
 1315+//   the public pixel layout is ordered in a way that if you cast num_channels
 1316+//   (1-4) to the enum, you get something sensible
 1317+typedef enum {
 1318+	STBIRI_1CHANNEL = 0,
 1319+	STBIRI_2CHANNEL = 1,
 1320+	STBIRI_RGB = 2,
 1321+	STBIRI_BGR = 3,
 1322+	STBIRI_4CHANNEL = 4,
 1323+
 1324+	STBIRI_RGBA = 5,
 1325+	STBIRI_BGRA = 6,
 1326+	STBIRI_ARGB = 7,
 1327+	STBIRI_ABGR = 8,
 1328+	STBIRI_RA = 9,
 1329+	STBIRI_AR = 10,
 1330+
 1331+	STBIRI_RGBA_PM = 11,
 1332+	STBIRI_BGRA_PM = 12,
 1333+	STBIRI_ARGB_PM = 13,
 1334+	STBIRI_ABGR_PM = 14,
 1335+	STBIRI_RA_PM = 15,
 1336+	STBIRI_AR_PM = 16,
 1337 } stbir_internal_pixel_layout;
 1338 
 1339-// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
 1340+// define the public pixel layouts to not compile inside the implementation (to
 1341+// avoid accidental use)
 1342 #define STBIR_BGR bad_dont_use_in_implementation
 1343 #define STBIR_1CHANNEL STBIR_BGR
 1344 #define STBIR_2CHANNEL STBIR_BGR
 1345@@ -898,277 +997,324 @@ typedef enum
 1346 
 1347 // must match stbir_datatype
 1348 static unsigned char stbir__type_size[] = {
 1349-  1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
 1350+    1, 1, 1, 2,
 1351+    4, 2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
 1352 };
 1353 
 1354 // When gathering, the contributors are which source pixels contribute.
 1355-// When scattering, the contributors are which destination pixels are contributed to.
 1356-typedef struct
 1357-{
 1358-  int n0; // First contributing pixel
 1359-  int n1; // Last contributing pixel
 1360+// When scattering, the contributors are which destination pixels are
 1361+// contributed to.
 1362+typedef struct {
 1363+	int n0; // First contributing pixel
 1364+	int n1; // Last contributing pixel
 1365 } stbir__contributors;
 1366 
 1367-typedef struct
 1368-{
 1369-  int lowest;    // First sample index for whole filter
 1370-  int highest;   // Last sample index for whole filter
 1371-  int widest;    // widest single set of samples for an output
 1372+typedef struct {
 1373+	int lowest;  // First sample index for whole filter
 1374+	int highest; // Last sample index for whole filter
 1375+	int widest;  // widest single set of samples for an output
 1376 } stbir__filter_extent_info;
 1377 
 1378-typedef struct
 1379-{
 1380-  int n0; // First pixel of decode buffer to write to
 1381-  int n1; // Last pixel of decode that will be written to
 1382-  int pixel_offset_for_input;  // Pixel offset into input_scanline
 1383+typedef struct {
 1384+	int n0;                     // First pixel of decode buffer to write to
 1385+	int n1;                     // Last pixel of decode that will be written to
 1386+	int pixel_offset_for_input; // Pixel offset into input_scanline
 1387 } stbir__span;
 1388 
 1389-typedef struct stbir__scale_info
 1390-{
 1391-  int input_full_size;
 1392-  int output_sub_size;
 1393-  float scale;
 1394-  float inv_scale;
 1395-  float pixel_shift; // starting shift in output pixel space (in pixels)
 1396-  int scale_is_rational;
 1397-  stbir_uint32 scale_numerator, scale_denominator;
 1398+typedef struct stbir__scale_info {
 1399+	int input_full_size;
 1400+	int output_sub_size;
 1401+	float scale;
 1402+	float inv_scale;
 1403+	float pixel_shift; // starting shift in output pixel space (in pixels)
 1404+	int scale_is_rational;
 1405+	stbir_uint32 scale_numerator, scale_denominator;
 1406 } stbir__scale_info;
 1407 
 1408-typedef struct
 1409-{
 1410-  stbir__contributors * contributors;
 1411-  float* coefficients;
 1412-  stbir__contributors * gather_prescatter_contributors;
 1413-  float * gather_prescatter_coefficients;
 1414-  stbir__scale_info scale_info;
 1415-  float support;
 1416-  stbir_filter filter_enum;
 1417-  stbir__kernel_callback * filter_kernel;
 1418-  stbir__support_callback * filter_support;
 1419-  stbir_edge edge;
 1420-  int coefficient_width;
 1421-  int filter_pixel_width;
 1422-  int filter_pixel_margin;
 1423-  int num_contributors;
 1424-  int contributors_size;
 1425-  int coefficients_size;
 1426-  stbir__filter_extent_info extent_info;
 1427-  int is_gather;  // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
 1428-  int gather_prescatter_num_contributors;
 1429-  int gather_prescatter_coefficient_width;
 1430-  int gather_prescatter_contributors_size;
 1431-  int gather_prescatter_coefficients_size;
 1432+typedef struct {
 1433+	stbir__contributors *contributors;
 1434+	float *coefficients;
 1435+	stbir__contributors *gather_prescatter_contributors;
 1436+	float *gather_prescatter_coefficients;
 1437+	stbir__scale_info scale_info;
 1438+	float support;
 1439+	stbir_filter filter_enum;
 1440+	stbir__kernel_callback *filter_kernel;
 1441+	stbir__support_callback *filter_support;
 1442+	stbir_edge edge;
 1443+	int coefficient_width;
 1444+	int filter_pixel_width;
 1445+	int filter_pixel_margin;
 1446+	int num_contributors;
 1447+	int contributors_size;
 1448+	int coefficients_size;
 1449+	stbir__filter_extent_info extent_info;
 1450+	int is_gather; // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with
 1451+	               // scale < 1
 1452+	int gather_prescatter_num_contributors;
 1453+	int gather_prescatter_coefficient_width;
 1454+	int gather_prescatter_contributors_size;
 1455+	int gather_prescatter_coefficients_size;
 1456 } stbir__sampler;
 1457 
 1458-typedef struct
 1459-{
 1460-  stbir__contributors conservative;
 1461-  int edge_sizes[2];    // this can be less than filter_pixel_margin, if the filter and scaling falls off
 1462-  stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
 1463+typedef struct {
 1464+	stbir__contributors conservative;
 1465+	int edge_sizes[2];    // this can be less than filter_pixel_margin, if the
 1466+	                      // filter and scaling falls off
 1467+	stbir__span spans[2]; // can be two spans, if doing input subrect with clamp
 1468+	                      // mode WRAP
 1469 } stbir__extents;
 1470 
 1471-typedef struct
 1472-{
 1473+typedef struct {
 1474 #ifdef STBIR_PROFILE
 1475-  union
 1476-  {
 1477-    struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
 1478-    stbir_uint64 array[8];
 1479-  } profile;
 1480-  stbir_uint64 * current_zone_excluded_ptr;
 1481-#endif
 1482-  float* decode_buffer;
 1483-
 1484-  int ring_buffer_first_scanline;
 1485-  int ring_buffer_last_scanline;
 1486-  int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
 1487-  int start_output_y, end_output_y;
 1488-  int start_input_y, end_input_y;  // used in scatter only
 1489-
 1490-  #ifdef STBIR__SEPARATE_ALLOCATIONS
 1491-    float** ring_buffers; // one pointer for each ring buffer
 1492-  #else
 1493-    float* ring_buffer;  // one big buffer that we index into
 1494-  #endif
 1495-
 1496-  float* vertical_buffer;
 1497-
 1498-  char no_cache_straddle[64];
 1499-} stbir__per_split_info;
 1500+	union {
 1501+		struct {
 1502+			stbir_uint64 total, looping, vertical, horizontal, decode, encode,
 1503+			    alpha, unalpha;
 1504+		} named;
 1505+		stbir_uint64 array[8];
 1506+	} profile;
 1507+	stbir_uint64 *current_zone_excluded_ptr;
 1508+#endif
 1509+	float *decode_buffer;
 1510 
 1511-typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
 1512-typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
 1513-typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
 1514-  stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
 1515-typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
 1516-typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
 1517+	int ring_buffer_first_scanline;
 1518+	int ring_buffer_last_scanline;
 1519+	int ring_buffer_begin_index; // first_scanline is at this index in the ring
 1520+	                             // buffer
 1521+	int start_output_y, end_output_y;
 1522+	int start_input_y, end_input_y; // used in scatter only
 1523 
 1524-struct stbir__info
 1525-{
 1526+#ifdef STBIR__SEPARATE_ALLOCATIONS
 1527+	float **ring_buffers; // one pointer for each ring buffer
 1528+#else
 1529+	float *ring_buffer; // one big buffer that we index into
 1530+#endif
 1531+
 1532+	float *vertical_buffer;
 1533+
 1534+	char no_cache_straddle[64];
 1535+} stbir__per_split_info;
 1536+
 1537+typedef float *
 1538+stbir__decode_pixels_func(float *decode, int width_times_channels,
 1539+                          void const *input);
 1540+typedef void
 1541+stbir__alpha_weight_func(float *decode_buffer, int width_times_channels);
 1542+typedef void
 1543+stbir__horizontal_gather_channels_func(
 1544+    float *output_buffer, unsigned int output_sub_size,
 1545+    float const *decode_buffer,
 1546+    stbir__contributors const *horizontal_contributors,
 1547+    float const *horizontal_coefficients, int coefficient_width);
 1548+typedef void
 1549+stbir__alpha_unweight_func(float *encode_buffer, int width_times_channels);
 1550+typedef void
 1551+stbir__encode_pixels_func(void *output, int width_times_channels,
 1552+                          float const *encode);
 1553+
 1554+struct stbir__info {
 1555 #ifdef STBIR_PROFILE
 1556-  union
 1557-  {
 1558-    struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
 1559-    stbir_uint64 array[7];
 1560-  } profile;
 1561-  stbir_uint64 * current_zone_excluded_ptr;
 1562-#endif
 1563-  stbir__sampler horizontal;
 1564-  stbir__sampler vertical;
 1565-
 1566-  void const * input_data;
 1567-  void * output_data;
 1568-
 1569-  int input_stride_bytes;
 1570-  int output_stride_bytes;
 1571-  int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
 1572-  int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
 1573-
 1574-  stbir_datatype input_type;
 1575-  stbir_datatype output_type;
 1576-
 1577-  stbir_input_callback * in_pixels_cb;
 1578-  void * user_data;
 1579-  stbir_output_callback * out_pixels_cb;
 1580-
 1581-  stbir__extents scanline_extents;
 1582-
 1583-  void * alloced_mem;
 1584-  stbir__per_split_info * split_info;  // by default 1, but there will be N of these allocated based on the thread init you did
 1585-
 1586-  stbir__decode_pixels_func * decode_pixels;
 1587-  stbir__alpha_weight_func * alpha_weight;
 1588-  stbir__horizontal_gather_channels_func * horizontal_gather_channels;
 1589-  stbir__alpha_unweight_func * alpha_unweight;
 1590-  stbir__encode_pixels_func * encode_pixels;
 1591-
 1592-  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
 1593-  int splits; // count of splits
 1594-
 1595-  stbir_internal_pixel_layout input_pixel_layout_internal;
 1596-  stbir_internal_pixel_layout output_pixel_layout_internal;
 1597-
 1598-  int input_color_and_type;
 1599-  int offset_x, offset_y; // offset within output_data
 1600-  int vertical_first;
 1601-  int channels;
 1602-  int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
 1603-  size_t alloced_total;
 1604+	union {
 1605+		struct {
 1606+			stbir_uint64 total, build, alloc, horizontal, vertical, cleanup,
 1607+			    pivot;
 1608+		} named;
 1609+		stbir_uint64 array[7];
 1610+	} profile;
 1611+	stbir_uint64 *current_zone_excluded_ptr;
 1612+#endif
 1613+	stbir__sampler horizontal;
 1614+	stbir__sampler vertical;
 1615+
 1616+	void const *input_data;
 1617+	void *output_data;
 1618+
 1619+	int input_stride_bytes;
 1620+	int output_stride_bytes;
 1621+	int ring_buffer_length_bytes; // The length of an individual entry in the
 1622+	                              // ring buffer. The total number of ring
 1623+	                              // buffers is
 1624+	                              // stbir__get_filter_pixel_width(filter)
 1625+	int ring_buffer_num_entries;  // Total number of entries in the ring buffer.
 1626+
 1627+	stbir_datatype input_type;
 1628+	stbir_datatype output_type;
 1629+
 1630+	stbir_input_callback *in_pixels_cb;
 1631+	void *user_data;
 1632+	stbir_output_callback *out_pixels_cb;
 1633+
 1634+	stbir__extents scanline_extents;
 1635+
 1636+	void *alloced_mem;
 1637+	stbir__per_split_info
 1638+	    *split_info; // by default 1, but there will be N of these allocated
 1639+	                 // based on the thread init you did
 1640+
 1641+	stbir__decode_pixels_func *decode_pixels;
 1642+	stbir__alpha_weight_func *alpha_weight;
 1643+	stbir__horizontal_gather_channels_func *horizontal_gather_channels;
 1644+	stbir__alpha_unweight_func *alpha_unweight;
 1645+	stbir__encode_pixels_func *encode_pixels;
 1646+
 1647+	int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer
 1648+	                                   // that will be allocated
 1649+	int splits;                        // count of splits
 1650+
 1651+	stbir_internal_pixel_layout input_pixel_layout_internal;
 1652+	stbir_internal_pixel_layout output_pixel_layout_internal;
 1653+
 1654+	int input_color_and_type;
 1655+	int offset_x, offset_y; // offset within output_data
 1656+	int vertical_first;
 1657+	int channels;
 1658+	int effective_channels; // same as channels, except on RGBA/ARGB (7), or
 1659+	                        // XA/AX (3)
 1660+	size_t alloced_total;
 1661 };
 1662 
 1663-
 1664-#define stbir__max_uint8_as_float             255.0f
 1665-#define stbir__max_uint16_as_float            65535.0f
 1666-#define stbir__max_uint8_as_float_inverted    3.9215689e-03f     // (1.0f/255.0f)
 1667-#define stbir__max_uint16_as_float_inverted   1.5259022e-05f     // (1.0f/65535.0f)
 1668-#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
 1669+#define stbir__max_uint8_as_float 255.0f
 1670+#define stbir__max_uint16_as_float 65535.0f
 1671+#define stbir__max_uint8_as_float_inverted 3.9215689e-03f  // (1.0f/255.0f)
 1672+#define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f)
 1673+#define stbir__small_float                                                     \
 1674+	((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) /    \
 1675+	 (1 << 20))
 1676 
 1677 // min/max friendly
 1678-#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
 1679-  if ( (x) < (xmin) ) (x) = (xmin);     \
 1680-  if ( (x) > (xmax) ) (x) = (xmax);     \
 1681-  break;                                \
 1682-}
 1683+#define STBIR_CLAMP(x, xmin, xmax)                                             \
 1684+	for (;;) {                                                                 \
 1685+		if ((x) < (xmin))                                                      \
 1686+			(x) = (xmin);                                                      \
 1687+		if ((x) > (xmax))                                                      \
 1688+			(x) = (xmax);                                                      \
 1689+		break;                                                                 \
 1690+	}
 1691 
 1692-static stbir__inline int stbir__min(int a, int b)
 1693+static stbir__inline int
 1694+stbir__min(int a, int b)
 1695 {
 1696-  return a < b ? a : b;
 1697+	return a < b ? a : b;
 1698 }
 1699 
 1700-static stbir__inline int stbir__max(int a, int b)
 1701+static stbir__inline int
 1702+stbir__max(int a, int b)
 1703 {
 1704-  return a > b ? a : b;
 1705+	return a > b ? a : b;
 1706 }
 1707 
 1708 static float stbir__srgb_uchar_to_linear_float[256] = {
 1709-  0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
 1710-  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
 1711-  0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
 1712-  0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
 1713-  0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
 1714-  0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
 1715-  0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
 1716-  0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
 1717-  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
 1718-  0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
 1719-  0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
 1720-  0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
 1721-  0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
 1722-  0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
 1723-  0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
 1724-  0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
 1725-  0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
 1726-  0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
 1727-  0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
 1728-  0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
 1729-  0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
 1730-  0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
 1731-  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
 1732-  0.982251f, 0.991102f, 1.0f
 1733-};
 1734-
 1735-typedef union
 1736-{
 1737-  unsigned int u;
 1738-  float f;
 1739+    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f,
 1740+    0.002125f, 0.002428f, 0.002732f, 0.003035f, 0.003347f, 0.003677f, 0.004025f,
 1741+    0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f,
 1742+    0.007499f, 0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f,
 1743+    0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f, 0.015209f, 0.015996f,
 1744+    0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f,
 1745+    0.023153f, 0.024158f, 0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f,
 1746+    0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f, 0.038204f,
 1747+    0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f,
 1748+    0.049707f, 0.051269f, 0.052861f, 0.054480f, 0.056128f, 0.057805f, 0.059511f,
 1749+    0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
 1750+    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f,
 1751+    0.088656f, 0.090842f, 0.093059f, 0.095307f, 0.097587f, 0.099899f, 0.102242f,
 1752+    0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f,
 1753+    0.122139f, 0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f,
 1754+    0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f, 0.155926f, 0.158961f,
 1755+    0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f,
 1756+    0.184475f, 0.187821f, 0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f,
 1757+    0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f, 0.230740f,
 1758+    0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f,
 1759+    0.262251f, 0.266356f, 0.270498f, 0.274677f, 0.278894f, 0.283149f, 0.287441f,
 1760+    0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
 1761+    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f,
 1762+    0.356400f, 0.361307f, 0.366253f, 0.371238f, 0.376262f, 0.381326f, 0.386430f,
 1763+    0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f,
 1764+    0.428691f, 0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f,
 1765+    0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f, 0.496933f, 0.502887f,
 1766+    0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f,
 1767+    0.552011f, 0.558340f, 0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f,
 1768+    0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f, 0.637597f,
 1769+    0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f,
 1770+    0.693872f, 0.701102f, 0.708376f, 0.715694f, 0.723055f, 0.730461f, 0.737911f,
 1771+    0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
 1772+    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f,
 1773+    0.854993f, 0.863157f, 0.871367f, 0.879622f, 0.887923f, 0.896269f, 0.904661f,
 1774+    0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f,
 1775+    0.973445f, 0.982251f, 0.991102f, 1.0f};
 1776+
 1777+typedef union {
 1778+	unsigned int u;
 1779+	float f;
 1780 } stbir__FP32;
 1781 
 1782 // From https://gist.github.com/rygorous/2203834
 1783 
 1784 static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
 1785-  0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
 1786-  0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
 1787-  0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
 1788-  0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
 1789-  0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
 1790-  0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
 1791-  0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
 1792-  0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
 1793-  0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
 1794-  0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
 1795-  0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
 1796-  0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
 1797-  0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
 1798+    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d,
 1799+    0x009a000d, 0x00a1000d, 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
 1800+    0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a, 0x010e0033, 0x01280033,
 1801+    0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
 1802+    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067,
 1803+    0x03110067, 0x03440067, 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
 1804+    0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5, 0x06970158, 0x07420142,
 1805+    0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
 1806+    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e,
 1807+    0x0fbc0150, 0x10630143, 0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
 1808+    0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af, 0x18fe0331, 0x1a9602fe,
 1809+    0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
 1810+    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341,
 1811+    0x2ebe031f, 0x304d0300, 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
 1812+    0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, 0x44c20798, 0x488e071e,
 1813+    0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
 1814+    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd,
 1815+    0x787d076c, 0x7c330723,
 1816 };
 1817 
 1818-static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 1819+static stbir__inline stbir_uint8
 1820+stbir__linear_to_srgb_uchar(float in)
 1821 {
 1822-  static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
 1823-  static const stbir__FP32 minval = { (127-13) << 23 };
 1824-  stbir_uint32 tab,bias,scale,t;
 1825-  stbir__FP32 f;
 1826+	static const stbir__FP32 almostone = {0x3f7fffff}; // 1-eps
 1827+	static const stbir__FP32 minval = {(127 - 13) << 23};
 1828+	stbir_uint32 tab, bias, scale, t;
 1829+	stbir__FP32 f;
 1830 
 1831-  // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
 1832-  // The tests are carefully written so that NaNs map to 0, same as in the reference
 1833-  // implementation.
 1834-  if (!(in > minval.f)) // written this way to catch NaNs
 1835-      return 0;
 1836-  if (in > almostone.f)
 1837-      return 255;
 1838+	// Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
 1839+	// The tests are carefully written so that NaNs map to 0, same as in the
 1840+	// reference implementation.
 1841+	if (!(in > minval.f)) { // written this way to catch NaNs
 1842+		return 0;
 1843+	}
 1844+	if (in > almostone.f) {
 1845+		return 255;
 1846+	}
 1847 
 1848-  // Do the table lookup and unpack bias, scale
 1849-  f.f = in;
 1850-  tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
 1851-  bias = (tab >> 16) << 9;
 1852-  scale = tab & 0xffff;
 1853+	// Do the table lookup and unpack bias, scale
 1854+	f.f = in;
 1855+	tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
 1856+	bias = (tab >> 16) << 9;
 1857+	scale = tab & 0xffff;
 1858 
 1859-  // Grab next-highest mantissa bits and perform linear interpolation
 1860-  t = (f.u >> 12) & 0xff;
 1861-  return (unsigned char) ((bias + scale*t) >> 16);
 1862+	// Grab next-highest mantissa bits and perform linear interpolation
 1863+	t = (f.u >> 12) & 0xff;
 1864+	return (unsigned char)((bias + scale * t) >> 16);
 1865 }
 1866 
 1867 #ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
 1868-#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
 1869+#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT                             \
 1870+	32 // when downsampling and <= 32 scanlines of buffering, use gather. gather
 1871+	   // used down to 1/8th scaling for 25% win.
 1872 #endif
 1873 
 1874 #ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
 1875-#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
 1876+#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS                               \
 1877+	4 // when threading, what is the minimum number of scanlines for a split?
 1878 #endif
 1879 
 1880 #define STBIR_INPUT_CALLBACK_PADDING 3
 1881 
 1882 #ifdef _M_IX86_FP
 1883-#if ( _M_IX86_FP >= 1 )
 1884+#if (_M_IX86_FP >= 1)
 1885 #ifndef STBIR_SSE
 1886 #define STBIR_SSE
 1887 #endif
 1888@@ -1176,41 +1322,47 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 1889 #endif
 1890 
 1891 #ifdef __TINYC__
 1892-  // tiny c has no intrinsics yet - this can become a version check if they add them
 1893-  #define STBIR_NO_SIMD
 1894-#endif
 1895-
 1896-#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
 1897-  #ifndef STBIR_SSE2
 1898-    #define STBIR_SSE2
 1899-  #endif
 1900-  #if defined(__AVX__) || defined(STBIR_AVX2)
 1901-    #ifndef STBIR_AVX
 1902-      #ifndef STBIR_NO_AVX
 1903-        #define STBIR_AVX
 1904-      #endif
 1905-    #endif
 1906-  #endif
 1907-  #if defined(__AVX2__) || defined(STBIR_AVX2)
 1908-    #ifndef STBIR_NO_AVX2
 1909-      #ifndef STBIR_AVX2
 1910-        #define STBIR_AVX2
 1911-      #endif
 1912-      #if defined( _MSC_VER ) && !defined(__clang__)
 1913-        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
 1914-          #define STBIR_FP16C
 1915-        #endif
 1916-      #endif
 1917-    #endif
 1918-  #endif
 1919-  #ifdef __F16C__
 1920-    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
 1921-      #define STBIR_FP16C
 1922-    #endif
 1923-  #endif
 1924-#endif
 1925-
 1926-#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
 1927+// tiny c has no intrinsics yet - this can become a version check if they add
 1928+// them
 1929+#define STBIR_NO_SIMD
 1930+#endif
 1931+
 1932+#if defined(_x86_64) || defined(__x86_64__) || defined(_M_X64) ||              \
 1933+    defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) ||             \
 1934+    defined(STBIR_SSE) || defined(STBIR_SSE2)
 1935+#ifndef STBIR_SSE2
 1936+#define STBIR_SSE2
 1937+#endif
 1938+#if defined(__AVX__) || defined(STBIR_AVX2)
 1939+#ifndef STBIR_AVX
 1940+#ifndef STBIR_NO_AVX
 1941+#define STBIR_AVX
 1942+#endif
 1943+#endif
 1944+#endif
 1945+#if defined(__AVX2__) || defined(STBIR_AVX2)
 1946+#ifndef STBIR_NO_AVX2
 1947+#ifndef STBIR_AVX2
 1948+#define STBIR_AVX2
 1949+#endif
 1950+#if defined(_MSC_VER) && !defined(__clang__)
 1951+#ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can
 1952+                    // autoselect it here on microsoft - clang needs -m16c
 1953+#define STBIR_FP16C
 1954+#endif
 1955+#endif
 1956+#endif
 1957+#endif
 1958+#ifdef __F16C__
 1959+#ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for
 1960+                    // clang and gcc)
 1961+#define STBIR_FP16C
 1962+#endif
 1963+#endif
 1964+#endif
 1965+
 1966+#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) ||         \
 1967+    ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
 1968 #ifndef STBIR_NEON
 1969 #define STBIR_NEON
 1970 #endif
 1971@@ -1229,35 +1381,39 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 1972 #endif
 1973 
 1974 // restrict pointers for the output pointers, other loop and unroll control
 1975-#if defined( _MSC_VER ) && !defined(__clang__)
 1976-  #define STBIR_STREAMOUT_PTR( star ) star __restrict
 1977-  #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
 1978-  #if _MSC_VER >= 1900
 1979-    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
 1980-  #else
 1981-    #define STBIR_NO_UNROLL_LOOP_START 
 1982-  #endif
 1983-#elif defined( __clang__ )
 1984-  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
 1985-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
 1986-  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
 1987-    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
 1988-  #else
 1989-    #define STBIR_NO_UNROLL_LOOP_START
 1990-  #endif 
 1991-#elif defined( __GNUC__ )
 1992-  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
 1993-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
 1994-  #if __GNUC__ >= 14
 1995-    #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
 1996-  #else
 1997-    #define STBIR_NO_UNROLL_LOOP_START
 1998-  #endif
 1999-  #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
 2000+#if defined(_MSC_VER) && !defined(__clang__)
 2001+#define STBIR_STREAMOUT_PTR(star) star __restrict
 2002+#define STBIR_NO_UNROLL(ptr)                                                   \
 2003+	__assume(ptr) // this oddly keeps msvc from unrolling a loop
 2004+#if _MSC_VER >= 1900
 2005+#define STBIR_NO_UNROLL_LOOP_START __pragma(loop(no_vector))
 2006+#else
 2007+#define STBIR_NO_UNROLL_LOOP_START
 2008+#endif
 2009+#elif defined(__clang__)
 2010+#define STBIR_STREAMOUT_PTR(star) star __restrict__
 2011+#define STBIR_NO_UNROLL(ptr) __asm__("" ::"r"(ptr))
 2012+#if (__clang_major__ >= 4) || ((__clang_major__ >= 3) && (__clang_minor__ >= 5))
 2013+#define STBIR_NO_UNROLL_LOOP_START                                             \
 2014+	_Pragma("clang loop unroll(disable)")                                      \
 2015+	    _Pragma("clang loop vectorize(disable)")
 2016+#else
 2017+#define STBIR_NO_UNROLL_LOOP_START
 2018+#endif
 2019+#elif defined(__GNUC__)
 2020+#define STBIR_STREAMOUT_PTR(star) star __restrict__
 2021+#define STBIR_NO_UNROLL(ptr) __asm__("" ::"r"(ptr))
 2022+#if __GNUC__ >= 14
 2023+#define STBIR_NO_UNROLL_LOOP_START                                             \
 2024+	_Pragma("GCC unroll 0") _Pragma("GCC novector")
 2025 #else
 2026-  #define STBIR_STREAMOUT_PTR( star ) star
 2027-  #define STBIR_NO_UNROLL( ptr )
 2028-  #define STBIR_NO_UNROLL_LOOP_START
 2029+#define STBIR_NO_UNROLL_LOOP_START
 2030+#endif
 2031+#define STBIR_NO_UNROLL_LOOP_START_INF_FOR
 2032+#else
 2033+#define STBIR_STREAMOUT_PTR(star) star
 2034+#define STBIR_NO_UNROLL(ptr)
 2035+#define STBIR_NO_UNROLL_LOOP_START
 2036 #endif
 2037 
 2038 #ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
 2039@@ -1299,1463 +1455,1914 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 2040 #else // STBIR_SIMD
 2041 
 2042 #ifdef STBIR_SSE2
 2043-  #include <emmintrin.h>
 2044-
 2045-  #define stbir__simdf __m128
 2046-  #define stbir__simdi __m128i
 2047-
 2048-  #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
 2049-  #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
 2050-
 2051-  #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
 2052-  #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
 2053-  #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values can be random (not denormal or nan for perf)
 2054-  #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
 2055-  #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values must be zero
 2056-  #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
 2057-  #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
 2058-  #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
 2059-  #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
 2060-  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
 2061-
 2062-  #define stbir__simdf_zeroP() _mm_setzero_ps()
 2063-  #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
 2064-
 2065-  #define stbir__simdf_store( ptr, reg )  _mm_storeu_ps( (float*)(ptr), reg )
 2066-  #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
 2067-  #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
 2068-  #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
 2069-
 2070-  #define stbir__simdi_store( ptr, reg )  _mm_storeu_si128( (__m128i*)(ptr), reg )
 2071-  #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
 2072-  #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
 2073-
 2074-  #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
 2075-
 2076-  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
 2077-  { \
 2078-    stbir__simdi zero = _mm_setzero_si128(); \
 2079-    out2 = _mm_unpacklo_epi8( ireg, zero ); \
 2080-    out3 = _mm_unpackhi_epi8( ireg, zero ); \
 2081-    out0 = _mm_unpacklo_epi16( out2, zero ); \
 2082-    out1 = _mm_unpackhi_epi16( out2, zero ); \
 2083-    out2 = _mm_unpacklo_epi16( out3, zero ); \
 2084-    out3 = _mm_unpackhi_epi16( out3, zero ); \
 2085-  }
 2086-
 2087-#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
 2088-  { \
 2089-    stbir__simdi zero = _mm_setzero_si128(); \
 2090-    out = _mm_unpacklo_epi8( ireg, zero ); \
 2091-    out = _mm_unpacklo_epi16( out, zero ); \
 2092-  }
 2093-
 2094-  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
 2095-  { \
 2096-    stbir__simdi zero = _mm_setzero_si128(); \
 2097-    out0 = _mm_unpacklo_epi16( ireg, zero ); \
 2098-    out1 = _mm_unpackhi_epi16( ireg, zero ); \
 2099-  }
 2100-
 2101-  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
 2102-  #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
 2103-  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
 2104-  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
 2105-
 2106-  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
 2107-  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
 2108-  #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
 2109-  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
 2110-  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
 2111-  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
 2112-  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
 2113-  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
 2114-
 2115-  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
 2116-  #include <immintrin.h>
 2117-  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
 2118-  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
 2119-  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
 2120-  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
 2121-  #else
 2122-  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
 2123-  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
 2124-  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
 2125-  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
 2126-  #endif
 2127-
 2128-  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
 2129-  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
 2130-
 2131-  #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
 2132-  #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
 2133-
 2134-  #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
 2135-  #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
 2136-  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
 2137-  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
 2138-
 2139-  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
 2140-  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
 2141-
 2142-  static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
 2143-  static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
 2144-  #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
 2145-  #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
 2146-  #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
 2147-  #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
 2148-
 2149-  #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
 2150-
 2151-  #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
 2152-  #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
 2153-  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
 2154-
 2155-  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
 2156-  { \
 2157-    stbir__simdf af,bf; \
 2158-    stbir__simdi a,b; \
 2159-    af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
 2160-    bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
 2161-    af = _mm_max_ps( af, _mm_setzero_ps() ); \
 2162-    bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
 2163-    a = _mm_cvttps_epi32( af ); \
 2164-    b = _mm_cvttps_epi32( bf ); \
 2165-    a = _mm_packs_epi32( a, b ); \
 2166-    out = _mm_packus_epi16( a, a ); \
 2167-  }
 2168-
 2169-  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
 2170-      stbir__simdf_load( o0, (ptr) );    \
 2171-      stbir__simdf_load( o1, (ptr)+4 );  \
 2172-      stbir__simdf_load( o2, (ptr)+8 );  \
 2173-      stbir__simdf_load( o3, (ptr)+12 ); \
 2174-      {                                  \
 2175-        __m128 tmp0, tmp1, tmp2, tmp3;   \
 2176-        tmp0 = _mm_unpacklo_ps(o0, o1);  \
 2177-        tmp2 = _mm_unpacklo_ps(o2, o3);  \
 2178-        tmp1 = _mm_unpackhi_ps(o0, o1);  \
 2179-        tmp3 = _mm_unpackhi_ps(o2, o3);  \
 2180-        o0 = _mm_movelh_ps(tmp0, tmp2);  \
 2181-        o1 = _mm_movehl_ps(tmp2, tmp0);  \
 2182-        o2 = _mm_movelh_ps(tmp1, tmp3);  \
 2183-        o3 = _mm_movehl_ps(tmp3, tmp1);  \
 2184-      }
 2185-
 2186-  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
 2187-      r0 = _mm_packs_epi32( r0, r1 ); \
 2188-      r2 = _mm_packs_epi32( r2, r3 ); \
 2189-      r1 = _mm_unpacklo_epi16( r0, r2 ); \
 2190-      r3 = _mm_unpackhi_epi16( r0, r2 ); \
 2191-      r0 = _mm_unpacklo_epi16( r1, r3 ); \
 2192-      r2 = _mm_unpackhi_epi16( r1, r3 ); \
 2193-      r0 = _mm_packus_epi16( r0, r2 ); \
 2194-      stbir__simdi_store( ptr, r0 ); \
 2195-
 2196-  #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
 2197-
 2198-  #if defined(_MSC_VER) && !defined(__clang__)
 2199-    // msvc inits with 8 bytes
 2200-    #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
 2201-    #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
 2202-    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
 2203-  #else
 2204-    // everything else inits with long long's
 2205-    #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
 2206-    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
 2207-  #endif
 2208-
 2209-  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
 2210-  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
 2211-  #define STBIR__CONSTF(var) (var)
 2212-  #define STBIR__CONSTI(var) (var)
 2213-
 2214-  #if defined(STBIR_AVX) || defined(__SSE4_1__)
 2215-    #include <smmintrin.h>
 2216-    #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
 2217-  #else
 2218-    static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
 2219-    static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
 2220-
 2221-    #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
 2222-      { \
 2223-        stbir__simdi tmp0,tmp1; \
 2224-        tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
 2225-        tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
 2226-        tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
 2227-        tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
 2228-        out = _mm_packs_epi32( tmp0, tmp1 ); \
 2229-        out = _mm_sub_epi16( out, stbir__s16_32768 ); \
 2230-      }
 2231-
 2232-  #endif
 2233-
 2234-  #define STBIR_SIMD
 2235-
 2236-  // if we detect AVX, set the simd8 defines
 2237-  #ifdef STBIR_AVX
 2238-    #include <immintrin.h>
 2239-    #define STBIR_SIMD8
 2240-    #define stbir__simdf8 __m256
 2241-    #define stbir__simdi8 __m256i
 2242-    #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
 2243-    #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
 2244-    #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
 2245-    #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
 2246-    #define stbir__simdi8_store( ptr, reg )  _mm256_storeu_si256( (__m256i*)(ptr), reg )
 2247-    #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
 2248-
 2249-    #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
 2250-    #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
 2251-
 2252-    #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
 2253-    #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
 2254-    #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
 2255-    #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
 2256-    #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
 2257-    #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr )  // avx load instruction
 2258-
 2259-    #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
 2260-    #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
 2261-
 2262-    #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
 2263-    #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
 2264-
 2265-    #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
 2266-
 2267-    #ifdef STBIR_AVX2
 2268-
 2269-    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
 2270-    { \
 2271-      stbir__simdi8 a, zero  =_mm256_setzero_si256();\
 2272-      a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
 2273-      out0 = _mm256_unpacklo_epi16( a, zero ); \
 2274-      out1 = _mm256_unpackhi_epi16( a, zero ); \
 2275-    }
 2276-
 2277-    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
 2278-    { \
 2279-      stbir__simdi8 t; \
 2280-      stbir__simdf8 af,bf; \
 2281-      stbir__simdi8 a,b; \
 2282-      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
 2283-      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
 2284-      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
 2285-      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
 2286-      a = _mm256_cvttps_epi32( af ); \
 2287-      b = _mm256_cvttps_epi32( bf ); \
 2288-      t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
 2289-      out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
 2290-    }
 2291-
 2292-    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
 2293-
 2294-    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
 2295-      { \
 2296-        stbir__simdf8 af,bf; \
 2297-        stbir__simdi8 a,b; \
 2298-        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
 2299-        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
 2300-        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
 2301-        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
 2302-        a = _mm256_cvttps_epi32( af ); \
 2303-        b = _mm256_cvttps_epi32( bf ); \
 2304-        (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
 2305-      }
 2306-
 2307-    #else
 2308-
 2309-    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
 2310-    { \
 2311-      stbir__simdi a,zero = _mm_setzero_si128(); \
 2312-      a = _mm_unpacklo_epi8( ireg, zero ); \
 2313-      out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
 2314-      a = _mm_unpackhi_epi8( ireg, zero ); \
 2315-      out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
 2316-    }
 2317-
 2318-    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
 2319-    { \
 2320-      stbir__simdi t; \
 2321-      stbir__simdf8 af,bf; \
 2322-      stbir__simdi8 a,b; \
 2323-      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
 2324-      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
 2325-      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
 2326-      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
 2327-      a = _mm256_cvttps_epi32( af ); \
 2328-      b = _mm256_cvttps_epi32( bf ); \
 2329-      out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
 2330-      out = _mm_packus_epi16( out, out ); \
 2331-      t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
 2332-      t = _mm_packus_epi16( t, t ); \
 2333-      out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
 2334-    }
 2335-
 2336-    #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
 2337-    { \
 2338-      stbir__simdi a,b,zero = _mm_setzero_si128(); \
 2339-      a = _mm_unpacklo_epi16( ireg, zero ); \
 2340-      b = _mm_unpackhi_epi16( ireg, zero ); \
 2341-      out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
 2342-    }
 2343-
 2344-    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
 2345-      { \
 2346-        stbir__simdi t0,t1; \
 2347-        stbir__simdf8 af,bf; \
 2348-        stbir__simdi8 a,b; \
 2349-        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
 2350-        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
 2351-        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
 2352-        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
 2353-        a = _mm256_cvttps_epi32( af ); \
 2354-        b = _mm256_cvttps_epi32( bf ); \
 2355-        t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
 2356-        t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
 2357-        out = _mm256_setr_m128i( t0, t1 ); \
 2358-      }
 2359-
 2360-    #endif
 2361-
 2362-    static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
 2363-    #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
 2364-
 2365-    static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
 2366-    #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
 2367-
 2368-    #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
 2369-
 2370-    #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
 2371-
 2372-    static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
 2373-    #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
 2374-    #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8,  _mm256_castps128_ps256( b ) )
 2375-
 2376-    static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i(  0x80000000,  0x80000000, 0, 0 ) };
 2377-    #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
 2378-
 2379-    #define stbir__simdf8_0123to00000000( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
 2380-    #define stbir__simdf8_0123to11111111( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
 2381-    #define stbir__simdf8_0123to22222222( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
 2382-    #define stbir__simdf8_0123to33333333( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
 2383-    #define stbir__simdf8_0123to21032103( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
 2384-    #define stbir__simdf8_0123to32103210( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
 2385-    #define stbir__simdf8_0123to12301230( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
 2386-    #define stbir__simdf8_0123to10321032( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
 2387-    #define stbir__simdf8_0123to30123012( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
 2388-
 2389-    #define stbir__simdf8_0123to11331133( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
 2390-    #define stbir__simdf8_0123to00220022( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
 2391-
 2392-    #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
 2393-    #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
 2394-    #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
 2395-    #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
 2396-
 2397-    #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
 2398-
 2399-    #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
 2400-    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
 2401-    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
 2402-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
 2403-    #else
 2404-    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
 2405-    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
 2406-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )  (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
 2407-    #endif
 2408-    #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
 2409-
 2410-  #endif
 2411-
 2412-  #ifdef STBIR_FLOORF
 2413-  #undef STBIR_FLOORF
 2414-  #endif
 2415-  #define STBIR_FLOORF stbir_simd_floorf
 2416-  static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
 2417-  {
 2418-    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
 2419-    __m128 t = _mm_set_ss(x);
 2420-    return _mm_cvtss_f32( _mm_floor_ss(t, t) );
 2421-    #else
 2422-    __m128 f = _mm_set_ss(x);
 2423-    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
 2424-    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
 2425-    return _mm_cvtss_f32(r);
 2426-    #endif
 2427-  }
 2428-
 2429-  #ifdef STBIR_CEILF
 2430-  #undef STBIR_CEILF
 2431-  #endif
 2432-  #define STBIR_CEILF stbir_simd_ceilf
 2433-  static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
 2434-  {
 2435-    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
 2436-    __m128 t = _mm_set_ss(x);
 2437-    return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
 2438-    #else
 2439-    __m128 f = _mm_set_ss(x);
 2440-    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
 2441-    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
 2442-    return _mm_cvtss_f32(r);
 2443-    #endif
 2444-  }
 2445+#include <emmintrin.h>
 2446+
 2447+#define stbir__simdf __m128
 2448+#define stbir__simdi __m128i
 2449+
 2450+#define stbir_simdi_castf(reg) _mm_castps_si128(reg)
 2451+#define stbir_simdf_casti(reg) _mm_castsi128_ps(reg)
 2452+
 2453+#define stbir__simdf_load(reg, ptr) (reg) = _mm_loadu_ps((float const *)(ptr))
 2454+#define stbir__simdi_load(reg, ptr)                                            \
 2455+	(reg) = _mm_loadu_si128((stbir__simdi const *)(ptr))
 2456+#define stbir__simdf_load1(out, ptr)                                           \
 2457+	(out) = _mm_load_ss((float const *)(ptr)) // top values can be random (not
 2458+	                                          // denormal or nan for perf)
 2459+#define stbir__simdi_load1(out, ptr)                                           \
 2460+	(out) = _mm_castps_si128(_mm_load_ss((float const *)(ptr)))
 2461+#define stbir__simdf_load1z(out, ptr)                                          \
 2462+	(out) = _mm_load_ss((float const *)(ptr)) // top values must be zero
 2463+#define stbir__simdf_frep4(fvar) _mm_set_ps1(fvar)
 2464+#define stbir__simdf_load1frep4(out, fvar) (out) = _mm_set_ps1(fvar)
 2465+#define stbir__simdf_load2(out, ptr)                                           \
 2466+	(out) = _mm_castsi128_ps(                                                  \
 2467+	    _mm_loadl_epi64((__m128i *)(ptr))) // top values can be random (not
 2468+	                                       // denormal or nan for perf)
 2469+#define stbir__simdf_load2z(out, ptr)                                          \
 2470+	(out) = _mm_castsi128_ps(                                                  \
 2471+	    _mm_loadl_epi64((__m128i *)(ptr))) // top values must be zero
 2472+#define stbir__simdf_load2hmerge(out, reg, ptr)                                \
 2473+	(out) = _mm_castpd_ps(_mm_loadh_pd(_mm_castps_pd(reg), (double *)(ptr)))
 2474+
 2475+#define stbir__simdf_zeroP() _mm_setzero_ps()
 2476+#define stbir__simdf_zero(reg) (reg) = _mm_setzero_ps()
 2477+
 2478+#define stbir__simdf_store(ptr, reg) _mm_storeu_ps((float *)(ptr), reg)
 2479+#define stbir__simdf_store1(ptr, reg) _mm_store_ss((float *)(ptr), reg)
 2480+#define stbir__simdf_store2(ptr, reg)                                          \
 2481+	_mm_storel_epi64((__m128i *)(ptr), _mm_castps_si128(reg))
 2482+#define stbir__simdf_store2h(ptr, reg)                                         \
 2483+	_mm_storeh_pd((double *)(ptr), _mm_castps_pd(reg))
 2484+
 2485+#define stbir__simdi_store(ptr, reg) _mm_storeu_si128((__m128i *)(ptr), reg)
 2486+#define stbir__simdi_store1(ptr, reg)                                          \
 2487+	_mm_store_ss((float *)(ptr), _mm_castsi128_ps(reg))
 2488+#define stbir__simdi_store2(ptr, reg) _mm_storel_epi64((__m128i *)(ptr), (reg))
 2489+
 2490+#define stbir__prefetch(ptr) _mm_prefetch((char *)(ptr), _MM_HINT_T0)
 2491+
 2492+#define stbir__simdi_expand_u8_to_u32(out0, out1, out2, out3, ireg)            \
 2493+	{                                                                          \
 2494+		stbir__simdi zero = _mm_setzero_si128();                               \
 2495+		out2 = _mm_unpacklo_epi8(ireg, zero);                                  \
 2496+		out3 = _mm_unpackhi_epi8(ireg, zero);                                  \
 2497+		out0 = _mm_unpacklo_epi16(out2, zero);                                 \
 2498+		out1 = _mm_unpackhi_epi16(out2, zero);                                 \
 2499+		out2 = _mm_unpacklo_epi16(out3, zero);                                 \
 2500+		out3 = _mm_unpackhi_epi16(out3, zero);                                 \
 2501+	}
 2502+
 2503+#define stbir__simdi_expand_u8_to_1u32(out, ireg)                              \
 2504+	{                                                                          \
 2505+		stbir__simdi zero = _mm_setzero_si128();                               \
 2506+		out = _mm_unpacklo_epi8(ireg, zero);                                   \
 2507+		out = _mm_unpacklo_epi16(out, zero);                                   \
 2508+	}
 2509+
 2510+#define stbir__simdi_expand_u16_to_u32(out0, out1, ireg)                       \
 2511+	{                                                                          \
 2512+		stbir__simdi zero = _mm_setzero_si128();                               \
 2513+		out0 = _mm_unpacklo_epi16(ireg, zero);                                 \
 2514+		out1 = _mm_unpackhi_epi16(ireg, zero);                                 \
 2515+	}
 2516+
 2517+#define stbir__simdf_convert_float_to_i32(i, f) (i) = _mm_cvttps_epi32(f)
 2518+#define stbir__simdf_convert_float_to_int(f) _mm_cvtt_ss2si(f)
 2519+#define stbir__simdf_convert_float_to_uint8(f)                                 \
 2520+	((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(                        \
 2521+	    _mm_max_ps(_mm_min_ps(f, STBIR__CONSTF(STBIR_max_uint8_as_float)),     \
 2522+	               _mm_setzero_ps()))))
 2523+#define stbir__simdf_convert_float_to_short(f)                                 \
 2524+	((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(                       \
 2525+	    _mm_max_ps(_mm_min_ps(f, STBIR__CONSTF(STBIR_max_uint16_as_float)),    \
 2526+	               _mm_setzero_ps()))))
 2527+
 2528+#define stbir__simdi_to_int(i) _mm_cvtsi128_si32(i)
 2529+#define stbir__simdi_convert_i32_to_float(out, ireg)                           \
 2530+	(out) = _mm_cvtepi32_ps(ireg)
 2531+#define stbir__simdf_add(out, reg0, reg1) (out) = _mm_add_ps(reg0, reg1)
 2532+#define stbir__simdf_mult(out, reg0, reg1) (out) = _mm_mul_ps(reg0, reg1)
 2533+#define stbir__simdf_mult_mem(out, reg, ptr)                                   \
 2534+	(out) = _mm_mul_ps(reg, _mm_loadu_ps((float const *)(ptr)))
 2535+#define stbir__simdf_mult1_mem(out, reg, ptr)                                  \
 2536+	(out) = _mm_mul_ss(reg, _mm_load_ss((float const *)(ptr)))
 2537+#define stbir__simdf_add_mem(out, reg, ptr)                                    \
 2538+	(out) = _mm_add_ps(reg, _mm_loadu_ps((float const *)(ptr)))
 2539+#define stbir__simdf_add1_mem(out, reg, ptr)                                   \
 2540+	(out) = _mm_add_ss(reg, _mm_load_ss((float const *)(ptr)))
 2541+
 2542+#ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to
 2543+                     // non-simd
 2544+#include <immintrin.h>
 2545+#define stbir__simdf_madd(out, add, mul1, mul2)                                \
 2546+	(out) = _mm_fmadd_ps(mul1, mul2, add)
 2547+#define stbir__simdf_madd1(out, add, mul1, mul2)                               \
 2548+	(out) = _mm_fmadd_ss(mul1, mul2, add)
 2549+#define stbir__simdf_madd_mem(out, add, mul, ptr)                              \
 2550+	(out) = _mm_fmadd_ps(mul, _mm_loadu_ps((float const *)(ptr)), add)
 2551+#define stbir__simdf_madd1_mem(out, add, mul, ptr)                             \
 2552+	(out) = _mm_fmadd_ss(mul, _mm_load_ss((float const *)(ptr)), add)
 2553+#else
 2554+#define stbir__simdf_madd(out, add, mul1, mul2)                                \
 2555+	(out) = _mm_add_ps(add, _mm_mul_ps(mul1, mul2))
 2556+#define stbir__simdf_madd1(out, add, mul1, mul2)                               \
 2557+	(out) = _mm_add_ss(add, _mm_mul_ss(mul1, mul2))
 2558+#define stbir__simdf_madd_mem(out, add, mul, ptr)                              \
 2559+	(out) = _mm_add_ps(add, _mm_mul_ps(mul, _mm_loadu_ps((float const *)(ptr))))
 2560+#define stbir__simdf_madd1_mem(out, add, mul, ptr)                             \
 2561+	(out) = _mm_add_ss(add, _mm_mul_ss(mul, _mm_load_ss((float const *)(ptr))))
 2562+#endif
 2563+
 2564+#define stbir__simdf_add1(out, reg0, reg1) (out) = _mm_add_ss(reg0, reg1)
 2565+#define stbir__simdf_mult1(out, reg0, reg1) (out) = _mm_mul_ss(reg0, reg1)
 2566+
 2567+#define stbir__simdf_and(out, reg0, reg1) (out) = _mm_and_ps(reg0, reg1)
 2568+#define stbir__simdf_or(out, reg0, reg1) (out) = _mm_or_ps(reg0, reg1)
 2569+
 2570+#define stbir__simdf_min(out, reg0, reg1) (out) = _mm_min_ps(reg0, reg1)
 2571+#define stbir__simdf_max(out, reg0, reg1) (out) = _mm_max_ps(reg0, reg1)
 2572+#define stbir__simdf_min1(out, reg0, reg1) (out) = _mm_min_ss(reg0, reg1)
 2573+#define stbir__simdf_max1(out, reg0, reg1) (out) = _mm_max_ss(reg0, reg1)
 2574+
 2575+#define stbir__simdf_0123ABCDto3ABx(out, reg0, reg1)                           \
 2576+	(out) = _mm_castsi128_ps(_mm_shuffle_epi32(                                \
 2577+	    _mm_castps_si128(_mm_shuffle_ps(                                       \
 2578+	        reg1, reg0, (0 << 0) + (1 << 2) + (2 << 4) + (3 << 6))),           \
 2579+	    (3 << 0) + (0 << 2) + (1 << 4) + (2 << 6)))
 2580+#define stbir__simdf_0123ABCDto23Ax(out, reg0, reg1)                           \
 2581+	(out) = _mm_castsi128_ps(_mm_shuffle_epi32(                                \
 2582+	    _mm_castps_si128(_mm_shuffle_ps(                                       \
 2583+	        reg1, reg0, (0 << 0) + (1 << 2) + (2 << 4) + (3 << 6))),           \
 2584+	    (2 << 0) + (3 << 2) + (0 << 4) + (1 << 6)))
 2585+
 2586+static const stbir__simdf STBIR_zeroones = {0.0f, 1.0f, 0.0f, 1.0f};
 2587+static const stbir__simdf STBIR_onezeros = {1.0f, 0.0f, 1.0f, 0.0f};
 2588+#define stbir__simdf_aaa1(out, alp, ones)                                      \
 2589+	(out) = _mm_castsi128_ps(                                                  \
 2590+	    _mm_shuffle_epi32(_mm_castps_si128(_mm_movehl_ps(ones, alp)),          \
 2591+	                      (1 << 0) + (1 << 2) + (1 << 4) + (2 << 6)))
 2592+#define stbir__simdf_1aaa(out, alp, ones)                                      \
 2593+	(out) = _mm_castsi128_ps(                                                  \
 2594+	    _mm_shuffle_epi32(_mm_castps_si128(_mm_movelh_ps(ones, alp)),          \
 2595+	                      (0 << 0) + (2 << 2) + (2 << 4) + (2 << 6)))
 2596+#define stbir__simdf_a1a1(out, alp, ones)                                      \
 2597+	(out) =                                                                    \
 2598+	    _mm_or_ps(_mm_castsi128_ps(_mm_srli_epi64(_mm_castps_si128(alp), 32)), \
 2599+	              STBIR_zeroones)
 2600+#define stbir__simdf_1a1a(out, alp, ones)                                      \
 2601+	(out) =                                                                    \
 2602+	    _mm_or_ps(_mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(alp), 32)), \
 2603+	              STBIR_onezeros)
 2604+
 2605+#define stbir__simdf_swiz(reg, one, two, three, four)                          \
 2606+	_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(reg),                  \
 2607+	                                   (one << 0) + (two << 2) +               \
 2608+	                                       (three << 4) + (four << 6)))
 2609+
 2610+#define stbir__simdi_and(out, reg0, reg1) (out) = _mm_and_si128(reg0, reg1)
 2611+#define stbir__simdi_or(out, reg0, reg1) (out) = _mm_or_si128(reg0, reg1)
 2612+#define stbir__simdi_16madd(out, reg0, reg1) (out) = _mm_madd_epi16(reg0, reg1)
 2613+
 2614+#define stbir__simdf_pack_to_8bytes(out, aa, bb)                               \
 2615+	{                                                                          \
 2616+		stbir__simdf af, bf;                                                   \
 2617+		stbir__simdi a, b;                                                     \
 2618+		af = _mm_min_ps(aa, STBIR_max_uint8_as_float);                         \
 2619+		bf = _mm_min_ps(bb, STBIR_max_uint8_as_float);                         \
 2620+		af = _mm_max_ps(af, _mm_setzero_ps());                                 \
 2621+		bf = _mm_max_ps(bf, _mm_setzero_ps());                                 \
 2622+		a = _mm_cvttps_epi32(af);                                              \
 2623+		b = _mm_cvttps_epi32(bf);                                              \
 2624+		a = _mm_packs_epi32(a, b);                                             \
 2625+		out = _mm_packus_epi16(a, a);                                          \
 2626+	}
 2627+
 2628+#define stbir__simdf_load4_transposed(o0, o1, o2, o3, ptr)                     \
 2629+	stbir__simdf_load(o0, (ptr));                                              \
 2630+	stbir__simdf_load(o1, (ptr) + 4);                                          \
 2631+	stbir__simdf_load(o2, (ptr) + 8);                                          \
 2632+	stbir__simdf_load(o3, (ptr) + 12);                                         \
 2633+	{                                                                          \
 2634+		__m128 tmp0, tmp1, tmp2, tmp3;                                         \
 2635+		tmp0 = _mm_unpacklo_ps(o0, o1);                                        \
 2636+		tmp2 = _mm_unpacklo_ps(o2, o3);                                        \
 2637+		tmp1 = _mm_unpackhi_ps(o0, o1);                                        \
 2638+		tmp3 = _mm_unpackhi_ps(o2, o3);                                        \
 2639+		o0 = _mm_movelh_ps(tmp0, tmp2);                                        \
 2640+		o1 = _mm_movehl_ps(tmp2, tmp0);                                        \
 2641+		o2 = _mm_movelh_ps(tmp1, tmp3);                                        \
 2642+		o3 = _mm_movehl_ps(tmp3, tmp1);                                        \
 2643+	}
 2644+
 2645+#define stbir__interleave_pack_and_store_16_u8(ptr, r0, r1, r2, r3)            \
 2646+	r0 = _mm_packs_epi32(r0, r1);                                              \
 2647+	r2 = _mm_packs_epi32(r2, r3);                                              \
 2648+	r1 = _mm_unpacklo_epi16(r0, r2);                                           \
 2649+	r3 = _mm_unpackhi_epi16(r0, r2);                                           \
 2650+	r0 = _mm_unpacklo_epi16(r1, r3);                                           \
 2651+	r2 = _mm_unpackhi_epi16(r1, r3);                                           \
 2652+	r0 = _mm_packus_epi16(r0, r2);                                             \
 2653+	stbir__simdi_store(ptr, r0);
 2654+
 2655+#define stbir__simdi_32shr(out, reg, imm) out = _mm_srli_epi32(reg, imm)
 2656+
 2657+#if defined(_MSC_VER) && !defined(__clang__)
 2658+// msvc inits with 8 bytes
 2659+#define STBIR__CONST_32_TO_8(v)                                                \
 2660+	(char)(unsigned char)((v) & 255), (char)(unsigned char)(((v) >> 8) & 255), \
 2661+	    (char)(unsigned char)(((v) >> 16) & 255),                              \
 2662+	    (char)(unsigned char)(((v) >> 24) & 255)
 2663+#define STBIR__CONST_4_32i(v)                                                  \
 2664+	STBIR__CONST_32_TO_8(v), STBIR__CONST_32_TO_8(v), STBIR__CONST_32_TO_8(v), \
 2665+	    STBIR__CONST_32_TO_8(v)
 2666+#define STBIR__CONST_4d_32i(v0, v1, v2, v3)                                    \
 2667+	STBIR__CONST_32_TO_8(v0), STBIR__CONST_32_TO_8(v1),                        \
 2668+	    STBIR__CONST_32_TO_8(v2), STBIR__CONST_32_TO_8(v3)
 2669+#else
 2670+// everything else inits with long long's
 2671+#define STBIR__CONST_4_32i(v)                                                  \
 2672+	(long long)((((stbir_uint64)(stbir_uint32)(v)) << 32) |                    \
 2673+	            ((stbir_uint64)(stbir_uint32)(v))),                            \
 2674+	    (long long)((((stbir_uint64)(stbir_uint32)(v)) << 32) |                \
 2675+	                ((stbir_uint64)(stbir_uint32)(v)))
 2676+#define STBIR__CONST_4d_32i(v0, v1, v2, v3)                                    \
 2677+	(long long)((((stbir_uint64)(stbir_uint32)(v1)) << 32) |                   \
 2678+	            ((stbir_uint64)(stbir_uint32)(v0))),                           \
 2679+	    (long long)((((stbir_uint64)(stbir_uint32)(v3)) << 32) |               \
 2680+	                ((stbir_uint64)(stbir_uint32)(v2)))
 2681+#endif
 2682+
 2683+#define STBIR__SIMDF_CONST(var, x) stbir__simdf var = {x, x, x, x}
 2684+#define STBIR__SIMDI_CONST(var, x) stbir__simdi var = {STBIR__CONST_4_32i(x)}
 2685+#define STBIR__CONSTF(var) (var)
 2686+#define STBIR__CONSTI(var) (var)
 2687+
 2688+#if defined(STBIR_AVX) || defined(__SSE4_1__)
 2689+#include <smmintrin.h>
 2690+#define stbir__simdf_pack_to_8words(out, reg0, reg1)                           \
 2691+	out = _mm_packus_epi32(                                                    \
 2692+	    _mm_cvttps_epi32(_mm_max_ps(                                           \
 2693+	        _mm_min_ps(reg0, STBIR__CONSTF(STBIR_max_uint16_as_float)),        \
 2694+	        _mm_setzero_ps())),                                                \
 2695+	    _mm_cvttps_epi32(_mm_max_ps(                                           \
 2696+	        _mm_min_ps(reg1, STBIR__CONSTF(STBIR_max_uint16_as_float)),        \
 2697+	        _mm_setzero_ps())))
 2698+#else
 2699+static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
 2700+static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768 << 16) | 32768));
 2701+
 2702+#define stbir__simdf_pack_to_8words(out, reg0, reg1)                           \
 2703+	{                                                                          \
 2704+		stbir__simdi tmp0, tmp1;                                               \
 2705+		tmp0 = _mm_cvttps_epi32(_mm_max_ps(                                    \
 2706+		    _mm_min_ps(reg0, STBIR__CONSTF(STBIR_max_uint16_as_float)),        \
 2707+		    _mm_setzero_ps()));                                                \
 2708+		tmp1 = _mm_cvttps_epi32(_mm_max_ps(                                    \
 2709+		    _mm_min_ps(reg1, STBIR__CONSTF(STBIR_max_uint16_as_float)),        \
 2710+		    _mm_setzero_ps()));                                                \
 2711+		tmp0 = _mm_sub_epi32(tmp0, stbir__s32_32768);                          \
 2712+		tmp1 = _mm_sub_epi32(tmp1, stbir__s32_32768);                          \
 2713+		out = _mm_packs_epi32(tmp0, tmp1);                                     \
 2714+		out = _mm_sub_epi16(out, stbir__s16_32768);                            \
 2715+	}
 2716+
 2717+#endif
 2718+
 2719+#define STBIR_SIMD
 2720+
 2721+// if we detect AVX, set the simd8 defines
 2722+#ifdef STBIR_AVX
 2723+#include <immintrin.h>
 2724+#define STBIR_SIMD8
 2725+#define stbir__simdf8 __m256
 2726+#define stbir__simdi8 __m256i
 2727+#define stbir__simdf8_load(out, ptr)                                           \
 2728+	(out) = _mm256_loadu_ps((float const *)(ptr))
 2729+#define stbir__simdi8_load(out, ptr)                                           \
 2730+	(out) = _mm256_loadu_si256((__m256i const *)(ptr))
 2731+#define stbir__simdf8_mult(out, a, b) (out) = _mm256_mul_ps((a), (b))
 2732+#define stbir__simdf8_store(ptr, out) _mm256_storeu_ps((float *)(ptr), out)
 2733+#define stbir__simdi8_store(ptr, reg) _mm256_storeu_si256((__m256i *)(ptr), reg)
 2734+#define stbir__simdf8_frep8(fval) _mm256_set1_ps(fval)
 2735+
 2736+#define stbir__simdf8_min(out, reg0, reg1) (out) = _mm256_min_ps(reg0, reg1)
 2737+#define stbir__simdf8_max(out, reg0, reg1) (out) = _mm256_max_ps(reg0, reg1)
 2738+
 2739+#define stbir__simdf8_add4halves(out, bot4, top8)                              \
 2740+	(out) = _mm_add_ps(bot4, _mm256_extractf128_ps(top8, 1))
 2741+#define stbir__simdf8_mult_mem(out, reg, ptr)                                  \
 2742+	(out) = _mm256_mul_ps(reg, _mm256_loadu_ps((float const *)(ptr)))
 2743+#define stbir__simdf8_add_mem(out, reg, ptr)                                   \
 2744+	(out) = _mm256_add_ps(reg, _mm256_loadu_ps((float const *)(ptr)))
 2745+#define stbir__simdf8_add(out, a, b) (out) = _mm256_add_ps(a, b)
 2746+#define stbir__simdf8_load1b(out, ptr) (out) = _mm256_broadcast_ss(ptr)
 2747+#define stbir__simdf_load1rep4(out, ptr)                                       \
 2748+	(out) = _mm_broadcast_ss(ptr) // avx load instruction
 2749+
 2750+#define stbir__simdi8_convert_i32_to_float(out, ireg)                          \
 2751+	(out) = _mm256_cvtepi32_ps(ireg)
 2752+#define stbir__simdf8_convert_float_to_i32(i, f) (i) = _mm256_cvttps_epi32(f)
 2753+
 2754+#define stbir__simdf8_bot4s(out, a, b)                                         \
 2755+	(out) = _mm256_permute2f128_ps(a, b, (0 << 0) + (2 << 4))
 2756+#define stbir__simdf8_top4s(out, a, b)                                         \
 2757+	(out) = _mm256_permute2f128_ps(a, b, (1 << 0) + (3 << 4))
 2758+
 2759+#define stbir__simdf8_gettop4(reg) _mm256_extractf128_ps(reg, 1)
 2760+
 2761+#ifdef STBIR_AVX2
 2762+
 2763+#define stbir__simdi8_expand_u8_to_u32(out0, out1, ireg)                       \
 2764+	{                                                                          \
 2765+		stbir__simdi8 a, zero = _mm256_setzero_si256();                        \
 2766+		a = _mm256_permute4x64_epi64(                                          \
 2767+		    _mm256_unpacklo_epi8(                                              \
 2768+		        _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),         \
 2769+		                                 (0 << 0) + (2 << 2) + (1 << 4) +      \
 2770+		                                     (3 << 6)),                        \
 2771+		        zero),                                                         \
 2772+		    (0 << 0) + (2 << 2) + (1 << 4) + (3 << 6));                        \
 2773+		out0 = _mm256_unpacklo_epi16(a, zero);                                 \
 2774+		out1 = _mm256_unpackhi_epi16(a, zero);                                 \
 2775+	}
 2776+
 2777+#define stbir__simdf8_pack_to_16bytes(out, aa, bb)                             \
 2778+	{                                                                          \
 2779+		stbir__simdi8 t;                                                       \
 2780+		stbir__simdf8 af, bf;                                                  \
 2781+		stbir__simdi8 a, b;                                                    \
 2782+		af = _mm256_min_ps(aa, STBIR_max_uint8_as_floatX);                     \
 2783+		bf = _mm256_min_ps(bb, STBIR_max_uint8_as_floatX);                     \
 2784+		af = _mm256_max_ps(af, _mm256_setzero_ps());                           \
 2785+		bf = _mm256_max_ps(bf, _mm256_setzero_ps());                           \
 2786+		a = _mm256_cvttps_epi32(af);                                           \
 2787+		b = _mm256_cvttps_epi32(bf);                                           \
 2788+		t = _mm256_permute4x64_epi64(_mm256_packs_epi32(a, b),                 \
 2789+		                             (0 << 0) + (2 << 2) + (1 << 4) +          \
 2790+		                                 (3 << 6));                            \
 2791+		out = _mm256_castsi256_si128(_mm256_permute4x64_epi64(                 \
 2792+		    _mm256_packus_epi16(t, t),                                         \
 2793+		    (0 << 0) + (2 << 2) + (1 << 4) + (3 << 6)));                       \
 2794+	}
 2795+
 2796+#define stbir__simdi8_expand_u16_to_u32(out, ireg)                             \
 2797+	out = _mm256_unpacklo_epi16(                                               \
 2798+	    _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),                 \
 2799+	                             (0 << 0) + (2 << 2) + (1 << 4) + (3 << 6)),   \
 2800+	    _mm256_setzero_si256());
 2801+
 2802+#define stbir__simdf8_pack_to_16words(out, aa, bb)                             \
 2803+	{                                                                          \
 2804+		stbir__simdf8 af, bf;                                                  \
 2805+		stbir__simdi8 a, b;                                                    \
 2806+		af = _mm256_min_ps(aa, STBIR_max_uint16_as_floatX);                    \
 2807+		bf = _mm256_min_ps(bb, STBIR_max_uint16_as_floatX);                    \
 2808+		af = _mm256_max_ps(af, _mm256_setzero_ps());                           \
 2809+		bf = _mm256_max_ps(bf, _mm256_setzero_ps());                           \
 2810+		a = _mm256_cvttps_epi32(af);                                           \
 2811+		b = _mm256_cvttps_epi32(bf);                                           \
 2812+		(out) = _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b),            \
 2813+		                                 (0 << 0) + (2 << 2) + (1 << 4) +      \
 2814+		                                     (3 << 6));                        \
 2815+	}
 2816+
 2817+#else
 2818+
 2819+#define stbir__simdi8_expand_u8_to_u32(out0, out1, ireg)                       \
 2820+	{                                                                          \
 2821+		stbir__simdi a, zero = _mm_setzero_si128();                            \
 2822+		a = _mm_unpacklo_epi8(ireg, zero);                                     \
 2823+		out0 = _mm256_setr_m128i(_mm_unpacklo_epi16(a, zero),                  \
 2824+		                         _mm_unpackhi_epi16(a, zero));                 \
 2825+		a = _mm_unpackhi_epi8(ireg, zero);                                     \
 2826+		out1 = _mm256_setr_m128i(_mm_unpacklo_epi16(a, zero),                  \
 2827+		                         _mm_unpackhi_epi16(a, zero));                 \
 2828+	}
 2829+
 2830+#define stbir__simdf8_pack_to_16bytes(out, aa, bb)                             \
 2831+	{                                                                          \
 2832+		stbir__simdi t;                                                        \
 2833+		stbir__simdf8 af, bf;                                                  \
 2834+		stbir__simdi8 a, b;                                                    \
 2835+		af = _mm256_min_ps(aa, STBIR_max_uint8_as_floatX);                     \
 2836+		bf = _mm256_min_ps(bb, STBIR_max_uint8_as_floatX);                     \
 2837+		af = _mm256_max_ps(af, _mm256_setzero_ps());                           \
 2838+		bf = _mm256_max_ps(bf, _mm256_setzero_ps());                           \
 2839+		a = _mm256_cvttps_epi32(af);                                           \
 2840+		b = _mm256_cvttps_epi32(bf);                                           \
 2841+		out = _mm_packs_epi32(_mm256_castsi256_si128(a),                       \
 2842+		                      _mm256_extractf128_si256(a, 1));                 \
 2843+		out = _mm_packus_epi16(out, out);                                      \
 2844+		t = _mm_packs_epi32(_mm256_castsi256_si128(b),                         \
 2845+		                    _mm256_extractf128_si256(b, 1));                   \
 2846+		t = _mm_packus_epi16(t, t);                                            \
 2847+		out = _mm_castps_si128(                                                \
 2848+		    _mm_shuffle_ps(_mm_castsi128_ps(out), _mm_castsi128_ps(t),         \
 2849+		                   (0 << 0) + (1 << 2) + (0 << 4) + (1 << 6)));        \
 2850+	}
 2851+
 2852+#define stbir__simdi8_expand_u16_to_u32(out, ireg)                             \
 2853+	{                                                                          \
 2854+		stbir__simdi a, b, zero = _mm_setzero_si128();                         \
 2855+		a = _mm_unpacklo_epi16(ireg, zero);                                    \
 2856+		b = _mm_unpackhi_epi16(ireg, zero);                                    \
 2857+		out = _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);        \
 2858+	}
 2859+
 2860+#define stbir__simdf8_pack_to_16words(out, aa, bb)                             \
 2861+	{                                                                          \
 2862+		stbir__simdi t0, t1;                                                   \
 2863+		stbir__simdf8 af, bf;                                                  \
 2864+		stbir__simdi8 a, b;                                                    \
 2865+		af = _mm256_min_ps(aa, STBIR_max_uint16_as_floatX);                    \
 2866+		bf = _mm256_min_ps(bb, STBIR_max_uint16_as_floatX);                    \
 2867+		af = _mm256_max_ps(af, _mm256_setzero_ps());                           \
 2868+		bf = _mm256_max_ps(bf, _mm256_setzero_ps());                           \
 2869+		a = _mm256_cvttps_epi32(af);                                           \
 2870+		b = _mm256_cvttps_epi32(bf);                                           \
 2871+		t0 = _mm_packus_epi32(_mm256_castsi256_si128(a),                       \
 2872+		                      _mm256_extractf128_si256(a, 1));                 \
 2873+		t1 = _mm_packus_epi32(_mm256_castsi256_si128(b),                       \
 2874+		                      _mm256_extractf128_si256(b, 1));                 \
 2875+		out = _mm256_setr_m128i(t0, t1);                                       \
 2876+	}
 2877+
 2878+#endif
 2879+
 2880+static __m256i stbir_00001111 = {STBIR__CONST_4d_32i(0, 0, 0, 0),
 2881+                                 STBIR__CONST_4d_32i(1, 1, 1, 1)};
 2882+#define stbir__simdf8_0123to00001111(out, in)                                  \
 2883+	(out) = _mm256_permutevar_ps(in, stbir_00001111)
 2884+
 2885+static __m256i stbir_22223333 = {STBIR__CONST_4d_32i(2, 2, 2, 2),
 2886+                                 STBIR__CONST_4d_32i(3, 3, 3, 3)};
 2887+#define stbir__simdf8_0123to22223333(out, in)                                  \
 2888+	(out) = _mm256_permutevar_ps(in, stbir_22223333)
 2889+
 2890+#define stbir__simdf8_0123to2222(out, in)                                      \
 2891+	(out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2, 2, 2, 2)
 2892+
 2893+#define stbir__simdf8_load4b(out, ptr)                                         \
 2894+	(out) = _mm256_broadcast_ps((__m128 const *)(ptr))
 2895+
 2896+static __m256i stbir_00112233 = {STBIR__CONST_4d_32i(0, 0, 1, 1),
 2897+                                 STBIR__CONST_4d_32i(2, 2, 3, 3)};
 2898+#define stbir__simdf8_0123to00112233(out, in)                                  \
 2899+	(out) = _mm256_permutevar_ps(in, stbir_00112233)
 2900+#define stbir__simdf8_add4(out, a8, b)                                         \
 2901+	(out) = _mm256_add_ps(a8, _mm256_castps128_ps256(b))
 2902+
 2903+static __m256i stbir_load6 = {
 2904+    STBIR__CONST_4_32i(0x80000000),
 2905+    STBIR__CONST_4d_32i(0x80000000, 0x80000000, 0, 0)};
 2906+#define stbir__simdf8_load6z(out, ptr)                                         \
 2907+	(out) = _mm256_maskload_ps(ptr, stbir_load6)
 2908+
 2909+#define stbir__simdf8_0123to00000000(out, in)                                  \
 2910+	(out) = _mm256_shuffle_ps(in, in, (0 << 0) + (0 << 2) + (0 << 4) + (0 << 6))
 2911+#define stbir__simdf8_0123to11111111(out, in)                                  \
 2912+	(out) = _mm256_shuffle_ps(in, in, (1 << 0) + (1 << 2) + (1 << 4) + (1 << 6))
 2913+#define stbir__simdf8_0123to22222222(out, in)                                  \
 2914+	(out) = _mm256_shuffle_ps(in, in, (2 << 0) + (2 << 2) + (2 << 4) + (2 << 6))
 2915+#define stbir__simdf8_0123to33333333(out, in)                                  \
 2916+	(out) = _mm256_shuffle_ps(in, in, (3 << 0) + (3 << 2) + (3 << 4) + (3 << 6))
 2917+#define stbir__simdf8_0123to21032103(out, in)                                  \
 2918+	(out) = _mm256_shuffle_ps(in, in, (2 << 0) + (1 << 2) + (0 << 4) + (3 << 6))
 2919+#define stbir__simdf8_0123to32103210(out, in)                                  \
 2920+	(out) = _mm256_shuffle_ps(in, in, (3 << 0) + (2 << 2) + (1 << 4) + (0 << 6))
 2921+#define stbir__simdf8_0123to12301230(out, in)                                  \
 2922+	(out) = _mm256_shuffle_ps(in, in, (1 << 0) + (2 << 2) + (3 << 4) + (0 << 6))
 2923+#define stbir__simdf8_0123to10321032(out, in)                                  \
 2924+	(out) = _mm256_shuffle_ps(in, in, (1 << 0) + (0 << 2) + (3 << 4) + (2 << 6))
 2925+#define stbir__simdf8_0123to30123012(out, in)                                  \
 2926+	(out) = _mm256_shuffle_ps(in, in, (3 << 0) + (0 << 2) + (1 << 4) + (2 << 6))
 2927+
 2928+#define stbir__simdf8_0123to11331133(out, in)                                  \
 2929+	(out) = _mm256_shuffle_ps(in, in, (1 << 0) + (1 << 2) + (3 << 4) + (3 << 6))
 2930+#define stbir__simdf8_0123to00220022(out, in)                                  \
 2931+	(out) = _mm256_shuffle_ps(in, in, (0 << 0) + (0 << 2) + (2 << 4) + (2 << 6))
 2932+
 2933+#define stbir__simdf8_aaa1(out, alp, ones)                                     \
 2934+	(out) = _mm256_blend_ps(alp, ones,                                         \
 2935+	                        (1 << 0) + (1 << 1) + (1 << 2) + (0 << 3) +        \
 2936+	                            (1 << 4) + (1 << 5) + (1 << 6) + (0 << 7));    \
 2937+	(out) =                                                                    \
 2938+	    _mm256_shuffle_ps(out, out, (3 << 0) + (3 << 2) + (3 << 4) + (0 << 6))
 2939+#define stbir__simdf8_1aaa(out, alp, ones)                                     \
 2940+	(out) = _mm256_blend_ps(alp, ones,                                         \
 2941+	                        (0 << 0) + (1 << 1) + (1 << 2) + (1 << 3) +        \
 2942+	                            (0 << 4) + (1 << 5) + (1 << 6) + (1 << 7));    \
 2943+	(out) =                                                                    \
 2944+	    _mm256_shuffle_ps(out, out, (1 << 0) + (0 << 2) + (0 << 4) + (0 << 6))
 2945+#define stbir__simdf8_a1a1(out, alp, ones)                                     \
 2946+	(out) = _mm256_blend_ps(alp, ones,                                         \
 2947+	                        (1 << 0) + (0 << 1) + (1 << 2) + (0 << 3) +        \
 2948+	                            (1 << 4) + (0 << 5) + (1 << 6) + (0 << 7));    \
 2949+	(out) =                                                                    \
 2950+	    _mm256_shuffle_ps(out, out, (1 << 0) + (0 << 2) + (3 << 4) + (2 << 6))
 2951+#define stbir__simdf8_1a1a(out, alp, ones)                                     \
 2952+	(out) = _mm256_blend_ps(alp, ones,                                         \
 2953+	                        (0 << 0) + (1 << 1) + (0 << 2) + (1 << 3) +        \
 2954+	                            (0 << 4) + (1 << 5) + (0 << 6) + (1 << 7));    \
 2955+	(out) =                                                                    \
 2956+	    _mm256_shuffle_ps(out, out, (1 << 0) + (0 << 2) + (3 << 4) + (2 << 6))
 2957+
 2958+#define stbir__simdf8_zero(reg) (reg) = _mm256_setzero_ps()
 2959+
 2960+#ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to
 2961+                     // non-simd
 2962+#define stbir__simdf8_madd(out, add, mul1, mul2)                               \
 2963+	(out) = _mm256_fmadd_ps(mul1, mul2, add)
 2964+#define stbir__simdf8_madd_mem(out, add, mul, ptr)                             \
 2965+	(out) = _mm256_fmadd_ps(mul, _mm256_loadu_ps((float const *)(ptr)), add)
 2966+#define stbir__simdf8_madd_mem4(out, add, mul, ptr)                            \
 2967+	(out) =                                                                    \
 2968+	    _mm256_fmadd_ps(_mm256_setr_m128(mul, _mm_setzero_ps()),               \
 2969+	                    _mm256_setr_m128(_mm_loadu_ps((float const *)(ptr)),   \
 2970+	                                     _mm_setzero_ps()),                    \
 2971+	                    add)
 2972+#else
 2973+#define stbir__simdf8_madd(out, add, mul1, mul2)                               \
 2974+	(out) = _mm256_add_ps(add, _mm256_mul_ps(mul1, mul2))
 2975+#define stbir__simdf8_madd_mem(out, add, mul, ptr)                             \
 2976+	(out) = _mm256_add_ps(                                                     \
 2977+	    add, _mm256_mul_ps(mul, _mm256_loadu_ps((float const *)(ptr))))
 2978+#define stbir__simdf8_madd_mem4(out, add, mul, ptr)                            \
 2979+	(out) = _mm256_add_ps(                                                     \
 2980+	    add,                                                                   \
 2981+	    _mm256_setr_m128(_mm_mul_ps(mul, _mm_loadu_ps((float const *)(ptr))),  \
 2982+	                     _mm_setzero_ps()))
 2983+#endif
 2984+#define stbir__if_simdf8_cast_to_simdf4(val) _mm256_castps256_ps128(val)
 2985+
 2986+#endif
 2987+
 2988+#ifdef STBIR_FLOORF
 2989+#undef STBIR_FLOORF
 2990+#endif
 2991+#define STBIR_FLOORF stbir_simd_floorf
 2992+static stbir__inline float
 2993+stbir_simd_floorf(float x) // martins floorf
 2994+{
 2995+#if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
 2996+	__m128 t = _mm_set_ss(x);
 2997+	return _mm_cvtss_f32(_mm_floor_ss(t, t));
 2998+#else
 2999+	__m128 f = _mm_set_ss(x);
 3000+	__m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
 3001+	__m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
 3002+	return _mm_cvtss_f32(r);
 3003+#endif
 3004+}
 3005+
 3006+#ifdef STBIR_CEILF
 3007+#undef STBIR_CEILF
 3008+#endif
 3009+#define STBIR_CEILF stbir_simd_ceilf
 3010+static stbir__inline float
 3011+stbir_simd_ceilf(float x) // martins ceilf
 3012+{
 3013+#if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
 3014+	__m128 t = _mm_set_ss(x);
 3015+	return _mm_cvtss_f32(_mm_ceil_ss(t, t));
 3016+#else
 3017+	__m128 f = _mm_set_ss(x);
 3018+	__m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
 3019+	__m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
 3020+	return _mm_cvtss_f32(r);
 3021+#endif
 3022+}
 3023 
 3024 #elif defined(STBIR_NEON)
 3025 
 3026-  #include <arm_neon.h>
 3027-
 3028-  #define stbir__simdf float32x4_t
 3029-  #define stbir__simdi uint32x4_t
 3030-
 3031-  #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
 3032-  #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
 3033-
 3034-  #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
 3035-  #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
 3036-  #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
 3037-  #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
 3038-  #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
 3039-  #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
 3040-  #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
 3041-  #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
 3042-  #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) )  // top values must be zero
 3043-  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
 3044-
 3045-  #define stbir__simdf_zeroP() vdupq_n_f32(0)
 3046-  #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
 3047-
 3048-  #define stbir__simdf_store( ptr, reg )  vst1q_f32( (float*)(ptr), reg )
 3049-  #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
 3050-  #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
 3051-  #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
 3052-
 3053-  #define stbir__simdi_store( ptr, reg )  vst1q_u32( (uint32_t*)(ptr), reg )
 3054-  #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
 3055-  #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
 3056-
 3057-  #define stbir__prefetch( ptr )
 3058-
 3059-  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
 3060-  { \
 3061-    uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
 3062-    uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
 3063-    out0 = vmovl_u16( vget_low_u16 ( l ) ); \
 3064-    out1 = vmovl_u16( vget_high_u16( l ) ); \
 3065-    out2 = vmovl_u16( vget_low_u16 ( h ) ); \
 3066-    out3 = vmovl_u16( vget_high_u16( h ) ); \
 3067-  }
 3068-
 3069-  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
 3070-  { \
 3071-    uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
 3072-    out = vmovl_u16( vget_low_u16( tmp ) ); \
 3073-  }
 3074-
 3075-  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
 3076-  { \
 3077-    uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
 3078-    out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
 3079-    out1 = vmovl_u16( vget_high_u16( tmp ) ); \
 3080-  }
 3081-
 3082-  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
 3083-  #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
 3084-  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
 3085-  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
 3086-  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
 3087-  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
 3088-  #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
 3089-  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
 3090-  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
 3091-  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
 3092-  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
 3093-  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
 3094-
 3095-  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
 3096-  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
 3097-  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
 3098-  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
 3099-  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
 3100-  #else
 3101-  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
 3102-  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
 3103-  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
 3104-  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
 3105-  #endif
 3106-
 3107-  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
 3108-  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
 3109-
 3110-  #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
 3111-  #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
 3112-
 3113-  #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
 3114-  #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
 3115-  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
 3116-  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
 3117-
 3118-  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
 3119-  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
 3120-
 3121-  #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
 3122-  #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
 3123-
 3124-  #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
 3125-
 3126-    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
 3127-    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
 3128-
 3129-    #if defined( _MSC_VER ) && !defined(__clang__)
 3130-      #define stbir_make16(a,b,c,d) vcombine_u8( \
 3131-        vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
 3132-          ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
 3133-        vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
 3134-          ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
 3135-
 3136-      static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
 3137-      {
 3138-        uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
 3139-        return r;
 3140-      }
 3141-    #else
 3142-      #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
 3143-      #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
 3144-    #endif
 3145-
 3146-    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
 3147-    #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
 3148-
 3149-    #define stbir__simdi_16madd( out, reg0, reg1 ) \
 3150-    { \
 3151-      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
 3152-      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
 3153-      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
 3154-      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
 3155-      (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
 3156-    }
 3157-
 3158-  #else
 3159-
 3160-    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
 3161-    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
 3162-
 3163-    #if defined( _MSC_VER ) && !defined(__clang__)
 3164-      static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
 3165-      {
 3166-        uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
 3167-        return r;
 3168-      }
 3169-      #define stbir_make8(a,b) vcreate_u8( \
 3170-        (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
 3171-        ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
 3172-    #else
 3173-      #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
 3174-      #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
 3175-    #endif
 3176-
 3177-    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
 3178-        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
 3179-        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
 3180-
 3181-    #define stbir__simdi_16madd( out, reg0, reg1 ) \
 3182-    { \
 3183-      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
 3184-      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
 3185-      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
 3186-      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
 3187-      int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
 3188-      int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
 3189-      (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
 3190-    }
 3191-
 3192-  #endif
 3193-
 3194-  #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
 3195-  #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
 3196-
 3197-  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
 3198-  { \
 3199-    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
 3200-    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
 3201-    int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
 3202-    int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
 3203-    uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
 3204-    out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
 3205-  }
 3206-
 3207-  #define stbir__simdf_pack_to_8words(out,aa,bb) \
 3208-  { \
 3209-    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
 3210-    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
 3211-    int32x4_t ai = vcvtq_s32_f32( af ); \
 3212-    int32x4_t bi = vcvtq_s32_f32( bf ); \
 3213-    out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
 3214-  }
 3215-
 3216-  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
 3217-  { \
 3218-    int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
 3219-    int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
 3220-    uint8x8x2_t out = \
 3221-    { { \
 3222-      vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
 3223-      vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
 3224-    } }; \
 3225-    vst2_u8(ptr, out); \
 3226-  }
 3227-
 3228-  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
 3229-  { \
 3230-    float32x4x4_t tmp = vld4q_f32(ptr); \
 3231-    o0 = tmp.val[0]; \
 3232-    o1 = tmp.val[1]; \
 3233-    o2 = tmp.val[2]; \
 3234-    o3 = tmp.val[3]; \
 3235-  }
 3236-
 3237-  #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
 3238-
 3239-  #if defined( _MSC_VER ) && !defined(__clang__)
 3240-    #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
 3241-    #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
 3242-    #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
 3243-    #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
 3244-  #else
 3245-    #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
 3246-    #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
 3247-    #define STBIR__CONSTF(var) (var)
 3248-    #define STBIR__CONSTI(var) (var)
 3249-  #endif
 3250-
 3251-  #ifdef STBIR_FLOORF
 3252-  #undef STBIR_FLOORF
 3253-  #endif
 3254-  #define STBIR_FLOORF stbir_simd_floorf
 3255-  static stbir__inline float stbir_simd_floorf(float x)
 3256-  {
 3257-    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
 3258-    return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
 3259-    #else
 3260-    float32x2_t f = vdup_n_f32(x);
 3261-    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
 3262-    uint32x2_t a = vclt_f32(f, t);
 3263-    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
 3264-    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
 3265-    return vget_lane_f32(r, 0);
 3266-    #endif
 3267-  }
 3268-
 3269-  #ifdef STBIR_CEILF
 3270-  #undef STBIR_CEILF
 3271-  #endif
 3272-  #define STBIR_CEILF stbir_simd_ceilf
 3273-  static stbir__inline float stbir_simd_ceilf(float x)
 3274-  {
 3275-    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
 3276-    return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
 3277-    #else
 3278-    float32x2_t f = vdup_n_f32(x);
 3279-    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
 3280-    uint32x2_t a = vclt_f32(t, f);
 3281-    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
 3282-    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
 3283-    return vget_lane_f32(r, 0);
 3284-    #endif
 3285-  }
 3286-
 3287-  #define STBIR_SIMD
 3288+#include <arm_neon.h>
 3289+
 3290+#define stbir__simdf float32x4_t
 3291+#define stbir__simdi uint32x4_t
 3292+
 3293+#define stbir_simdi_castf(reg) vreinterpretq_u32_f32(reg)
 3294+#define stbir_simdf_casti(reg) vreinterpretq_f32_u32(reg)
 3295+
 3296+#define stbir__simdf_load(reg, ptr) (reg) = vld1q_f32((float const *)(ptr))
 3297+#define stbir__simdi_load(reg, ptr) (reg) = vld1q_u32((uint32_t const *)(ptr))
 3298+#define stbir__simdf_load1(out, ptr)                                           \
 3299+	(out) = vld1q_dup_f32((float const *)(ptr)) // top values can be random (not
 3300+	                                            // denormal or nan for perf)
 3301+#define stbir__simdi_load1(out, ptr)                                           \
 3302+	(out) = vld1q_dup_u32((uint32_t const *)(ptr))
 3303+#define stbir__simdf_load1z(out, ptr)                                          \
 3304+	(out) = vld1q_lane_f32((float const *)(ptr), vdupq_n_f32(0),               \
 3305+	                       0) // top values must be zero
 3306+#define stbir__simdf_frep4(fvar) vdupq_n_f32(fvar)
 3307+#define stbir__simdf_load1frep4(out, fvar) (out) = vdupq_n_f32(fvar)
 3308+#define stbir__simdf_load2(out, ptr)                                           \
 3309+	(out) = vcombine_f32(                                                      \
 3310+	    vld1_f32((float const *)(ptr)),                                        \
 3311+	    vcreate_f32(                                                           \
 3312+	        0)) // top values can be random (not denormal or nan for perf)
 3313+#define stbir__simdf_load2z(out, ptr)                                          \
 3314+	(out) = vcombine_f32(vld1_f32((float const *)(ptr)),                       \
 3315+	                     vcreate_f32(0)) // top values must be zero
 3316+#define stbir__simdf_load2hmerge(out, reg, ptr)                                \
 3317+	(out) = vcombine_f32(vget_low_f32(reg), vld1_f32((float const *)(ptr)))
 3318+
 3319+#define stbir__simdf_zeroP() vdupq_n_f32(0)
 3320+#define stbir__simdf_zero(reg) (reg) = vdupq_n_f32(0)
 3321+
 3322+#define stbir__simdf_store(ptr, reg) vst1q_f32((float *)(ptr), reg)
 3323+#define stbir__simdf_store1(ptr, reg) vst1q_lane_f32((float *)(ptr), reg, 0)
 3324+#define stbir__simdf_store2(ptr, reg)                                          \
 3325+	vst1_f32((float *)(ptr), vget_low_f32(reg))
 3326+#define stbir__simdf_store2h(ptr, reg)                                         \
 3327+	vst1_f32((float *)(ptr), vget_high_f32(reg))
 3328+
 3329+#define stbir__simdi_store(ptr, reg) vst1q_u32((uint32_t *)(ptr), reg)
 3330+#define stbir__simdi_store1(ptr, reg) vst1q_lane_u32((uint32_t *)(ptr), reg, 0)
 3331+#define stbir__simdi_store2(ptr, reg)                                          \
 3332+	vst1_u32((uint32_t *)(ptr), vget_low_u32(reg))
 3333+
 3334+#define stbir__prefetch(ptr)
 3335+
 3336+#define stbir__simdi_expand_u8_to_u32(out0, out1, out2, out3, ireg)            \
 3337+	{                                                                          \
 3338+		uint16x8_t l = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(ireg)));      \
 3339+		uint16x8_t h = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(ireg)));     \
 3340+		out0 = vmovl_u16(vget_low_u16(l));                                     \
 3341+		out1 = vmovl_u16(vget_high_u16(l));                                    \
 3342+		out2 = vmovl_u16(vget_low_u16(h));                                     \
 3343+		out3 = vmovl_u16(vget_high_u16(h));                                    \
 3344+	}
 3345+
 3346+#define stbir__simdi_expand_u8_to_1u32(out, ireg)                              \
 3347+	{                                                                          \
 3348+		uint16x8_t tmp = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(ireg)));    \
 3349+		out = vmovl_u16(vget_low_u16(tmp));                                    \
 3350+	}
 3351+
 3352+#define stbir__simdi_expand_u16_to_u32(out0, out1, ireg)                       \
 3353+	{                                                                          \
 3354+		uint16x8_t tmp = vreinterpretq_u16_u32(ireg);                          \
 3355+		out0 = vmovl_u16(vget_low_u16(tmp));                                   \
 3356+		out1 = vmovl_u16(vget_high_u16(tmp));                                  \
 3357+	}
 3358+
 3359+#define stbir__simdf_convert_float_to_i32(i, f)                                \
 3360+	(i) = vreinterpretq_u32_s32(vcvtq_s32_f32(f))
 3361+#define stbir__simdf_convert_float_to_int(f) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
 3362+#define stbir__simdi_to_int(i) (int)vgetq_lane_u32(i, 0)
 3363+#define stbir__simdf_convert_float_to_uint8(f)                                 \
 3364+	((unsigned char)vgetq_lane_s32(                                            \
 3365+	    vcvtq_s32_f32(                                                         \
 3366+	        vmaxq_f32(vminq_f32(f, STBIR__CONSTF(STBIR_max_uint8_as_float)),   \
 3367+	                  vdupq_n_f32(0))),                                        \
 3368+	    0))
 3369+#define stbir__simdf_convert_float_to_short(f)                                 \
 3370+	((unsigned short)vgetq_lane_s32(                                           \
 3371+	    vcvtq_s32_f32(                                                         \
 3372+	        vmaxq_f32(vminq_f32(f, STBIR__CONSTF(STBIR_max_uint16_as_float)),  \
 3373+	                  vdupq_n_f32(0))),                                        \
 3374+	    0))
 3375+#define stbir__simdi_convert_i32_to_float(out, ireg)                           \
 3376+	(out) = vcvtq_f32_s32(vreinterpretq_s32_u32(ireg))
 3377+#define stbir__simdf_add(out, reg0, reg1) (out) = vaddq_f32(reg0, reg1)
 3378+#define stbir__simdf_mult(out, reg0, reg1) (out) = vmulq_f32(reg0, reg1)
 3379+#define stbir__simdf_mult_mem(out, reg, ptr)                                   \
 3380+	(out) = vmulq_f32(reg, vld1q_f32((float const *)(ptr)))
 3381+#define stbir__simdf_mult1_mem(out, reg, ptr)                                  \
 3382+	(out) = vmulq_f32(reg, vld1q_dup_f32((float const *)(ptr)))
 3383+#define stbir__simdf_add_mem(out, reg, ptr)                                    \
 3384+	(out) = vaddq_f32(reg, vld1q_f32((float const *)(ptr)))
 3385+#define stbir__simdf_add1_mem(out, reg, ptr)                                   \
 3386+	(out) = vaddq_f32(reg, vld1q_dup_f32((float const *)(ptr)))
 3387+
 3388+#ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to
 3389+                     // non-simd (and also x64 no madd to arm madd)
 3390+#define stbir__simdf_madd(out, add, mul1, mul2)                                \
 3391+	(out) = vfmaq_f32(add, mul1, mul2)
 3392+#define stbir__simdf_madd1(out, add, mul1, mul2)                               \
 3393+	(out) = vfmaq_f32(add, mul1, mul2)
 3394+#define stbir__simdf_madd_mem(out, add, mul, ptr)                              \
 3395+	(out) = vfmaq_f32(add, mul, vld1q_f32((float const *)(ptr)))
 3396+#define stbir__simdf_madd1_mem(out, add, mul, ptr)                             \
 3397+	(out) = vfmaq_f32(add, mul, vld1q_dup_f32((float const *)(ptr)))
 3398+#else
 3399+#define stbir__simdf_madd(out, add, mul1, mul2)                                \
 3400+	(out) = vaddq_f32(add, vmulq_f32(mul1, mul2))
 3401+#define stbir__simdf_madd1(out, add, mul1, mul2)                               \
 3402+	(out) = vaddq_f32(add, vmulq_f32(mul1, mul2))
 3403+#define stbir__simdf_madd_mem(out, add, mul, ptr)                              \
 3404+	(out) = vaddq_f32(add, vmulq_f32(mul, vld1q_f32((float const *)(ptr))))
 3405+#define stbir__simdf_madd1_mem(out, add, mul, ptr)                             \
 3406+	(out) = vaddq_f32(add, vmulq_f32(mul, vld1q_dup_f32((float const *)(ptr))))
 3407+#endif
 3408+
 3409+#define stbir__simdf_add1(out, reg0, reg1) (out) = vaddq_f32(reg0, reg1)
 3410+#define stbir__simdf_mult1(out, reg0, reg1) (out) = vmulq_f32(reg0, reg1)
 3411+
 3412+#define stbir__simdf_and(out, reg0, reg1)                                      \
 3413+	(out) = vreinterpretq_f32_u32(                                             \
 3414+	    vandq_u32(vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1)))
 3415+#define stbir__simdf_or(out, reg0, reg1)                                       \
 3416+	(out) = vreinterpretq_f32_u32(                                             \
 3417+	    vorrq_u32(vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1)))
 3418+
 3419+#define stbir__simdf_min(out, reg0, reg1) (out) = vminq_f32(reg0, reg1)
 3420+#define stbir__simdf_max(out, reg0, reg1) (out) = vmaxq_f32(reg0, reg1)
 3421+#define stbir__simdf_min1(out, reg0, reg1) (out) = vminq_f32(reg0, reg1)
 3422+#define stbir__simdf_max1(out, reg0, reg1) (out) = vmaxq_f32(reg0, reg1)
 3423+
 3424+#define stbir__simdf_0123ABCDto3ABx(out, reg0, reg1)                           \
 3425+	(out) = vextq_f32(reg0, reg1, 3)
 3426+#define stbir__simdf_0123ABCDto23Ax(out, reg0, reg1)                           \
 3427+	(out) = vextq_f32(reg0, reg1, 2)
 3428+
 3429+#define stbir__simdf_a1a1(out, alp, ones)                                      \
 3430+	(out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
 3431+#define stbir__simdf_1a1a(out, alp, ones)                                      \
 3432+	(out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
 3433+
 3434+#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__)
 3435+
 3436+#define stbir__simdf_aaa1(out, alp, ones)                                      \
 3437+	(out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
 3438+#define stbir__simdf_1aaa(out, alp, ones)                                      \
 3439+	(out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
 3440+
 3441+#if defined(_MSC_VER) && !defined(__clang__)
 3442+#define stbir_make16(a, b, c, d)                                               \
 3443+	vcombine_u8(                                                               \
 3444+	    vcreate_u8((4 * a + 0) | ((4 * a + 1) << 8) | ((4 * a + 2) << 16) |    \
 3445+	               ((4 * a + 3) << 24) | ((stbir_uint64)(4 * b + 0) << 32) |   \
 3446+	               ((stbir_uint64)(4 * b + 1) << 40) |                         \
 3447+	               ((stbir_uint64)(4 * b + 2) << 48) |                         \
 3448+	               ((stbir_uint64)(4 * b + 3) << 56)),                         \
 3449+	    vcreate_u8((4 * c + 0) | ((4 * c + 1) << 8) | ((4 * c + 2) << 16) |    \
 3450+	               ((4 * c + 3) << 24) | ((stbir_uint64)(4 * d + 0) << 32) |   \
 3451+	               ((stbir_uint64)(4 * d + 1) << 40) |                         \
 3452+	               ((stbir_uint64)(4 * d + 2) << 48) |                         \
 3453+	               ((stbir_uint64)(4 * d + 3) << 56)))
 3454+
 3455+static stbir__inline uint8x16x2_t
 3456+stbir_make16x2(float32x4_t rega, float32x4_t regb)
 3457+{
 3458+	uint8x16x2_t r = {vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb)};
 3459+	return r;
 3460+}
 3461+#else
 3462+#define stbir_make16(a, b, c, d)                                               \
 3463+	(uint8x16_t){4 * a + 0, 4 * a + 1, 4 * a + 2, 4 * a + 3,                   \
 3464+	             4 * b + 0, 4 * b + 1, 4 * b + 2, 4 * b + 3,                   \
 3465+	             4 * c + 0, 4 * c + 1, 4 * c + 2, 4 * c + 3,                   \
 3466+	             4 * d + 0, 4 * d + 1, 4 * d + 2, 4 * d + 3}
 3467+#define stbir_make16x2(a, b)                                                   \
 3468+	(uint8x16x2_t)                                                             \
 3469+	{                                                                          \
 3470+		{                                                                      \
 3471+			vreinterpretq_u8_f32(a), vreinterpretq_u8_f32(b)                   \
 3472+		}                                                                      \
 3473+	}
 3474+#endif
 3475+
 3476+#define stbir__simdf_swiz(reg, one, two, three, four)                          \
 3477+	vreinterpretq_f32_u8(vqtbl1q_u8(vreinterpretq_u8_f32(reg),                 \
 3478+	                                stbir_make16(one, two, three, four)))
 3479+#define stbir__simdf_swiz2(rega, regb, one, two, three, four)                  \
 3480+	vreinterpretq_f32_u8(vqtbl2q_u8(stbir_make16x2(rega, regb),                \
 3481+	                                stbir_make16(one, two, three, four)))
 3482+
 3483+#define stbir__simdi_16madd(out, reg0, reg1)                                   \
 3484+	{                                                                          \
 3485+		int16x8_t r0 = vreinterpretq_s16_u32(reg0);                            \
 3486+		int16x8_t r1 = vreinterpretq_s16_u32(reg1);                            \
 3487+		int32x4_t tmp0 = vmull_s16(vget_low_s16(r0), vget_low_s16(r1));        \
 3488+		int32x4_t tmp1 = vmull_s16(vget_high_s16(r0), vget_high_s16(r1));      \
 3489+		(out) = vreinterpretq_u32_s32(vpaddq_s32(tmp0, tmp1));                 \
 3490+	}
 3491+
 3492+#else
 3493+
 3494+#define stbir__simdf_aaa1(out, alp, ones)                                      \
 3495+	(out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
 3496+#define stbir__simdf_1aaa(out, alp, ones)                                      \
 3497+	(out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
 3498+
 3499+#if defined(_MSC_VER) && !defined(__clang__)
 3500+static stbir__inline uint8x8x2_t
 3501+stbir_make8x2(float32x4_t reg)
 3502+{
 3503+	uint8x8x2_t r = {{vget_low_u8(vreinterpretq_u8_f32(reg)),
 3504+	                  vget_high_u8(vreinterpretq_u8_f32(reg))}};
 3505+	return r;
 3506+}
 3507+#define stbir_make8(a, b)                                                      \
 3508+	vcreate_u8((4 * a + 0) | ((4 * a + 1) << 8) | ((4 * a + 2) << 16) |        \
 3509+	           ((4 * a + 3) << 24) | ((stbir_uint64)(4 * b + 0) << 32) |       \
 3510+	           ((stbir_uint64)(4 * b + 1) << 40) |                             \
 3511+	           ((stbir_uint64)(4 * b + 2) << 48) |                             \
 3512+	           ((stbir_uint64)(4 * b + 3) << 56))
 3513+#else
 3514+#define stbir_make8x2(reg)                                                     \
 3515+	(uint8x8x2_t)                                                              \
 3516+	{                                                                          \
 3517+		{                                                                      \
 3518+			vget_low_u8(vreinterpretq_u8_f32(reg)),                            \
 3519+			    vget_high_u8(vreinterpretq_u8_f32(reg))                        \
 3520+		}                                                                      \
 3521+	}
 3522+#define stbir_make8(a, b)                                                      \
 3523+	(uint8x8_t){4 * a + 0, 4 * a + 1, 4 * a + 2, 4 * a + 3,                    \
 3524+	            4 * b + 0, 4 * b + 1, 4 * b + 2, 4 * b + 3}
 3525+#endif
 3526+
 3527+#define stbir__simdf_swiz(reg, one, two, three, four)                          \
 3528+	vreinterpretq_f32_u8(                                                      \
 3529+	    vcombine_u8(vtbl2_u8(stbir_make8x2(reg), stbir_make8(one, two)),       \
 3530+	                vtbl2_u8(stbir_make8x2(reg), stbir_make8(three, four))))
 3531+
 3532+#define stbir__simdi_16madd(out, reg0, reg1)                                   \
 3533+	{                                                                          \
 3534+		int16x8_t r0 = vreinterpretq_s16_u32(reg0);                            \
 3535+		int16x8_t r1 = vreinterpretq_s16_u32(reg1);                            \
 3536+		int32x4_t tmp0 = vmull_s16(vget_low_s16(r0), vget_low_s16(r1));        \
 3537+		int32x4_t tmp1 = vmull_s16(vget_high_s16(r0), vget_high_s16(r1));      \
 3538+		int32x2_t out0 = vpadd_s32(vget_low_s32(tmp0), vget_high_s32(tmp0));   \
 3539+		int32x2_t out1 = vpadd_s32(vget_low_s32(tmp1), vget_high_s32(tmp1));   \
 3540+		(out) = vreinterpretq_u32_s32(vcombine_s32(out0, out1));               \
 3541+	}
 3542+
 3543+#endif
 3544+
 3545+#define stbir__simdi_and(out, reg0, reg1) (out) = vandq_u32(reg0, reg1)
 3546+#define stbir__simdi_or(out, reg0, reg1) (out) = vorrq_u32(reg0, reg1)
 3547+
 3548+#define stbir__simdf_pack_to_8bytes(out, aa, bb)                               \
 3549+	{                                                                          \
 3550+		float32x4_t af =                                                       \
 3551+		    vmaxq_f32(vminq_f32(aa, STBIR__CONSTF(STBIR_max_uint8_as_float)),  \
 3552+		              vdupq_n_f32(0));                                         \
 3553+		float32x4_t bf =                                                       \
 3554+		    vmaxq_f32(vminq_f32(bb, STBIR__CONSTF(STBIR_max_uint8_as_float)),  \
 3555+		              vdupq_n_f32(0));                                         \
 3556+		int16x4_t ai = vqmovn_s32(vcvtq_s32_f32(af));                          \
 3557+		int16x4_t bi = vqmovn_s32(vcvtq_s32_f32(bf));                          \
 3558+		uint8x8_t out8 = vqmovun_s16(vcombine_s16(ai, bi));                    \
 3559+		out = vreinterpretq_u32_u8(vcombine_u8(out8, out8));                   \
 3560+	}
 3561+
 3562+#define stbir__simdf_pack_to_8words(out, aa, bb)                               \
 3563+	{                                                                          \
 3564+		float32x4_t af =                                                       \
 3565+		    vmaxq_f32(vminq_f32(aa, STBIR__CONSTF(STBIR_max_uint16_as_float)), \
 3566+		              vdupq_n_f32(0));                                         \
 3567+		float32x4_t bf =                                                       \
 3568+		    vmaxq_f32(vminq_f32(bb, STBIR__CONSTF(STBIR_max_uint16_as_float)), \
 3569+		              vdupq_n_f32(0));                                         \
 3570+		int32x4_t ai = vcvtq_s32_f32(af);                                      \
 3571+		int32x4_t bi = vcvtq_s32_f32(bf);                                      \
 3572+		out = vreinterpretq_u32_u16(                                           \
 3573+		    vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)));                   \
 3574+	}
 3575+
 3576+#define stbir__interleave_pack_and_store_16_u8(ptr, r0, r1, r2, r3)            \
 3577+	{                                                                          \
 3578+		int16x4x2_t tmp0 = vzip_s16(vqmovn_s32(vreinterpretq_s32_u32(r0)),     \
 3579+		                            vqmovn_s32(vreinterpretq_s32_u32(r2)));    \
 3580+		int16x4x2_t tmp1 = vzip_s16(vqmovn_s32(vreinterpretq_s32_u32(r1)),     \
 3581+		                            vqmovn_s32(vreinterpretq_s32_u32(r3)));    \
 3582+		uint8x8x2_t out = {{                                                   \
 3583+		    vqmovun_s16(vcombine_s16(tmp0.val[0], tmp0.val[1])),               \
 3584+		    vqmovun_s16(vcombine_s16(tmp1.val[0], tmp1.val[1])),               \
 3585+		}};                                                                    \
 3586+		vst2_u8(ptr, out);                                                     \
 3587+	}
 3588+
 3589+#define stbir__simdf_load4_transposed(o0, o1, o2, o3, ptr)                     \
 3590+	{                                                                          \
 3591+		float32x4x4_t tmp = vld4q_f32(ptr);                                    \
 3592+		o0 = tmp.val[0];                                                       \
 3593+		o1 = tmp.val[1];                                                       \
 3594+		o2 = tmp.val[2];                                                       \
 3595+		o3 = tmp.val[3];                                                       \
 3596+	}
 3597+
 3598+#define stbir__simdi_32shr(out, reg, imm) out = vshrq_n_u32(reg, imm)
 3599+
 3600+#if defined(_MSC_VER) && !defined(__clang__)
 3601+#define STBIR__SIMDF_CONST(var, x)                                             \
 3602+	__declspec(align(8)) float var[] = {x, x, x, x}
 3603+#define STBIR__SIMDI_CONST(var, x)                                             \
 3604+	__declspec(align(8)) uint32_t var[] = {x, x, x, x}
 3605+#define STBIR__CONSTF(var) (*(const float32x4_t *)var)
 3606+#define STBIR__CONSTI(var) (*(const uint32x4_t *)var)
 3607+#else
 3608+#define STBIR__SIMDF_CONST(var, x) stbir__simdf var = {x, x, x, x}
 3609+#define STBIR__SIMDI_CONST(var, x) stbir__simdi var = {x, x, x, x}
 3610+#define STBIR__CONSTF(var) (var)
 3611+#define STBIR__CONSTI(var) (var)
 3612+#endif
 3613+
 3614+#ifdef STBIR_FLOORF
 3615+#undef STBIR_FLOORF
 3616+#endif
 3617+#define STBIR_FLOORF stbir_simd_floorf
 3618+static stbir__inline float
 3619+stbir_simd_floorf(float x)
 3620+{
 3621+#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__)
 3622+	return vget_lane_f32(vrndm_f32(vdup_n_f32(x)), 0);
 3623+#else
 3624+	float32x2_t f = vdup_n_f32(x);
 3625+	float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
 3626+	uint32x2_t a = vclt_f32(f, t);
 3627+	uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
 3628+	float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
 3629+	return vget_lane_f32(r, 0);
 3630+#endif
 3631+}
 3632+
 3633+#ifdef STBIR_CEILF
 3634+#undef STBIR_CEILF
 3635+#endif
 3636+#define STBIR_CEILF stbir_simd_ceilf
 3637+static stbir__inline float
 3638+stbir_simd_ceilf(float x)
 3639+{
 3640+#if defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__)
 3641+	return vget_lane_f32(vrndp_f32(vdup_n_f32(x)), 0);
 3642+#else
 3643+	float32x2_t f = vdup_n_f32(x);
 3644+	float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
 3645+	uint32x2_t a = vclt_f32(t, f);
 3646+	uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
 3647+	float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
 3648+	return vget_lane_f32(r, 0);
 3649+#endif
 3650+}
 3651+
 3652+#define STBIR_SIMD
 3653 
 3654 #elif defined(STBIR_WASM)
 3655 
 3656-  #include <wasm_simd128.h>
 3657-
 3658-  #define stbir__simdf v128_t
 3659-  #define stbir__simdi v128_t
 3660-
 3661-  #define stbir_simdi_castf( reg ) (reg)
 3662-  #define stbir_simdf_casti( reg ) (reg)
 3663-
 3664-  #define stbir__simdf_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
 3665-  #define stbir__simdi_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
 3666-  #define stbir__simdf_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
 3667-  #define stbir__simdi_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) )
 3668-  #define stbir__simdf_load1z( out, ptr )           (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
 3669-  #define stbir__simdf_frep4( fvar )                wasm_f32x4_splat( fvar )
 3670-  #define stbir__simdf_load1frep4( out, fvar )      (out) = wasm_f32x4_splat( fvar )
 3671-  #define stbir__simdf_load2( out, ptr )            (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
 3672-  #define stbir__simdf_load2z( out, ptr )           (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
 3673-  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
 3674-
 3675-  #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
 3676-  #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
 3677-
 3678-  #define stbir__simdf_store( ptr, reg )   wasm_v128_store( (void*)(ptr), reg )
 3679-  #define stbir__simdf_store1( ptr, reg )  wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
 3680-  #define stbir__simdf_store2( ptr, reg )  wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
 3681-  #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
 3682-
 3683-  #define stbir__simdi_store( ptr, reg )  wasm_v128_store( (void*)(ptr), reg )
 3684-  #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
 3685-  #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
 3686-
 3687-  #define stbir__prefetch( ptr )
 3688-
 3689-  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
 3690-  { \
 3691-    v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
 3692-    v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
 3693-    out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
 3694-    out1 = wasm_u32x4_extend_high_u16x8( l ); \
 3695-    out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
 3696-    out3 = wasm_u32x4_extend_high_u16x8( h ); \
 3697-  }
 3698-
 3699-  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
 3700-  { \
 3701-    v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
 3702-    out = wasm_u32x4_extend_low_u16x8(tmp); \
 3703-  }
 3704-
 3705-  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
 3706-  { \
 3707-    out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
 3708-    out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
 3709-  }
 3710-
 3711-  #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
 3712-  #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
 3713-  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0)
 3714-  #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
 3715-  #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
 3716-  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
 3717-  #define stbir__simdf_add( out, reg0, reg1 )          (out) = wasm_f32x4_add( reg0, reg1 )
 3718-  #define stbir__simdf_mult( out, reg0, reg1 )         (out) = wasm_f32x4_mul( reg0, reg1 )
 3719-  #define stbir__simdf_mult_mem( out, reg, ptr )       (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
 3720-  #define stbir__simdf_mult1_mem( out, reg, ptr )      (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
 3721-  #define stbir__simdf_add_mem( out, reg, ptr )        (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
 3722-  #define stbir__simdf_add1_mem( out, reg, ptr )       (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
 3723-
 3724-  #define stbir__simdf_madd( out, add, mul1, mul2 )    (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
 3725-  #define stbir__simdf_madd1( out, add, mul1, mul2 )   (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
 3726-  #define stbir__simdf_madd_mem( out, add, mul, ptr )  (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
 3727-  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
 3728-
 3729-  #define stbir__simdf_add1( out, reg0, reg1 )  (out) = wasm_f32x4_add( reg0, reg1 )
 3730-  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
 3731-
 3732-  #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
 3733-  #define stbir__simdf_or( out, reg0, reg1 )  (out) = wasm_v128_or( reg0, reg1 )
 3734-
 3735-  #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
 3736-  #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
 3737-  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
 3738-  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
 3739-
 3740-  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
 3741-  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
 3742-
 3743-  #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
 3744-  #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
 3745-  #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
 3746-  #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
 3747-
 3748-  #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
 3749-
 3750-  #define stbir__simdi_and( out, reg0, reg1 )    (out) = wasm_v128_and( reg0, reg1 )
 3751-  #define stbir__simdi_or( out, reg0, reg1 )     (out) = wasm_v128_or( reg0, reg1 )
 3752-  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
 3753-
 3754-  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
 3755-  { \
 3756-    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
 3757-    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
 3758-    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
 3759-    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
 3760-    v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
 3761-    out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
 3762-  }
 3763-
 3764-  #define stbir__simdf_pack_to_8words(out,aa,bb) \
 3765-  { \
 3766-    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
 3767-    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
 3768-    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
 3769-    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
 3770-    out = wasm_u16x8_narrow_i32x4( ai, bi ); \
 3771-  }
 3772-
 3773-  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
 3774-  { \
 3775-    v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
 3776-    v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
 3777-    v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
 3778-    tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
 3779-    wasm_v128_store( (void*)(ptr), tmp); \
 3780-  }
 3781-
 3782-  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
 3783-  { \
 3784-    v128_t t0 = wasm_v128_load( ptr    ); \
 3785-    v128_t t1 = wasm_v128_load( ptr+4  ); \
 3786-    v128_t t2 = wasm_v128_load( ptr+8  ); \
 3787-    v128_t t3 = wasm_v128_load( ptr+12 ); \
 3788-    v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
 3789-    v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
 3790-    v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
 3791-    v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
 3792-    o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
 3793-    o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
 3794-    o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
 3795-    o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
 3796-  }
 3797-
 3798-  #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
 3799-
 3800-  typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 3801-  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
 3802-  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
 3803-  #define STBIR__CONSTF(var) (var)
 3804-  #define STBIR__CONSTI(var) (var)
 3805-
 3806-  #ifdef STBIR_FLOORF
 3807-  #undef STBIR_FLOORF
 3808-  #endif
 3809-  #define STBIR_FLOORF stbir_simd_floorf
 3810-  static stbir__inline float stbir_simd_floorf(float x)
 3811-  {
 3812-    return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
 3813-  }
 3814-
 3815-  #ifdef STBIR_CEILF
 3816-  #undef STBIR_CEILF
 3817-  #endif
 3818-  #define STBIR_CEILF stbir_simd_ceilf
 3819-  static stbir__inline float stbir_simd_ceilf(float x)
 3820-  {
 3821-    return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
 3822-  }
 3823-
 3824-  #define STBIR_SIMD
 3825-
 3826-#endif  // SSE2/NEON/WASM
 3827+#include <wasm_simd128.h>
 3828+
 3829+#define stbir__simdf v128_t
 3830+#define stbir__simdi v128_t
 3831+
 3832+#define stbir_simdi_castf(reg) (reg)
 3833+#define stbir_simdf_casti(reg) (reg)
 3834+
 3835+#define stbir__simdf_load(reg, ptr) (reg) = wasm_v128_load((void const *)(ptr))
 3836+#define stbir__simdi_load(reg, ptr) (reg) = wasm_v128_load((void const *)(ptr))
 3837+#define stbir__simdf_load1(out, ptr)                                           \
 3838+	(out) = wasm_v128_load32_splat(                                            \
 3839+	    (void const *)(ptr)) // top values can be random (not denormal or nan
 3840+	                         // for perf)
 3841+#define stbir__simdi_load1(out, ptr)                                           \
 3842+	(out) = wasm_v128_load32_splat((void const *)(ptr))
 3843+#define stbir__simdf_load1z(out, ptr)                                          \
 3844+	(out) =                                                                    \
 3845+	    wasm_v128_load32_zero((void const *)(ptr)) // top values must be zero
 3846+#define stbir__simdf_frep4(fvar) wasm_f32x4_splat(fvar)
 3847+#define stbir__simdf_load1frep4(out, fvar) (out) = wasm_f32x4_splat(fvar)
 3848+#define stbir__simdf_load2(out, ptr)                                           \
 3849+	(out) = wasm_v128_load64_splat(                                            \
 3850+	    (void const *)(ptr)) // top values can be random (not denormal or nan
 3851+	                         // for perf)
 3852+#define stbir__simdf_load2z(out, ptr)                                          \
 3853+	(out) =                                                                    \
 3854+	    wasm_v128_load64_zero((void const *)(ptr)) // top values must be zero
 3855+#define stbir__simdf_load2hmerge(out, reg, ptr)                                \
 3856+	(out) = wasm_v128_load64_lane((void const *)(ptr), reg, 1)
 3857+
 3858+#define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
 3859+#define stbir__simdf_zero(reg) (reg) = wasm_f32x4_const_splat(0)
 3860+
 3861+#define stbir__simdf_store(ptr, reg) wasm_v128_store((void *)(ptr), reg)
 3862+#define stbir__simdf_store1(ptr, reg)                                          \
 3863+	wasm_v128_store32_lane((void *)(ptr), reg, 0)
 3864+#define stbir__simdf_store2(ptr, reg)                                          \
 3865+	wasm_v128_store64_lane((void *)(ptr), reg, 0)
 3866+#define stbir__simdf_store2h(ptr, reg)                                         \
 3867+	wasm_v128_store64_lane((void *)(ptr), reg, 1)
 3868+
 3869+#define stbir__simdi_store(ptr, reg) wasm_v128_store((void *)(ptr), reg)
 3870+#define stbir__simdi_store1(ptr, reg)                                          \
 3871+	wasm_v128_store32_lane((void *)(ptr), reg, 0)
 3872+#define stbir__simdi_store2(ptr, reg)                                          \
 3873+	wasm_v128_store64_lane((void *)(ptr), reg, 0)
 3874+
 3875+#define stbir__prefetch(ptr)
 3876+
 3877+#define stbir__simdi_expand_u8_to_u32(out0, out1, out2, out3, ireg)            \
 3878+	{                                                                          \
 3879+		v128_t l = wasm_u16x8_extend_low_u8x16(ireg);                          \
 3880+		v128_t h = wasm_u16x8_extend_high_u8x16(ireg);                         \
 3881+		out0 = wasm_u32x4_extend_low_u16x8(l);                                 \
 3882+		out1 = wasm_u32x4_extend_high_u16x8(l);                                \
 3883+		out2 = wasm_u32x4_extend_low_u16x8(h);                                 \
 3884+		out3 = wasm_u32x4_extend_high_u16x8(h);                                \
 3885+	}
 3886+
 3887+#define stbir__simdi_expand_u8_to_1u32(out, ireg)                              \
 3888+	{                                                                          \
 3889+		v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg);                        \
 3890+		out = wasm_u32x4_extend_low_u16x8(tmp);                                \
 3891+	}
 3892+
 3893+#define stbir__simdi_expand_u16_to_u32(out0, out1, ireg)                       \
 3894+	{                                                                          \
 3895+		out0 = wasm_u32x4_extend_low_u16x8(ireg);                              \
 3896+		out1 = wasm_u32x4_extend_high_u16x8(ireg);                             \
 3897+	}
 3898+
 3899+#define stbir__simdf_convert_float_to_i32(i, f)                                \
 3900+	(i) = wasm_i32x4_trunc_sat_f32x4(f)
 3901+#define stbir__simdf_convert_float_to_int(f)                                   \
 3902+	wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
 3903+#define stbir__simdi_to_int(i) wasm_i32x4_extract_lane(i, 0)
 3904+#define stbir__simdf_convert_float_to_uint8(f)                                 \
 3905+	((unsigned char)wasm_i32x4_extract_lane(                                   \
 3906+	    wasm_i32x4_trunc_sat_f32x4(                                            \
 3907+	        wasm_f32x4_max(wasm_f32x4_min(f, STBIR_max_uint8_as_float),        \
 3908+	                       wasm_f32x4_const_splat(0))),                        \
 3909+	    0))
 3910+#define stbir__simdf_convert_float_to_short(f)                                 \
 3911+	((unsigned short)wasm_i32x4_extract_lane(                                  \
 3912+	    wasm_i32x4_trunc_sat_f32x4(                                            \
 3913+	        wasm_f32x4_max(wasm_f32x4_min(f, STBIR_max_uint16_as_float),       \
 3914+	                       wasm_f32x4_const_splat(0))),                        \
 3915+	    0))
 3916+#define stbir__simdi_convert_i32_to_float(out, ireg)                           \
 3917+	(out) = wasm_f32x4_convert_i32x4(ireg)
 3918+#define stbir__simdf_add(out, reg0, reg1) (out) = wasm_f32x4_add(reg0, reg1)
 3919+#define stbir__simdf_mult(out, reg0, reg1) (out) = wasm_f32x4_mul(reg0, reg1)
 3920+#define stbir__simdf_mult_mem(out, reg, ptr)                                   \
 3921+	(out) = wasm_f32x4_mul(reg, wasm_v128_load((void const *)(ptr)))
 3922+#define stbir__simdf_mult1_mem(out, reg, ptr)                                  \
 3923+	(out) = wasm_f32x4_mul(reg, wasm_v128_load32_splat((void const *)(ptr)))
 3924+#define stbir__simdf_add_mem(out, reg, ptr)                                    \
 3925+	(out) = wasm_f32x4_add(reg, wasm_v128_load((void const *)(ptr)))
 3926+#define stbir__simdf_add1_mem(out, reg, ptr)                                   \
 3927+	(out) = wasm_f32x4_add(reg, wasm_v128_load32_splat((void const *)(ptr)))
 3928+
 3929+#define stbir__simdf_madd(out, add, mul1, mul2)                                \
 3930+	(out) = wasm_f32x4_add(add, wasm_f32x4_mul(mul1, mul2))
 3931+#define stbir__simdf_madd1(out, add, mul1, mul2)                               \
 3932+	(out) = wasm_f32x4_add(add, wasm_f32x4_mul(mul1, mul2))
 3933+#define stbir__simdf_madd_mem(out, add, mul, ptr)                              \
 3934+	(out) = wasm_f32x4_add(                                                    \
 3935+	    add, wasm_f32x4_mul(mul, wasm_v128_load((void const *)(ptr))))
 3936+#define stbir__simdf_madd1_mem(out, add, mul, ptr)                             \
 3937+	(out) = wasm_f32x4_add(                                                    \
 3938+	    add, wasm_f32x4_mul(mul, wasm_v128_load32_splat((void const *)(ptr))))
 3939+
 3940+#define stbir__simdf_add1(out, reg0, reg1) (out) = wasm_f32x4_add(reg0, reg1)
 3941+#define stbir__simdf_mult1(out, reg0, reg1) (out) = wasm_f32x4_mul(reg0, reg1)
 3942+
 3943+#define stbir__simdf_and(out, reg0, reg1) (out) = wasm_v128_and(reg0, reg1)
 3944+#define stbir__simdf_or(out, reg0, reg1) (out) = wasm_v128_or(reg0, reg1)
 3945+
 3946+#define stbir__simdf_min(out, reg0, reg1) (out) = wasm_f32x4_min(reg0, reg1)
 3947+#define stbir__simdf_max(out, reg0, reg1) (out) = wasm_f32x4_max(reg0, reg1)
 3948+#define stbir__simdf_min1(out, reg0, reg1) (out) = wasm_f32x4_min(reg0, reg1)
 3949+#define stbir__simdf_max1(out, reg0, reg1) (out) = wasm_f32x4_max(reg0, reg1)
 3950+
 3951+#define stbir__simdf_0123ABCDto3ABx(out, reg0, reg1)                           \
 3952+	(out) = wasm_i32x4_shuffle(reg0, reg1, 3, 4, 5, -1)
 3953+#define stbir__simdf_0123ABCDto23Ax(out, reg0, reg1)                           \
 3954+	(out) = wasm_i32x4_shuffle(reg0, reg1, 2, 3, 4, -1)
 3955+
 3956+#define stbir__simdf_aaa1(out, alp, ones)                                      \
 3957+	(out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
 3958+#define stbir__simdf_1aaa(out, alp, ones)                                      \
 3959+	(out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
 3960+#define stbir__simdf_a1a1(out, alp, ones)                                      \
 3961+	(out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
 3962+#define stbir__simdf_1a1a(out, alp, ones)                                      \
 3963+	(out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
 3964+
 3965+#define stbir__simdf_swiz(reg, one, two, three, four)                          \
 3966+	wasm_i32x4_shuffle(reg, reg, one, two, three, four)
 3967+
 3968+#define stbir__simdi_and(out, reg0, reg1) (out) = wasm_v128_and(reg0, reg1)
 3969+#define stbir__simdi_or(out, reg0, reg1) (out) = wasm_v128_or(reg0, reg1)
 3970+#define stbir__simdi_16madd(out, reg0, reg1)                                   \
 3971+	(out) = wasm_i32x4_dot_i16x8(reg0, reg1)
 3972+
 3973+#define stbir__simdf_pack_to_8bytes(out, aa, bb)                               \
 3974+	{                                                                          \
 3975+		v128_t af =                                                            \
 3976+		    wasm_f32x4_max(wasm_f32x4_min(aa, STBIR_max_uint8_as_float),       \
 3977+		                   wasm_f32x4_const_splat(0));                         \
 3978+		v128_t bf =                                                            \
 3979+		    wasm_f32x4_max(wasm_f32x4_min(bb, STBIR_max_uint8_as_float),       \
 3980+		                   wasm_f32x4_const_splat(0));                         \
 3981+		v128_t ai = wasm_i32x4_trunc_sat_f32x4(af);                            \
 3982+		v128_t bi = wasm_i32x4_trunc_sat_f32x4(bf);                            \
 3983+		v128_t out16 = wasm_i16x8_narrow_i32x4(ai, bi);                        \
 3984+		out = wasm_u8x16_narrow_i16x8(out16, out16);                           \
 3985+	}
 3986+
 3987+#define stbir__simdf_pack_to_8words(out, aa, bb)                               \
 3988+	{                                                                          \
 3989+		v128_t af =                                                            \
 3990+		    wasm_f32x4_max(wasm_f32x4_min(aa, STBIR_max_uint16_as_float),      \
 3991+		                   wasm_f32x4_const_splat(0));                         \
 3992+		v128_t bf =                                                            \
 3993+		    wasm_f32x4_max(wasm_f32x4_min(bb, STBIR_max_uint16_as_float),      \
 3994+		                   wasm_f32x4_const_splat(0));                         \
 3995+		v128_t ai = wasm_i32x4_trunc_sat_f32x4(af);                            \
 3996+		v128_t bi = wasm_i32x4_trunc_sat_f32x4(bf);                            \
 3997+		out = wasm_u16x8_narrow_i32x4(ai, bi);                                 \
 3998+	}
 3999+
 4000+#define stbir__interleave_pack_and_store_16_u8(ptr, r0, r1, r2, r3)            \
 4001+	{                                                                          \
 4002+		v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1);                         \
 4003+		v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3);                         \
 4004+		v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1);                      \
 4005+		tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, \
 4006+		                         14, 3, 7, 11, 15);                            \
 4007+		wasm_v128_store((void *)(ptr), tmp);                                   \
 4008+	}
 4009+
 4010+#define stbir__simdf_load4_transposed(o0, o1, o2, o3, ptr)                     \
 4011+	{                                                                          \
 4012+		v128_t t0 = wasm_v128_load(ptr);                                       \
 4013+		v128_t t1 = wasm_v128_load(ptr + 4);                                   \
 4014+		v128_t t2 = wasm_v128_load(ptr + 8);                                   \
 4015+		v128_t t3 = wasm_v128_load(ptr + 12);                                  \
 4016+		v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6);                    \
 4017+		v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7);                    \
 4018+		v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6);                    \
 4019+		v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7);                    \
 4020+		o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5);                           \
 4021+		o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5);                           \
 4022+		o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7);                           \
 4023+		o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7);                           \
 4024+	}
 4025+
 4026+#define stbir__simdi_32shr(out, reg, imm) out = wasm_u32x4_shr(reg, imm)
 4027+
 4028+typedef float stbir__f32x4
 4029+    __attribute__((__vector_size__(16), __aligned__(16)));
 4030+#define STBIR__SIMDF_CONST(var, x)                                             \
 4031+	stbir__simdf var = (v128_t)(stbir__f32x4) { x, x, x, x }
 4032+#define STBIR__SIMDI_CONST(var, x) stbir__simdi var = {x, x, x, x}
 4033+#define STBIR__CONSTF(var) (var)
 4034+#define STBIR__CONSTI(var) (var)
 4035+
 4036+#ifdef STBIR_FLOORF
 4037+#undef STBIR_FLOORF
 4038+#endif
 4039+#define STBIR_FLOORF stbir_simd_floorf
 4040+static stbir__inline float
 4041+stbir_simd_floorf(float x)
 4042+{
 4043+	return wasm_f32x4_extract_lane(wasm_f32x4_floor(wasm_f32x4_splat(x)), 0);
 4044+}
 4045+
 4046+#ifdef STBIR_CEILF
 4047+#undef STBIR_CEILF
 4048+#endif
 4049+#define STBIR_CEILF stbir_simd_ceilf
 4050+static stbir__inline float
 4051+stbir_simd_ceilf(float x)
 4052+{
 4053+	return wasm_f32x4_extract_lane(wasm_f32x4_ceil(wasm_f32x4_splat(x)), 0);
 4054+}
 4055+
 4056+#define STBIR_SIMD
 4057+
 4058+#endif // SSE2/NEON/WASM
 4059 
 4060 #endif // NO SIMD
 4061 
 4062 #ifdef STBIR_SIMD8
 4063-  #define stbir__simdfX stbir__simdf8
 4064-  #define stbir__simdiX stbir__simdi8
 4065-  #define stbir__simdfX_load stbir__simdf8_load
 4066-  #define stbir__simdiX_load stbir__simdi8_load
 4067-  #define stbir__simdfX_mult stbir__simdf8_mult
 4068-  #define stbir__simdfX_add_mem stbir__simdf8_add_mem
 4069-  #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
 4070-  #define stbir__simdfX_store stbir__simdf8_store
 4071-  #define stbir__simdiX_store stbir__simdi8_store
 4072-  #define stbir__simdf_frepX  stbir__simdf8_frep8
 4073-  #define stbir__simdfX_madd stbir__simdf8_madd
 4074-  #define stbir__simdfX_min stbir__simdf8_min
 4075-  #define stbir__simdfX_max stbir__simdf8_max
 4076-  #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
 4077-  #define stbir__simdfX_1aaa stbir__simdf8_1aaa
 4078-  #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
 4079-  #define stbir__simdfX_1a1a stbir__simdf8_1a1a
 4080-  #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
 4081-  #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
 4082-  #define stbir__simdfX_zero stbir__simdf8_zero
 4083-  #define STBIR_onesX STBIR_ones8
 4084-  #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
 4085-  #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
 4086-  #define STBIR_simd_point5X STBIR_simd_point58
 4087-  #define stbir__simdfX_float_count 8
 4088-  #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
 4089-  #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
 4090-  static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
 4091-  static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
 4092-  static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
 4093-  static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
 4094-  static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
 4095-  static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
 4096+#define stbir__simdfX stbir__simdf8
 4097+#define stbir__simdiX stbir__simdi8
 4098+#define stbir__simdfX_load stbir__simdf8_load
 4099+#define stbir__simdiX_load stbir__simdi8_load
 4100+#define stbir__simdfX_mult stbir__simdf8_mult
 4101+#define stbir__simdfX_add_mem stbir__simdf8_add_mem
 4102+#define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
 4103+#define stbir__simdfX_store stbir__simdf8_store
 4104+#define stbir__simdiX_store stbir__simdi8_store
 4105+#define stbir__simdf_frepX stbir__simdf8_frep8
 4106+#define stbir__simdfX_madd stbir__simdf8_madd
 4107+#define stbir__simdfX_min stbir__simdf8_min
 4108+#define stbir__simdfX_max stbir__simdf8_max
 4109+#define stbir__simdfX_aaa1 stbir__simdf8_aaa1
 4110+#define stbir__simdfX_1aaa stbir__simdf8_1aaa
 4111+#define stbir__simdfX_a1a1 stbir__simdf8_a1a1
 4112+#define stbir__simdfX_1a1a stbir__simdf8_1a1a
 4113+#define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
 4114+#define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
 4115+#define stbir__simdfX_zero stbir__simdf8_zero
 4116+#define STBIR_onesX STBIR_ones8
 4117+#define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
 4118+#define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
 4119+#define STBIR_simd_point5X STBIR_simd_point58
 4120+#define stbir__simdfX_float_count 8
 4121+#define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
 4122+#define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
 4123+static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = {
 4124+    stbir__max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted,
 4125+    stbir__max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted,
 4126+    stbir__max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted,
 4127+    stbir__max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted};
 4128+static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = {
 4129+    stbir__max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted,
 4130+    stbir__max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted,
 4131+    stbir__max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted,
 4132+    stbir__max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted};
 4133+static const stbir__simdf8 STBIR_ones8 = {1.0, 1.0, 1.0, 1.0,
 4134+                                          1.0, 1.0, 1.0, 1.0};
 4135+static const stbir__simdf8 STBIR_simd_point58 = {0.5, 0.5, 0.5, 0.5,
 4136+                                                 0.5, 0.5, 0.5, 0.5};
 4137+static const stbir__simdf8 STBIR_max_uint8_as_float8 = {
 4138+    stbir__max_uint8_as_float, stbir__max_uint8_as_float,
 4139+    stbir__max_uint8_as_float, stbir__max_uint8_as_float,
 4140+    stbir__max_uint8_as_float, stbir__max_uint8_as_float,
 4141+    stbir__max_uint8_as_float, stbir__max_uint8_as_float};
 4142+static const stbir__simdf8 STBIR_max_uint16_as_float8 = {
 4143+    stbir__max_uint16_as_float, stbir__max_uint16_as_float,
 4144+    stbir__max_uint16_as_float, stbir__max_uint16_as_float,
 4145+    stbir__max_uint16_as_float, stbir__max_uint16_as_float,
 4146+    stbir__max_uint16_as_float, stbir__max_uint16_as_float};
 4147 #else
 4148-  #define stbir__simdfX stbir__simdf
 4149-  #define stbir__simdiX stbir__simdi
 4150-  #define stbir__simdfX_load stbir__simdf_load
 4151-  #define stbir__simdiX_load stbir__simdi_load
 4152-  #define stbir__simdfX_mult stbir__simdf_mult
 4153-  #define stbir__simdfX_add_mem stbir__simdf_add_mem
 4154-  #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
 4155-  #define stbir__simdfX_store stbir__simdf_store
 4156-  #define stbir__simdiX_store stbir__simdi_store
 4157-  #define stbir__simdf_frepX  stbir__simdf_frep4
 4158-  #define stbir__simdfX_madd stbir__simdf_madd
 4159-  #define stbir__simdfX_min stbir__simdf_min
 4160-  #define stbir__simdfX_max stbir__simdf_max
 4161-  #define stbir__simdfX_aaa1 stbir__simdf_aaa1
 4162-  #define stbir__simdfX_1aaa stbir__simdf_1aaa
 4163-  #define stbir__simdfX_a1a1 stbir__simdf_a1a1
 4164-  #define stbir__simdfX_1a1a stbir__simdf_1a1a
 4165-  #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
 4166-  #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
 4167-  #define stbir__simdfX_zero stbir__simdf_zero
 4168-  #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
 4169-  #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
 4170-  #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
 4171-  #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
 4172-  #define stbir__simdfX_float_count 4
 4173-  #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
 4174-  #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
 4175-  #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
 4176+#define stbir__simdfX stbir__simdf
 4177+#define stbir__simdiX stbir__simdi
 4178+#define stbir__simdfX_load stbir__simdf_load
 4179+#define stbir__simdiX_load stbir__simdi_load
 4180+#define stbir__simdfX_mult stbir__simdf_mult
 4181+#define stbir__simdfX_add_mem stbir__simdf_add_mem
 4182+#define stbir__simdfX_madd_mem stbir__simdf_madd_mem
 4183+#define stbir__simdfX_store stbir__simdf_store
 4184+#define stbir__simdiX_store stbir__simdi_store
 4185+#define stbir__simdf_frepX stbir__simdf_frep4
 4186+#define stbir__simdfX_madd stbir__simdf_madd
 4187+#define stbir__simdfX_min stbir__simdf_min
 4188+#define stbir__simdfX_max stbir__simdf_max
 4189+#define stbir__simdfX_aaa1 stbir__simdf_aaa1
 4190+#define stbir__simdfX_1aaa stbir__simdf_1aaa
 4191+#define stbir__simdfX_a1a1 stbir__simdf_a1a1
 4192+#define stbir__simdfX_1a1a stbir__simdf_1a1a
 4193+#define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
 4194+#define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
 4195+#define stbir__simdfX_zero stbir__simdf_zero
 4196+#define STBIR_onesX STBIR__CONSTF(STBIR_ones)
 4197+#define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
 4198+#define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
 4199+#define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
 4200+#define stbir__simdfX_float_count 4
 4201+#define stbir__if_simdf8_cast_to_simdf4(val) (val)
 4202+#define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
 4203+#define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
 4204 #endif
 4205 
 4206-
 4207 #if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
 4208 
 4209-  #if defined( _MSC_VER ) && !defined(__clang__)
 4210-  typedef __int16 stbir__FP16;
 4211-  #else
 4212-  typedef float16_t stbir__FP16;
 4213-  #endif
 4214+#if defined(_MSC_VER) && !defined(__clang__)
 4215+typedef __int16 stbir__FP16;
 4216+#else
 4217+typedef float16_t stbir__FP16;
 4218+#endif
 4219 
 4220 #else // no NEON, or 32-bit ARM for MSVC
 4221 
 4222-  typedef union stbir__FP16
 4223-  {
 4224-    unsigned short u;
 4225-  } stbir__FP16;
 4226-
 4227-#endif
 4228-
 4229-#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
 4230-
 4231-  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 4232-
 4233-  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
 4234-  {
 4235-    static const stbir__FP32 magic = { (254 - 15) << 23 };
 4236-    static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
 4237-    stbir__FP32 o;
 4238-
 4239-    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
 4240-    o.f *= magic.f;                 // exponent adjust
 4241-    if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
 4242-      o.u |= 255 << 23;
 4243-    o.u |= (h.u & 0x8000) << 16;    // sign bit
 4244-    return o.f;
 4245-  }
 4246-
 4247-  static stbir__inline stbir__FP16 stbir__float_to_half(float val)
 4248-  {
 4249-    stbir__FP32 f32infty = { 255 << 23 };
 4250-    stbir__FP32 f16max   = { (127 + 16) << 23 };
 4251-    stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
 4252-    unsigned int sign_mask = 0x80000000u;
 4253-    stbir__FP16 o = { 0 };
 4254-    stbir__FP32 f;
 4255-    unsigned int sign;
 4256-
 4257-    f.f = val;
 4258-    sign = f.u & sign_mask;
 4259-    f.u ^= sign;
 4260-
 4261-    if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
 4262-      o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
 4263-    else // (De)normalized number or zero
 4264-    {
 4265-      if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
 4266-      {
 4267-        // use a magic value to align our 10 mantissa bits at the bottom of
 4268-        // the float. as long as FP addition is round-to-nearest-even this
 4269-        // just works.
 4270-        f.f += denorm_magic.f;
 4271-        // and one integer subtract of the bias later, we have our final float!
 4272-        o.u = (unsigned short) ( f.u - denorm_magic.u );
 4273-      }
 4274-      else
 4275-      {
 4276-        unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
 4277-        // update exponent, rounding bias part 1
 4278-        f.u = f.u + ((15u - 127) << 23) + 0xfff;
 4279-        // rounding bias part 2
 4280-        f.u += mant_odd;
 4281-        // take the bits!
 4282-        o.u = (unsigned short) ( f.u >> 13 );
 4283-      }
 4284-    }
 4285-
 4286-    o.u |= sign >> 16;
 4287-    return o;
 4288-  }
 4289+typedef union stbir__FP16 {
 4290+	unsigned short u;
 4291+} stbir__FP16;
 4292 
 4293 #endif
 4294 
 4295+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) ||                         \
 4296+    (defined(STBIR_NEON) && defined(_M_ARM)) ||                                \
 4297+    (defined(STBIR_NEON) && defined(__arm__))
 4298+
 4299+// Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 4300+
 4301+static stbir__inline float
 4302+stbir__half_to_float(stbir__FP16 h)
 4303+{
 4304+	static const stbir__FP32 magic = {(254 - 15) << 23};
 4305+	static const stbir__FP32 was_infnan = {(127 + 16) << 23};
 4306+	stbir__FP32 o;
 4307+
 4308+	o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
 4309+	o.f *= magic.f;             // exponent adjust
 4310+	if (o.f >= was_infnan.f) {  // make sure Inf/NaN survive
 4311+		o.u |= 255 << 23;
 4312+	}
 4313+	o.u |= (h.u & 0x8000) << 16; // sign bit
 4314+	return o.f;
 4315+}
 4316+
 4317+static stbir__inline stbir__FP16
 4318+stbir__float_to_half(float val)
 4319+{
 4320+	stbir__FP32 f32infty = {255 << 23};
 4321+	stbir__FP32 f16max = {(127 + 16) << 23};
 4322+	stbir__FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
 4323+	unsigned int sign_mask = 0x80000000u;
 4324+	stbir__FP16 o = {0};
 4325+	stbir__FP32 f;
 4326+	unsigned int sign;
 4327+
 4328+	f.f = val;
 4329+	sign = f.u & sign_mask;
 4330+	f.u ^= sign;
 4331+
 4332+	if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
 4333+		o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
 4334+	} else // (De)normalized number or zero
 4335+	{
 4336+		if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
 4337+		{
 4338+			// use a magic value to align our 10 mantissa bits at the bottom of
 4339+			// the float. as long as FP addition is round-to-nearest-even this
 4340+			// just works.
 4341+			f.f += denorm_magic.f;
 4342+			// and one integer subtract of the bias later, we have our final
 4343+			// float!
 4344+			o.u = (unsigned short)(f.u - denorm_magic.u);
 4345+		} else {
 4346+			unsigned int mant_odd =
 4347+			    (f.u >> 13) & 1; // resulting mantissa is odd
 4348+			// update exponent, rounding bias part 1
 4349+			f.u = f.u + ((15u - 127) << 23) + 0xfff;
 4350+			// rounding bias part 2
 4351+			f.u += mant_odd;
 4352+			// take the bits!
 4353+			o.u = (unsigned short)(f.u >> 13);
 4354+		}
 4355+	}
 4356+
 4357+	o.u |= sign >> 16;
 4358+	return o;
 4359+}
 4360+
 4361+#endif
 4362 
 4363 #if defined(STBIR_FP16C)
 4364 
 4365-  #include <immintrin.h>
 4366+#include <immintrin.h>
 4367 
 4368-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
 4369-  {
 4370-    _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
 4371-  }
 4372+static stbir__inline void
 4373+stbir__half_to_float_SIMD(float *output, stbir__FP16 const *input)
 4374+{
 4375+	_mm256_storeu_ps((float *)output,
 4376+	                 _mm256_cvtph_ps(_mm_loadu_si128((__m128i const *)input)));
 4377+}
 4378 
 4379-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
 4380-  {
 4381-    _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
 4382-  }
 4383+static stbir__inline void
 4384+stbir__float_to_half_SIMD(stbir__FP16 *output, float const *input)
 4385+{
 4386+	_mm_storeu_si128((__m128i *)output,
 4387+	                 _mm256_cvtps_ph(_mm256_loadu_ps(input), 0));
 4388+}
 4389 
 4390-  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
 4391-  {
 4392-    return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
 4393-  }
 4394+static stbir__inline float
 4395+stbir__half_to_float(stbir__FP16 h)
 4396+{
 4397+	return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128((int)h.u)));
 4398+}
 4399 
 4400-  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
 4401-  {
 4402-    stbir__FP16 h;
 4403-    h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
 4404-    return h;
 4405-  }
 4406+static stbir__inline stbir__FP16
 4407+stbir__float_to_half(float f)
 4408+{
 4409+	stbir__FP16 h;
 4410+	h.u = (unsigned short)_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f), 0));
 4411+	return h;
 4412+}
 4413 
 4414 #elif defined(STBIR_SSE2)
 4415 
 4416-  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 4417-  stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
 4418-  {
 4419-    static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
 4420-    static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
 4421-    static const STBIR__SIMDI_CONST(infinity,         0x7c00);
 4422-    static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
 4423-    static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
 4424-
 4425-    __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
 4426-    __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
 4427-    __m128i mnosign     = STBIR__CONSTI(mask_nosign);
 4428-    __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
 4429-    __m128i smallest    = STBIR__CONSTI(smallest_normal);
 4430-    __m128i infty       = STBIR__CONSTI(infinity);
 4431-    __m128i expmant     = _mm_and_si128(mnosign, h);
 4432-    __m128i justsign    = _mm_xor_si128(h, expmant);
 4433-    __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
 4434-    __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
 4435-    __m128i shifted     = _mm_slli_epi32(expmant, 13);
 4436-    __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
 4437-    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
 4438-    __m128i den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
 4439-    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
 4440-    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
 4441-    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
 4442-    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
 4443-    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
 4444-    __m128i sign        = _mm_slli_epi32(justsign, 16);
 4445-    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
 4446-    stbir__simdf_store( output + 0,  final );
 4447-
 4448-    h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
 4449-    expmant     = _mm_and_si128(mnosign, h);
 4450-    justsign    = _mm_xor_si128(h, expmant);
 4451-    b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
 4452-    b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
 4453-    shifted     = _mm_slli_epi32(expmant, 13);
 4454-    adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
 4455-    adjusted    = _mm_add_epi32(eadjust, shifted);
 4456-    den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
 4457-    adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
 4458-    den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
 4459-    adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
 4460-    adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
 4461-    adjusted5   = _mm_or_ps(adjusted3, adjusted4);
 4462-    sign        = _mm_slli_epi32(justsign, 16);
 4463-    final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
 4464-    stbir__simdf_store( output + 4,  final );
 4465-
 4466-    // ~38 SSE2 ops for 8 values
 4467-  }
 4468-
 4469-  // Fabian's round-to-nearest-even float to half
 4470-  // ~48 SSE2 ops for 8 output
 4471-  stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
 4472-  {
 4473-    static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
 4474-    static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
 4475-    static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
 4476-    static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
 4477-    static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
 4478-    static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
 4479-    static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
 4480-
 4481-    __m128  f           =  _mm_loadu_ps(input);
 4482-    __m128  msign       = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
 4483-    __m128  justsign    = _mm_and_ps(msign, f);
 4484-    __m128  absf        = _mm_xor_ps(f, justsign);
 4485-    __m128i absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
 4486-    __m128i f16max      = STBIR__CONSTI(c_f16max);
 4487-    __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
 4488-    __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
 4489-    __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
 4490-    __m128i inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
 4491-
 4492-    __m128i min_normal  = STBIR__CONSTI(c_min_normal);
 4493-    __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
 4494-
 4495-    // "result is subnormal" path
 4496-    __m128  subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
 4497-    __m128i subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
 4498-
 4499-    // "result is normal" path
 4500-    __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
 4501-    __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
 4502-
 4503-    __m128i round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
 4504-    __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
 4505-    __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
 4506-
 4507-    // combine the two non-specials
 4508-    __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
 4509-
 4510-    // merge in specials as well
 4511-    __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
 4512-
 4513-    __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
 4514-    __m128i final2, final= _mm_or_si128(joined, sign_shift);
 4515-
 4516-    f           =  _mm_loadu_ps(input+4);
 4517-    justsign    = _mm_and_ps(msign, f);
 4518-    absf        = _mm_xor_ps(f, justsign);
 4519-    absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
 4520-    b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
 4521-    b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
 4522-    nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
 4523-    inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
 4524-
 4525-    b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
 4526-
 4527-    // "result is subnormal" path
 4528-    subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
 4529-    subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
 4530-
 4531-    // "result is normal" path
 4532-    mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
 4533-    mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
 4534-
 4535-    round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
 4536-    round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
 4537-    normal      = _mm_srli_epi32(round2, 13); // rounded result
 4538-
 4539-    // combine the two non-specials
 4540-    nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
 4541-
 4542-    // merge in specials as well
 4543-    joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
 4544-
 4545-    sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
 4546-    final2      = _mm_or_si128(joined, sign_shift);
 4547-    final       = _mm_packs_epi32(final, final2);
 4548-    stbir__simdi_store( output,final );
 4549-  }
 4550-
 4551-#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
 4552-
 4553-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
 4554-  {
 4555-    float16x4_t in0 = vld1_f16(input + 0);
 4556-    float16x4_t in1 = vld1_f16(input + 4);
 4557-    vst1q_f32(output + 0, vcvt_f32_f16(in0));
 4558-    vst1q_f32(output + 4, vcvt_f32_f16(in1));
 4559-  }
 4560-
 4561-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
 4562-  {
 4563-    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
 4564-    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
 4565-    vst1_f16(output+0, out0);
 4566-    vst1_f16(output+4, out1);
 4567-  }
 4568-
 4569-  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
 4570-  {
 4571-    return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
 4572-  }
 4573-
 4574-  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
 4575-  {
 4576-    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
 4577-  }
 4578-
 4579-#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
 4580-
 4581-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
 4582-  {
 4583-    float16x8_t in = vld1q_f16(input);
 4584-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
 4585-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
 4586-  }
 4587-
 4588-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
 4589-  {
 4590-    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
 4591-    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
 4592-    vst1q_f16(output, vcombine_f16(out0, out1));
 4593-  }
 4594-
 4595-  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
 4596-  {
 4597-    return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
 4598-  }
 4599-
 4600-  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
 4601-  {
 4602-    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
 4603-  }
 4604-
 4605-#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
 4606-
 4607-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
 4608-  {
 4609-    for (int i=0; i<8; i++)
 4610-    {
 4611-      output[i] = stbir__half_to_float(input[i]);
 4612-    }
 4613-  }
 4614-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
 4615-  {
 4616-    for (int i=0; i<8; i++)
 4617-    {
 4618-      output[i] = stbir__float_to_half(input[i]);
 4619-    }
 4620-  }
 4621+// Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 4622+stbir__inline static void
 4623+stbir__half_to_float_SIMD(float *output, void const *input)
 4624+{
 4625+	static const STBIR__SIMDI_CONST(mask_nosign, 0x7fff);
 4626+	static const STBIR__SIMDI_CONST(smallest_normal, 0x0400);
 4627+	static const STBIR__SIMDI_CONST(infinity, 0x7c00);
 4628+	static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
 4629+	static const STBIR__SIMDI_CONST(magic_denorm, 113 << 23);
 4630+
 4631+	__m128i i = _mm_loadu_si128((__m128i const *)(input));
 4632+	__m128i h = _mm_unpacklo_epi16(i, _mm_setzero_si128());
 4633+	__m128i mnosign = STBIR__CONSTI(mask_nosign);
 4634+	__m128i eadjust = STBIR__CONSTI(expadjust_normal);
 4635+	__m128i smallest = STBIR__CONSTI(smallest_normal);
 4636+	__m128i infty = STBIR__CONSTI(infinity);
 4637+	__m128i expmant = _mm_and_si128(mnosign, h);
 4638+	__m128i justsign = _mm_xor_si128(h, expmant);
 4639+	__m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
 4640+	__m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
 4641+	__m128i shifted = _mm_slli_epi32(expmant, 13);
 4642+	__m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
 4643+	__m128i adjusted = _mm_add_epi32(eadjust, shifted);
 4644+	__m128i den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
 4645+	__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
 4646+	__m128 den2 =
 4647+	    _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
 4648+	__m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
 4649+	__m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm),
 4650+	                                 _mm_castsi128_ps(adjusted2));
 4651+	__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
 4652+	__m128i sign = _mm_slli_epi32(justsign, 16);
 4653+	__m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
 4654+	stbir__simdf_store(output + 0, final);
 4655+
 4656+	h = _mm_unpackhi_epi16(i, _mm_setzero_si128());
 4657+	expmant = _mm_and_si128(mnosign, h);
 4658+	justsign = _mm_xor_si128(h, expmant);
 4659+	b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
 4660+	b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
 4661+	shifted = _mm_slli_epi32(expmant, 13);
 4662+	adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
 4663+	adjusted = _mm_add_epi32(eadjust, shifted);
 4664+	den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
 4665+	adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
 4666+	den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
 4667+	adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
 4668+	adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm),
 4669+	                          _mm_castsi128_ps(adjusted2));
 4670+	adjusted5 = _mm_or_ps(adjusted3, adjusted4);
 4671+	sign = _mm_slli_epi32(justsign, 16);
 4672+	final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
 4673+	stbir__simdf_store(output + 4, final);
 4674+
 4675+	// ~38 SSE2 ops for 8 values
 4676+}
 4677+
 4678+// Fabian's round-to-nearest-even float to half
 4679+// ~48 SSE2 ops for 8 output
 4680+stbir__inline static void
 4681+stbir__float_to_half_SIMD(void *output, float const *input)
 4682+{
 4683+	static const STBIR__SIMDI_CONST(mask_sign, 0x80000000u);
 4684+	static const STBIR__SIMDI_CONST(
 4685+	    c_f16max, (127 + 16) << 23); // all FP32 values >=this round to +inf
 4686+	static const STBIR__SIMDI_CONST(c_nanbit, 0x200);
 4687+	static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
 4688+	static const STBIR__SIMDI_CONST(
 4689+	    c_min_normal, (127 - 14)
 4690+	                      << 23); // smallest FP32 that yields a normalized FP16
 4691+	static const STBIR__SIMDI_CONST(c_subnorm_magic,
 4692+	                                ((127 - 15) + (23 - 10) + 1) << 23);
 4693+	static const STBIR__SIMDI_CONST(
 4694+	    c_normal_bias,
 4695+	    0xfff -
 4696+	        ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
 4697+
 4698+	__m128 f = _mm_loadu_ps(input);
 4699+	__m128 msign = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
 4700+	__m128 justsign = _mm_and_ps(msign, f);
 4701+	__m128 absf = _mm_xor_ps(f, justsign);
 4702+	__m128i absf_int = _mm_castps_si128(
 4703+	    absf); // the cast is "free" (extra bypass latency, but no thruput hit)
 4704+	__m128i f16max = STBIR__CONSTI(c_f16max);
 4705+	__m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
 4706+	__m128i b_isregular =
 4707+	    _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
 4708+	__m128i nanbit =
 4709+	    _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
 4710+	__m128i inf_or_nan = _mm_or_si128(
 4711+	    nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
 4712+
 4713+	__m128i min_normal = STBIR__CONSTI(c_min_normal);
 4714+	__m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
 4715+
 4716+	// "result is subnormal" path
 4717+	__m128 subnorm1 = _mm_add_ps(
 4718+	    absf, _mm_castsi128_ps(STBIR__CONSTI(
 4719+	              c_subnorm_magic))); // magic value to round output mantissa
 4720+	__m128i subnorm2 =
 4721+	    _mm_sub_epi32(_mm_castps_si128(subnorm1),
 4722+	                  STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
 4723+
 4724+	// "result is normal" path
 4725+	__m128i mantoddbit = _mm_slli_epi32(
 4726+	    absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
 4727+	__m128i mantodd =
 4728+	    _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
 4729+
 4730+	__m128i round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
 4731+	__m128i round2 = _mm_sub_epi32(
 4732+	    round1,
 4733+	    mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
 4734+	__m128i normal = _mm_srli_epi32(round2, 13); // rounded result
 4735+
 4736+	// combine the two non-specials
 4737+	__m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub),
 4738+	                                  _mm_andnot_si128(b_issub, normal));
 4739+
 4740+	// merge in specials as well
 4741+	__m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular),
 4742+	                              _mm_andnot_si128(b_isregular, inf_or_nan));
 4743+
 4744+	__m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
 4745+	__m128i final2, final = _mm_or_si128(joined, sign_shift);
 4746+
 4747+	f = _mm_loadu_ps(input + 4);
 4748+	justsign = _mm_and_ps(msign, f);
 4749+	absf = _mm_xor_ps(f, justsign);
 4750+	absf_int = _mm_castps_si128(
 4751+	    absf); // the cast is "free" (extra bypass latency, but no thruput hit)
 4752+	b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
 4753+	b_isregular =
 4754+	    _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
 4755+	nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
 4756+	inf_or_nan = _mm_or_si128(
 4757+	    nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
 4758+
 4759+	b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
 4760+
 4761+	// "result is subnormal" path
 4762+	subnorm1 = _mm_add_ps(
 4763+	    absf, _mm_castsi128_ps(STBIR__CONSTI(
 4764+	              c_subnorm_magic))); // magic value to round output mantissa
 4765+	subnorm2 =
 4766+	    _mm_sub_epi32(_mm_castps_si128(subnorm1),
 4767+	                  STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
 4768 
 4769-#endif
 4770+	// "result is normal" path
 4771+	mantoddbit = _mm_slli_epi32(absf_int,
 4772+	                            31 - 13); // shift bit 13 (mantissa LSB) to sign
 4773+	mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
 4774 
 4775+	round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
 4776+	round2 = _mm_sub_epi32(
 4777+	    round1,
 4778+	    mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
 4779+	normal = _mm_srli_epi32(round2, 13); // rounded result
 4780 
 4781-#ifdef STBIR_SIMD
 4782+	// combine the two non-specials
 4783+	nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub),
 4784+	                          _mm_andnot_si128(b_issub, normal));
 4785 
 4786-#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
 4787-#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
 4788-#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
 4789-#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
 4790-#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
 4791-#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
 4792-#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
 4793-#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
 4794-#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
 4795-#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
 4796-#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
 4797-#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
 4798-#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
 4799-#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
 4800-#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
 4801-#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
 4802-#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
 4803-#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
 4804-#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
 4805-#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
 4806-
 4807-typedef union stbir__simdi_u32
 4808-{
 4809-  stbir_uint32 m128i_u32[4];
 4810-  int m128i_i32[4];
 4811-  stbir__simdi m128i_i128;
 4812-} stbir__simdi_u32;
 4813+	// merge in specials as well
 4814+	joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular),
 4815+	                      _mm_andnot_si128(b_isregular, inf_or_nan));
 4816+
 4817+	sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
 4818+	final2 = _mm_or_si128(joined, sign_shift);
 4819+	final = _mm_packs_epi32(final, final2);
 4820+	stbir__simdi_store(output, final);
 4821+}
 4822+
 4823+#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) &&         \
 4824+    !defined(__clang__) // 64-bit ARM on MSVC (not clang)
 4825+
 4826+static stbir__inline void
 4827+stbir__half_to_float_SIMD(float *output, stbir__FP16 const *input)
 4828+{
 4829+	float16x4_t in0 = vld1_f16(input + 0);
 4830+	float16x4_t in1 = vld1_f16(input + 4);
 4831+	vst1q_f32(output + 0, vcvt_f32_f16(in0));
 4832+	vst1q_f32(output + 4, vcvt_f32_f16(in1));
 4833+}
 4834+
 4835+static stbir__inline void
 4836+stbir__float_to_half_SIMD(stbir__FP16 *output, float const *input)
 4837+{
 4838+	float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
 4839+	float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
 4840+	vst1_f16(output + 0, out0);
 4841+	vst1_f16(output + 4, out1);
 4842+}
 4843 
 4844-static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
 4845+static stbir__inline float
 4846+stbir__half_to_float(stbir__FP16 h)
 4847+{
 4848+	return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
 4849+}
 4850+
 4851+static stbir__inline stbir__FP16
 4852+stbir__float_to_half(float f)
 4853+{
 4854+	return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
 4855+}
 4856+
 4857+#elif defined(STBIR_NEON) && (defined(_M_ARM64) || defined(__aarch64__) ||     \
 4858+                              defined(__arm64__)) // 64-bit ARM
 4859+
 4860+static stbir__inline void
 4861+stbir__half_to_float_SIMD(float *output, stbir__FP16 const *input)
 4862+{
 4863+	float16x8_t in = vld1q_f16(input);
 4864+	vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
 4865+	vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
 4866+}
 4867+
 4868+static stbir__inline void
 4869+stbir__float_to_half_SIMD(stbir__FP16 *output, float const *input)
 4870+{
 4871+	float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
 4872+	float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
 4873+	vst1q_f16(output, vcombine_f16(out0, out1));
 4874+}
 4875+
 4876+static stbir__inline float
 4877+stbir__half_to_float(stbir__FP16 h)
 4878+{
 4879+	return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
 4880+}
 4881+
 4882+static stbir__inline stbir__FP16
 4883+stbir__float_to_half(float f)
 4884+{
 4885+	return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
 4886+}
 4887+
 4888+#elif defined(STBIR_WASM) ||                                                   \
 4889+    (defined(STBIR_NEON) &&                                                    \
 4890+     (defined(_MSC_VER) || defined(_M_ARM) ||                                  \
 4891+      defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
 4892+
 4893+static stbir__inline void
 4894+stbir__half_to_float_SIMD(float *output, stbir__FP16 const *input)
 4895+{
 4896+	for (int i = 0; i < 8; i++) {
 4897+		output[i] = stbir__half_to_float(input[i]);
 4898+	}
 4899+}
 4900+static stbir__inline void
 4901+stbir__float_to_half_SIMD(stbir__FP16 *output, float const *input)
 4902+{
 4903+	for (int i = 0; i < 8; i++) {
 4904+		output[i] = stbir__float_to_half(input[i]);
 4905+	}
 4906+}
 4907 
 4908-static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,           stbir__max_uint8_as_float);
 4909-static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,          stbir__max_uint16_as_float);
 4910-static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,  stbir__max_uint8_as_float_inverted);
 4911-static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
 4912+#endif
 4913+
 4914+#ifdef STBIR_SIMD
 4915+
 4916+#define stbir__simdf_0123to3333(out, reg)                                      \
 4917+	(out) = stbir__simdf_swiz(reg, 3, 3, 3, 3)
 4918+#define stbir__simdf_0123to2222(out, reg)                                      \
 4919+	(out) = stbir__simdf_swiz(reg, 2, 2, 2, 2)
 4920+#define stbir__simdf_0123to1111(out, reg)                                      \
 4921+	(out) = stbir__simdf_swiz(reg, 1, 1, 1, 1)
 4922+#define stbir__simdf_0123to0000(out, reg)                                      \
 4923+	(out) = stbir__simdf_swiz(reg, 0, 0, 0, 0)
 4924+#define stbir__simdf_0123to0003(out, reg)                                      \
 4925+	(out) = stbir__simdf_swiz(reg, 0, 0, 0, 3)
 4926+#define stbir__simdf_0123to0001(out, reg)                                      \
 4927+	(out) = stbir__simdf_swiz(reg, 0, 0, 0, 1)
 4928+#define stbir__simdf_0123to1122(out, reg)                                      \
 4929+	(out) = stbir__simdf_swiz(reg, 1, 1, 2, 2)
 4930+#define stbir__simdf_0123to2333(out, reg)                                      \
 4931+	(out) = stbir__simdf_swiz(reg, 2, 3, 3, 3)
 4932+#define stbir__simdf_0123to0023(out, reg)                                      \
 4933+	(out) = stbir__simdf_swiz(reg, 0, 0, 2, 3)
 4934+#define stbir__simdf_0123to1230(out, reg)                                      \
 4935+	(out) = stbir__simdf_swiz(reg, 1, 2, 3, 0)
 4936+#define stbir__simdf_0123to2103(out, reg)                                      \
 4937+	(out) = stbir__simdf_swiz(reg, 2, 1, 0, 3)
 4938+#define stbir__simdf_0123to3210(out, reg)                                      \
 4939+	(out) = stbir__simdf_swiz(reg, 3, 2, 1, 0)
 4940+#define stbir__simdf_0123to2301(out, reg)                                      \
 4941+	(out) = stbir__simdf_swiz(reg, 2, 3, 0, 1)
 4942+#define stbir__simdf_0123to3012(out, reg)                                      \
 4943+	(out) = stbir__simdf_swiz(reg, 3, 0, 1, 2)
 4944+#define stbir__simdf_0123to0011(out, reg)                                      \
 4945+	(out) = stbir__simdf_swiz(reg, 0, 0, 1, 1)
 4946+#define stbir__simdf_0123to1100(out, reg)                                      \
 4947+	(out) = stbir__simdf_swiz(reg, 1, 1, 0, 0)
 4948+#define stbir__simdf_0123to2233(out, reg)                                      \
 4949+	(out) = stbir__simdf_swiz(reg, 2, 2, 3, 3)
 4950+#define stbir__simdf_0123to1133(out, reg)                                      \
 4951+	(out) = stbir__simdf_swiz(reg, 1, 1, 3, 3)
 4952+#define stbir__simdf_0123to0022(out, reg)                                      \
 4953+	(out) = stbir__simdf_swiz(reg, 0, 0, 2, 2)
 4954+#define stbir__simdf_0123to1032(out, reg)                                      \
 4955+	(out) = stbir__simdf_swiz(reg, 1, 0, 3, 2)
 4956+
 4957+typedef union stbir__simdi_u32 {
 4958+	stbir_uint32 m128i_u32[4];
 4959+	int m128i_i32[4];
 4960+	stbir__simdi m128i_i128;
 4961+} stbir__simdi_u32;
 4962 
 4963-static const STBIR__SIMDF_CONST(STBIR_simd_point5,   0.5f);
 4964-static const STBIR__SIMDF_CONST(STBIR_ones,          1.0f);
 4965-static const STBIR__SIMDI_CONST(STBIR_almost_zero,   (127 - 13) << 23);
 4966-static const STBIR__SIMDI_CONST(STBIR_almost_one,    0x3f7fffff);
 4967+static const int STBIR_mask[9] = {0, 0, 0, -1, -1, -1, 0, 0, 0};
 4968+
 4969+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,
 4970+                                stbir__max_uint8_as_float);
 4971+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,
 4972+                                stbir__max_uint16_as_float);
 4973+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,
 4974+                                stbir__max_uint8_as_float_inverted);
 4975+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted,
 4976+                                stbir__max_uint16_as_float_inverted);
 4977+
 4978+static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f);
 4979+static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f);
 4980+static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23);
 4981+static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff);
 4982 static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
 4983-static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 4984+static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000);
 4985 
 4986 //   Basically, in simd mode, we unroll the proper amount, and we don't want
 4987 //   the non-simd remnant loops to be unroll because they only run a few times
 4988 //   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
 4989-#define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
 4990+#define STBIR_SIMD_STREAMOUT_PTR(star) STBIR_STREAMOUT_PTR(star)
 4991 #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
 4992 #define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
 4993-#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
 4994+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR                                \
 4995+	STBIR_NO_UNROLL_LOOP_START_INF_FOR
 4996 
 4997 #ifdef STBIR_MEMCPY
 4998 #undef STBIR_MEMCPY
 4999 #endif
 5000 #define STBIR_MEMCPY stbir_simd_memcpy
 5001 
 5002-// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
 5003-static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
 5004-{
 5005-  char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
 5006-  char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
 5007-  ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
 5008-
 5009-  // check overlaps
 5010-  STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
 5011-
 5012-  if ( bytes < (16*stbir__simdfX_float_count) )
 5013-  {
 5014-    if ( bytes < 16 )
 5015-    {
 5016-      if ( bytes )
 5017-      {
 5018-        STBIR_SIMD_NO_UNROLL_LOOP_START
 5019-        do
 5020-        {
 5021-          STBIR_SIMD_NO_UNROLL(d);
 5022-          d[ 0 ] = d[ ofs_to_src ];
 5023-          ++d;
 5024-        } while ( d < d_end );
 5025-      }
 5026-    }
 5027-    else
 5028-    {
 5029-      stbir__simdf x;
 5030-      // do one unaligned to get us aligned for the stream out below
 5031-      stbir__simdf_load( x, ( d + ofs_to_src ) );
 5032-      stbir__simdf_store( d, x );
 5033-      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
 5034-
 5035-      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 5036-      for(;;)
 5037-      {
 5038-        STBIR_SIMD_NO_UNROLL(d);
 5039-
 5040-        if ( d > ( d_end - 16 ) )
 5041-        {
 5042-          if ( d == d_end )
 5043-            return;
 5044-          d = d_end - 16;
 5045-        }
 5046-
 5047-        stbir__simdf_load( x, ( d + ofs_to_src ) );
 5048-        stbir__simdf_store( d, x );
 5049-        d += 16;
 5050-      }
 5051-    }
 5052-  }
 5053-  else
 5054-  {
 5055-    stbir__simdfX x0,x1,x2,x3;
 5056-
 5057-    // do one unaligned to get us aligned for the stream out below
 5058-    stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
 5059-    stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
 5060-    stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
 5061-    stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
 5062-    stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
 5063-    stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
 5064-    stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
 5065-    stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
 5066-    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
 5067-
 5068-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 5069-    for(;;)
 5070-    {
 5071-      STBIR_SIMD_NO_UNROLL(d);
 5072-
 5073-      if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
 5074-      {
 5075-        if ( d == d_end )
 5076-          return;
 5077-        d = d_end - (16*stbir__simdfX_float_count);
 5078-      }
 5079-
 5080-      stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
 5081-      stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
 5082-      stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
 5083-      stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
 5084-      stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
 5085-      stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
 5086-      stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
 5087-      stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
 5088-      d += (16*stbir__simdfX_float_count);
 5089-    }
 5090-  }
 5091-}
 5092-
 5093-// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
 5094-//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 5095-//   the diff between dest and src)
 5096-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 5097-{
 5098-  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
 5099-  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
 5100-  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
 5101-
 5102-  if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
 5103-  {
 5104-    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
 5105-    STBIR_SIMD_NO_UNROLL_LOOP_START
 5106-    do
 5107-    {
 5108-      stbir__simdf x;
 5109-      STBIR_SIMD_NO_UNROLL(sd);
 5110-      stbir__simdf_load( x, sd );
 5111-      stbir__simdf_store(  ( sd + ofs_to_dest ), x );
 5112-      sd += 16;
 5113-    } while ( sd < s_end16 );
 5114-
 5115-    if ( sd == s_end )
 5116-      return;
 5117-  }
 5118-
 5119-  do
 5120-  {
 5121-    STBIR_SIMD_NO_UNROLL(sd);
 5122-    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
 5123-    sd += 4;
 5124-  } while ( sd < s_end );
 5125+// override normal use of memcpy with much simpler copy (faster and smaller with
 5126+// our sized copies)
 5127+static void
 5128+stbir_simd_memcpy(void *dest, void const *src, size_t bytes)
 5129+{
 5130+	char STBIR_SIMD_STREAMOUT_PTR(*) d = (char *)dest;
 5131+	char STBIR_SIMD_STREAMOUT_PTR(*) d_end = ((char *)dest) + bytes;
 5132+	ptrdiff_t ofs_to_src = (char *)src - (char *)dest;
 5133+
 5134+	// check overlaps
 5135+	STBIR_ASSERT(((d >= ((char *)src) + bytes)) ||
 5136+	             ((d + bytes) <= (char *)src));
 5137+
 5138+	if (bytes < (16 * stbir__simdfX_float_count)) {
 5139+		if (bytes < 16) {
 5140+			if (bytes) {
 5141+				STBIR_SIMD_NO_UNROLL_LOOP_START
 5142+				do {
 5143+					STBIR_SIMD_NO_UNROLL(d);
 5144+					d[0] = d[ofs_to_src];
 5145+					++d;
 5146+				} while (d < d_end);
 5147+			}
 5148+		} else {
 5149+			stbir__simdf x;
 5150+			// do one unaligned to get us aligned for the stream out below
 5151+			stbir__simdf_load(x, (d + ofs_to_src));
 5152+			stbir__simdf_store(d, x);
 5153+			d = (char *)((((size_t)d) + 16) & ~15);
 5154+
 5155+			STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 5156+			for (;;) {
 5157+				STBIR_SIMD_NO_UNROLL(d);
 5158+
 5159+				if (d > (d_end - 16)) {
 5160+					if (d == d_end) {
 5161+						return;
 5162+					}
 5163+					d = d_end - 16;
 5164+				}
 5165+
 5166+				stbir__simdf_load(x, (d + ofs_to_src));
 5167+				stbir__simdf_store(d, x);
 5168+				d += 16;
 5169+			}
 5170+		}
 5171+	} else {
 5172+		stbir__simdfX x0, x1, x2, x3;
 5173+
 5174+		// do one unaligned to get us aligned for the stream out below
 5175+		stbir__simdfX_load(x0,
 5176+		                   (d + ofs_to_src) + 0 * stbir__simdfX_float_count);
 5177+		stbir__simdfX_load(x1,
 5178+		                   (d + ofs_to_src) + 4 * stbir__simdfX_float_count);
 5179+		stbir__simdfX_load(x2,
 5180+		                   (d + ofs_to_src) + 8 * stbir__simdfX_float_count);
 5181+		stbir__simdfX_load(x3,
 5182+		                   (d + ofs_to_src) + 12 * stbir__simdfX_float_count);
 5183+		stbir__simdfX_store(d + 0 * stbir__simdfX_float_count, x0);
 5184+		stbir__simdfX_store(d + 4 * stbir__simdfX_float_count, x1);
 5185+		stbir__simdfX_store(d + 8 * stbir__simdfX_float_count, x2);
 5186+		stbir__simdfX_store(d + 12 * stbir__simdfX_float_count, x3);
 5187+		d = (char *)((((size_t)d) + (16 * stbir__simdfX_float_count)) &
 5188+		             ~((16 * stbir__simdfX_float_count) - 1));
 5189+
 5190+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 5191+		for (;;) {
 5192+			STBIR_SIMD_NO_UNROLL(d);
 5193+
 5194+			if (d > (d_end - (16 * stbir__simdfX_float_count))) {
 5195+				if (d == d_end) {
 5196+					return;
 5197+				}
 5198+				d = d_end - (16 * stbir__simdfX_float_count);
 5199+			}
 5200+
 5201+			stbir__simdfX_load(x0, (d + ofs_to_src) +
 5202+			                           0 * stbir__simdfX_float_count);
 5203+			stbir__simdfX_load(x1, (d + ofs_to_src) +
 5204+			                           4 * stbir__simdfX_float_count);
 5205+			stbir__simdfX_load(x2, (d + ofs_to_src) +
 5206+			                           8 * stbir__simdfX_float_count);
 5207+			stbir__simdfX_load(x3, (d + ofs_to_src) +
 5208+			                           12 * stbir__simdfX_float_count);
 5209+			stbir__simdfX_store(d + 0 * stbir__simdfX_float_count, x0);
 5210+			stbir__simdfX_store(d + 4 * stbir__simdfX_float_count, x1);
 5211+			stbir__simdfX_store(d + 8 * stbir__simdfX_float_count, x2);
 5212+			stbir__simdfX_store(d + 12 * stbir__simdfX_float_count, x3);
 5213+			d += (16 * stbir__simdfX_float_count);
 5214+		}
 5215+	}
 5216+}
 5217+
 5218+// memcpy that is specically intentionally overlapping (src is smaller then
 5219+// dest, so can be
 5220+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or
 5221+//   equal to the diff between dest and src)
 5222+static void
 5223+stbir_overlapping_memcpy(void *dest, void const *src, size_t bytes)
 5224+{
 5225+	char STBIR_SIMD_STREAMOUT_PTR(*) sd = (char *)src;
 5226+	char STBIR_SIMD_STREAMOUT_PTR(*) s_end = ((char *)src) + bytes;
 5227+	ptrdiff_t ofs_to_dest = (char *)dest - (char *)src;
 5228+
 5229+	if (ofs_to_dest >= 16) // is the overlap more than 16 away?
 5230+	{
 5231+		char STBIR_SIMD_STREAMOUT_PTR(*) s_end16 =
 5232+		    ((char *)src) + (bytes & ~15);
 5233+		STBIR_SIMD_NO_UNROLL_LOOP_START
 5234+		do {
 5235+			stbir__simdf x;
 5236+			STBIR_SIMD_NO_UNROLL(sd);
 5237+			stbir__simdf_load(x, sd);
 5238+			stbir__simdf_store((sd + ofs_to_dest), x);
 5239+			sd += 16;
 5240+		} while (sd < s_end16);
 5241+
 5242+		if (sd == s_end) {
 5243+			return;
 5244+		}
 5245+	}
 5246+
 5247+	do {
 5248+		STBIR_SIMD_NO_UNROLL(sd);
 5249+		*(int *)(sd + ofs_to_dest) = *(int *)sd;
 5250+		sd += 4;
 5251+	} while (sd < s_end);
 5252 }
 5253 
 5254 #else // no SSE2
 5255 
 5256-// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
 5257-#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
 5258+// when in scalar mode, we let unrolling happen, so this macro just does the
 5259+// __restrict
 5260+#define STBIR_SIMD_STREAMOUT_PTR(star) STBIR_STREAMOUT_PTR(star)
 5261 #define STBIR_SIMD_NO_UNROLL(ptr)
 5262 #define STBIR_SIMD_NO_UNROLL_LOOP_START
 5263 #define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 5264 
 5265 #endif // SSE2
 5266 
 5267-
 5268 #ifdef STBIR_PROFILE
 5269 
 5270 #ifndef STBIR_PROFILE_FUNC
 5271 
 5272-#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
 5273+#if defined(_x86_64) || defined(__x86_64__) || defined(_M_X64) ||              \
 5274+    defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) ||            \
 5275+    defined(_M_IX86_FP) || defined(__i386) || defined(__i386__) ||             \
 5276+    defined(_M_IX86) || defined(_X86_)
 5277 
 5278 #ifdef _MSC_VER
 5279 
 5280-  STBIRDEF stbir_uint64 __rdtsc();
 5281-  #define STBIR_PROFILE_FUNC() __rdtsc()
 5282+STBIRDEF stbir_uint64
 5283+__rdtsc();
 5284+#define STBIR_PROFILE_FUNC() __rdtsc()
 5285 
 5286 #else // non msvc
 5287 
 5288-  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
 5289-  {
 5290-    stbir_uint32 lo, hi;
 5291-    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
 5292-    return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
 5293-  }
 5294+static stbir__inline stbir_uint64
 5295+STBIR_PROFILE_FUNC()
 5296+{
 5297+	stbir_uint32 lo, hi;
 5298+	asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
 5299+	return (((stbir_uint64)hi) << 32) | ((stbir_uint64)lo);
 5300+}
 5301 
 5302-#endif  // msvc
 5303+#endif // msvc
 5304 
 5305-#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
 5306+#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) ||       \
 5307+    defined(__ARM_NEON__)
 5308 
 5309-#if defined( _MSC_VER ) && !defined(__clang__)
 5310+#if defined(_MSC_VER) && !defined(__clang__)
 5311 
 5312-  #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
 5313+#define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
 5314 
 5315 #else
 5316 
 5317-  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
 5318-  {
 5319-    stbir_uint64 tsc;
 5320-    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
 5321-    return tsc;
 5322-  }
 5323+static stbir__inline stbir_uint64
 5324+STBIR_PROFILE_FUNC()
 5325+{
 5326+	stbir_uint64 tsc;
 5327+	asm volatile("mrs %0, cntvct_el0" : "=r"(tsc));
 5328+	return tsc;
 5329+}
 5330 
 5331 #endif
 5332 
 5333@@ -2763,35 +3370,69 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 5334 
 5335 #error Unknown platform for profiling.
 5336 
 5337-#endif  // x64, arm
 5338+#endif // x64, arm
 5339 
 5340 #endif // STBIR_PROFILE_FUNC
 5341 
 5342-#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
 5343-#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
 5344+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO , stbir__per_split_info *split_info
 5345+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO , split_info
 5346 
 5347-#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
 5348-#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
 5349+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO , stbir__info *profile_info
 5350+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO , profile_info
 5351 
 5352 // super light-weight micro profiler
 5353-#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
 5354-#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
 5355-#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
 5356-#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
 5357+#define STBIR_PROFILE_START_ll(info, wh)                                       \
 5358+	{                                                                          \
 5359+		stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC();                  \
 5360+		stbir_uint64 *wh##save_parent_excluded_ptr =                           \
 5361+		    info->current_zone_excluded_ptr;                                   \
 5362+		stbir_uint64 wh##current_zone_excluded = 0;                            \
 5363+		info->current_zone_excluded_ptr = &wh##current_zone_excluded;
 5364+#define STBIR_PROFILE_END_ll(info, wh)                                         \
 5365+	wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime;                \
 5366+	info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded;    \
 5367+	*wh##save_parent_excluded_ptr += wh##thiszonetime;                         \
 5368+	info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr;            \
 5369+	}
 5370+#define STBIR_PROFILE_FIRST_START_ll(info, wh)                                 \
 5371+	{                                                                          \
 5372+		int i;                                                                 \
 5373+		info->current_zone_excluded_ptr = &info->profile.named.total;          \
 5374+		for (i = 0; i < STBIR__ARRAY_SIZE(info->profile.array); i++)           \
 5375+			info->profile.array[i] = 0;                                        \
 5376+	}                                                                          \
 5377+	STBIR_PROFILE_START_ll(info, wh);
 5378+#define STBIR_PROFILE_CLEAR_EXTRAS_ll(info, num)                               \
 5379+	{                                                                          \
 5380+		int extra;                                                             \
 5381+		for (extra = 1; extra < (num); extra++) {                              \
 5382+			int i;                                                             \
 5383+			for (i = 0; i < STBIR__ARRAY_SIZE((info)->profile.array); i++)     \
 5384+				(info)[extra].profile.array[i] = 0;                            \
 5385+		}                                                                      \
 5386+	}
 5387 
 5388 // for thread data
 5389-#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
 5390-#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
 5391-#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
 5392-#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
 5393+#define STBIR_PROFILE_START(wh) STBIR_PROFILE_START_ll(split_info, wh)
 5394+#define STBIR_PROFILE_END(wh) STBIR_PROFILE_END_ll(split_info, wh)
 5395+#define STBIR_PROFILE_FIRST_START(wh)                                          \
 5396+	STBIR_PROFILE_FIRST_START_ll(split_info, wh)
 5397+#define STBIR_PROFILE_CLEAR_EXTRAS()                                           \
 5398+	STBIR_PROFILE_CLEAR_EXTRAS_ll(split_info, split_count)
 5399 
 5400 // for build data
 5401-#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
 5402-#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
 5403-#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
 5404-#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
 5405-
 5406-#else  // no profile
 5407+#define STBIR_PROFILE_BUILD_START(wh) STBIR_PROFILE_START_ll(profile_info, wh)
 5408+#define STBIR_PROFILE_BUILD_END(wh) STBIR_PROFILE_END_ll(profile_info, wh)
 5409+#define STBIR_PROFILE_BUILD_FIRST_START(wh)                                    \
 5410+	STBIR_PROFILE_FIRST_START_ll(profile_info, wh)
 5411+#define STBIR_PROFILE_BUILD_CLEAR(info)                                        \
 5412+	{                                                                          \
 5413+		int i;                                                                 \
 5414+		for (i = 0; i < STBIR__ARRAY_SIZE(info->profile.array); i++)           \
 5415+			info->profile.array[i] = 0;                                        \
 5416+	}
 5417+
 5418+#else // no profile
 5419 
 5420 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
 5421 #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
 5422@@ -2799,17 +3440,17 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 5423 #define STBIR_ONLY_PROFILE_BUILD_GET_INFO
 5424 #define STBIR_ONLY_PROFILE_BUILD_SET_INFO
 5425 
 5426-#define STBIR_PROFILE_START( wh )
 5427-#define STBIR_PROFILE_END( wh )
 5428-#define STBIR_PROFILE_FIRST_START( wh )
 5429-#define STBIR_PROFILE_CLEAR_EXTRAS( )
 5430+#define STBIR_PROFILE_START(wh)
 5431+#define STBIR_PROFILE_END(wh)
 5432+#define STBIR_PROFILE_FIRST_START(wh)
 5433+#define STBIR_PROFILE_CLEAR_EXTRAS()
 5434 
 5435-#define STBIR_PROFILE_BUILD_START( wh )
 5436-#define STBIR_PROFILE_BUILD_END( wh )
 5437-#define STBIR_PROFILE_BUILD_FIRST_START( wh )
 5438-#define STBIR_PROFILE_BUILD_CLEAR( info )
 5439+#define STBIR_PROFILE_BUILD_START(wh)
 5440+#define STBIR_PROFILE_BUILD_END(wh)
 5441+#define STBIR_PROFILE_BUILD_FIRST_START(wh)
 5442+#define STBIR_PROFILE_BUILD_CLEAR(info)
 5443 
 5444-#endif  // stbir_profile
 5445+#endif // stbir_profile
 5446 
 5447 #ifndef STBIR_CEILF
 5448 #include <math.h>
 5449@@ -2825,665 +3466,763 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 5450 #ifndef STBIR_MEMCPY
 5451 // For memcpy
 5452 #include <string.h>
 5453-#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
 5454+#define STBIR_MEMCPY(dest, src, len) memcpy(dest, src, len)
 5455 #endif
 5456 
 5457 #ifndef STBIR_SIMD
 5458 
 5459-// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
 5460-//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 5461-//   the diff between dest and src)
 5462-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 5463-{
 5464-  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
 5465-  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
 5466-  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
 5467-
 5468-  if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
 5469-  {
 5470-    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
 5471-    STBIR_NO_UNROLL_LOOP_START
 5472-    do
 5473-    {
 5474-      STBIR_NO_UNROLL(sd);
 5475-      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
 5476-      sd += 8;
 5477-    } while ( sd < s_end8 );
 5478-
 5479-    if ( sd == s_end )
 5480-      return;
 5481-  }
 5482-
 5483-  STBIR_NO_UNROLL_LOOP_START
 5484-  do
 5485-  {
 5486-    STBIR_NO_UNROLL(sd);
 5487-    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
 5488-    sd += 4;
 5489-  } while ( sd < s_end );
 5490+// memcpy that is specifically intentionally overlapping (src is smaller then
 5491+// dest, so can be
 5492+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or
 5493+//   equal to the diff between dest and src)
 5494+static void
 5495+stbir_overlapping_memcpy(void *dest, void const *src, size_t bytes)
 5496+{
 5497+	char STBIR_SIMD_STREAMOUT_PTR(*) sd = (char *)src;
 5498+	char STBIR_SIMD_STREAMOUT_PTR(*) s_end = ((char *)src) + bytes;
 5499+	ptrdiff_t ofs_to_dest = (char *)dest - (char *)src;
 5500+
 5501+	if (ofs_to_dest >= 8) // is the overlap more than 8 away?
 5502+	{
 5503+		char STBIR_SIMD_STREAMOUT_PTR(*) s_end8 = ((char *)src) + (bytes & ~7);
 5504+		STBIR_NO_UNROLL_LOOP_START
 5505+		do {
 5506+			STBIR_NO_UNROLL(sd);
 5507+			*(stbir_uint64 *)(sd + ofs_to_dest) = *(stbir_uint64 *)sd;
 5508+			sd += 8;
 5509+		} while (sd < s_end8);
 5510+
 5511+		if (sd == s_end) {
 5512+			return;
 5513+		}
 5514+	}
 5515+
 5516+	STBIR_NO_UNROLL_LOOP_START
 5517+	do {
 5518+		STBIR_NO_UNROLL(sd);
 5519+		*(int *)(sd + ofs_to_dest) = *(int *)sd;
 5520+		sd += 4;
 5521+	} while (sd < s_end);
 5522 }
 5523 
 5524 #endif
 5525 
 5526-static float stbir__filter_trapezoid(float x, float scale, void * user_data)
 5527+static float
 5528+stbir__filter_trapezoid(float x, float scale, void *user_data)
 5529 {
 5530-  float halfscale = scale / 2;
 5531-  float t = 0.5f + halfscale;
 5532-  STBIR_ASSERT(scale <= 1);
 5533-  STBIR__UNUSED(user_data);
 5534+	float halfscale = scale / 2;
 5535+	float t = 0.5f + halfscale;
 5536+	STBIR_ASSERT(scale <= 1);
 5537+	STBIR__UNUSED(user_data);
 5538 
 5539-  if ( x < 0.0f ) x = -x;
 5540+	if (x < 0.0f) {
 5541+		x = -x;
 5542+	}
 5543 
 5544-  if (x >= t)
 5545-    return 0.0f;
 5546-  else
 5547-  {
 5548-    float r = 0.5f - halfscale;
 5549-    if (x <= r)
 5550-      return 1.0f;
 5551-    else
 5552-      return (t - x) / scale;
 5553-  }
 5554+	if (x >= t) {
 5555+		return 0.0f;
 5556+	} else {
 5557+		float r = 0.5f - halfscale;
 5558+		if (x <= r) {
 5559+			return 1.0f;
 5560+		} else {
 5561+			return (t - x) / scale;
 5562+		}
 5563+	}
 5564 }
 5565 
 5566-static float stbir__support_trapezoid(float scale, void * user_data)
 5567+static float
 5568+stbir__support_trapezoid(float scale, void *user_data)
 5569 {
 5570-  STBIR__UNUSED(user_data);
 5571-  return 0.5f + scale / 2.0f;
 5572+	STBIR__UNUSED(user_data);
 5573+	return 0.5f + scale / 2.0f;
 5574 }
 5575 
 5576-static float stbir__filter_triangle(float x, float s, void * user_data)
 5577+static float
 5578+stbir__filter_triangle(float x, float s, void *user_data)
 5579 {
 5580-  STBIR__UNUSED(s);
 5581-  STBIR__UNUSED(user_data);
 5582+	STBIR__UNUSED(s);
 5583+	STBIR__UNUSED(user_data);
 5584 
 5585-  if ( x < 0.0f ) x = -x;
 5586+	if (x < 0.0f) {
 5587+		x = -x;
 5588+	}
 5589 
 5590-  if (x <= 1.0f)
 5591-    return 1.0f - x;
 5592-  else
 5593-    return 0.0f;
 5594+	if (x <= 1.0f) {
 5595+		return 1.0f - x;
 5596+	} else {
 5597+		return 0.0f;
 5598+	}
 5599 }
 5600 
 5601-static float stbir__filter_point(float x, float s, void * user_data)
 5602+static float
 5603+stbir__filter_point(float x, float s, void *user_data)
 5604 {
 5605-  STBIR__UNUSED(x);
 5606-  STBIR__UNUSED(s);
 5607-  STBIR__UNUSED(user_data);
 5608+	STBIR__UNUSED(x);
 5609+	STBIR__UNUSED(s);
 5610+	STBIR__UNUSED(user_data);
 5611 
 5612-  return 1.0f;
 5613+	return 1.0f;
 5614 }
 5615 
 5616-static float stbir__filter_cubic(float x, float s, void * user_data)
 5617+static float
 5618+stbir__filter_cubic(float x, float s, void *user_data)
 5619 {
 5620-  STBIR__UNUSED(s);
 5621-  STBIR__UNUSED(user_data);
 5622+	STBIR__UNUSED(s);
 5623+	STBIR__UNUSED(user_data);
 5624 
 5625-  if ( x < 0.0f ) x = -x;
 5626+	if (x < 0.0f) {
 5627+		x = -x;
 5628+	}
 5629 
 5630-  if (x < 1.0f)
 5631-    return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
 5632-  else if (x < 2.0f)
 5633-    return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
 5634+	if (x < 1.0f) {
 5635+		return (4.0f + x * x * (3.0f * x - 6.0f)) / 6.0f;
 5636+	} else if (x < 2.0f) {
 5637+		return (8.0f + x * (-12.0f + x * (6.0f - x))) / 6.0f;
 5638+	}
 5639 
 5640-  return (0.0f);
 5641+	return (0.0f);
 5642 }
 5643 
 5644-static float stbir__filter_catmullrom(float x, float s, void * user_data)
 5645+static float
 5646+stbir__filter_catmullrom(float x, float s, void *user_data)
 5647 {
 5648-  STBIR__UNUSED(s);
 5649-  STBIR__UNUSED(user_data);
 5650+	STBIR__UNUSED(s);
 5651+	STBIR__UNUSED(user_data);
 5652 
 5653-  if ( x < 0.0f ) x = -x;
 5654+	if (x < 0.0f) {
 5655+		x = -x;
 5656+	}
 5657 
 5658-  if (x < 1.0f)
 5659-    return 1.0f - x*x*(2.5f - 1.5f*x);
 5660-  else if (x < 2.0f)
 5661-    return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
 5662+	if (x < 1.0f) {
 5663+		return 1.0f - x * x * (2.5f - 1.5f * x);
 5664+	} else if (x < 2.0f) {
 5665+		return 2.0f - x * (4.0f + x * (0.5f * x - 2.5f));
 5666+	}
 5667 
 5668-  return (0.0f);
 5669+	return (0.0f);
 5670 }
 5671 
 5672-static float stbir__filter_mitchell(float x, float s, void * user_data)
 5673+static float
 5674+stbir__filter_mitchell(float x, float s, void *user_data)
 5675 {
 5676-  STBIR__UNUSED(s);
 5677-  STBIR__UNUSED(user_data);
 5678+	STBIR__UNUSED(s);
 5679+	STBIR__UNUSED(user_data);
 5680 
 5681-  if ( x < 0.0f ) x = -x;
 5682+	if (x < 0.0f) {
 5683+		x = -x;
 5684+	}
 5685 
 5686-  if (x < 1.0f)
 5687-    return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
 5688-  else if (x < 2.0f)
 5689-    return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
 5690+	if (x < 1.0f) {
 5691+		return (16.0f + x * x * (21.0f * x - 36.0f)) / 18.0f;
 5692+	} else if (x < 2.0f) {
 5693+		return (32.0f + x * (-60.0f + x * (36.0f - 7.0f * x))) / 18.0f;
 5694+	}
 5695 
 5696-  return (0.0f);
 5697+	return (0.0f);
 5698 }
 5699 
 5700-static float stbir__support_zeropoint5(float s, void * user_data)
 5701+static float
 5702+stbir__support_zeropoint5(float s, void *user_data)
 5703 {
 5704-  STBIR__UNUSED(s);
 5705-  STBIR__UNUSED(user_data);
 5706-  return 0.5f;
 5707+	STBIR__UNUSED(s);
 5708+	STBIR__UNUSED(user_data);
 5709+	return 0.5f;
 5710 }
 5711 
 5712-static float stbir__support_one(float s, void * user_data)
 5713+static float
 5714+stbir__support_one(float s, void *user_data)
 5715 {
 5716-  STBIR__UNUSED(s);
 5717-  STBIR__UNUSED(user_data);
 5718-  return 1;
 5719+	STBIR__UNUSED(s);
 5720+	STBIR__UNUSED(user_data);
 5721+	return 1;
 5722 }
 5723 
 5724-static float stbir__support_two(float s, void * user_data)
 5725+static float
 5726+stbir__support_two(float s, void *user_data)
 5727 {
 5728-  STBIR__UNUSED(s);
 5729-  STBIR__UNUSED(user_data);
 5730-  return 2;
 5731+	STBIR__UNUSED(s);
 5732+	STBIR__UNUSED(user_data);
 5733+	return 2;
 5734 }
 5735 
 5736 // This is the maximum number of input samples that can affect an output sample
 5737 // with the given filter from the output pixel's perspective
 5738-static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
 5739+static int
 5740+stbir__get_filter_pixel_width(stbir__support_callback *support, float scale,
 5741+                              void *user_data)
 5742 {
 5743-  STBIR_ASSERT(support != 0);
 5744+	STBIR_ASSERT(support != 0);
 5745 
 5746-  if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
 5747-    return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
 5748-  else
 5749-    return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
 5750+	if (scale >= (1.0f - stbir__small_float)) { // upscale
 5751+		return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
 5752+	} else {
 5753+		return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
 5754+	}
 5755 }
 5756 
 5757 // this is how many coefficents per run of the filter (which is different
 5758 //   from the filter_pixel_width depending on if we are scattering or gathering)
 5759-static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
 5760+static int
 5761+stbir__get_coefficient_width(stbir__sampler *samp, int is_gather,
 5762+                             void *user_data)
 5763 {
 5764-  float scale = samp->scale_info.scale;
 5765-  stbir__support_callback * support = samp->filter_support;
 5766+	float scale = samp->scale_info.scale;
 5767+	stbir__support_callback *support = samp->filter_support;
 5768 
 5769-  switch( is_gather )
 5770-  {
 5771-    case 1:
 5772-      return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
 5773-    case 2:
 5774-      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
 5775-    case 0:
 5776-      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
 5777-    default:
 5778-      STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
 5779-      return 0;
 5780-  }
 5781+	switch (is_gather) {
 5782+	case 1:
 5783+		return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
 5784+	case 2:
 5785+		return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
 5786+	case 0:
 5787+		return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
 5788+	default:
 5789+		STBIR_ASSERT((is_gather >= 0) && (is_gather <= 2));
 5790+		return 0;
 5791+	}
 5792 }
 5793 
 5794-static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
 5795+static int
 5796+stbir__get_contributors(stbir__sampler *samp, int is_gather)
 5797 {
 5798-  if (is_gather)
 5799-      return samp->scale_info.output_sub_size;
 5800-  else
 5801-      return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
 5802+	if (is_gather) {
 5803+		return samp->scale_info.output_sub_size;
 5804+	} else {
 5805+		return (samp->scale_info.input_full_size +
 5806+		        samp->filter_pixel_margin * 2);
 5807+	}
 5808 }
 5809 
 5810-static int stbir__edge_zero_full( int n, int max )
 5811+static int
 5812+stbir__edge_zero_full(int n, int max)
 5813 {
 5814-  STBIR__UNUSED(n);
 5815-  STBIR__UNUSED(max);
 5816-  return 0; // NOTREACHED
 5817+	STBIR__UNUSED(n);
 5818+	STBIR__UNUSED(max);
 5819+	return 0; // NOTREACHED
 5820 }
 5821 
 5822-static int stbir__edge_clamp_full( int n, int max )
 5823+static int
 5824+stbir__edge_clamp_full(int n, int max)
 5825 {
 5826-  if (n < 0)
 5827-    return 0;
 5828+	if (n < 0) {
 5829+		return 0;
 5830+	}
 5831 
 5832-  if (n >= max)
 5833-    return max - 1;
 5834+	if (n >= max) {
 5835+		return max - 1;
 5836+	}
 5837 
 5838-  return n; // NOTREACHED
 5839+	return n; // NOTREACHED
 5840 }
 5841 
 5842-static int stbir__edge_reflect_full( int n, int max )
 5843+static int
 5844+stbir__edge_reflect_full(int n, int max)
 5845 {
 5846-  if (n < 0)
 5847-  {
 5848-    if (n > -max)
 5849-      return -n;
 5850-    else
 5851-      return max - 1;
 5852-  }
 5853+	if (n < 0) {
 5854+		if (n > -max) {
 5855+			return -n;
 5856+		} else {
 5857+			return max - 1;
 5858+		}
 5859+	}
 5860 
 5861-  if (n >= max)
 5862-  {
 5863-    int max2 = max * 2;
 5864-    if (n >= max2)
 5865-      return 0;
 5866-    else
 5867-      return max2 - n - 1;
 5868-  }
 5869+	if (n >= max) {
 5870+		int max2 = max * 2;
 5871+		if (n >= max2) {
 5872+			return 0;
 5873+		} else {
 5874+			return max2 - n - 1;
 5875+		}
 5876+	}
 5877 
 5878-  return n; // NOTREACHED
 5879+	return n; // NOTREACHED
 5880 }
 5881 
 5882-static int stbir__edge_wrap_full( int n, int max )
 5883+static int
 5884+stbir__edge_wrap_full(int n, int max)
 5885 {
 5886-  if (n >= 0)
 5887-    return (n % max);
 5888-  else
 5889-  {
 5890-    int m = (-n) % max;
 5891+	if (n >= 0) {
 5892+		return (n % max);
 5893+	} else {
 5894+		int m = (-n) % max;
 5895 
 5896-    if (m != 0)
 5897-      m = max - m;
 5898+		if (m != 0) {
 5899+			m = max - m;
 5900+		}
 5901 
 5902-    return (m);
 5903-  }
 5904+		return (m);
 5905+	}
 5906 }
 5907 
 5908-typedef int stbir__edge_wrap_func( int n, int max );
 5909-static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
 5910-{
 5911-  stbir__edge_clamp_full,    // STBIR_EDGE_CLAMP
 5912-  stbir__edge_reflect_full,  // STBIR_EDGE_REFLECT
 5913-  stbir__edge_wrap_full,     // STBIR_EDGE_WRAP
 5914-  stbir__edge_zero_full,     // STBIR_EDGE_ZERO
 5915+typedef int
 5916+stbir__edge_wrap_func(int n, int max);
 5917+static stbir__edge_wrap_func *stbir__edge_wrap_slow[] = {
 5918+    stbir__edge_clamp_full,   // STBIR_EDGE_CLAMP
 5919+    stbir__edge_reflect_full, // STBIR_EDGE_REFLECT
 5920+    stbir__edge_wrap_full,    // STBIR_EDGE_WRAP
 5921+    stbir__edge_zero_full,    // STBIR_EDGE_ZERO
 5922 };
 5923 
 5924-stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
 5925+stbir__inline static int
 5926+stbir__edge_wrap(stbir_edge edge, int n, int max)
 5927 {
 5928-  // avoid per-pixel switch
 5929-  if (n >= 0 && n < max)
 5930-      return n;
 5931-  return stbir__edge_wrap_slow[edge]( n, max );
 5932+	// avoid per-pixel switch
 5933+	if (n >= 0 && n < max) {
 5934+		return n;
 5935+	}
 5936+	return stbir__edge_wrap_slow[edge](n, max);
 5937 }
 5938 
 5939 #define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
 5940 
 5941 // get information on the extents of a sampler
 5942-static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
 5943-{
 5944-  int j, stop;
 5945-  int left_margin, right_margin;
 5946-  int min_n = 0x7fffffff, max_n = -0x7fffffff;
 5947-  int min_left = 0x7fffffff, max_left = -0x7fffffff;
 5948-  int min_right = 0x7fffffff, max_right = -0x7fffffff;
 5949-  stbir_edge edge = samp->edge;
 5950-  stbir__contributors* contributors = samp->contributors;
 5951-  int output_sub_size = samp->scale_info.output_sub_size;
 5952-  int input_full_size = samp->scale_info.input_full_size;
 5953-  int filter_pixel_margin = samp->filter_pixel_margin;
 5954-
 5955-  STBIR_ASSERT( samp->is_gather );
 5956-
 5957-  stop = output_sub_size;
 5958-  for (j = 0; j < stop; j++ )
 5959-  {
 5960-    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
 5961-    if ( contributors[j].n0 < min_n )
 5962-    {
 5963-      min_n = contributors[j].n0;
 5964-      stop = j + filter_pixel_margin;  // if we find a new min, only scan another filter width
 5965-      if ( stop > output_sub_size ) stop = output_sub_size;
 5966-    }
 5967-  }
 5968-
 5969-  stop = 0;
 5970-  for (j = output_sub_size - 1; j >= stop; j-- )
 5971-  {
 5972-    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
 5973-    if ( contributors[j].n1 > max_n )
 5974-    {
 5975-      max_n = contributors[j].n1;
 5976-      stop = j - filter_pixel_margin;  // if we find a new max, only scan another filter width
 5977-      if (stop<0) stop = 0;
 5978-    }
 5979-  }
 5980-
 5981-  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
 5982-  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
 5983-
 5984-  // now calculate how much into the margins we really read
 5985-  left_margin = 0;
 5986-  if ( min_n < 0 )
 5987-  {
 5988-    left_margin = -min_n;
 5989-    min_n = 0;
 5990-  }
 5991-
 5992-  right_margin = 0;
 5993-  if ( max_n >= input_full_size )
 5994-  {
 5995-    right_margin = max_n - input_full_size + 1;
 5996-    max_n = input_full_size - 1;
 5997-  }
 5998-
 5999-  // index 1 is margin pixel extents (how many pixels we hang over the edge)
 6000-  scanline_extents->edge_sizes[0] = left_margin;
 6001-  scanline_extents->edge_sizes[1] = right_margin;
 6002-
 6003-  // index 2 is pixels read from the input
 6004-  scanline_extents->spans[0].n0 = min_n;
 6005-  scanline_extents->spans[0].n1 = max_n;
 6006-  scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6007-
 6008-  // default to no other input range
 6009-  scanline_extents->spans[1].n0 = 0;
 6010-  scanline_extents->spans[1].n1 = -1;
 6011-  scanline_extents->spans[1].pixel_offset_for_input = 0;
 6012-
 6013-  // don't have to do edge calc for zero clamp
 6014-  if ( edge == STBIR_EDGE_ZERO )
 6015-    return;
 6016-
 6017-  // convert margin pixels to the pixels within the input (min and max)
 6018-  for( j = -left_margin ; j < 0 ; j++ )
 6019-  {
 6020-      int p = stbir__edge_wrap( edge, j, input_full_size );
 6021-      if ( p < min_left )
 6022-        min_left = p;
 6023-      if ( p > max_left )
 6024-        max_left = p;
 6025-  }
 6026-
 6027-  for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
 6028-  {
 6029-      int p = stbir__edge_wrap( edge, j, input_full_size );
 6030-      if ( p < min_right )
 6031-        min_right = p;
 6032-      if ( p > max_right )
 6033-        max_right = p;
 6034-  }
 6035-
 6036-  // merge the left margin pixel region if it connects within 4 pixels of main pixel region
 6037-  if ( min_left != 0x7fffffff )
 6038-  {
 6039-    if ( ( ( min_left <= min_n ) && ( ( max_left  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
 6040-         ( ( min_n <= min_left ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
 6041-    {
 6042-      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
 6043-      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
 6044-      scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6045-      left_margin = 0;
 6046-    }
 6047-  }
 6048-
 6049-  // merge the right margin pixel region if it connects within 4 pixels of main pixel region
 6050-  if ( min_right != 0x7fffffff )
 6051-  {
 6052-    if ( ( ( min_right <= min_n ) && ( ( max_right  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
 6053-         ( ( min_n <= min_right ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
 6054-    {
 6055-      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
 6056-      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
 6057-      scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6058-      right_margin = 0;
 6059-    }
 6060-  }
 6061-
 6062-  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
 6063-  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
 6064-
 6065-  // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
 6066-  //   so you need to get a second run of pixels from the opposite side of the scanline (which you
 6067-  //   wouldn't need except for WRAP)
 6068-
 6069-
 6070-  // if we can't merge the min_left range, add it as a second range
 6071-  if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
 6072-  {
 6073-    stbir__span * newspan = scanline_extents->spans + 1;
 6074-    STBIR_ASSERT( right_margin == 0 );
 6075-    if ( min_left < scanline_extents->spans[0].n0 )
 6076-    {
 6077-      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
 6078-      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
 6079-      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
 6080-      --newspan;
 6081-    }
 6082-    newspan->pixel_offset_for_input = min_left;
 6083-    newspan->n0 = -left_margin;
 6084-    newspan->n1 = ( max_left - min_left ) - left_margin;
 6085-    scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
 6086-  }
 6087-  // if we can't merge the min_left range, add it as a second range
 6088-  else  
 6089-  if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
 6090-  {
 6091-    stbir__span * newspan = scanline_extents->spans + 1;
 6092-    if ( min_right < scanline_extents->spans[0].n0 )
 6093-    {
 6094-      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
 6095-      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
 6096-      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
 6097-      --newspan;
 6098-    }
 6099-    newspan->pixel_offset_for_input = min_right;
 6100-    newspan->n0 = scanline_extents->spans[1].n1 + 1;
 6101-    newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
 6102-    scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
 6103-  }
 6104-
 6105-  // sort the spans into write output order
 6106-  if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
 6107-  {
 6108-    stbir__span tspan = scanline_extents->spans[0];
 6109-    scanline_extents->spans[0] = scanline_extents->spans[1];
 6110-    scanline_extents->spans[1] = tspan;
 6111-  }
 6112-}
 6113-
 6114-static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
 6115-{
 6116-  int first, last;
 6117-  float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
 6118-  float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
 6119-
 6120-  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
 6121-  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
 6122-
 6123-  first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
 6124-  last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
 6125-  if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
 6126-
 6127-  if ( edge == STBIR_EDGE_WRAP )
 6128-  {
 6129-    if ( first < -input_size )
 6130-      first = -input_size;
 6131-    if ( last >= (input_size*2))
 6132-      last = (input_size*2) - 1;
 6133-  }
 6134-
 6135-  *first_pixel = first;
 6136-  *last_pixel = last;
 6137-}
 6138-
 6139-static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
 6140-{
 6141-  int n, end;
 6142-  float inv_scale = scale_info->inv_scale;
 6143-  float out_shift = scale_info->pixel_shift;
 6144-  int input_size  = scale_info->input_full_size;
 6145-  int numerator = scale_info->scale_numerator;
 6146-  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
 6147-
 6148-  // Looping through out pixels
 6149-  end = num_contributors; if ( polyphase ) end = numerator;
 6150-  for (n = 0; n < end; n++)
 6151-  {
 6152-    int i;
 6153-    int last_non_zero;
 6154-    float out_pixel_center = (float)n + 0.5f;
 6155-    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
 6156-
 6157-    int in_first_pixel, in_last_pixel;
 6158-
 6159-    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
 6160-
 6161-    // make sure we never generate a range larger than our precalculated coeff width
 6162-    //   this only happens in point sample mode, but it's a good safe thing to do anyway
 6163-    if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
 6164-      in_last_pixel = in_first_pixel + coefficient_width - 1;
 6165-
 6166-    last_non_zero = -1;
 6167-    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
 6168-    {
 6169-      float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
 6170-      float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
 6171-
 6172-      // kill denormals
 6173-      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
 6174-      {
 6175-        if ( i == 0 )  // if we're at the front, just eat zero contributors
 6176-        {
 6177-          STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
 6178-          ++in_first_pixel;
 6179-          i--;
 6180-          continue;
 6181-        }
 6182-        coeff = 0;  // make sure is fully zero (should keep denormals away)
 6183-      }
 6184-      else
 6185-        last_non_zero = i;
 6186-
 6187-      coefficient_group[i] = coeff;
 6188-    }
 6189-
 6190-    in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
 6191-    contributors->n0 = in_first_pixel;
 6192-    contributors->n1 = in_last_pixel;
 6193-
 6194-    STBIR_ASSERT(contributors->n1 >= contributors->n0);
 6195-
 6196-    ++contributors;
 6197-    coefficient_group += coefficient_width;
 6198-  }
 6199-}
 6200-
 6201-static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
 6202-{
 6203-  if ( new_pixel <= contribs->n1 )  // before the end
 6204-  {
 6205-    if ( new_pixel < contribs->n0 ) // before the front?
 6206-    {
 6207-      if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
 6208-      { 
 6209-        int j, o = contribs->n0 - new_pixel;
 6210-        for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
 6211-          coeffs[ j + o ] = coeffs[ j ];
 6212-        for ( j = 1 ; j < o ; j-- )
 6213-          coeffs[ j ] = coeffs[ 0 ];
 6214-        coeffs[ 0 ] = new_coeff;
 6215-        contribs->n0 = new_pixel;
 6216-      }
 6217-    }
 6218-    else
 6219-    {
 6220-      coeffs[ new_pixel - contribs->n0 ] += new_coeff;
 6221-    }
 6222-  }
 6223-  else
 6224-  {
 6225-    if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
 6226-    {
 6227-      int j, e = new_pixel - contribs->n0;
 6228-      for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
 6229-        coeffs[j] = 0;
 6230-
 6231-      coeffs[ e ] = new_coeff;
 6232-      contribs->n1 = new_pixel;
 6233-    }
 6234-  }
 6235-}
 6236-
 6237-static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
 6238-{
 6239-  float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
 6240-  float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
 6241-  float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
 6242-  float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
 6243-  int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
 6244-  int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
 6245-
 6246-  if ( out_first_pixel < 0 )
 6247-    out_first_pixel = 0;
 6248-  if ( out_last_pixel >= out_size )
 6249-    out_last_pixel = out_size - 1;
 6250-  *first_pixel = out_first_pixel;
 6251-  *last_pixel = out_last_pixel;
 6252-}
 6253-
 6254-static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
 6255-{
 6256-  int in_pixel;
 6257-  int i;
 6258-  int first_out_inited = -1;
 6259-  float scale = scale_info->scale;
 6260-  float out_shift = scale_info->pixel_shift;
 6261-  int out_size = scale_info->output_sub_size;
 6262-  int numerator = scale_info->scale_numerator;
 6263-  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
 6264-
 6265-  STBIR__UNUSED(num_contributors);
 6266-
 6267-  // Loop through the input pixels
 6268-  for (in_pixel = start; in_pixel < end; in_pixel++)
 6269-  {
 6270-    float in_pixel_center = (float)in_pixel + 0.5f;
 6271-    float out_center_of_in = in_pixel_center * scale - out_shift;
 6272-    int out_first_pixel, out_last_pixel;
 6273-
 6274-    stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
 6275-
 6276-    if ( out_first_pixel > out_last_pixel )
 6277-      continue;
 6278-
 6279-    // clamp or exit if we are using polyphase filtering, and the limit is up
 6280-    if ( polyphase )
 6281-    {
 6282-      // when polyphase, you only have to do coeffs up to the numerator count
 6283-      if ( out_first_pixel == numerator )
 6284-        break;
 6285-
 6286-      // don't do any extra work, clamp last pixel at numerator too
 6287-      if ( out_last_pixel >= numerator )
 6288-        out_last_pixel = numerator - 1;
 6289-    }
 6290-
 6291-    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
 6292-    {
 6293-      float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
 6294-      float x = out_pixel_center - out_center_of_in;
 6295-      float coeff = kernel(x, scale, user_data) * scale;
 6296-
 6297-      // kill the coeff if it's too small (avoid denormals)
 6298-      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
 6299-        coeff = 0.0f;
 6300-
 6301-      {
 6302-        int out = i + out_first_pixel;
 6303-        float * coeffs = coefficient_group + out * coefficient_width;
 6304-        stbir__contributors * contribs = contributors + out;
 6305-
 6306-        // is this the first time this output pixel has been seen?  Init it.
 6307-        if ( out > first_out_inited )
 6308-        {
 6309-          STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
 6310-          first_out_inited = out;
 6311-          contribs->n0 = in_pixel;
 6312-          contribs->n1 = in_pixel;
 6313-          coeffs[0]  = coeff;
 6314-        }
 6315-        else
 6316-        {
 6317-          // insert on end (always in order)
 6318-          if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
 6319-          {
 6320-            STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
 6321-            contribs->n0 = in_pixel;
 6322-          }
 6323-          contribs->n1 = in_pixel;
 6324-          STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
 6325-          coeffs[in_pixel - contribs->n0]  = coeff;
 6326-        }
 6327-      }
 6328-    }
 6329-  }
 6330+static void
 6331+stbir__get_extents(stbir__sampler *samp, stbir__extents *scanline_extents)
 6332+{
 6333+	int j, stop;
 6334+	int left_margin, right_margin;
 6335+	int min_n = 0x7fffffff, max_n = -0x7fffffff;
 6336+	int min_left = 0x7fffffff, max_left = -0x7fffffff;
 6337+	int min_right = 0x7fffffff, max_right = -0x7fffffff;
 6338+	stbir_edge edge = samp->edge;
 6339+	stbir__contributors *contributors = samp->contributors;
 6340+	int output_sub_size = samp->scale_info.output_sub_size;
 6341+	int input_full_size = samp->scale_info.input_full_size;
 6342+	int filter_pixel_margin = samp->filter_pixel_margin;
 6343+
 6344+	STBIR_ASSERT(samp->is_gather);
 6345+
 6346+	stop = output_sub_size;
 6347+	for (j = 0; j < stop; j++) {
 6348+		STBIR_ASSERT(contributors[j].n1 >= contributors[j].n0);
 6349+		if (contributors[j].n0 < min_n) {
 6350+			min_n = contributors[j].n0;
 6351+			stop = j + filter_pixel_margin; // if we find a new min, only scan
 6352+			                                // another filter width
 6353+			if (stop > output_sub_size) {
 6354+				stop = output_sub_size;
 6355+			}
 6356+		}
 6357+	}
 6358+
 6359+	stop = 0;
 6360+	for (j = output_sub_size - 1; j >= stop; j--) {
 6361+		STBIR_ASSERT(contributors[j].n1 >= contributors[j].n0);
 6362+		if (contributors[j].n1 > max_n) {
 6363+			max_n = contributors[j].n1;
 6364+			stop = j - filter_pixel_margin; // if we find a new max, only scan
 6365+			                                // another filter width
 6366+			if (stop < 0) {
 6367+				stop = 0;
 6368+			}
 6369+		}
 6370+	}
 6371+
 6372+	STBIR_ASSERT(scanline_extents->conservative.n0 <= min_n);
 6373+	STBIR_ASSERT(scanline_extents->conservative.n1 >= max_n);
 6374+
 6375+	// now calculate how much into the margins we really read
 6376+	left_margin = 0;
 6377+	if (min_n < 0) {
 6378+		left_margin = -min_n;
 6379+		min_n = 0;
 6380+	}
 6381+
 6382+	right_margin = 0;
 6383+	if (max_n >= input_full_size) {
 6384+		right_margin = max_n - input_full_size + 1;
 6385+		max_n = input_full_size - 1;
 6386+	}
 6387+
 6388+	// index 1 is margin pixel extents (how many pixels we hang over the edge)
 6389+	scanline_extents->edge_sizes[0] = left_margin;
 6390+	scanline_extents->edge_sizes[1] = right_margin;
 6391+
 6392+	// index 2 is pixels read from the input
 6393+	scanline_extents->spans[0].n0 = min_n;
 6394+	scanline_extents->spans[0].n1 = max_n;
 6395+	scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6396+
 6397+	// default to no other input range
 6398+	scanline_extents->spans[1].n0 = 0;
 6399+	scanline_extents->spans[1].n1 = -1;
 6400+	scanline_extents->spans[1].pixel_offset_for_input = 0;
 6401+
 6402+	// don't have to do edge calc for zero clamp
 6403+	if (edge == STBIR_EDGE_ZERO) {
 6404+		return;
 6405+	}
 6406+
 6407+	// convert margin pixels to the pixels within the input (min and max)
 6408+	for (j = -left_margin; j < 0; j++) {
 6409+		int p = stbir__edge_wrap(edge, j, input_full_size);
 6410+		if (p < min_left) {
 6411+			min_left = p;
 6412+		}
 6413+		if (p > max_left) {
 6414+			max_left = p;
 6415+		}
 6416+	}
 6417+
 6418+	for (j = input_full_size; j < (input_full_size + right_margin); j++) {
 6419+		int p = stbir__edge_wrap(edge, j, input_full_size);
 6420+		if (p < min_right) {
 6421+			min_right = p;
 6422+		}
 6423+		if (p > max_right) {
 6424+			max_right = p;
 6425+		}
 6426+	}
 6427+
 6428+	// merge the left margin pixel region if it connects within 4 pixels of main
 6429+	// pixel region
 6430+	if (min_left != 0x7fffffff) {
 6431+		if (((min_left <= min_n) &&
 6432+		     ((max_left + STBIR__MERGE_RUNS_PIXEL_THRESHOLD) >= min_n)) ||
 6433+		    ((min_n <= min_left) &&
 6434+		     ((max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD) >= max_left))) {
 6435+			scanline_extents->spans[0].n0 = min_n = stbir__min(min_n, min_left);
 6436+			scanline_extents->spans[0].n1 = max_n = stbir__max(max_n, max_left);
 6437+			scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6438+			left_margin = 0;
 6439+		}
 6440+	}
 6441+
 6442+	// merge the right margin pixel region if it connects within 4 pixels of
 6443+	// main pixel region
 6444+	if (min_right != 0x7fffffff) {
 6445+		if (((min_right <= min_n) &&
 6446+		     ((max_right + STBIR__MERGE_RUNS_PIXEL_THRESHOLD) >= min_n)) ||
 6447+		    ((min_n <= min_right) &&
 6448+		     ((max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD) >= max_right))) {
 6449+			scanline_extents->spans[0].n0 = min_n =
 6450+			    stbir__min(min_n, min_right);
 6451+			scanline_extents->spans[0].n1 = max_n =
 6452+			    stbir__max(max_n, max_right);
 6453+			scanline_extents->spans[0].pixel_offset_for_input = min_n;
 6454+			right_margin = 0;
 6455+		}
 6456+	}
 6457+
 6458+	STBIR_ASSERT(scanline_extents->conservative.n0 <= min_n);
 6459+	STBIR_ASSERT(scanline_extents->conservative.n1 >= max_n);
 6460+
 6461+	// you get two ranges when you have the WRAP edge mode and you are doing
 6462+	// just the a piece of the resize
 6463+	//   so you need to get a second run of pixels from the opposite side of the
 6464+	//   scanline (which you wouldn't need except for WRAP)
 6465+
 6466+	// if we can't merge the min_left range, add it as a second range
 6467+	if ((left_margin) && (min_left != 0x7fffffff)) {
 6468+		stbir__span *newspan = scanline_extents->spans + 1;
 6469+		STBIR_ASSERT(right_margin == 0);
 6470+		if (min_left < scanline_extents->spans[0].n0) {
 6471+			scanline_extents->spans[1].pixel_offset_for_input =
 6472+			    scanline_extents->spans[0].n0;
 6473+			scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
 6474+			scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
 6475+			--newspan;
 6476+		}
 6477+		newspan->pixel_offset_for_input = min_left;
 6478+		newspan->n0 = -left_margin;
 6479+		newspan->n1 = (max_left - min_left) - left_margin;
 6480+		scanline_extents->edge_sizes[0] =
 6481+		    0; // don't need to copy the left margin, since we are directly
 6482+		       // decoding into the margin
 6483+	}
 6484+	// if we can't merge the min_left range, add it as a second range
 6485+	else if ((right_margin) && (min_right != 0x7fffffff)) {
 6486+		stbir__span *newspan = scanline_extents->spans + 1;
 6487+		if (min_right < scanline_extents->spans[0].n0) {
 6488+			scanline_extents->spans[1].pixel_offset_for_input =
 6489+			    scanline_extents->spans[0].n0;
 6490+			scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
 6491+			scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
 6492+			--newspan;
 6493+		}
 6494+		newspan->pixel_offset_for_input = min_right;
 6495+		newspan->n0 = scanline_extents->spans[1].n1 + 1;
 6496+		newspan->n1 =
 6497+		    scanline_extents->spans[1].n1 + 1 + (max_right - min_right);
 6498+		scanline_extents->edge_sizes[1] =
 6499+		    0; // don't need to copy the right margin, since we are directly
 6500+		       // decoding into the margin
 6501+	}
 6502+
 6503+	// sort the spans into write output order
 6504+	if ((scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0) &&
 6505+	    (scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0)) {
 6506+		stbir__span tspan = scanline_extents->spans[0];
 6507+		scanline_extents->spans[0] = scanline_extents->spans[1];
 6508+		scanline_extents->spans[1] = tspan;
 6509+	}
 6510+}
 6511+
 6512+static void
 6513+stbir__calculate_in_pixel_range(int *first_pixel, int *last_pixel,
 6514+                                float out_pixel_center, float out_filter_radius,
 6515+                                float inv_scale, float out_shift,
 6516+                                int input_size, stbir_edge edge)
 6517+{
 6518+	int first, last;
 6519+	float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
 6520+	float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
 6521+
 6522+	float in_pixel_influence_lowerbound =
 6523+	    (out_pixel_influence_lowerbound + out_shift) * inv_scale;
 6524+	float in_pixel_influence_upperbound =
 6525+	    (out_pixel_influence_upperbound + out_shift) * inv_scale;
 6526+
 6527+	first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
 6528+	last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
 6529+	if (last < first) {
 6530+		last = first; // point sample mode can span a value *right* at 0.5, and
 6531+		              // cause these to cross
 6532+	}
 6533+
 6534+	if (edge == STBIR_EDGE_WRAP) {
 6535+		if (first < -input_size) {
 6536+			first = -input_size;
 6537+		}
 6538+		if (last >= (input_size * 2)) {
 6539+			last = (input_size * 2) - 1;
 6540+		}
 6541+	}
 6542+
 6543+	*first_pixel = first;
 6544+	*last_pixel = last;
 6545+}
 6546+
 6547+static void
 6548+stbir__calculate_coefficients_for_gather_upsample(
 6549+    float out_filter_radius, stbir__kernel_callback *kernel,
 6550+    stbir__scale_info *scale_info, int num_contributors,
 6551+    stbir__contributors *contributors, float *coefficient_group,
 6552+    int coefficient_width, stbir_edge edge, void *user_data)
 6553+{
 6554+	int n, end;
 6555+	float inv_scale = scale_info->inv_scale;
 6556+	float out_shift = scale_info->pixel_shift;
 6557+	int input_size = scale_info->input_full_size;
 6558+	int numerator = scale_info->scale_numerator;
 6559+	int polyphase =
 6560+	    ((scale_info->scale_is_rational) && (numerator < num_contributors));
 6561+
 6562+	// Looping through out pixels
 6563+	end = num_contributors;
 6564+	if (polyphase) {
 6565+		end = numerator;
 6566+	}
 6567+	for (n = 0; n < end; n++) {
 6568+		int i;
 6569+		int last_non_zero;
 6570+		float out_pixel_center = (float)n + 0.5f;
 6571+		float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
 6572+
 6573+		int in_first_pixel, in_last_pixel;
 6574+
 6575+		stbir__calculate_in_pixel_range(&in_first_pixel, &in_last_pixel,
 6576+		                                out_pixel_center, out_filter_radius,
 6577+		                                inv_scale, out_shift, input_size, edge);
 6578+
 6579+		// make sure we never generate a range larger than our precalculated
 6580+		// coeff width
 6581+		//   this only happens in point sample mode, but it's a good safe thing
 6582+		//   to do anyway
 6583+		if ((in_last_pixel - in_first_pixel + 1) > coefficient_width) {
 6584+			in_last_pixel = in_first_pixel + coefficient_width - 1;
 6585+		}
 6586+
 6587+		last_non_zero = -1;
 6588+		for (i = 0; i <= in_last_pixel - in_first_pixel; i++) {
 6589+			float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
 6590+			float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale,
 6591+			                     user_data);
 6592+
 6593+			// kill denormals
 6594+			if (((coeff < stbir__small_float) &&
 6595+			     (coeff > -stbir__small_float))) {
 6596+				if (i == 0) // if we're at the front, just eat zero contributors
 6597+				{
 6598+					STBIR_ASSERT((in_last_pixel - in_first_pixel) !=
 6599+					             0); // there should be at least one contrib
 6600+					++in_first_pixel;
 6601+					i--;
 6602+					continue;
 6603+				}
 6604+				coeff =
 6605+				    0; // make sure is fully zero (should keep denormals away)
 6606+			} else {
 6607+				last_non_zero = i;
 6608+			}
 6609+
 6610+			coefficient_group[i] = coeff;
 6611+		}
 6612+
 6613+		in_last_pixel = last_non_zero + in_first_pixel; // kills trailing zeros
 6614+		contributors->n0 = in_first_pixel;
 6615+		contributors->n1 = in_last_pixel;
 6616+
 6617+		STBIR_ASSERT(contributors->n1 >= contributors->n0);
 6618+
 6619+		++contributors;
 6620+		coefficient_group += coefficient_width;
 6621+	}
 6622+}
 6623+
 6624+static void
 6625+stbir__insert_coeff(stbir__contributors *contribs, float *coeffs, int new_pixel,
 6626+                    float new_coeff, int max_width)
 6627+{
 6628+	if (new_pixel <= contribs->n1) // before the end
 6629+	{
 6630+		if (new_pixel < contribs->n0) // before the front?
 6631+		{
 6632+			if ((contribs->n1 - new_pixel + 1) <= max_width) {
 6633+				int j, o = contribs->n0 - new_pixel;
 6634+				for (j = contribs->n1 - contribs->n0; j <= 0; j--) {
 6635+					coeffs[j + o] = coeffs[j];
 6636+				}
 6637+				for (j = 1; j < o; j--) {
 6638+					coeffs[j] = coeffs[0];
 6639+				}
 6640+				coeffs[0] = new_coeff;
 6641+				contribs->n0 = new_pixel;
 6642+			}
 6643+		} else {
 6644+			coeffs[new_pixel - contribs->n0] += new_coeff;
 6645+		}
 6646+	} else {
 6647+		if ((new_pixel - contribs->n0 + 1) <= max_width) {
 6648+			int j, e = new_pixel - contribs->n0;
 6649+			for (j = (contribs->n1 - contribs->n0) + 1; j < e;
 6650+			     j++) { // clear in-betweens coeffs if there are any
 6651+				coeffs[j] = 0;
 6652+			}
 6653+
 6654+			coeffs[e] = new_coeff;
 6655+			contribs->n1 = new_pixel;
 6656+		}
 6657+	}
 6658+}
 6659+
 6660+static void
 6661+stbir__calculate_out_pixel_range(int *first_pixel, int *last_pixel,
 6662+                                 float in_pixel_center, float in_pixels_radius,
 6663+                                 float scale, float out_shift, int out_size)
 6664+{
 6665+	float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
 6666+	float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
 6667+	float out_pixel_influence_lowerbound =
 6668+	    in_pixel_influence_lowerbound * scale - out_shift;
 6669+	float out_pixel_influence_upperbound =
 6670+	    in_pixel_influence_upperbound * scale - out_shift;
 6671+	int out_first_pixel =
 6672+	    (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
 6673+	int out_last_pixel =
 6674+	    (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
 6675+
 6676+	if (out_first_pixel < 0) {
 6677+		out_first_pixel = 0;
 6678+	}
 6679+	if (out_last_pixel >= out_size) {
 6680+		out_last_pixel = out_size - 1;
 6681+	}
 6682+	*first_pixel = out_first_pixel;
 6683+	*last_pixel = out_last_pixel;
 6684+}
 6685+
 6686+static void
 6687+stbir__calculate_coefficients_for_gather_downsample(
 6688+    int start, int end, float in_pixels_radius, stbir__kernel_callback *kernel,
 6689+    stbir__scale_info *scale_info, int coefficient_width, int num_contributors,
 6690+    stbir__contributors *contributors, float *coefficient_group,
 6691+    void *user_data)
 6692+{
 6693+	int in_pixel;
 6694+	int i;
 6695+	int first_out_inited = -1;
 6696+	float scale = scale_info->scale;
 6697+	float out_shift = scale_info->pixel_shift;
 6698+	int out_size = scale_info->output_sub_size;
 6699+	int numerator = scale_info->scale_numerator;
 6700+	int polyphase = ((scale_info->scale_is_rational) && (numerator < out_size));
 6701+
 6702+	STBIR__UNUSED(num_contributors);
 6703+
 6704+	// Loop through the input pixels
 6705+	for (in_pixel = start; in_pixel < end; in_pixel++) {
 6706+		float in_pixel_center = (float)in_pixel + 0.5f;
 6707+		float out_center_of_in = in_pixel_center * scale - out_shift;
 6708+		int out_first_pixel, out_last_pixel;
 6709+
 6710+		stbir__calculate_out_pixel_range(&out_first_pixel, &out_last_pixel,
 6711+		                                 in_pixel_center, in_pixels_radius,
 6712+		                                 scale, out_shift, out_size);
 6713+
 6714+		if (out_first_pixel > out_last_pixel) {
 6715+			continue;
 6716+		}
 6717+
 6718+		// clamp or exit if we are using polyphase filtering, and the limit is
 6719+		// up
 6720+		if (polyphase) {
 6721+			// when polyphase, you only have to do coeffs up to the numerator
 6722+			// count
 6723+			if (out_first_pixel == numerator) {
 6724+				break;
 6725+			}
 6726+
 6727+			// don't do any extra work, clamp last pixel at numerator too
 6728+			if (out_last_pixel >= numerator) {
 6729+				out_last_pixel = numerator - 1;
 6730+			}
 6731+		}
 6732+
 6733+		for (i = 0; i <= out_last_pixel - out_first_pixel; i++) {
 6734+			float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
 6735+			float x = out_pixel_center - out_center_of_in;
 6736+			float coeff = kernel(x, scale, user_data) * scale;
 6737+
 6738+			// kill the coeff if it's too small (avoid denormals)
 6739+			if (((coeff < stbir__small_float) &&
 6740+			     (coeff > -stbir__small_float))) {
 6741+				coeff = 0.0f;
 6742+			}
 6743+
 6744+			{
 6745+				int out = i + out_first_pixel;
 6746+				float *coeffs = coefficient_group + out * coefficient_width;
 6747+				stbir__contributors *contribs = contributors + out;
 6748+
 6749+				// is this the first time this output pixel has been seen?  Init
 6750+				// it.
 6751+				if (out > first_out_inited) {
 6752+					STBIR_ASSERT(
 6753+					    out == (first_out_inited +
 6754+					            1)); // ensure we have only advanced one at time
 6755+					first_out_inited = out;
 6756+					contribs->n0 = in_pixel;
 6757+					contribs->n1 = in_pixel;
 6758+					coeffs[0] = coeff;
 6759+				} else {
 6760+					// insert on end (always in order)
 6761+					if (coeffs[0] == 0.0f) // if the first coefficent is zero,
 6762+					                       // then zap it for this coeffs
 6763+					{
 6764+						STBIR_ASSERT(
 6765+						    (in_pixel - contribs->n0) ==
 6766+						    1); // ensure that when we zap, we're at the 2nd pos
 6767+						contribs->n0 = in_pixel;
 6768+					}
 6769+					contribs->n1 = in_pixel;
 6770+					STBIR_ASSERT((in_pixel - contribs->n0) < coefficient_width);
 6771+					coeffs[in_pixel - contribs->n0] = coeff;
 6772+				}
 6773+			}
 6774+		}
 6775+	}
 6776 }
 6777 
 6778 #ifdef STBIR_RENORMALIZE_IN_FLOAT
 6779@@ -3492,555 +4231,647 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
 6780 #define STBIR_RENORM_TYPE double
 6781 #endif
 6782 
 6783-static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
 6784-{
 6785-  int input_size = scale_info->input_full_size;
 6786-  int input_last_n1 = input_size - 1;
 6787-  int n, end;
 6788-  int lowest = 0x7fffffff;
 6789-  int highest = -0x7fffffff;
 6790-  int widest = -1;
 6791-  int numerator = scale_info->scale_numerator;
 6792-  int denominator = scale_info->scale_denominator;
 6793-  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
 6794-  float * coeffs;
 6795-  stbir__contributors * contribs;
 6796-
 6797-  // weight all the coeffs for each sample
 6798-  coeffs = coefficient_group;
 6799-  contribs = contributors;
 6800-  end = num_contributors; if ( polyphase ) end = numerator;
 6801-  for (n = 0; n < end; n++)
 6802-  {
 6803-    int i;
 6804-    STBIR_RENORM_TYPE filter_scale, total_filter = 0;
 6805-    int e;
 6806-
 6807-    // add all contribs
 6808-    e = contribs->n1 - contribs->n0;
 6809-    for( i = 0 ; i <= e ; i++ )
 6810-    {
 6811-      total_filter += (STBIR_RENORM_TYPE) coeffs[i];
 6812-      STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
 6813-    }
 6814-
 6815-    // rescale
 6816-    if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
 6817-    {
 6818-      // all coeffs are extremely small, just zero it
 6819-      contribs->n1 = contribs->n0;
 6820-      coeffs[0] = 0.0f;
 6821-    }
 6822-    else
 6823-    {
 6824-      // if the total isn't 1.0, rescale everything
 6825-      if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
 6826-      {
 6827-        filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
 6828-
 6829-        // scale them all
 6830-        for (i = 0; i <= e; i++)
 6831-          coeffs[i] = (float) ( coeffs[i] * filter_scale );
 6832-      }
 6833-    }
 6834-    ++contribs;
 6835-    coeffs += coefficient_width;
 6836-  }
 6837-
 6838-  // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
 6839-  //   most of the coefficients, so we copy them here
 6840-  if ( polyphase )
 6841-  {
 6842-    stbir__contributors * prev_contribs = contributors;
 6843-    stbir__contributors * cur_contribs = contributors + numerator;
 6844-
 6845-    for( n = numerator ; n < num_contributors ; n++ )
 6846-    {
 6847-      cur_contribs->n0 = prev_contribs->n0 + denominator;
 6848-      cur_contribs->n1 = prev_contribs->n1 + denominator;
 6849-      ++cur_contribs;
 6850-      ++prev_contribs;
 6851-    }
 6852-    stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
 6853-  }
 6854-
 6855-  coeffs = coefficient_group;
 6856-  contribs = contributors;
 6857-
 6858-  for (n = 0; n < num_contributors; n++)
 6859-  {
 6860-    int i;
 6861-
 6862-    // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
 6863-    if ( edge == STBIR_EDGE_ZERO )
 6864-    {
 6865-      // shrink the right side if necessary
 6866-      if ( contribs->n1 > input_last_n1 )
 6867-        contribs->n1 = input_last_n1;
 6868-
 6869-      // shrink the left side
 6870-      if ( contribs->n0 < 0 )
 6871-      {
 6872-        int j, left, skips = 0;
 6873-
 6874-        skips = -contribs->n0;
 6875-        contribs->n0 = 0;
 6876-
 6877-        // now move down the weights
 6878-        left = contribs->n1 - contribs->n0 + 1;
 6879-        if ( left > 0 )
 6880-        {
 6881-          for( j = 0 ; j < left ; j++ )
 6882-            coeffs[ j ] = coeffs[ j + skips ];
 6883-        }
 6884-      }
 6885-    }
 6886-    else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
 6887-    {
 6888-      // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
 6889-
 6890-      // right hand side first
 6891-      if ( contribs->n1 > input_last_n1 )
 6892-      {
 6893-        int start = contribs->n0;
 6894-        int endi = contribs->n1;
 6895-        contribs->n1 = input_last_n1;
 6896-        for( i = input_size; i <= endi; i++ )
 6897-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
 6898-      }
 6899-
 6900-      // now check left hand edge
 6901-      if ( contribs->n0 < 0 )
 6902-      {
 6903-        int save_n0;
 6904-        float save_n0_coeff;
 6905-        float * c = coeffs - ( contribs->n0 + 1 );
 6906-
 6907-        // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
 6908-        for( i = -1 ; i > contribs->n0 ; i-- )
 6909-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
 6910-        save_n0 = contribs->n0;
 6911-        save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
 6912-
 6913-        // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
 6914-        contribs->n0 = 0;
 6915-        for(i = 0 ; i <= contribs->n1 ; i++ )
 6916-          coeffs[i] = coeffs[i-save_n0];
 6917-
 6918-        // now that we have shrunk down the contribs, we insert the first one safely
 6919-        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
 6920-      }
 6921-    }
 6922-
 6923-    if ( contribs->n0 <= contribs->n1 )
 6924-    {
 6925-      int diff = contribs->n1 - contribs->n0 + 1;
 6926-      while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
 6927-        --diff;
 6928-
 6929-      contribs->n1 = contribs->n0 + diff - 1;
 6930-
 6931-      if ( contribs->n0 <= contribs->n1 )
 6932-      {
 6933-        if ( contribs->n0 < lowest )
 6934-          lowest = contribs->n0;
 6935-        if ( contribs->n1 > highest )
 6936-          highest = contribs->n1;
 6937-        if ( diff > widest )
 6938-          widest = diff;
 6939-      }
 6940-
 6941-      // re-zero out unused coefficients (if any)
 6942-      for( i = diff ; i < coefficient_width ; i++ )
 6943-        coeffs[i] = 0.0f;
 6944-    }
 6945-
 6946-    ++contribs;
 6947-    coeffs += coefficient_width;
 6948-  }
 6949-  filter_info->lowest = lowest;
 6950-  filter_info->highest = highest;
 6951-  filter_info->widest = widest;
 6952-}
 6953-
 6954-#undef STBIR_RENORM_TYPE 
 6955-
 6956-static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 6957-{
 6958-  #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
 6959-  #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
 6960-  #ifdef STBIR_SIMD
 6961-  #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
 6962-  #else
 6963-  #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
 6964-  #endif
 6965-
 6966-  int row_end = row1 + 1;
 6967-  STBIR__UNUSED( row0 ); // only used in an assert
 6968-
 6969-  if ( coefficient_width != widest )
 6970-  {
 6971-    float * pc = coefficents;
 6972-    float * coeffs = coefficents;
 6973-    float * pc_end = coefficents + num_contributors * widest;
 6974-    switch( widest )
 6975-    {
 6976-      case 1:
 6977-        STBIR_NO_UNROLL_LOOP_START
 6978-        do {
 6979-          STBIR_MOVE_1( pc, coeffs );
 6980-          ++pc;
 6981-          coeffs += coefficient_width;
 6982-        } while ( pc < pc_end );
 6983-        break;
 6984-      case 2:
 6985-        STBIR_NO_UNROLL_LOOP_START
 6986-        do {
 6987-          STBIR_MOVE_2( pc, coeffs );
 6988-          pc += 2;
 6989-          coeffs += coefficient_width;
 6990-        } while ( pc < pc_end );
 6991-        break;
 6992-      case 3:
 6993-        STBIR_NO_UNROLL_LOOP_START
 6994-        do {
 6995-          STBIR_MOVE_2( pc, coeffs );
 6996-          STBIR_MOVE_1( pc+2, coeffs+2 );
 6997-          pc += 3;
 6998-          coeffs += coefficient_width;
 6999-        } while ( pc < pc_end );
 7000-        break;
 7001-      case 4:
 7002-        STBIR_NO_UNROLL_LOOP_START
 7003-        do {
 7004-          STBIR_MOVE_4( pc, coeffs );
 7005-          pc += 4;
 7006-          coeffs += coefficient_width;
 7007-        } while ( pc < pc_end );
 7008-        break;
 7009-      case 5:
 7010-        STBIR_NO_UNROLL_LOOP_START
 7011-        do {
 7012-          STBIR_MOVE_4( pc, coeffs );
 7013-          STBIR_MOVE_1( pc+4, coeffs+4 );
 7014-          pc += 5;
 7015-          coeffs += coefficient_width;
 7016-        } while ( pc < pc_end );
 7017-        break;
 7018-      case 6:
 7019-        STBIR_NO_UNROLL_LOOP_START
 7020-        do {
 7021-          STBIR_MOVE_4( pc, coeffs );
 7022-          STBIR_MOVE_2( pc+4, coeffs+4 );
 7023-          pc += 6;
 7024-          coeffs += coefficient_width;
 7025-        } while ( pc < pc_end );
 7026-        break;
 7027-      case 7:
 7028-        STBIR_NO_UNROLL_LOOP_START
 7029-        do {
 7030-          STBIR_MOVE_4( pc, coeffs );
 7031-          STBIR_MOVE_2( pc+4, coeffs+4 );
 7032-          STBIR_MOVE_1( pc+6, coeffs+6 );
 7033-          pc += 7;
 7034-          coeffs += coefficient_width;
 7035-        } while ( pc < pc_end );
 7036-        break;
 7037-      case 8:
 7038-        STBIR_NO_UNROLL_LOOP_START
 7039-        do {
 7040-          STBIR_MOVE_4( pc, coeffs );
 7041-          STBIR_MOVE_4( pc+4, coeffs+4 );
 7042-          pc += 8;
 7043-          coeffs += coefficient_width;
 7044-        } while ( pc < pc_end );
 7045-        break;
 7046-      case 9:
 7047-        STBIR_NO_UNROLL_LOOP_START
 7048-        do {
 7049-          STBIR_MOVE_4( pc, coeffs );
 7050-          STBIR_MOVE_4( pc+4, coeffs+4 );
 7051-          STBIR_MOVE_1( pc+8, coeffs+8 );
 7052-          pc += 9;
 7053-          coeffs += coefficient_width;
 7054-        } while ( pc < pc_end );
 7055-        break;
 7056-      case 10:
 7057-        STBIR_NO_UNROLL_LOOP_START
 7058-        do {
 7059-          STBIR_MOVE_4( pc, coeffs );
 7060-          STBIR_MOVE_4( pc+4, coeffs+4 );
 7061-          STBIR_MOVE_2( pc+8, coeffs+8 );
 7062-          pc += 10;
 7063-          coeffs += coefficient_width;
 7064-        } while ( pc < pc_end );
 7065-        break;
 7066-      case 11:
 7067-        STBIR_NO_UNROLL_LOOP_START
 7068-        do {
 7069-          STBIR_MOVE_4( pc, coeffs );
 7070-          STBIR_MOVE_4( pc+4, coeffs+4 );
 7071-          STBIR_MOVE_2( pc+8, coeffs+8 );
 7072-          STBIR_MOVE_1( pc+10, coeffs+10 );
 7073-          pc += 11;
 7074-          coeffs += coefficient_width;
 7075-        } while ( pc < pc_end );
 7076-        break;
 7077-      case 12:
 7078-        STBIR_NO_UNROLL_LOOP_START
 7079-        do {
 7080-          STBIR_MOVE_4( pc, coeffs );
 7081-          STBIR_MOVE_4( pc+4, coeffs+4 );
 7082-          STBIR_MOVE_4( pc+8, coeffs+8 );
 7083-          pc += 12;
 7084-          coeffs += coefficient_width;
 7085-        } while ( pc < pc_end );
 7086-        break;
 7087-      default:
 7088-        STBIR_NO_UNROLL_LOOP_START
 7089-        do {
 7090-          float * copy_end = pc + widest - 4;
 7091-          float * c = coeffs;
 7092-          do {
 7093-            STBIR_NO_UNROLL( pc );
 7094-            STBIR_MOVE_4( pc, c );
 7095-            pc += 4;
 7096-            c += 4;
 7097-          } while ( pc <= copy_end );
 7098-          copy_end += 4;
 7099-          STBIR_NO_UNROLL_LOOP_START
 7100-          while ( pc < copy_end )
 7101-          {
 7102-            STBIR_MOVE_1( pc, c );
 7103-            ++pc; ++c;
 7104-          }
 7105-          coeffs += coefficient_width;
 7106-        } while ( pc < pc_end );
 7107-        break;
 7108-    }
 7109-  }
 7110-
 7111-  // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
 7112-  coefficents[ widest * num_contributors ] = 8888.0f;
 7113-
 7114-  // the minimum we might read for unrolled filters widths is 12. So, we need to
 7115-  //   make sure we never read outside the decode buffer, by possibly moving
 7116-  //   the sample area back into the scanline, and putting zeros weights first.
 7117-  // we start on the right edge and check until we're well past the possible
 7118-  //   clip area (2*widest).
 7119-  {
 7120-    stbir__contributors * contribs = contributors + num_contributors - 1;
 7121-    float * coeffs = coefficents + widest * ( num_contributors - 1 );
 7122-
 7123-    // go until no chance of clipping (this is usually less than 8 lops)
 7124-    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
 7125-    {
 7126-      // might we clip??
 7127-      if ( ( contribs->n0 + widest ) > row_end )
 7128-      {
 7129-        int stop_range = widest;
 7130-
 7131-        // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
 7132-        //   of this contrib n1, instead of a fixed widest amount - so calculate this
 7133-        if ( widest > 12 )
 7134-        {
 7135-          int mod;
 7136-
 7137-          // how far will be read in the n_coeff loop (which depends on the widest count mod4);
 7138-          mod = widest & 3;
 7139-          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 7140-
 7141-          // the n_coeff loops do a minimum amount of coeffs, so factor that in!
 7142-          if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
 7143-        }
 7144-
 7145-        // now see if we still clip with the refined range
 7146-        if ( ( contribs->n0 + stop_range ) > row_end )
 7147-        {
 7148-          int new_n0 = row_end - stop_range;
 7149-          int num = contribs->n1 - contribs->n0 + 1;
 7150-          int backup = contribs->n0 - new_n0;
 7151-          float * from_co = coeffs + num - 1;
 7152-          float * to_co = from_co + backup;
 7153-
 7154-          STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
 7155-
 7156-          // move the coeffs over
 7157-          while( num )
 7158-          {
 7159-            *to_co-- = *from_co--;
 7160-            --num;
 7161-          }
 7162-          // zero new positions
 7163-          while ( to_co >= coeffs )
 7164-            *to_co-- = 0;
 7165-          // set new start point
 7166-          contribs->n0 = new_n0;
 7167-          if ( widest > 12 )
 7168-          {
 7169-            int mod;
 7170-
 7171-            // how far will be read in the n_coeff loop (which depends on the widest count mod4);
 7172-            mod = widest & 3;
 7173-            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 7174-
 7175-            // the n_coeff loops do a minimum amount of coeffs, so factor that in!
 7176-            if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
 7177-          }
 7178-        }
 7179-      }
 7180-      --contribs;
 7181-      coeffs -= widest;
 7182-    }
 7183-  }
 7184-
 7185-  return widest;
 7186-  #undef STBIR_MOVE_1
 7187-  #undef STBIR_MOVE_2
 7188-  #undef STBIR_MOVE_4
 7189-}
 7190-
 7191-static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
 7192-{
 7193-  int n;
 7194-  float scale = samp->scale_info.scale;
 7195-  stbir__kernel_callback * kernel = samp->filter_kernel;
 7196-  stbir__support_callback * support = samp->filter_support;
 7197-  float inv_scale = samp->scale_info.inv_scale;
 7198-  int input_full_size = samp->scale_info.input_full_size;
 7199-  int gather_num_contributors = samp->num_contributors;
 7200-  stbir__contributors* gather_contributors = samp->contributors;
 7201-  float * gather_coeffs = samp->coefficients;
 7202-  int gather_coefficient_width = samp->coefficient_width;
 7203-
 7204-  switch ( samp->is_gather )
 7205-  {
 7206-    case 1: // gather upsample
 7207-    {
 7208-      float out_pixels_radius = support(inv_scale,user_data) * scale;
 7209-
 7210-      stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
 7211-
 7212-      STBIR_PROFILE_BUILD_START( cleanup );
 7213-      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
 7214-      STBIR_PROFILE_BUILD_END( cleanup );
 7215-    }
 7216-    break;
 7217-
 7218-    case 0: // scatter downsample (only on vertical)
 7219-    case 2: // gather downsample
 7220-    {
 7221-      float in_pixels_radius = support(scale,user_data) * inv_scale;
 7222-      int filter_pixel_margin = samp->filter_pixel_margin;
 7223-      int input_end = input_full_size + filter_pixel_margin;
 7224-
 7225-      // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
 7226-      if ( !samp->is_gather )
 7227-      {
 7228-        // check if we are using the same gather downsample on the horizontal as this vertical,
 7229-        //   if so, then we don't have to generate them, we can just pivot from the horizontal.
 7230-        if ( other_axis_for_pivot )
 7231-        {
 7232-          gather_contributors = other_axis_for_pivot->contributors;
 7233-          gather_coeffs = other_axis_for_pivot->coefficients;
 7234-          gather_coefficient_width = other_axis_for_pivot->coefficient_width;
 7235-          gather_num_contributors = other_axis_for_pivot->num_contributors;
 7236-          samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
 7237-          samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
 7238-          samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
 7239-          goto jump_right_to_pivot;
 7240-        }
 7241-
 7242-        gather_contributors = samp->gather_prescatter_contributors;
 7243-        gather_coeffs = samp->gather_prescatter_coefficients;
 7244-        gather_coefficient_width = samp->gather_prescatter_coefficient_width;
 7245-        gather_num_contributors = samp->gather_prescatter_num_contributors;
 7246-      }
 7247-
 7248-      stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
 7249-
 7250-      STBIR_PROFILE_BUILD_START( cleanup );
 7251-      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
 7252-      STBIR_PROFILE_BUILD_END( cleanup );
 7253-
 7254-      if ( !samp->is_gather )
 7255-      {
 7256-        // if this is a scatter (vertical only), then we need to pivot the coeffs
 7257-        stbir__contributors * scatter_contributors;
 7258-        int highest_set;
 7259-
 7260-        jump_right_to_pivot:
 7261-
 7262-        STBIR_PROFILE_BUILD_START( pivot );
 7263-
 7264-        highest_set = (-filter_pixel_margin) - 1;
 7265-        for (n = 0; n < gather_num_contributors; n++)
 7266-        {
 7267-          int k;
 7268-          int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
 7269-          int scatter_coefficient_width = samp->coefficient_width;
 7270-          float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
 7271-          float * g_coeffs = gather_coeffs;
 7272-          scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
 7273-
 7274-          for (k = gn0 ; k <= gn1 ; k++ )
 7275-          {
 7276-            float gc = *g_coeffs++;
 7277-            
 7278-            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
 7279-            //   (which happens when pivoting from horizontal, which might have dummy zeros)
 7280-            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
 7281-            {
 7282-              if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
 7283-              {
 7284-                {
 7285-                  // if we are skipping over several contributors, we need to clear the skipped ones
 7286-                  stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
 7287-                  while ( clear_contributors < scatter_contributors )
 7288-                  {
 7289-                    clear_contributors->n0 = 0;
 7290-                    clear_contributors->n1 = -1;
 7291-                    ++clear_contributors;
 7292-                  }
 7293-                }
 7294-                scatter_contributors->n0 = n;
 7295-                scatter_contributors->n1 = n;
 7296-                scatter_coeffs[0]  = gc;
 7297-                highest_set = k;
 7298-              }
 7299-              else
 7300-              {
 7301-                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
 7302-              }
 7303-              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
 7304-            }
 7305-            ++scatter_contributors;
 7306-            scatter_coeffs += scatter_coefficient_width;
 7307-          }
 7308-
 7309-          ++gather_contributors;
 7310-          gather_coeffs += gather_coefficient_width;
 7311-        }
 7312-
 7313-        // now clear any unset contribs
 7314-        {
 7315-          stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
 7316-          stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
 7317-          while ( clear_contributors < end_contributors )
 7318-          {
 7319-            clear_contributors->n0 = 0;
 7320-            clear_contributors->n1 = -1;
 7321-            ++clear_contributors;
 7322-          }
 7323-        }
 7324-
 7325-        STBIR_PROFILE_BUILD_END( pivot );
 7326-      }
 7327-    }
 7328-    break;
 7329-  }
 7330-}
 7331+static void
 7332+stbir__cleanup_gathered_coefficients(stbir_edge edge,
 7333+                                     stbir__filter_extent_info *filter_info,
 7334+                                     stbir__scale_info *scale_info,
 7335+                                     int num_contributors,
 7336+                                     stbir__contributors *contributors,
 7337+                                     float *coefficient_group,
 7338+                                     int coefficient_width)
 7339+{
 7340+	int input_size = scale_info->input_full_size;
 7341+	int input_last_n1 = input_size - 1;
 7342+	int n, end;
 7343+	int lowest = 0x7fffffff;
 7344+	int highest = -0x7fffffff;
 7345+	int widest = -1;
 7346+	int numerator = scale_info->scale_numerator;
 7347+	int denominator = scale_info->scale_denominator;
 7348+	int polyphase =
 7349+	    ((scale_info->scale_is_rational) && (numerator < num_contributors));
 7350+	float *coeffs;
 7351+	stbir__contributors *contribs;
 7352+
 7353+	// weight all the coeffs for each sample
 7354+	coeffs = coefficient_group;
 7355+	contribs = contributors;
 7356+	end = num_contributors;
 7357+	if (polyphase) {
 7358+		end = numerator;
 7359+	}
 7360+	for (n = 0; n < end; n++) {
 7361+		int i;
 7362+		STBIR_RENORM_TYPE filter_scale, total_filter = 0;
 7363+		int e;
 7364+
 7365+		// add all contribs
 7366+		e = contribs->n1 - contribs->n0;
 7367+		for (i = 0; i <= e; i++) {
 7368+			total_filter += (STBIR_RENORM_TYPE)coeffs[i];
 7369+			STBIR_ASSERT((coeffs[i] >= -2.0f) &&
 7370+			             (coeffs[i] <= 2.0f)); // check for wonky weights
 7371+		}
 7372+
 7373+		// rescale
 7374+		if ((total_filter < stbir__small_float) &&
 7375+		    (total_filter > -stbir__small_float)) {
 7376+			// all coeffs are extremely small, just zero it
 7377+			contribs->n1 = contribs->n0;
 7378+			coeffs[0] = 0.0f;
 7379+		} else {
 7380+			// if the total isn't 1.0, rescale everything
 7381+			if ((total_filter < (1.0f - stbir__small_float)) ||
 7382+			    (total_filter > (1.0f + stbir__small_float))) {
 7383+				filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
 7384+
 7385+				// scale them all
 7386+				for (i = 0; i <= e; i++) {
 7387+					coeffs[i] = (float)(coeffs[i] * filter_scale);
 7388+				}
 7389+			}
 7390+		}
 7391+		++contribs;
 7392+		coeffs += coefficient_width;
 7393+	}
 7394+
 7395+	// if we have a rational for the scale, we can exploit the polyphaseness to
 7396+	// not calculate
 7397+	//   most of the coefficients, so we copy them here
 7398+	if (polyphase) {
 7399+		stbir__contributors *prev_contribs = contributors;
 7400+		stbir__contributors *cur_contribs = contributors + numerator;
 7401+
 7402+		for (n = numerator; n < num_contributors; n++) {
 7403+			cur_contribs->n0 = prev_contribs->n0 + denominator;
 7404+			cur_contribs->n1 = prev_contribs->n1 + denominator;
 7405+			++cur_contribs;
 7406+			++prev_contribs;
 7407+		}
 7408+		stbir_overlapping_memcpy(coefficient_group +
 7409+		                             numerator * coefficient_width,
 7410+		                         coefficient_group,
 7411+		                         (num_contributors - numerator) *
 7412+		                             coefficient_width * sizeof(coeffs[0]));
 7413+	}
 7414+
 7415+	coeffs = coefficient_group;
 7416+	contribs = contributors;
 7417+
 7418+	for (n = 0; n < num_contributors; n++) {
 7419+		int i;
 7420+
 7421+		// in zero edge mode, just remove out of bounds contribs completely
 7422+		// (since their weights are accounted for now)
 7423+		if (edge == STBIR_EDGE_ZERO) {
 7424+			// shrink the right side if necessary
 7425+			if (contribs->n1 > input_last_n1) {
 7426+				contribs->n1 = input_last_n1;
 7427+			}
 7428+
 7429+			// shrink the left side
 7430+			if (contribs->n0 < 0) {
 7431+				int j, left, skips = 0;
 7432+
 7433+				skips = -contribs->n0;
 7434+				contribs->n0 = 0;
 7435+
 7436+				// now move down the weights
 7437+				left = contribs->n1 - contribs->n0 + 1;
 7438+				if (left > 0) {
 7439+					for (j = 0; j < left; j++) {
 7440+						coeffs[j] = coeffs[j + skips];
 7441+					}
 7442+				}
 7443+			}
 7444+		} else if ((edge == STBIR_EDGE_CLAMP) || (edge == STBIR_EDGE_REFLECT)) {
 7445+			// for clamp and reflect, calculate the true inbounds position
 7446+			// (based on edge type) and just add that to the existing weight
 7447+
 7448+			// right hand side first
 7449+			if (contribs->n1 > input_last_n1) {
 7450+				int start = contribs->n0;
 7451+				int endi = contribs->n1;
 7452+				contribs->n1 = input_last_n1;
 7453+				for (i = input_size; i <= endi; i++) {
 7454+					stbir__insert_coeff(
 7455+					    contribs, coeffs,
 7456+					    stbir__edge_wrap_slow[edge](i, input_size),
 7457+					    coeffs[i - start], coefficient_width);
 7458+				}
 7459+			}
 7460+
 7461+			// now check left hand edge
 7462+			if (contribs->n0 < 0) {
 7463+				int save_n0;
 7464+				float save_n0_coeff;
 7465+				float *c = coeffs - (contribs->n0 + 1);
 7466+
 7467+				// reinsert the coeffs with it reflected or clamped (insert
 7468+				// accumulates, if the coeffs exist)
 7469+				for (i = -1; i > contribs->n0; i--) {
 7470+					stbir__insert_coeff(
 7471+					    contribs, coeffs,
 7472+					    stbir__edge_wrap_slow[edge](i, input_size), *c--,
 7473+					    coefficient_width);
 7474+				}
 7475+				save_n0 = contribs->n0;
 7476+				save_n0_coeff = c[0]; // save it, since we didn't do the final
 7477+				                      // one (i==n0), because there might be too
 7478+				                      // many coeffs to hold (before we resize)!
 7479+
 7480+				// now slide all the coeffs down (since we have accumulated them
 7481+				// in the positive contribs) and reset the first contrib
 7482+				contribs->n0 = 0;
 7483+				for (i = 0; i <= contribs->n1; i++) {
 7484+					coeffs[i] = coeffs[i - save_n0];
 7485+				}
 7486+
 7487+				// now that we have shrunk down the contribs, we insert the
 7488+				// first one safely
 7489+				stbir__insert_coeff(
 7490+				    contribs, coeffs,
 7491+				    stbir__edge_wrap_slow[edge](save_n0, input_size),
 7492+				    save_n0_coeff, coefficient_width);
 7493+			}
 7494+		}
 7495+
 7496+		if (contribs->n0 <= contribs->n1) {
 7497+			int diff = contribs->n1 - contribs->n0 + 1;
 7498+			while (diff && (coeffs[diff - 1] == 0.0f)) {
 7499+				--diff;
 7500+			}
 7501+
 7502+			contribs->n1 = contribs->n0 + diff - 1;
 7503+
 7504+			if (contribs->n0 <= contribs->n1) {
 7505+				if (contribs->n0 < lowest) {
 7506+					lowest = contribs->n0;
 7507+				}
 7508+				if (contribs->n1 > highest) {
 7509+					highest = contribs->n1;
 7510+				}
 7511+				if (diff > widest) {
 7512+					widest = diff;
 7513+				}
 7514+			}
 7515+
 7516+			// re-zero out unused coefficients (if any)
 7517+			for (i = diff; i < coefficient_width; i++) {
 7518+				coeffs[i] = 0.0f;
 7519+			}
 7520+		}
 7521+
 7522+		++contribs;
 7523+		coeffs += coefficient_width;
 7524+	}
 7525+	filter_info->lowest = lowest;
 7526+	filter_info->highest = highest;
 7527+	filter_info->widest = widest;
 7528+}
 7529+
 7530+#undef STBIR_RENORM_TYPE
 7531+
 7532+static int
 7533+stbir__pack_coefficients(int num_contributors,
 7534+                         stbir__contributors *contributors, float *coefficents,
 7535+                         int coefficient_width, int widest, int row0, int row1)
 7536+{
 7537+#define STBIR_MOVE_1(dest, src)                                                \
 7538+	{                                                                          \
 7539+		STBIR_NO_UNROLL(dest);                                                 \
 7540+		((stbir_uint32 *)(dest))[0] = ((stbir_uint32 *)(src))[0];              \
 7541+	}
 7542+#define STBIR_MOVE_2(dest, src)                                                \
 7543+	{                                                                          \
 7544+		STBIR_NO_UNROLL(dest);                                                 \
 7545+		((stbir_uint64 *)(dest))[0] = ((stbir_uint64 *)(src))[0];              \
 7546+	}
 7547+#ifdef STBIR_SIMD
 7548+#define STBIR_MOVE_4(dest, src)                                                \
 7549+	{                                                                          \
 7550+		stbir__simdf t;                                                        \
 7551+		STBIR_NO_UNROLL(dest);                                                 \
 7552+		stbir__simdf_load(t, src);                                             \
 7553+		stbir__simdf_store(dest, t);                                           \
 7554+	}
 7555+#else
 7556+#define STBIR_MOVE_4(dest, src)                                                \
 7557+	{                                                                          \
 7558+		STBIR_NO_UNROLL(dest);                                                 \
 7559+		((stbir_uint64 *)(dest))[0] = ((stbir_uint64 *)(src))[0];              \
 7560+		((stbir_uint64 *)(dest))[1] = ((stbir_uint64 *)(src))[1];              \
 7561+	}
 7562+#endif
 7563 
 7564+	int row_end = row1 + 1;
 7565+	STBIR__UNUSED(row0); // only used in an assert
 7566+
 7567+	if (coefficient_width != widest) {
 7568+		float *pc = coefficents;
 7569+		float *coeffs = coefficents;
 7570+		float *pc_end = coefficents + num_contributors * widest;
 7571+		switch (widest) {
 7572+		case 1:
 7573+			STBIR_NO_UNROLL_LOOP_START
 7574+			do {
 7575+				STBIR_MOVE_1(pc, coeffs);
 7576+				++pc;
 7577+				coeffs += coefficient_width;
 7578+			} while (pc < pc_end);
 7579+			break;
 7580+		case 2:
 7581+			STBIR_NO_UNROLL_LOOP_START
 7582+			do {
 7583+				STBIR_MOVE_2(pc, coeffs);
 7584+				pc += 2;
 7585+				coeffs += coefficient_width;
 7586+			} while (pc < pc_end);
 7587+			break;
 7588+		case 3:
 7589+			STBIR_NO_UNROLL_LOOP_START
 7590+			do {
 7591+				STBIR_MOVE_2(pc, coeffs);
 7592+				STBIR_MOVE_1(pc + 2, coeffs + 2);
 7593+				pc += 3;
 7594+				coeffs += coefficient_width;
 7595+			} while (pc < pc_end);
 7596+			break;
 7597+		case 4:
 7598+			STBIR_NO_UNROLL_LOOP_START
 7599+			do {
 7600+				STBIR_MOVE_4(pc, coeffs);
 7601+				pc += 4;
 7602+				coeffs += coefficient_width;
 7603+			} while (pc < pc_end);
 7604+			break;
 7605+		case 5:
 7606+			STBIR_NO_UNROLL_LOOP_START
 7607+			do {
 7608+				STBIR_MOVE_4(pc, coeffs);
 7609+				STBIR_MOVE_1(pc + 4, coeffs + 4);
 7610+				pc += 5;
 7611+				coeffs += coefficient_width;
 7612+			} while (pc < pc_end);
 7613+			break;
 7614+		case 6:
 7615+			STBIR_NO_UNROLL_LOOP_START
 7616+			do {
 7617+				STBIR_MOVE_4(pc, coeffs);
 7618+				STBIR_MOVE_2(pc + 4, coeffs + 4);
 7619+				pc += 6;
 7620+				coeffs += coefficient_width;
 7621+			} while (pc < pc_end);
 7622+			break;
 7623+		case 7:
 7624+			STBIR_NO_UNROLL_LOOP_START
 7625+			do {
 7626+				STBIR_MOVE_4(pc, coeffs);
 7627+				STBIR_MOVE_2(pc + 4, coeffs + 4);
 7628+				STBIR_MOVE_1(pc + 6, coeffs + 6);
 7629+				pc += 7;
 7630+				coeffs += coefficient_width;
 7631+			} while (pc < pc_end);
 7632+			break;
 7633+		case 8:
 7634+			STBIR_NO_UNROLL_LOOP_START
 7635+			do {
 7636+				STBIR_MOVE_4(pc, coeffs);
 7637+				STBIR_MOVE_4(pc + 4, coeffs + 4);
 7638+				pc += 8;
 7639+				coeffs += coefficient_width;
 7640+			} while (pc < pc_end);
 7641+			break;
 7642+		case 9:
 7643+			STBIR_NO_UNROLL_LOOP_START
 7644+			do {
 7645+				STBIR_MOVE_4(pc, coeffs);
 7646+				STBIR_MOVE_4(pc + 4, coeffs + 4);
 7647+				STBIR_MOVE_1(pc + 8, coeffs + 8);
 7648+				pc += 9;
 7649+				coeffs += coefficient_width;
 7650+			} while (pc < pc_end);
 7651+			break;
 7652+		case 10:
 7653+			STBIR_NO_UNROLL_LOOP_START
 7654+			do {
 7655+				STBIR_MOVE_4(pc, coeffs);
 7656+				STBIR_MOVE_4(pc + 4, coeffs + 4);
 7657+				STBIR_MOVE_2(pc + 8, coeffs + 8);
 7658+				pc += 10;
 7659+				coeffs += coefficient_width;
 7660+			} while (pc < pc_end);
 7661+			break;
 7662+		case 11:
 7663+			STBIR_NO_UNROLL_LOOP_START
 7664+			do {
 7665+				STBIR_MOVE_4(pc, coeffs);
 7666+				STBIR_MOVE_4(pc + 4, coeffs + 4);
 7667+				STBIR_MOVE_2(pc + 8, coeffs + 8);
 7668+				STBIR_MOVE_1(pc + 10, coeffs + 10);
 7669+				pc += 11;
 7670+				coeffs += coefficient_width;
 7671+			} while (pc < pc_end);
 7672+			break;
 7673+		case 12:
 7674+			STBIR_NO_UNROLL_LOOP_START
 7675+			do {
 7676+				STBIR_MOVE_4(pc, coeffs);
 7677+				STBIR_MOVE_4(pc + 4, coeffs + 4);
 7678+				STBIR_MOVE_4(pc + 8, coeffs + 8);
 7679+				pc += 12;
 7680+				coeffs += coefficient_width;
 7681+			} while (pc < pc_end);
 7682+			break;
 7683+		default:
 7684+			STBIR_NO_UNROLL_LOOP_START
 7685+			do {
 7686+				float *copy_end = pc + widest - 4;
 7687+				float *c = coeffs;
 7688+				do {
 7689+					STBIR_NO_UNROLL(pc);
 7690+					STBIR_MOVE_4(pc, c);
 7691+					pc += 4;
 7692+					c += 4;
 7693+				} while (pc <= copy_end);
 7694+				copy_end += 4;
 7695+				STBIR_NO_UNROLL_LOOP_START
 7696+				while (pc < copy_end) {
 7697+					STBIR_MOVE_1(pc, c);
 7698+					++pc;
 7699+					++c;
 7700+				}
 7701+				coeffs += coefficient_width;
 7702+			} while (pc < pc_end);
 7703+			break;
 7704+		}
 7705+	}
 7706+
 7707+	// some horizontal routines read one float off the end (which is then masked
 7708+	// off), so put in a sentinal so we don't read an snan or denormal
 7709+	coefficents[widest * num_contributors] = 8888.0f;
 7710+
 7711+	// the minimum we might read for unrolled filters widths is 12. So, we need
 7712+	// to
 7713+	//   make sure we never read outside the decode buffer, by possibly moving
 7714+	//   the sample area back into the scanline, and putting zeros weights
 7715+	//   first.
 7716+	// we start on the right edge and check until we're well past the possible
 7717+	//   clip area (2*widest).
 7718+	{
 7719+		stbir__contributors *contribs = contributors + num_contributors - 1;
 7720+		float *coeffs = coefficents + widest * (num_contributors - 1);
 7721+
 7722+		// go until no chance of clipping (this is usually less than 8 lops)
 7723+		while ((contribs >= contributors) &&
 7724+		       ((contribs->n0 + widest * 2) >= row_end)) {
 7725+			// might we clip??
 7726+			if ((contribs->n0 + widest) > row_end) {
 7727+				int stop_range = widest;
 7728+
 7729+				// if range is larger than 12, it will be handled by generic
 7730+				// loops that can terminate on the exact length
 7731+				//   of this contrib n1, instead of a fixed widest amount - so
 7732+				//   calculate this
 7733+				if (widest > 12) {
 7734+					int mod;
 7735+
 7736+					// how far will be read in the n_coeff loop (which depends
 7737+					// on the widest count mod4);
 7738+					mod = widest & 3;
 7739+					stop_range =
 7740+					    (((contribs->n1 - contribs->n0 + 1) - mod + 3) & ~3) +
 7741+					    mod;
 7742+
 7743+					// the n_coeff loops do a minimum amount of coeffs, so
 7744+					// factor that in!
 7745+					if (stop_range < (8 + mod)) {
 7746+						stop_range = 8 + mod;
 7747+					}
 7748+				}
 7749+
 7750+				// now see if we still clip with the refined range
 7751+				if ((contribs->n0 + stop_range) > row_end) {
 7752+					int new_n0 = row_end - stop_range;
 7753+					int num = contribs->n1 - contribs->n0 + 1;
 7754+					int backup = contribs->n0 - new_n0;
 7755+					float *from_co = coeffs + num - 1;
 7756+					float *to_co = from_co + backup;
 7757+
 7758+					STBIR_ASSERT((new_n0 >= row0) && (new_n0 < contribs->n0));
 7759+
 7760+					// move the coeffs over
 7761+					while (num) {
 7762+						*to_co-- = *from_co--;
 7763+						--num;
 7764+					}
 7765+					// zero new positions
 7766+					while (to_co >= coeffs) {
 7767+						*to_co-- = 0;
 7768+					}
 7769+					// set new start point
 7770+					contribs->n0 = new_n0;
 7771+					if (widest > 12) {
 7772+						int mod;
 7773+
 7774+						// how far will be read in the n_coeff loop (which
 7775+						// depends on the widest count mod4);
 7776+						mod = widest & 3;
 7777+						stop_range =
 7778+						    (((contribs->n1 - contribs->n0 + 1) - mod + 3) &
 7779+						     ~3) +
 7780+						    mod;
 7781+
 7782+						// the n_coeff loops do a minimum amount of coeffs, so
 7783+						// factor that in!
 7784+						if (stop_range < (8 + mod)) {
 7785+							stop_range = 8 + mod;
 7786+						}
 7787+					}
 7788+				}
 7789+			}
 7790+			--contribs;
 7791+			coeffs -= widest;
 7792+		}
 7793+	}
 7794+
 7795+	return widest;
 7796+#undef STBIR_MOVE_1
 7797+#undef STBIR_MOVE_2
 7798+#undef STBIR_MOVE_4
 7799+}
 7800+
 7801+static void
 7802+stbir__calculate_filters(stbir__sampler *samp,
 7803+                         stbir__sampler *other_axis_for_pivot,
 7804+                         void *user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO)
 7805+{
 7806+	int n;
 7807+	float scale = samp->scale_info.scale;
 7808+	stbir__kernel_callback *kernel = samp->filter_kernel;
 7809+	stbir__support_callback *support = samp->filter_support;
 7810+	float inv_scale = samp->scale_info.inv_scale;
 7811+	int input_full_size = samp->scale_info.input_full_size;
 7812+	int gather_num_contributors = samp->num_contributors;
 7813+	stbir__contributors *gather_contributors = samp->contributors;
 7814+	float *gather_coeffs = samp->coefficients;
 7815+	int gather_coefficient_width = samp->coefficient_width;
 7816+
 7817+	switch (samp->is_gather) {
 7818+	case 1: // gather upsample
 7819+	{
 7820+		float out_pixels_radius = support(inv_scale, user_data) * scale;
 7821+
 7822+		stbir__calculate_coefficients_for_gather_upsample(
 7823+		    out_pixels_radius, kernel, &samp->scale_info,
 7824+		    gather_num_contributors, gather_contributors, gather_coeffs,
 7825+		    gather_coefficient_width, samp->edge, user_data);
 7826+
 7827+		STBIR_PROFILE_BUILD_START(cleanup);
 7828+		stbir__cleanup_gathered_coefficients(
 7829+		    samp->edge, &samp->extent_info, &samp->scale_info,
 7830+		    gather_num_contributors, gather_contributors, gather_coeffs,
 7831+		    gather_coefficient_width);
 7832+		STBIR_PROFILE_BUILD_END(cleanup);
 7833+	} break;
 7834+
 7835+	case 0: // scatter downsample (only on vertical)
 7836+	case 2: // gather downsample
 7837+	{
 7838+		float in_pixels_radius = support(scale, user_data) * inv_scale;
 7839+		int filter_pixel_margin = samp->filter_pixel_margin;
 7840+		int input_end = input_full_size + filter_pixel_margin;
 7841+
 7842+		// if this is a scatter, we do a downsample gather to get the coeffs,
 7843+		// and then pivot after
 7844+		if (!samp->is_gather) {
 7845+			// check if we are using the same gather downsample on the
 7846+			// horizontal as this vertical,
 7847+			//   if so, then we don't have to generate them, we can just pivot
 7848+			//   from the horizontal.
 7849+			if (other_axis_for_pivot) {
 7850+				gather_contributors = other_axis_for_pivot->contributors;
 7851+				gather_coeffs = other_axis_for_pivot->coefficients;
 7852+				gather_coefficient_width =
 7853+				    other_axis_for_pivot->coefficient_width;
 7854+				gather_num_contributors =
 7855+				    other_axis_for_pivot->num_contributors;
 7856+				samp->extent_info.lowest =
 7857+				    other_axis_for_pivot->extent_info.lowest;
 7858+				samp->extent_info.highest =
 7859+				    other_axis_for_pivot->extent_info.highest;
 7860+				samp->extent_info.widest =
 7861+				    other_axis_for_pivot->extent_info.widest;
 7862+				goto jump_right_to_pivot;
 7863+			}
 7864+
 7865+			gather_contributors = samp->gather_prescatter_contributors;
 7866+			gather_coeffs = samp->gather_prescatter_coefficients;
 7867+			gather_coefficient_width =
 7868+			    samp->gather_prescatter_coefficient_width;
 7869+			gather_num_contributors = samp->gather_prescatter_num_contributors;
 7870+		}
 7871+
 7872+		stbir__calculate_coefficients_for_gather_downsample(
 7873+		    -filter_pixel_margin, input_end, in_pixels_radius, kernel,
 7874+		    &samp->scale_info, gather_coefficient_width,
 7875+		    gather_num_contributors, gather_contributors, gather_coeffs,
 7876+		    user_data);
 7877+
 7878+		STBIR_PROFILE_BUILD_START(cleanup);
 7879+		stbir__cleanup_gathered_coefficients(
 7880+		    samp->edge, &samp->extent_info, &samp->scale_info,
 7881+		    gather_num_contributors, gather_contributors, gather_coeffs,
 7882+		    gather_coefficient_width);
 7883+		STBIR_PROFILE_BUILD_END(cleanup);
 7884+
 7885+		if (!samp->is_gather) {
 7886+			// if this is a scatter (vertical only), then we need to pivot the
 7887+			// coeffs
 7888+			stbir__contributors *scatter_contributors;
 7889+			int highest_set;
 7890+
 7891+		jump_right_to_pivot:
 7892+
 7893+			STBIR_PROFILE_BUILD_START(pivot);
 7894+
 7895+			highest_set = (-filter_pixel_margin) - 1;
 7896+			for (n = 0; n < gather_num_contributors; n++) {
 7897+				int k;
 7898+				int gn0 = gather_contributors->n0,
 7899+				    gn1 = gather_contributors->n1;
 7900+				int scatter_coefficient_width = samp->coefficient_width;
 7901+				float *scatter_coeffs =
 7902+				    samp->coefficients +
 7903+				    (gn0 + filter_pixel_margin) * scatter_coefficient_width;
 7904+				float *g_coeffs = gather_coeffs;
 7905+				scatter_contributors =
 7906+				    samp->contributors + (gn0 + filter_pixel_margin);
 7907+
 7908+				for (k = gn0; k <= gn1; k++) {
 7909+					float gc = *g_coeffs++;
 7910+
 7911+					// skip zero and denormals - must skip zeros to avoid adding
 7912+					// coeffs beyond scatter_coefficient_width
 7913+					//   (which happens when pivoting from horizontal, which
 7914+					//   might have dummy zeros)
 7915+					if (((gc >= stbir__small_float) ||
 7916+					     (gc <= -stbir__small_float))) {
 7917+						if ((k > highest_set) || (scatter_contributors->n0 >
 7918+						                          scatter_contributors->n1)) {
 7919+							{
 7920+								// if we are skipping over several contributors,
 7921+								// we need to clear the skipped ones
 7922+								stbir__contributors *clear_contributors =
 7923+								    samp->contributors +
 7924+								    (highest_set + filter_pixel_margin + 1);
 7925+								while (clear_contributors <
 7926+								       scatter_contributors) {
 7927+									clear_contributors->n0 = 0;
 7928+									clear_contributors->n1 = -1;
 7929+									++clear_contributors;
 7930+								}
 7931+							}
 7932+							scatter_contributors->n0 = n;
 7933+							scatter_contributors->n1 = n;
 7934+							scatter_coeffs[0] = gc;
 7935+							highest_set = k;
 7936+						} else {
 7937+							stbir__insert_coeff(scatter_contributors,
 7938+							                    scatter_coeffs, n, gc,
 7939+							                    scatter_coefficient_width);
 7940+						}
 7941+						STBIR_ASSERT((scatter_contributors->n1 -
 7942+						              scatter_contributors->n0 + 1) <=
 7943+						             scatter_coefficient_width);
 7944+					}
 7945+					++scatter_contributors;
 7946+					scatter_coeffs += scatter_coefficient_width;
 7947+				}
 7948+
 7949+				++gather_contributors;
 7950+				gather_coeffs += gather_coefficient_width;
 7951+			}
 7952+
 7953+			// now clear any unset contribs
 7954+			{
 7955+				stbir__contributors *clear_contributors =
 7956+				    samp->contributors +
 7957+				    (highest_set + filter_pixel_margin + 1);
 7958+				stbir__contributors *end_contributors =
 7959+				    samp->contributors + samp->num_contributors;
 7960+				while (clear_contributors < end_contributors) {
 7961+					clear_contributors->n0 = 0;
 7962+					clear_contributors->n1 = -1;
 7963+					++clear_contributors;
 7964+				}
 7965+			}
 7966+
 7967+			STBIR_PROFILE_BUILD_END(pivot);
 7968+		}
 7969+	} break;
 7970+	}
 7971+}
 7972 
 7973 //========================================================================================================
 7974 // scanline decoders and encoders
 7975@@ -4051,760 +4882,803 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 7976 
 7977 #define stbir__decode_suffix BGRA
 7978 #define stbir__decode_swizzle
 7979-#define stbir__decode_order0  2
 7980-#define stbir__decode_order1  1
 7981-#define stbir__decode_order2  0
 7982-#define stbir__decode_order3  3
 7983-#define stbir__encode_order0  2
 7984-#define stbir__encode_order1  1
 7985-#define stbir__encode_order2  0
 7986-#define stbir__encode_order3  3
 7987+#define stbir__decode_order0 2
 7988+#define stbir__decode_order1 1
 7989+#define stbir__decode_order2 0
 7990+#define stbir__decode_order3 3
 7991+#define stbir__encode_order0 2
 7992+#define stbir__encode_order1 1
 7993+#define stbir__encode_order2 0
 7994+#define stbir__encode_order3 3
 7995 #define stbir__coder_min_num 4
 7996 #define STB_IMAGE_RESIZE_DO_CODERS
 7997 #include STBIR__HEADER_FILENAME
 7998 
 7999 #define stbir__decode_suffix ARGB
 8000 #define stbir__decode_swizzle
 8001-#define stbir__decode_order0  1
 8002-#define stbir__decode_order1  2
 8003-#define stbir__decode_order2  3
 8004-#define stbir__decode_order3  0
 8005-#define stbir__encode_order0  3
 8006-#define stbir__encode_order1  0
 8007-#define stbir__encode_order2  1
 8008-#define stbir__encode_order3  2
 8009+#define stbir__decode_order0 1
 8010+#define stbir__decode_order1 2
 8011+#define stbir__decode_order2 3
 8012+#define stbir__decode_order3 0
 8013+#define stbir__encode_order0 3
 8014+#define stbir__encode_order1 0
 8015+#define stbir__encode_order2 1
 8016+#define stbir__encode_order3 2
 8017 #define stbir__coder_min_num 4
 8018 #define STB_IMAGE_RESIZE_DO_CODERS
 8019 #include STBIR__HEADER_FILENAME
 8020 
 8021 #define stbir__decode_suffix ABGR
 8022 #define stbir__decode_swizzle
 8023-#define stbir__decode_order0  3
 8024-#define stbir__decode_order1  2
 8025-#define stbir__decode_order2  1
 8026-#define stbir__decode_order3  0
 8027-#define stbir__encode_order0  3
 8028-#define stbir__encode_order1  2
 8029-#define stbir__encode_order2  1
 8030-#define stbir__encode_order3  0
 8031+#define stbir__decode_order0 3
 8032+#define stbir__decode_order1 2
 8033+#define stbir__decode_order2 1
 8034+#define stbir__decode_order3 0
 8035+#define stbir__encode_order0 3
 8036+#define stbir__encode_order1 2
 8037+#define stbir__encode_order2 1
 8038+#define stbir__encode_order3 0
 8039 #define stbir__coder_min_num 4
 8040 #define STB_IMAGE_RESIZE_DO_CODERS
 8041 #include STBIR__HEADER_FILENAME
 8042 
 8043 #define stbir__decode_suffix AR
 8044 #define stbir__decode_swizzle
 8045-#define stbir__decode_order0  1
 8046-#define stbir__decode_order1  0
 8047-#define stbir__decode_order2  3
 8048-#define stbir__decode_order3  2
 8049-#define stbir__encode_order0  1
 8050-#define stbir__encode_order1  0
 8051-#define stbir__encode_order2  3
 8052-#define stbir__encode_order3  2
 8053+#define stbir__decode_order0 1
 8054+#define stbir__decode_order1 0
 8055+#define stbir__decode_order2 3
 8056+#define stbir__decode_order3 2
 8057+#define stbir__encode_order0 1
 8058+#define stbir__encode_order1 0
 8059+#define stbir__encode_order2 3
 8060+#define stbir__encode_order3 2
 8061 #define stbir__coder_min_num 2
 8062 #define STB_IMAGE_RESIZE_DO_CODERS
 8063 #include STBIR__HEADER_FILENAME
 8064 
 8065+// fancy alpha means we expand to keep both premultipied and non-premultiplied
 8066+// color channels
 8067+static void
 8068+stbir__fancy_alpha_weight_4ch(float *out_buffer, int width_times_channels)
 8069+{
 8070+	float STBIR_STREAMOUT_PTR(*) out = out_buffer;
 8071+	float const *end_decode =
 8072+	    out_buffer + (width_times_channels / 4) *
 8073+	                     7; // decode buffer aligned to end of out_buffer
 8074+	float STBIR_STREAMOUT_PTR(*) decode =
 8075+	    (float *)end_decode - width_times_channels;
 8076+
 8077+	// fancy alpha is stored internally as R G B A Rpm Gpm Bpm
 8078 
 8079-// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
 8080-static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
 8081-{
 8082-  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
 8083-  float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7;  // decode buffer aligned to end of out_buffer
 8084-  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
 8085-
 8086-  // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
 8087-
 8088-  #ifdef STBIR_SIMD
 8089-
 8090-  #ifdef STBIR_SIMD8
 8091-  decode += 16;
 8092-  STBIR_NO_UNROLL_LOOP_START
 8093-  while ( decode <= end_decode )
 8094-  {
 8095-    stbir__simdf8 d0,d1,a0,a1,p0,p1;
 8096-    STBIR_NO_UNROLL(decode);
 8097-    stbir__simdf8_load( d0, decode-16 );
 8098-    stbir__simdf8_load( d1, decode-16+8 );
 8099-    stbir__simdf8_0123to33333333( a0, d0 );
 8100-    stbir__simdf8_0123to33333333( a1, d1 );
 8101-    stbir__simdf8_mult( p0, a0, d0 );
 8102-    stbir__simdf8_mult( p1, a1, d1 );
 8103-    stbir__simdf8_bot4s( a0, d0, p0 );
 8104-    stbir__simdf8_bot4s( a1, d1, p1 );
 8105-    stbir__simdf8_top4s( d0, d0, p0 );
 8106-    stbir__simdf8_top4s( d1, d1, p1 );
 8107-    stbir__simdf8_store ( out, a0 );
 8108-    stbir__simdf8_store ( out+7, d0 );
 8109-    stbir__simdf8_store ( out+14, a1 );
 8110-    stbir__simdf8_store ( out+21, d1 );
 8111-    decode += 16;
 8112-    out += 28;
 8113-  }
 8114-  decode -= 16;
 8115-  #else
 8116-  decode += 8;
 8117-  STBIR_NO_UNROLL_LOOP_START
 8118-  while ( decode <= end_decode )
 8119-  {
 8120-    stbir__simdf d0,a0,d1,a1,p0,p1;
 8121-    STBIR_NO_UNROLL(decode);
 8122-    stbir__simdf_load( d0, decode-8 );
 8123-    stbir__simdf_load( d1, decode-8+4 );
 8124-    stbir__simdf_0123to3333( a0, d0 );
 8125-    stbir__simdf_0123to3333( a1, d1 );
 8126-    stbir__simdf_mult( p0, a0, d0 );
 8127-    stbir__simdf_mult( p1, a1, d1 );
 8128-    stbir__simdf_store ( out, d0 );
 8129-    stbir__simdf_store ( out+4, p0 );
 8130-    stbir__simdf_store ( out+7, d1 );
 8131-    stbir__simdf_store ( out+7+4, p1 );
 8132-    decode += 8;
 8133-    out += 14;
 8134-  }
 8135-  decode -= 8;
 8136-  #endif
 8137-
 8138-  // might be one last odd pixel
 8139-  #ifdef STBIR_SIMD8
 8140-  STBIR_NO_UNROLL_LOOP_START
 8141-  while ( decode < end_decode )
 8142-  #else
 8143-  if ( decode < end_decode )
 8144-  #endif
 8145-  {
 8146-    stbir__simdf d,a,p;
 8147-    STBIR_NO_UNROLL(decode);
 8148-    stbir__simdf_load( d, decode );
 8149-    stbir__simdf_0123to3333( a, d );
 8150-    stbir__simdf_mult( p, a, d );
 8151-    stbir__simdf_store ( out, d );
 8152-    stbir__simdf_store ( out+4, p );
 8153-    decode += 4;
 8154-    out += 7;
 8155-  }
 8156-
 8157-  #else
 8158-
 8159-  while( decode < end_decode )
 8160-  {
 8161-    float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
 8162-    out[0] = r;
 8163-    out[1] = g;
 8164-    out[2] = b;
 8165-    out[3] = alpha;
 8166-    out[4] = r * alpha;
 8167-    out[5] = g * alpha;
 8168-    out[6] = b * alpha;
 8169-    out += 7;
 8170-    decode += 4;
 8171-  }
 8172-
 8173-  #endif
 8174-}
 8175-
 8176-static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
 8177-{
 8178-  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
 8179-  float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
 8180-  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
 8181-
 8182-  //  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
 8183-
 8184-  #ifdef STBIR_SIMD
 8185-
 8186-  decode += 8;
 8187-  if ( decode <= end_decode )
 8188-  {
 8189-    STBIR_NO_UNROLL_LOOP_START
 8190-    do {
 8191-      #ifdef STBIR_SIMD8
 8192-      stbir__simdf8 d0,a0,p0;
 8193-      STBIR_NO_UNROLL(decode);
 8194-      stbir__simdf8_load( d0, decode-8 );
 8195-      stbir__simdf8_0123to11331133( p0, d0 );
 8196-      stbir__simdf8_0123to00220022( a0, d0 );
 8197-      stbir__simdf8_mult( p0, p0, a0 );
 8198-
 8199-      stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
 8200-      stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
 8201-      stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
 8202-
 8203-      stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
 8204-      stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
 8205-      stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
 8206-      #else
 8207-      stbir__simdf d0,a0,d1,a1,p0,p1;
 8208-      STBIR_NO_UNROLL(decode);
 8209-      stbir__simdf_load( d0, decode-8 );
 8210-      stbir__simdf_load( d1, decode-8+4 );
 8211-      stbir__simdf_0123to1133( p0, d0 );
 8212-      stbir__simdf_0123to1133( p1, d1 );
 8213-      stbir__simdf_0123to0022( a0, d0 );
 8214-      stbir__simdf_0123to0022( a1, d1 );
 8215-      stbir__simdf_mult( p0, p0, a0 );
 8216-      stbir__simdf_mult( p1, p1, a1 );
 8217-
 8218-      stbir__simdf_store2( out, d0 );
 8219-      stbir__simdf_store( out+2, p0 );
 8220-      stbir__simdf_store2h( out+3, d0 );
 8221-
 8222-      stbir__simdf_store2( out+6, d1 );
 8223-      stbir__simdf_store( out+8, p1 );
 8224-      stbir__simdf_store2h( out+9, d1 );
 8225-      #endif
 8226-      decode += 8;
 8227-      out += 12;
 8228-    } while ( decode <= end_decode );
 8229-  }
 8230-  decode -= 8;
 8231-  #endif
 8232-
 8233-  STBIR_SIMD_NO_UNROLL_LOOP_START
 8234-  while( decode < end_decode )
 8235-  {
 8236-    float x = decode[0], y = decode[1];
 8237-    STBIR_SIMD_NO_UNROLL(decode);
 8238-    out[0] = x;
 8239-    out[1] = y;
 8240-    out[2] = x * y;
 8241-    out += 3;
 8242-    decode += 2;
 8243-  }
 8244-}
 8245-
 8246-static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
 8247-{
 8248-  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8249-  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
 8250-  float const * end_output = encode_buffer + width_times_channels;
 8251-
 8252-  // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
 8253-
 8254-  STBIR_SIMD_NO_UNROLL_LOOP_START
 8255-  do {
 8256-    float alpha = input[3];
 8257 #ifdef STBIR_SIMD
 8258-    stbir__simdf i,ia;
 8259-    STBIR_SIMD_NO_UNROLL(encode);
 8260-    if ( alpha < stbir__small_float )
 8261-    {
 8262-      stbir__simdf_load( i, input );
 8263-      stbir__simdf_store( encode, i );
 8264-    }
 8265-    else
 8266-    {
 8267-      stbir__simdf_load1frep4( ia, 1.0f / alpha );
 8268-      stbir__simdf_load( i, input+4 );
 8269-      stbir__simdf_mult( i, i, ia );
 8270-      stbir__simdf_store( encode, i );
 8271-      encode[3] = alpha;
 8272-    }
 8273+
 8274+#ifdef STBIR_SIMD8
 8275+	decode += 16;
 8276+	STBIR_NO_UNROLL_LOOP_START
 8277+	while (decode <= end_decode) {
 8278+		stbir__simdf8 d0, d1, a0, a1, p0, p1;
 8279+		STBIR_NO_UNROLL(decode);
 8280+		stbir__simdf8_load(d0, decode - 16);
 8281+		stbir__simdf8_load(d1, decode - 16 + 8);
 8282+		stbir__simdf8_0123to33333333(a0, d0);
 8283+		stbir__simdf8_0123to33333333(a1, d1);
 8284+		stbir__simdf8_mult(p0, a0, d0);
 8285+		stbir__simdf8_mult(p1, a1, d1);
 8286+		stbir__simdf8_bot4s(a0, d0, p0);
 8287+		stbir__simdf8_bot4s(a1, d1, p1);
 8288+		stbir__simdf8_top4s(d0, d0, p0);
 8289+		stbir__simdf8_top4s(d1, d1, p1);
 8290+		stbir__simdf8_store(out, a0);
 8291+		stbir__simdf8_store(out + 7, d0);
 8292+		stbir__simdf8_store(out + 14, a1);
 8293+		stbir__simdf8_store(out + 21, d1);
 8294+		decode += 16;
 8295+		out += 28;
 8296+	}
 8297+	decode -= 16;
 8298 #else
 8299-    if ( alpha < stbir__small_float )
 8300-    {
 8301-      encode[0] = input[0];
 8302-      encode[1] = input[1];
 8303-      encode[2] = input[2];
 8304-    }
 8305-    else
 8306-    {
 8307-      float ialpha = 1.0f / alpha;
 8308-      encode[0] = input[4] * ialpha;
 8309-      encode[1] = input[5] * ialpha;
 8310-      encode[2] = input[6] * ialpha;
 8311-    }
 8312-    encode[3] = alpha;
 8313-#endif
 8314-
 8315-    input += 7;
 8316-    encode += 4;
 8317-  } while ( encode < end_output );
 8318-}
 8319+	decode += 8;
 8320+	STBIR_NO_UNROLL_LOOP_START
 8321+	while (decode <= end_decode) {
 8322+		stbir__simdf d0, a0, d1, a1, p0, p1;
 8323+		STBIR_NO_UNROLL(decode);
 8324+		stbir__simdf_load(d0, decode - 8);
 8325+		stbir__simdf_load(d1, decode - 8 + 4);
 8326+		stbir__simdf_0123to3333(a0, d0);
 8327+		stbir__simdf_0123to3333(a1, d1);
 8328+		stbir__simdf_mult(p0, a0, d0);
 8329+		stbir__simdf_mult(p1, a1, d1);
 8330+		stbir__simdf_store(out, d0);
 8331+		stbir__simdf_store(out + 4, p0);
 8332+		stbir__simdf_store(out + 7, d1);
 8333+		stbir__simdf_store(out + 7 + 4, p1);
 8334+		decode += 8;
 8335+		out += 14;
 8336+	}
 8337+	decode -= 8;
 8338+#endif
 8339 
 8340-//  format: [X A Xpm][X A Xpm] etc
 8341-static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
 8342-{
 8343-  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8344-  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
 8345-  float const * end_output = encode_buffer + width_times_channels;
 8346-
 8347-  do {
 8348-    float alpha = input[1];
 8349-    encode[0] = input[0];
 8350-    if ( alpha >= stbir__small_float )
 8351-      encode[0] = input[2] / alpha;
 8352-    encode[1] = alpha;
 8353-
 8354-    input += 3;
 8355-    encode += 2;
 8356-  } while ( encode < end_output );
 8357-}
 8358-
 8359-static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
 8360-{
 8361-  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 8362-  float const * end_decode = decode_buffer + width_times_channels;
 8363-
 8364-  #ifdef STBIR_SIMD
 8365-  {
 8366-    decode += 2 * stbir__simdfX_float_count;
 8367-    STBIR_NO_UNROLL_LOOP_START
 8368-    while ( decode <= end_decode )
 8369-    {
 8370-      stbir__simdfX d0,a0,d1,a1;
 8371-      STBIR_NO_UNROLL(decode);
 8372-      stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
 8373-      stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
 8374-      stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
 8375-      stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
 8376-      stbir__simdfX_mult( d0, d0, a0 );
 8377-      stbir__simdfX_mult( d1, d1, a1 );
 8378-      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
 8379-      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
 8380-      decode += 2 * stbir__simdfX_float_count;
 8381-    }
 8382-    decode -= 2 * stbir__simdfX_float_count;
 8383-
 8384-    // few last pixels remnants
 8385-    #ifdef STBIR_SIMD8
 8386-    STBIR_NO_UNROLL_LOOP_START
 8387-    while ( decode < end_decode )
 8388-    #else
 8389-    if ( decode < end_decode )
 8390-    #endif
 8391-    {
 8392-      stbir__simdf d,a;
 8393-      stbir__simdf_load( d, decode );
 8394-      stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
 8395-      stbir__simdf_mult( d, d, a );
 8396-      stbir__simdf_store ( decode, d );
 8397-      decode += 4;
 8398-    }
 8399-  }
 8400-
 8401-  #else
 8402-
 8403-  while( decode < end_decode )
 8404-  {
 8405-    float alpha = decode[3];
 8406-    decode[0] *= alpha;
 8407-    decode[1] *= alpha;
 8408-    decode[2] *= alpha;
 8409-    decode += 4;
 8410-  }
 8411-
 8412-  #endif
 8413-}
 8414-
 8415-static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
 8416-{
 8417-  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 8418-  float const * end_decode = decode_buffer + width_times_channels;
 8419-
 8420-  #ifdef STBIR_SIMD
 8421-  decode += 2 * stbir__simdfX_float_count;
 8422-  STBIR_NO_UNROLL_LOOP_START
 8423-  while ( decode <= end_decode )
 8424-  {
 8425-    stbir__simdfX d0,a0,d1,a1;
 8426-    STBIR_NO_UNROLL(decode);
 8427-    stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
 8428-    stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
 8429-    stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
 8430-    stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
 8431-    stbir__simdfX_mult( d0, d0, a0 );
 8432-    stbir__simdfX_mult( d1, d1, a1 );
 8433-    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
 8434-    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
 8435-    decode += 2 * stbir__simdfX_float_count;
 8436-  }
 8437-  decode -= 2 * stbir__simdfX_float_count;
 8438-  #endif
 8439-
 8440-  STBIR_SIMD_NO_UNROLL_LOOP_START
 8441-  while( decode < end_decode )
 8442-  {
 8443-    float alpha = decode[1];
 8444-    STBIR_SIMD_NO_UNROLL(decode);
 8445-    decode[0] *= alpha;
 8446-    decode += 2;
 8447-  }
 8448-}
 8449-
 8450-static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
 8451-{
 8452-  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8453-  float const * end_output = encode_buffer + width_times_channels;
 8454-
 8455-  STBIR_SIMD_NO_UNROLL_LOOP_START
 8456-  do {
 8457-    float alpha = encode[3];
 8458+// might be one last odd pixel
 8459+#ifdef STBIR_SIMD8
 8460+	STBIR_NO_UNROLL_LOOP_START
 8461+	while (decode < end_decode)
 8462+#else
 8463+	if (decode < end_decode)
 8464+#endif
 8465+	{
 8466+		stbir__simdf d, a, p;
 8467+		STBIR_NO_UNROLL(decode);
 8468+		stbir__simdf_load(d, decode);
 8469+		stbir__simdf_0123to3333(a, d);
 8470+		stbir__simdf_mult(p, a, d);
 8471+		stbir__simdf_store(out, d);
 8472+		stbir__simdf_store(out + 4, p);
 8473+		decode += 4;
 8474+		out += 7;
 8475+	}
 8476 
 8477-#ifdef STBIR_SIMD
 8478-    stbir__simdf i,ia;
 8479-    STBIR_SIMD_NO_UNROLL(encode);
 8480-    if ( alpha >= stbir__small_float )
 8481-    {
 8482-      stbir__simdf_load1frep4( ia, 1.0f / alpha );
 8483-      stbir__simdf_load( i, encode );
 8484-      stbir__simdf_mult( i, i, ia );
 8485-      stbir__simdf_store( encode, i );
 8486-      encode[3] = alpha;
 8487-    }
 8488 #else
 8489-    if ( alpha >= stbir__small_float )
 8490-    {
 8491-      float ialpha = 1.0f / alpha;
 8492-      encode[0] *= ialpha;
 8493-      encode[1] *= ialpha;
 8494-      encode[2] *= ialpha;
 8495-    }
 8496+
 8497+	while (decode < end_decode) {
 8498+		float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
 8499+		out[0] = r;
 8500+		out[1] = g;
 8501+		out[2] = b;
 8502+		out[3] = alpha;
 8503+		out[4] = r * alpha;
 8504+		out[5] = g * alpha;
 8505+		out[6] = b * alpha;
 8506+		out += 7;
 8507+		decode += 4;
 8508+	}
 8509+
 8510 #endif
 8511-    encode += 4;
 8512-  } while ( encode < end_output );
 8513 }
 8514 
 8515-static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
 8516+static void
 8517+stbir__fancy_alpha_weight_2ch(float *out_buffer, int width_times_channels)
 8518 {
 8519-  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8520-  float const * end_output = encode_buffer + width_times_channels;
 8521+	float STBIR_STREAMOUT_PTR(*) out = out_buffer;
 8522+	float const *end_decode = out_buffer + (width_times_channels / 2) * 3;
 8523+	float STBIR_STREAMOUT_PTR(*) decode =
 8524+	    (float *)end_decode - width_times_channels;
 8525 
 8526-  do {
 8527-    float alpha = encode[1];
 8528-    if ( alpha >= stbir__small_float )
 8529-      encode[0] /= alpha;
 8530-    encode += 2;
 8531-  } while ( encode < end_output );
 8532-}
 8533+	//  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
 8534 
 8535+#ifdef STBIR_SIMD
 8536 
 8537-// only used in RGB->BGR or BGR->RGB
 8538-static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
 8539+	decode += 8;
 8540+	if (decode <= end_decode) {
 8541+		STBIR_NO_UNROLL_LOOP_START
 8542+		do {
 8543+#ifdef STBIR_SIMD8
 8544+			stbir__simdf8 d0, a0, p0;
 8545+			STBIR_NO_UNROLL(decode);
 8546+			stbir__simdf8_load(d0, decode - 8);
 8547+			stbir__simdf8_0123to11331133(p0, d0);
 8548+			stbir__simdf8_0123to00220022(a0, d0);
 8549+			stbir__simdf8_mult(p0, p0, a0);
 8550+
 8551+			stbir__simdf_store2(out, stbir__if_simdf8_cast_to_simdf4(d0));
 8552+			stbir__simdf_store(out + 2, stbir__if_simdf8_cast_to_simdf4(p0));
 8553+			stbir__simdf_store2h(out + 3, stbir__if_simdf8_cast_to_simdf4(d0));
 8554+
 8555+			stbir__simdf_store2(out + 6, stbir__simdf8_gettop4(d0));
 8556+			stbir__simdf_store(out + 8, stbir__simdf8_gettop4(p0));
 8557+			stbir__simdf_store2h(out + 9, stbir__simdf8_gettop4(d0));
 8558+#else
 8559+			stbir__simdf d0, a0, d1, a1, p0, p1;
 8560+			STBIR_NO_UNROLL(decode);
 8561+			stbir__simdf_load(d0, decode - 8);
 8562+			stbir__simdf_load(d1, decode - 8 + 4);
 8563+			stbir__simdf_0123to1133(p0, d0);
 8564+			stbir__simdf_0123to1133(p1, d1);
 8565+			stbir__simdf_0123to0022(a0, d0);
 8566+			stbir__simdf_0123to0022(a1, d1);
 8567+			stbir__simdf_mult(p0, p0, a0);
 8568+			stbir__simdf_mult(p1, p1, a1);
 8569+
 8570+			stbir__simdf_store2(out, d0);
 8571+			stbir__simdf_store(out + 2, p0);
 8572+			stbir__simdf_store2h(out + 3, d0);
 8573+
 8574+			stbir__simdf_store2(out + 6, d1);
 8575+			stbir__simdf_store(out + 8, p1);
 8576+			stbir__simdf_store2h(out + 9, d1);
 8577+#endif
 8578+			decode += 8;
 8579+			out += 12;
 8580+		} while (decode <= end_decode);
 8581+	}
 8582+	decode -= 8;
 8583+#endif
 8584+
 8585+	STBIR_SIMD_NO_UNROLL_LOOP_START
 8586+	while (decode < end_decode) {
 8587+		float x = decode[0], y = decode[1];
 8588+		STBIR_SIMD_NO_UNROLL(decode);
 8589+		out[0] = x;
 8590+		out[1] = y;
 8591+		out[2] = x * y;
 8592+		out += 3;
 8593+		decode += 2;
 8594+	}
 8595+}
 8596+
 8597+static void
 8598+stbir__fancy_alpha_unweight_4ch(float *encode_buffer, int width_times_channels)
 8599 {
 8600-  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 8601-  float const * end_decode = decode_buffer + width_times_channels;
 8602+	float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8603+	float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
 8604+	float const *end_output = encode_buffer + width_times_channels;
 8605+
 8606+	// fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
 8607 
 8608+	STBIR_SIMD_NO_UNROLL_LOOP_START
 8609+	do {
 8610+		float alpha = input[3];
 8611 #ifdef STBIR_SIMD
 8612-    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
 8613-      end_decode -= 12; 
 8614-      STBIR_NO_UNROLL_LOOP_START
 8615-      while( decode <= end_decode )
 8616-      {
 8617-        // on arm64 8 instructions, no overlapping stores
 8618-        stbir__simdf a,b,c,na,nb;
 8619-        STBIR_SIMD_NO_UNROLL(decode);
 8620-        stbir__simdf_load( a, decode );
 8621-        stbir__simdf_load( b, decode+4 );
 8622-        stbir__simdf_load( c, decode+8 );
 8623-
 8624-        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
 8625-        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
 8626-        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
 8627-        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
 8628-
 8629-        stbir__simdf_store( decode, na );
 8630-        stbir__simdf_store( decode+4, nb ); 
 8631-        stbir__simdf_store( decode+8, c );
 8632-        decode += 12;
 8633-      }
 8634-      end_decode += 12;
 8635-    #else
 8636-      end_decode -= 24;
 8637-      STBIR_NO_UNROLL_LOOP_START
 8638-      while( decode <= end_decode )
 8639-      {
 8640-        // 26 instructions on x64
 8641-        stbir__simdf a,b,c,d,e,f,g;
 8642-        float i21, i23;
 8643-        STBIR_SIMD_NO_UNROLL(decode);
 8644-        stbir__simdf_load( a, decode );
 8645-        stbir__simdf_load( b, decode+3 );
 8646-        stbir__simdf_load( c, decode+6 );
 8647-        stbir__simdf_load( d, decode+9 );
 8648-        stbir__simdf_load( e, decode+12 );
 8649-        stbir__simdf_load( f, decode+15 );
 8650-        stbir__simdf_load( g, decode+18 );
 8651-
 8652-        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
 8653-        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
 8654-        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
 8655-        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
 8656-        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
 8657-        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
 8658-        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
 8659-
 8660-        // stores overlap, need to be in order, 
 8661-        stbir__simdf_store( decode,    a );
 8662-        i21 = decode[21];
 8663-        stbir__simdf_store( decode+3,  b ); 
 8664-        i23 = decode[23];
 8665-        stbir__simdf_store( decode+6,  c );
 8666-        stbir__simdf_store( decode+9,  d );
 8667-        stbir__simdf_store( decode+12, e );
 8668-        stbir__simdf_store( decode+15, f );
 8669-        stbir__simdf_store( decode+18, g );
 8670-        decode[21] = i23;
 8671-        decode[23] = i21;
 8672-        decode += 24;
 8673-      }
 8674-      end_decode += 24;
 8675-    #endif
 8676+		stbir__simdf i, ia;
 8677+		STBIR_SIMD_NO_UNROLL(encode);
 8678+		if (alpha < stbir__small_float) {
 8679+			stbir__simdf_load(i, input);
 8680+			stbir__simdf_store(encode, i);
 8681+		} else {
 8682+			stbir__simdf_load1frep4(ia, 1.0f / alpha);
 8683+			stbir__simdf_load(i, input + 4);
 8684+			stbir__simdf_mult(i, i, ia);
 8685+			stbir__simdf_store(encode, i);
 8686+			encode[3] = alpha;
 8687+		}
 8688 #else
 8689-  end_decode -= 12;
 8690-  STBIR_NO_UNROLL_LOOP_START
 8691-  while( decode <= end_decode )
 8692-  {
 8693-    // 16 instructions
 8694-    float t0,t1,t2,t3;
 8695-    STBIR_NO_UNROLL(decode);
 8696-    t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
 8697-    decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
 8698-    decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
 8699-    decode += 12;
 8700-  }
 8701-  end_decode += 12;
 8702-#endif
 8703-
 8704-  STBIR_NO_UNROLL_LOOP_START
 8705-  while( decode < end_decode )
 8706-  {
 8707-    float t = decode[0];
 8708-    STBIR_NO_UNROLL(decode);
 8709-    decode[0] = decode[2];
 8710-    decode[2] = t;
 8711-    decode += 3;
 8712-  }
 8713-}
 8714-
 8715-
 8716-
 8717-static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
 8718-{
 8719-  int channels = stbir_info->channels;
 8720-  int effective_channels = stbir_info->effective_channels;
 8721-  int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
 8722-  stbir_edge edge_horizontal = stbir_info->horizontal.edge;
 8723-  stbir_edge edge_vertical = stbir_info->vertical.edge;
 8724-  int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
 8725-  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
 8726-  stbir__span const * spans = stbir_info->scanline_extents.spans;
 8727-  float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
 8728-  float * last_decoded = 0;
 8729-
 8730-  // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
 8731-  STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
 8732-
 8733-  do
 8734-  {
 8735-    float * decode_buffer;
 8736-    void const * input_data;
 8737-    float * end_decode;
 8738-    int width_times_channels;
 8739-    int width;
 8740-
 8741-    if ( spans->n1 < spans->n0 )
 8742-      break;
 8743-
 8744-    width = spans->n1 + 1 - spans->n0;
 8745-    decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
 8746-    end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
 8747-    width_times_channels = width * channels;
 8748-
 8749-    // read directly out of input plane by default
 8750-    input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
 8751-
 8752-    // if we have an input callback, call it to get the input data
 8753-    if ( stbir_info->in_pixels_cb )
 8754-    {
 8755-      // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
 8756-      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
 8757-    }
 8758-
 8759-    STBIR_PROFILE_START( decode );
 8760-    // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
 8761-    last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
 8762-    STBIR_PROFILE_END( decode );
 8763-
 8764-    if (stbir_info->alpha_weight)
 8765-    {
 8766-      STBIR_PROFILE_START( alpha );
 8767-      stbir_info->alpha_weight( decode_buffer, width_times_channels );
 8768-      STBIR_PROFILE_END( alpha );
 8769-    }
 8770-
 8771-    ++spans;
 8772-  } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
 8773-
 8774-  // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
 8775-  // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
 8776-  //   wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
 8777-  if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
 8778-  {
 8779-    // this code only runs if we're in edge_wrap, and we're doing the entire scanline
 8780-    int e, start_x[2];
 8781-    int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
 8782-
 8783-    start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
 8784-    start_x[1] =  input_full_size;                             // right edge
 8785-
 8786-    for( e = 0; e < 2 ; e++ )
 8787-    {
 8788-      // do each margin
 8789-      int margin = stbir_info->scanline_extents.edge_sizes[e];
 8790-      if ( margin )
 8791-      {
 8792-        int x = start_x[e];
 8793-        float * marg = full_decode_buffer + x * effective_channels;
 8794-        float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
 8795-        STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
 8796-        if ( e == 1 ) last_decoded = marg + margin * effective_channels;
 8797-      }
 8798-    }
 8799-  }
 8800-  
 8801-  // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
 8802-  //   (we can't pre-zero it, because the input callback can use that area as padding)
 8803-  last_decoded[0] = 0.0f; 
 8804-
 8805-  // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
 8806-  //   when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
 8807-  //   this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
 8808-  last_decoded[1] = 0.0f;
 8809+		if (alpha < stbir__small_float) {
 8810+			encode[0] = input[0];
 8811+			encode[1] = input[1];
 8812+			encode[2] = input[2];
 8813+		} else {
 8814+			float ialpha = 1.0f / alpha;
 8815+			encode[0] = input[4] * ialpha;
 8816+			encode[1] = input[5] * ialpha;
 8817+			encode[2] = input[6] * ialpha;
 8818+		}
 8819+		encode[3] = alpha;
 8820+#endif
 8821+
 8822+		input += 7;
 8823+		encode += 4;
 8824+	} while (encode < end_output);
 8825 }
 8826 
 8827+//  format: [X A Xpm][X A Xpm] etc
 8828+static void
 8829+stbir__fancy_alpha_unweight_2ch(float *encode_buffer, int width_times_channels)
 8830+{
 8831+	float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 8832+	float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
 8833+	float const *end_output = encode_buffer + width_times_channels;
 8834 
 8835-//=================
 8836-// Do 1 channel horizontal routines
 8837+	do {
 8838+		float alpha = input[1];
 8839+		encode[0] = input[0];
 8840+		if (alpha >= stbir__small_float) {
 8841+			encode[0] = input[2] / alpha;
 8842+		}
 8843+		encode[1] = alpha;
 8844+
 8845+		input += 3;
 8846+		encode += 2;
 8847+	} while (encode < end_output);
 8848+}
 8849+
 8850+static void
 8851+stbir__simple_alpha_weight_4ch(float *decode_buffer, int width_times_channels)
 8852+{
 8853+	float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 8854+	float const *end_decode = decode_buffer + width_times_channels;
 8855+
 8856+#ifdef STBIR_SIMD
 8857+	{
 8858+		decode += 2 * stbir__simdfX_float_count;
 8859+		STBIR_NO_UNROLL_LOOP_START
 8860+		while (decode <= end_decode) {
 8861+			stbir__simdfX d0, a0, d1, a1;
 8862+			STBIR_NO_UNROLL(decode);
 8863+			stbir__simdfX_load(d0, decode - 2 * stbir__simdfX_float_count);
 8864+			stbir__simdfX_load(d1, decode - 2 * stbir__simdfX_float_count +
 8865+			                           stbir__simdfX_float_count);
 8866+			stbir__simdfX_aaa1(a0, d0, STBIR_onesX);
 8867+			stbir__simdfX_aaa1(a1, d1, STBIR_onesX);
 8868+			stbir__simdfX_mult(d0, d0, a0);
 8869+			stbir__simdfX_mult(d1, d1, a1);
 8870+			stbir__simdfX_store(decode - 2 * stbir__simdfX_float_count, d0);
 8871+			stbir__simdfX_store(decode - 2 * stbir__simdfX_float_count +
 8872+			                        stbir__simdfX_float_count,
 8873+			                    d1);
 8874+			decode += 2 * stbir__simdfX_float_count;
 8875+		}
 8876+		decode -= 2 * stbir__simdfX_float_count;
 8877+
 8878+// few last pixels remnants
 8879+#ifdef STBIR_SIMD8
 8880+		STBIR_NO_UNROLL_LOOP_START
 8881+		while (decode < end_decode)
 8882+#else
 8883+		if (decode < end_decode)
 8884+#endif
 8885+		{
 8886+			stbir__simdf d, a;
 8887+			stbir__simdf_load(d, decode);
 8888+			stbir__simdf_aaa1(a, d, STBIR__CONSTF(STBIR_ones));
 8889+			stbir__simdf_mult(d, d, a);
 8890+			stbir__simdf_store(decode, d);
 8891+			decode += 4;
 8892+		}
 8893+	}
 8894+
 8895+#else
 8896+
 8897+	while (decode < end_decode) {
 8898+		float alpha = decode[3];
 8899+		decode[0] *= alpha;
 8900+		decode[1] *= alpha;
 8901+		decode[2] *= alpha;
 8902+		decode += 4;
 8903+	}
 8904+
 8905+#endif
 8906+}
 8907+
 8908+static void
 8909+stbir__simple_alpha_weight_2ch(float *decode_buffer, int width_times_channels)
 8910+{
 8911+	float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 8912+	float const *end_decode = decode_buffer + width_times_channels;
 8913 
 8914 #ifdef STBIR_SIMD
 8915+	decode += 2 * stbir__simdfX_float_count;
 8916+	STBIR_NO_UNROLL_LOOP_START
 8917+	while (decode <= end_decode) {
 8918+		stbir__simdfX d0, a0, d1, a1;
 8919+		STBIR_NO_UNROLL(decode);
 8920+		stbir__simdfX_load(d0, decode - 2 * stbir__simdfX_float_count);
 8921+		stbir__simdfX_load(d1, decode - 2 * stbir__simdfX_float_count +
 8922+		                           stbir__simdfX_float_count);
 8923+		stbir__simdfX_a1a1(a0, d0, STBIR_onesX);
 8924+		stbir__simdfX_a1a1(a1, d1, STBIR_onesX);
 8925+		stbir__simdfX_mult(d0, d0, a0);
 8926+		stbir__simdfX_mult(d1, d1, a1);
 8927+		stbir__simdfX_store(decode - 2 * stbir__simdfX_float_count, d0);
 8928+		stbir__simdfX_store(decode - 2 * stbir__simdfX_float_count +
 8929+		                        stbir__simdfX_float_count,
 8930+		                    d1);
 8931+		decode += 2 * stbir__simdfX_float_count;
 8932+	}
 8933+	decode -= 2 * stbir__simdfX_float_count;
 8934+#endif
 8935+
 8936+	STBIR_SIMD_NO_UNROLL_LOOP_START
 8937+	while (decode < end_decode) {
 8938+		float alpha = decode[1];
 8939+		STBIR_SIMD_NO_UNROLL(decode);
 8940+		decode[0] *= alpha;
 8941+		decode += 2;
 8942+	}
 8943+}
 8944 
 8945-#define stbir__1_coeff_only()          \
 8946-    stbir__simdf tot,c;                \
 8947-    STBIR_SIMD_NO_UNROLL(decode);      \
 8948-    stbir__simdf_load1( c, hc );       \
 8949-    stbir__simdf_mult1_mem( tot, c, decode );
 8950-
 8951-#define stbir__2_coeff_only()          \
 8952-    stbir__simdf tot,c,d;              \
 8953-    STBIR_SIMD_NO_UNROLL(decode);      \
 8954-    stbir__simdf_load2z( c, hc );      \
 8955-    stbir__simdf_load2( d, decode );   \
 8956-    stbir__simdf_mult( tot, c, d );    \
 8957-    stbir__simdf_0123to1230( c, tot ); \
 8958-    stbir__simdf_add1( tot, tot, c );
 8959-
 8960-#define stbir__3_coeff_only()                  \
 8961-    stbir__simdf tot,c,t;                      \
 8962-    STBIR_SIMD_NO_UNROLL(decode);              \
 8963-    stbir__simdf_load( c, hc );                \
 8964-    stbir__simdf_mult_mem( tot, c, decode );   \
 8965-    stbir__simdf_0123to1230( c, tot );         \
 8966-    stbir__simdf_0123to2301( t, tot );         \
 8967-    stbir__simdf_add1( tot, tot, c );          \
 8968-    stbir__simdf_add1( tot, tot, t );
 8969-
 8970-#define stbir__store_output_tiny()                \
 8971-    stbir__simdf_store1( output, tot );           \
 8972-    horizontal_coefficients += coefficient_width; \
 8973-    ++horizontal_contributors;                    \
 8974-    output += 1;
 8975-
 8976-#define stbir__4_coeff_start()                 \
 8977-    stbir__simdf tot,c;                        \
 8978-    STBIR_SIMD_NO_UNROLL(decode);              \
 8979-    stbir__simdf_load( c, hc );                \
 8980-    stbir__simdf_mult_mem( tot, c, decode );   \
 8981-
 8982-#define stbir__4_coeff_continue_from_4( ofs )  \
 8983-    STBIR_SIMD_NO_UNROLL(decode);              \
 8984-    stbir__simdf_load( c, hc + (ofs) );        \
 8985-    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
 8986-
 8987-#define stbir__1_coeff_remnant( ofs )          \
 8988-    { stbir__simdf d;                          \
 8989-    stbir__simdf_load1z( c, hc + (ofs) );      \
 8990-    stbir__simdf_load1( d, decode + (ofs) );   \
 8991-    stbir__simdf_madd( tot, tot, d, c ); }
 8992-
 8993-#define stbir__2_coeff_remnant( ofs )          \
 8994-    { stbir__simdf d;                          \
 8995-    stbir__simdf_load2z( c, hc+(ofs) );        \
 8996-    stbir__simdf_load2( d, decode+(ofs) );     \
 8997-    stbir__simdf_madd( tot, tot, d, c ); }
 8998-
 8999-#define stbir__3_coeff_setup()                 \
 9000-    stbir__simdf mask;                         \
 9001-    stbir__simdf_load( mask, STBIR_mask + 3 );
 9002-
 9003-#define stbir__3_coeff_remnant( ofs )                  \
 9004-    stbir__simdf_load( c, hc+(ofs) );                  \
 9005-    stbir__simdf_and( c, c, mask );                    \
 9006-    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
 9007-
 9008-#define stbir__store_output()                     \
 9009-    stbir__simdf_0123to2301( c, tot );            \
 9010-    stbir__simdf_add( tot, tot, c );              \
 9011-    stbir__simdf_0123to1230( c, tot );            \
 9012-    stbir__simdf_add1( tot, tot, c );             \
 9013-    stbir__simdf_store1( output, tot );           \
 9014-    horizontal_coefficients += coefficient_width; \
 9015-    ++horizontal_contributors;                    \
 9016-    output += 1;
 9017+static void
 9018+stbir__simple_alpha_unweight_4ch(float *encode_buffer, int width_times_channels)
 9019+{
 9020+	float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 9021+	float const *end_output = encode_buffer + width_times_channels;
 9022+
 9023+	STBIR_SIMD_NO_UNROLL_LOOP_START
 9024+	do {
 9025+		float alpha = encode[3];
 9026 
 9027+#ifdef STBIR_SIMD
 9028+		stbir__simdf i, ia;
 9029+		STBIR_SIMD_NO_UNROLL(encode);
 9030+		if (alpha >= stbir__small_float) {
 9031+			stbir__simdf_load1frep4(ia, 1.0f / alpha);
 9032+			stbir__simdf_load(i, encode);
 9033+			stbir__simdf_mult(i, i, ia);
 9034+			stbir__simdf_store(encode, i);
 9035+			encode[3] = alpha;
 9036+		}
 9037 #else
 9038+		if (alpha >= stbir__small_float) {
 9039+			float ialpha = 1.0f / alpha;
 9040+			encode[0] *= ialpha;
 9041+			encode[1] *= ialpha;
 9042+			encode[2] *= ialpha;
 9043+		}
 9044+#endif
 9045+		encode += 4;
 9046+	} while (encode < end_output);
 9047+}
 9048 
 9049-#define stbir__1_coeff_only()  \
 9050-    float tot;                 \
 9051-    tot = decode[0]*hc[0];
 9052+static void
 9053+stbir__simple_alpha_unweight_2ch(float *encode_buffer, int width_times_channels)
 9054+{
 9055+	float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
 9056+	float const *end_output = encode_buffer + width_times_channels;
 9057 
 9058-#define stbir__2_coeff_only()  \
 9059-    float tot;                 \
 9060-    tot = decode[0] * hc[0];   \
 9061-    tot += decode[1] * hc[1];
 9062+	do {
 9063+		float alpha = encode[1];
 9064+		if (alpha >= stbir__small_float) {
 9065+			encode[0] /= alpha;
 9066+		}
 9067+		encode += 2;
 9068+	} while (encode < end_output);
 9069+}
 9070 
 9071-#define stbir__3_coeff_only()  \
 9072-    float tot;                 \
 9073-    tot = decode[0] * hc[0];   \
 9074-    tot += decode[1] * hc[1];  \
 9075-    tot += decode[2] * hc[2];
 9076+// only used in RGB->BGR or BGR->RGB
 9077+static void
 9078+stbir__simple_flip_3ch(float *decode_buffer, int width_times_channels)
 9079+{
 9080+	float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
 9081+	float const *end_decode = decode_buffer + width_times_channels;
 9082 
 9083-#define stbir__store_output_tiny()                \
 9084-    output[0] = tot;                              \
 9085-    horizontal_coefficients += coefficient_width; \
 9086-    ++horizontal_contributors;                    \
 9087-    output += 1;
 9088+#ifdef STBIR_SIMD
 9089+#ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
 9090+	end_decode -= 12;
 9091+	STBIR_NO_UNROLL_LOOP_START
 9092+	while (decode <= end_decode) {
 9093+		// on arm64 8 instructions, no overlapping stores
 9094+		stbir__simdf a, b, c, na, nb;
 9095+		STBIR_SIMD_NO_UNROLL(decode);
 9096+		stbir__simdf_load(a, decode);
 9097+		stbir__simdf_load(b, decode + 4);
 9098+		stbir__simdf_load(c, decode + 8);
 9099+
 9100+		na = stbir__simdf_swiz2(a, b, 2, 1, 0, 5);
 9101+		b = stbir__simdf_swiz2(a, b, 4, 3, 6, 7);
 9102+		nb = stbir__simdf_swiz2(b, c, 0, 1, 4, 3);
 9103+		c = stbir__simdf_swiz2(b, c, 2, 7, 6, 5);
 9104+
 9105+		stbir__simdf_store(decode, na);
 9106+		stbir__simdf_store(decode + 4, nb);
 9107+		stbir__simdf_store(decode + 8, c);
 9108+		decode += 12;
 9109+	}
 9110+	end_decode += 12;
 9111+#else
 9112+	end_decode -= 24;
 9113+	STBIR_NO_UNROLL_LOOP_START
 9114+	while (decode <= end_decode) {
 9115+		// 26 instructions on x64
 9116+		stbir__simdf a, b, c, d, e, f, g;
 9117+		float i21, i23;
 9118+		STBIR_SIMD_NO_UNROLL(decode);
 9119+		stbir__simdf_load(a, decode);
 9120+		stbir__simdf_load(b, decode + 3);
 9121+		stbir__simdf_load(c, decode + 6);
 9122+		stbir__simdf_load(d, decode + 9);
 9123+		stbir__simdf_load(e, decode + 12);
 9124+		stbir__simdf_load(f, decode + 15);
 9125+		stbir__simdf_load(g, decode + 18);
 9126+
 9127+		a = stbir__simdf_swiz(a, 2, 1, 0, 3);
 9128+		b = stbir__simdf_swiz(b, 2, 1, 0, 3);
 9129+		c = stbir__simdf_swiz(c, 2, 1, 0, 3);
 9130+		d = stbir__simdf_swiz(d, 2, 1, 0, 3);
 9131+		e = stbir__simdf_swiz(e, 2, 1, 0, 3);
 9132+		f = stbir__simdf_swiz(f, 2, 1, 0, 3);
 9133+		g = stbir__simdf_swiz(g, 2, 1, 0, 3);
 9134+
 9135+		// stores overlap, need to be in order,
 9136+		stbir__simdf_store(decode, a);
 9137+		i21 = decode[21];
 9138+		stbir__simdf_store(decode + 3, b);
 9139+		i23 = decode[23];
 9140+		stbir__simdf_store(decode + 6, c);
 9141+		stbir__simdf_store(decode + 9, d);
 9142+		stbir__simdf_store(decode + 12, e);
 9143+		stbir__simdf_store(decode + 15, f);
 9144+		stbir__simdf_store(decode + 18, g);
 9145+		decode[21] = i23;
 9146+		decode[23] = i21;
 9147+		decode += 24;
 9148+	}
 9149+	end_decode += 24;
 9150+#endif
 9151+#else
 9152+	end_decode -= 12;
 9153+	STBIR_NO_UNROLL_LOOP_START
 9154+	while (decode <= end_decode) {
 9155+		// 16 instructions
 9156+		float t0, t1, t2, t3;
 9157+		STBIR_NO_UNROLL(decode);
 9158+		t0 = decode[0];
 9159+		t1 = decode[3];
 9160+		t2 = decode[6];
 9161+		t3 = decode[9];
 9162+		decode[0] = decode[2];
 9163+		decode[3] = decode[5];
 9164+		decode[6] = decode[8];
 9165+		decode[9] = decode[11];
 9166+		decode[2] = t0;
 9167+		decode[5] = t1;
 9168+		decode[8] = t2;
 9169+		decode[11] = t3;
 9170+		decode += 12;
 9171+	}
 9172+	end_decode += 12;
 9173+#endif
 9174 
 9175-#define stbir__4_coeff_start()  \
 9176-    float tot0,tot1,tot2,tot3;  \
 9177-    tot0 = decode[0] * hc[0];   \
 9178-    tot1 = decode[1] * hc[1];   \
 9179-    tot2 = decode[2] * hc[2];   \
 9180-    tot3 = decode[3] * hc[3];
 9181+	STBIR_NO_UNROLL_LOOP_START
 9182+	while (decode < end_decode) {
 9183+		float t = decode[0];
 9184+		STBIR_NO_UNROLL(decode);
 9185+		decode[0] = decode[2];
 9186+		decode[2] = t;
 9187+		decode += 3;
 9188+	}
 9189+}
 9190+
 9191+static void
 9192+stbir__decode_scanline(stbir__info const *stbir_info, int n,
 9193+                       float *output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO)
 9194+{
 9195+	int channels = stbir_info->channels;
 9196+	int effective_channels = stbir_info->effective_channels;
 9197+	int input_sample_in_bytes =
 9198+	    stbir__type_size[stbir_info->input_type] * channels;
 9199+	stbir_edge edge_horizontal = stbir_info->horizontal.edge;
 9200+	stbir_edge edge_vertical = stbir_info->vertical.edge;
 9201+	int row = stbir__edge_wrap(edge_vertical, n,
 9202+	                           stbir_info->vertical.scale_info.input_full_size);
 9203+	const void *input_plane_data =
 9204+	    ((char *)stbir_info->input_data) +
 9205+	    (size_t)row * (size_t)stbir_info->input_stride_bytes;
 9206+	stbir__span const *spans = stbir_info->scanline_extents.spans;
 9207+	float *full_decode_buffer =
 9208+	    output_buffer -
 9209+	    stbir_info->scanline_extents.conservative.n0 * effective_channels;
 9210+	float *last_decoded = 0;
 9211+
 9212+	// if we are on edge_zero, and we get in here with an out of bounds n, then
 9213+	// the calculate filters has failed
 9214+	STBIR_ASSERT(
 9215+	    !(edge_vertical == STBIR_EDGE_ZERO &&
 9216+	      (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)));
 9217+
 9218+	do {
 9219+		float *decode_buffer;
 9220+		void const *input_data;
 9221+		float *end_decode;
 9222+		int width_times_channels;
 9223+		int width;
 9224+
 9225+		if (spans->n1 < spans->n0) {
 9226+			break;
 9227+		}
 9228+
 9229+		width = spans->n1 + 1 - spans->n0;
 9230+		decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
 9231+		end_decode = full_decode_buffer + (spans->n1 + 1) * effective_channels;
 9232+		width_times_channels = width * channels;
 9233+
 9234+		// read directly out of input plane by default
 9235+		input_data = ((char *)input_plane_data) +
 9236+		             spans->pixel_offset_for_input * input_sample_in_bytes;
 9237+
 9238+		// if we have an input callback, call it to get the input data
 9239+		if (stbir_info->in_pixels_cb) {
 9240+			// call the callback with a temp buffer (that they can choose to use
 9241+			// or not).  the temp is just right aligned memory in the
 9242+			// decode_buffer itself
 9243+			input_data = stbir_info->in_pixels_cb(
 9244+			    ((char *)end_decode) - (width * input_sample_in_bytes) +
 9245+			        ((stbir_info->input_type != STBIR_TYPE_FLOAT)
 9246+			             ? (sizeof(float) * STBIR_INPUT_CALLBACK_PADDING)
 9247+			             : 0),
 9248+			    input_plane_data, width, spans->pixel_offset_for_input, row,
 9249+			    stbir_info->user_data);
 9250+		}
 9251+
 9252+		STBIR_PROFILE_START(decode);
 9253+		// convert the pixels info the float decode_buffer, (we index from
 9254+		// end_decode, so that when channels<effective_channels, we are right
 9255+		// justified in the buffer)
 9256+		last_decoded = stbir_info->decode_pixels(
 9257+		    (float *)end_decode - width_times_channels, width_times_channels,
 9258+		    input_data);
 9259+		STBIR_PROFILE_END(decode);
 9260+
 9261+		if (stbir_info->alpha_weight) {
 9262+			STBIR_PROFILE_START(alpha);
 9263+			stbir_info->alpha_weight(decode_buffer, width_times_channels);
 9264+			STBIR_PROFILE_END(alpha);
 9265+		}
 9266+
 9267+		++spans;
 9268+	} while (spans <= (&stbir_info->scanline_extents.spans[1]));
 9269+
 9270+	// handle the edge_wrap filter (all other types are handled back out at the
 9271+	// calculate_filter stage) basically the idea here is that if we have the
 9272+	// whole scanline in memory, we don't redecode the
 9273+	//   wrapped edge pixels, and instead just memcpy them from the scanline
 9274+	//   into the edge positions
 9275+	if ((edge_horizontal == STBIR_EDGE_WRAP) &&
 9276+	    (stbir_info->scanline_extents.edge_sizes[0] |
 9277+	     stbir_info->scanline_extents.edge_sizes[1])) {
 9278+		// this code only runs if we're in edge_wrap, and we're doing the entire
 9279+		// scanline
 9280+		int e, start_x[2];
 9281+		int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
 9282+
 9283+		start_x[0] =
 9284+		    -stbir_info->scanline_extents.edge_sizes[0]; // left edge start x
 9285+		start_x[1] = input_full_size;                    // right edge
 9286+
 9287+		for (e = 0; e < 2; e++) {
 9288+			// do each margin
 9289+			int margin = stbir_info->scanline_extents.edge_sizes[e];
 9290+			if (margin) {
 9291+				int x = start_x[e];
 9292+				float *marg = full_decode_buffer + x * effective_channels;
 9293+				float const *src =
 9294+				    full_decode_buffer +
 9295+				    stbir__edge_wrap(edge_horizontal, x, input_full_size) *
 9296+				        effective_channels;
 9297+				STBIR_MEMCPY(marg, src,
 9298+				             margin * effective_channels * sizeof(float));
 9299+				if (e == 1) {
 9300+					last_decoded = marg + margin * effective_channels;
 9301+				}
 9302+			}
 9303+		}
 9304+	}
 9305+
 9306+	// some of the horizontal gathers read one float off the edge (which is
 9307+	// masked out), but we force a zero here to make sure no NaNs leak in
 9308+	//   (we can't pre-zero it, because the input callback can use that area as
 9309+	//   padding)
 9310+	last_decoded[0] = 0.0f;
 9311+
 9312+	// we clear this extra float, because the final output pixel filter kernel
 9313+	// might have used one less coeff than the max filter width
 9314+	//   when this happens, we do read that pixel from the input, so it too
 9315+	//   could be Nan, so just zero an extra one. this fits because each
 9316+	//   scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
 9317+	last_decoded[1] = 0.0f;
 9318+}
 9319 
 9320-#define stbir__4_coeff_continue_from_4( ofs )  \
 9321-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
 9322-    tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
 9323-    tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
 9324-    tot3 += decode[3+(ofs)] * hc[3+(ofs)];
 9325+//=================
 9326+// Do 1 channel horizontal routines
 9327 
 9328-#define stbir__1_coeff_remnant( ofs )        \
 9329-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];
 9330+#ifdef STBIR_SIMD
 9331 
 9332-#define stbir__2_coeff_remnant( ofs )        \
 9333-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
 9334-    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
 9335+#define stbir__1_coeff_only()                                                  \
 9336+	stbir__simdf tot, c;                                                       \
 9337+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9338+	stbir__simdf_load1(c, hc);                                                 \
 9339+	stbir__simdf_mult1_mem(tot, c, decode);
 9340+
 9341+#define stbir__2_coeff_only()                                                  \
 9342+	stbir__simdf tot, c, d;                                                    \
 9343+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9344+	stbir__simdf_load2z(c, hc);                                                \
 9345+	stbir__simdf_load2(d, decode);                                             \
 9346+	stbir__simdf_mult(tot, c, d);                                              \
 9347+	stbir__simdf_0123to1230(c, tot);                                           \
 9348+	stbir__simdf_add1(tot, tot, c);
 9349+
 9350+#define stbir__3_coeff_only()                                                  \
 9351+	stbir__simdf tot, c, t;                                                    \
 9352+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9353+	stbir__simdf_load(c, hc);                                                  \
 9354+	stbir__simdf_mult_mem(tot, c, decode);                                     \
 9355+	stbir__simdf_0123to1230(c, tot);                                           \
 9356+	stbir__simdf_0123to2301(t, tot);                                           \
 9357+	stbir__simdf_add1(tot, tot, c);                                            \
 9358+	stbir__simdf_add1(tot, tot, t);
 9359+
 9360+#define stbir__store_output_tiny()                                             \
 9361+	stbir__simdf_store1(output, tot);                                          \
 9362+	horizontal_coefficients += coefficient_width;                              \
 9363+	++horizontal_contributors;                                                 \
 9364+	output += 1;
 9365+
 9366+#define stbir__4_coeff_start()                                                 \
 9367+	stbir__simdf tot, c;                                                       \
 9368+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9369+	stbir__simdf_load(c, hc);                                                  \
 9370+	stbir__simdf_mult_mem(tot, c, decode);
 9371+
 9372+#define stbir__4_coeff_continue_from_4(ofs)                                    \
 9373+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9374+	stbir__simdf_load(c, hc + (ofs));                                          \
 9375+	stbir__simdf_madd_mem(tot, tot, c, decode + (ofs));
 9376+
 9377+#define stbir__1_coeff_remnant(ofs)                                            \
 9378+	{                                                                          \
 9379+		stbir__simdf d;                                                        \
 9380+		stbir__simdf_load1z(c, hc + (ofs));                                    \
 9381+		stbir__simdf_load1(d, decode + (ofs));                                 \
 9382+		stbir__simdf_madd(tot, tot, d, c);                                     \
 9383+	}
 9384+
 9385+#define stbir__2_coeff_remnant(ofs)                                            \
 9386+	{                                                                          \
 9387+		stbir__simdf d;                                                        \
 9388+		stbir__simdf_load2z(c, hc + (ofs));                                    \
 9389+		stbir__simdf_load2(d, decode + (ofs));                                 \
 9390+		stbir__simdf_madd(tot, tot, d, c);                                     \
 9391+	}
 9392+
 9393+#define stbir__3_coeff_setup()                                                 \
 9394+	stbir__simdf mask;                                                         \
 9395+	stbir__simdf_load(mask, STBIR_mask + 3);
 9396+
 9397+#define stbir__3_coeff_remnant(ofs)                                            \
 9398+	stbir__simdf_load(c, hc + (ofs));                                          \
 9399+	stbir__simdf_and(c, c, mask);                                              \
 9400+	stbir__simdf_madd_mem(tot, tot, c, decode + (ofs));
 9401+
 9402+#define stbir__store_output()                                                  \
 9403+	stbir__simdf_0123to2301(c, tot);                                           \
 9404+	stbir__simdf_add(tot, tot, c);                                             \
 9405+	stbir__simdf_0123to1230(c, tot);                                           \
 9406+	stbir__simdf_add1(tot, tot, c);                                            \
 9407+	stbir__simdf_store1(output, tot);                                          \
 9408+	horizontal_coefficients += coefficient_width;                              \
 9409+	++horizontal_contributors;                                                 \
 9410+	output += 1;
 9411 
 9412-#define stbir__3_coeff_remnant( ofs )        \
 9413-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
 9414-    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
 9415-    tot2 += decode[2+(ofs)] * hc[2+(ofs)];
 9416+#else
 9417 
 9418-#define stbir__store_output()                     \
 9419-    output[0] = (tot0+tot2)+(tot1+tot3);          \
 9420-    horizontal_coefficients += coefficient_width; \
 9421-    ++horizontal_contributors;                    \
 9422-    output += 1;
 9423+#define stbir__1_coeff_only()                                                  \
 9424+	float tot;                                                                 \
 9425+	tot = decode[0] * hc[0];
 9426+
 9427+#define stbir__2_coeff_only()                                                  \
 9428+	float tot;                                                                 \
 9429+	tot = decode[0] * hc[0];                                                   \
 9430+	tot += decode[1] * hc[1];
 9431+
 9432+#define stbir__3_coeff_only()                                                  \
 9433+	float tot;                                                                 \
 9434+	tot = decode[0] * hc[0];                                                   \
 9435+	tot += decode[1] * hc[1];                                                  \
 9436+	tot += decode[2] * hc[2];
 9437+
 9438+#define stbir__store_output_tiny()                                             \
 9439+	output[0] = tot;                                                           \
 9440+	horizontal_coefficients += coefficient_width;                              \
 9441+	++horizontal_contributors;                                                 \
 9442+	output += 1;
 9443+
 9444+#define stbir__4_coeff_start()                                                 \
 9445+	float tot0, tot1, tot2, tot3;                                              \
 9446+	tot0 = decode[0] * hc[0];                                                  \
 9447+	tot1 = decode[1] * hc[1];                                                  \
 9448+	tot2 = decode[2] * hc[2];                                                  \
 9449+	tot3 = decode[3] * hc[3];
 9450+
 9451+#define stbir__4_coeff_continue_from_4(ofs)                                    \
 9452+	tot0 += decode[0 + (ofs)] * hc[0 + (ofs)];                                 \
 9453+	tot1 += decode[1 + (ofs)] * hc[1 + (ofs)];                                 \
 9454+	tot2 += decode[2 + (ofs)] * hc[2 + (ofs)];                                 \
 9455+	tot3 += decode[3 + (ofs)] * hc[3 + (ofs)];
 9456+
 9457+#define stbir__1_coeff_remnant(ofs) tot0 += decode[0 + (ofs)] * hc[0 + (ofs)];
 9458+
 9459+#define stbir__2_coeff_remnant(ofs)                                            \
 9460+	tot0 += decode[0 + (ofs)] * hc[0 + (ofs)];                                 \
 9461+	tot1 += decode[1 + (ofs)] * hc[1 + (ofs)];
 9462+
 9463+#define stbir__3_coeff_remnant(ofs)                                            \
 9464+	tot0 += decode[0 + (ofs)] * hc[0 + (ofs)];                                 \
 9465+	tot1 += decode[1 + (ofs)] * hc[1 + (ofs)];                                 \
 9466+	tot2 += decode[2 + (ofs)] * hc[2 + (ofs)];
 9467+
 9468+#define stbir__store_output()                                                  \
 9469+	output[0] = (tot0 + tot2) + (tot1 + tot3);                                 \
 9470+	horizontal_coefficients += coefficient_width;                              \
 9471+	++horizontal_contributors;                                                 \
 9472+	output += 1;
 9473 
 9474 #endif
 9475 
 9476@@ -4812,239 +5686,251 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 9477 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
 9478 #include STBIR__HEADER_FILENAME
 9479 
 9480-
 9481 //=================
 9482 // Do 2 channel horizontal routines
 9483 
 9484 #ifdef STBIR_SIMD
 9485 
 9486-#define stbir__1_coeff_only()         \
 9487-    stbir__simdf tot,c,d;             \
 9488-    STBIR_SIMD_NO_UNROLL(decode);     \
 9489-    stbir__simdf_load1z( c, hc );     \
 9490-    stbir__simdf_0123to0011( c, c );  \
 9491-    stbir__simdf_load2( d, decode );  \
 9492-    stbir__simdf_mult( tot, d, c );
 9493-
 9494-#define stbir__2_coeff_only()         \
 9495-    stbir__simdf tot,c;               \
 9496-    STBIR_SIMD_NO_UNROLL(decode);     \
 9497-    stbir__simdf_load2( c, hc );      \
 9498-    stbir__simdf_0123to0011( c, c );  \
 9499-    stbir__simdf_mult_mem( tot, c, decode );
 9500-
 9501-#define stbir__3_coeff_only()                \
 9502-    stbir__simdf tot,c,cs,d;                 \
 9503-    STBIR_SIMD_NO_UNROLL(decode);            \
 9504-    stbir__simdf_load( cs, hc );             \
 9505-    stbir__simdf_0123to0011( c, cs );        \
 9506-    stbir__simdf_mult_mem( tot, c, decode ); \
 9507-    stbir__simdf_0123to2222( c, cs );        \
 9508-    stbir__simdf_load2z( d, decode+4 );      \
 9509-    stbir__simdf_madd( tot, tot, d, c );
 9510-
 9511-#define stbir__store_output_tiny()                \
 9512-    stbir__simdf_0123to2301( c, tot );            \
 9513-    stbir__simdf_add( tot, tot, c );              \
 9514-    stbir__simdf_store2( output, tot );           \
 9515-    horizontal_coefficients += coefficient_width; \
 9516-    ++horizontal_contributors;                    \
 9517-    output += 2;
 9518+#define stbir__1_coeff_only()                                                  \
 9519+	stbir__simdf tot, c, d;                                                    \
 9520+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9521+	stbir__simdf_load1z(c, hc);                                                \
 9522+	stbir__simdf_0123to0011(c, c);                                             \
 9523+	stbir__simdf_load2(d, decode);                                             \
 9524+	stbir__simdf_mult(tot, d, c);
 9525+
 9526+#define stbir__2_coeff_only()                                                  \
 9527+	stbir__simdf tot, c;                                                       \
 9528+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9529+	stbir__simdf_load2(c, hc);                                                 \
 9530+	stbir__simdf_0123to0011(c, c);                                             \
 9531+	stbir__simdf_mult_mem(tot, c, decode);
 9532+
 9533+#define stbir__3_coeff_only()                                                  \
 9534+	stbir__simdf tot, c, cs, d;                                                \
 9535+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9536+	stbir__simdf_load(cs, hc);                                                 \
 9537+	stbir__simdf_0123to0011(c, cs);                                            \
 9538+	stbir__simdf_mult_mem(tot, c, decode);                                     \
 9539+	stbir__simdf_0123to2222(c, cs);                                            \
 9540+	stbir__simdf_load2z(d, decode + 4);                                        \
 9541+	stbir__simdf_madd(tot, tot, d, c);
 9542+
 9543+#define stbir__store_output_tiny()                                             \
 9544+	stbir__simdf_0123to2301(c, tot);                                           \
 9545+	stbir__simdf_add(tot, tot, c);                                             \
 9546+	stbir__simdf_store2(output, tot);                                          \
 9547+	horizontal_coefficients += coefficient_width;                              \
 9548+	++horizontal_contributors;                                                 \
 9549+	output += 2;
 9550 
 9551 #ifdef STBIR_SIMD8
 9552 
 9553-#define stbir__4_coeff_start()                    \
 9554-    stbir__simdf8 tot0,c,cs;                      \
 9555-    STBIR_SIMD_NO_UNROLL(decode);                 \
 9556-    stbir__simdf8_load4b( cs, hc );               \
 9557-    stbir__simdf8_0123to00112233( c, cs );        \
 9558-    stbir__simdf8_mult_mem( tot0, c, decode );
 9559-
 9560-#define stbir__4_coeff_continue_from_4( ofs )        \
 9561-    STBIR_SIMD_NO_UNROLL(decode);                    \
 9562-    stbir__simdf8_load4b( cs, hc + (ofs) );          \
 9563-    stbir__simdf8_0123to00112233( c, cs );           \
 9564-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 9565-
 9566-#define stbir__1_coeff_remnant( ofs )                \
 9567-    { stbir__simdf t,d;                              \
 9568-    stbir__simdf_load1z( t, hc + (ofs) );            \
 9569-    stbir__simdf_load2( d, decode + (ofs) * 2 );     \
 9570-    stbir__simdf_0123to0011( t, t );                 \
 9571-    stbir__simdf_mult( t, t, d );                    \
 9572-    stbir__simdf8_add4( tot0, tot0, t ); }
 9573- 
 9574-#define stbir__2_coeff_remnant( ofs )                \
 9575-    { stbir__simdf t;                                \
 9576-    stbir__simdf_load2( t, hc + (ofs) );             \
 9577-    stbir__simdf_0123to0011( t, t );                 \
 9578-    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
 9579-    stbir__simdf8_add4( tot0, tot0, t ); }
 9580-
 9581-#define stbir__3_coeff_remnant( ofs )                \
 9582-    { stbir__simdf8 d;                               \
 9583-    stbir__simdf8_load4b( cs, hc + (ofs) );          \
 9584-    stbir__simdf8_0123to00112233( c, cs );           \
 9585-    stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
 9586-    stbir__simdf8_madd( tot0, tot0, c, d ); }
 9587-
 9588-#define stbir__store_output()                     \
 9589-    { stbir__simdf t,d;                           \
 9590-    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
 9591-    stbir__simdf_0123to2301( d, t );              \
 9592-    stbir__simdf_add( t, t, d );                  \
 9593-    stbir__simdf_store2( output, t );             \
 9594-    horizontal_coefficients += coefficient_width; \
 9595-    ++horizontal_contributors;                    \
 9596-    output += 2; }
 9597+#define stbir__4_coeff_start()                                                 \
 9598+	stbir__simdf8 tot0, c, cs;                                                 \
 9599+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9600+	stbir__simdf8_load4b(cs, hc);                                              \
 9601+	stbir__simdf8_0123to00112233(c, cs);                                       \
 9602+	stbir__simdf8_mult_mem(tot0, c, decode);
 9603+
 9604+#define stbir__4_coeff_continue_from_4(ofs)                                    \
 9605+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9606+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
 9607+	stbir__simdf8_0123to00112233(c, cs);                                       \
 9608+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 2);
 9609+
 9610+#define stbir__1_coeff_remnant(ofs)                                            \
 9611+	{                                                                          \
 9612+		stbir__simdf t, d;                                                     \
 9613+		stbir__simdf_load1z(t, hc + (ofs));                                    \
 9614+		stbir__simdf_load2(d, decode + (ofs) * 2);                             \
 9615+		stbir__simdf_0123to0011(t, t);                                         \
 9616+		stbir__simdf_mult(t, t, d);                                            \
 9617+		stbir__simdf8_add4(tot0, tot0, t);                                     \
 9618+	}
 9619+
 9620+#define stbir__2_coeff_remnant(ofs)                                            \
 9621+	{                                                                          \
 9622+		stbir__simdf t;                                                        \
 9623+		stbir__simdf_load2(t, hc + (ofs));                                     \
 9624+		stbir__simdf_0123to0011(t, t);                                         \
 9625+		stbir__simdf_mult_mem(t, t, decode + (ofs) * 2);                       \
 9626+		stbir__simdf8_add4(tot0, tot0, t);                                     \
 9627+	}
 9628+
 9629+#define stbir__3_coeff_remnant(ofs)                                            \
 9630+	{                                                                          \
 9631+		stbir__simdf8 d;                                                       \
 9632+		stbir__simdf8_load4b(cs, hc + (ofs));                                  \
 9633+		stbir__simdf8_0123to00112233(c, cs);                                   \
 9634+		stbir__simdf8_load6z(d, decode + (ofs) * 2);                           \
 9635+		stbir__simdf8_madd(tot0, tot0, c, d);                                  \
 9636+	}
 9637+
 9638+#define stbir__store_output()                                                  \
 9639+	{                                                                          \
 9640+		stbir__simdf t, d;                                                     \
 9641+		stbir__simdf8_add4halves(t, stbir__if_simdf8_cast_to_simdf4(tot0),     \
 9642+		                         tot0);                                        \
 9643+		stbir__simdf_0123to2301(d, t);                                         \
 9644+		stbir__simdf_add(t, t, d);                                             \
 9645+		stbir__simdf_store2(output, t);                                        \
 9646+		horizontal_coefficients += coefficient_width;                          \
 9647+		++horizontal_contributors;                                             \
 9648+		output += 2;                                                           \
 9649+	}
 9650 
 9651 #else
 9652 
 9653-#define stbir__4_coeff_start()                   \
 9654-    stbir__simdf tot0,tot1,c,cs;                 \
 9655-    STBIR_SIMD_NO_UNROLL(decode);                \
 9656-    stbir__simdf_load( cs, hc );                 \
 9657-    stbir__simdf_0123to0011( c, cs );            \
 9658-    stbir__simdf_mult_mem( tot0, c, decode );    \
 9659-    stbir__simdf_0123to2233( c, cs );            \
 9660-    stbir__simdf_mult_mem( tot1, c, decode+4 );
 9661-
 9662-#define stbir__4_coeff_continue_from_4( ofs )                \
 9663-    STBIR_SIMD_NO_UNROLL(decode);                            \
 9664-    stbir__simdf_load( cs, hc + (ofs) );                     \
 9665-    stbir__simdf_0123to0011( c, cs );                        \
 9666-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
 9667-    stbir__simdf_0123to2233( c, cs );                        \
 9668-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
 9669-
 9670-#define stbir__1_coeff_remnant( ofs )            \
 9671-    { stbir__simdf d;                            \
 9672-    stbir__simdf_load1z( cs, hc + (ofs) );       \
 9673-    stbir__simdf_0123to0011( c, cs );            \
 9674-    stbir__simdf_load2( d, decode + (ofs) * 2 ); \
 9675-    stbir__simdf_madd( tot0, tot0, d, c ); }
 9676-
 9677-#define stbir__2_coeff_remnant( ofs )                      \
 9678-    stbir__simdf_load2( cs, hc + (ofs) );                  \
 9679-    stbir__simdf_0123to0011( c, cs );                      \
 9680-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 9681-
 9682-#define stbir__3_coeff_remnant( ofs )                       \
 9683-    { stbir__simdf d;                                       \
 9684-    stbir__simdf_load( cs, hc + (ofs) );                    \
 9685-    stbir__simdf_0123to0011( c, cs );                       \
 9686-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
 9687-    stbir__simdf_0123to2222( c, cs );                       \
 9688-    stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
 9689-    stbir__simdf_madd( tot1, tot1, d, c ); }
 9690-
 9691-#define stbir__store_output()                     \
 9692-    stbir__simdf_add( tot0, tot0, tot1 );         \
 9693-    stbir__simdf_0123to2301( c, tot0 );           \
 9694-    stbir__simdf_add( tot0, tot0, c );            \
 9695-    stbir__simdf_store2( output, tot0 );          \
 9696-    horizontal_coefficients += coefficient_width; \
 9697-    ++horizontal_contributors;                    \
 9698-    output += 2;
 9699+#define stbir__4_coeff_start()                                                 \
 9700+	stbir__simdf tot0, tot1, c, cs;                                            \
 9701+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9702+	stbir__simdf_load(cs, hc);                                                 \
 9703+	stbir__simdf_0123to0011(c, cs);                                            \
 9704+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
 9705+	stbir__simdf_0123to2233(c, cs);                                            \
 9706+	stbir__simdf_mult_mem(tot1, c, decode + 4);
 9707+
 9708+#define stbir__4_coeff_continue_from_4(ofs)                                    \
 9709+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9710+	stbir__simdf_load(cs, hc + (ofs));                                         \
 9711+	stbir__simdf_0123to0011(c, cs);                                            \
 9712+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 2);                  \
 9713+	stbir__simdf_0123to2233(c, cs);                                            \
 9714+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 2 + 4);
 9715+
 9716+#define stbir__1_coeff_remnant(ofs)                                            \
 9717+	{                                                                          \
 9718+		stbir__simdf d;                                                        \
 9719+		stbir__simdf_load1z(cs, hc + (ofs));                                   \
 9720+		stbir__simdf_0123to0011(c, cs);                                        \
 9721+		stbir__simdf_load2(d, decode + (ofs) * 2);                             \
 9722+		stbir__simdf_madd(tot0, tot0, d, c);                                   \
 9723+	}
 9724+
 9725+#define stbir__2_coeff_remnant(ofs)                                            \
 9726+	stbir__simdf_load2(cs, hc + (ofs));                                        \
 9727+	stbir__simdf_0123to0011(c, cs);                                            \
 9728+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 2);
 9729+
 9730+#define stbir__3_coeff_remnant(ofs)                                            \
 9731+	{                                                                          \
 9732+		stbir__simdf d;                                                        \
 9733+		stbir__simdf_load(cs, hc + (ofs));                                     \
 9734+		stbir__simdf_0123to0011(c, cs);                                        \
 9735+		stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 2);              \
 9736+		stbir__simdf_0123to2222(c, cs);                                        \
 9737+		stbir__simdf_load2z(d, decode + (ofs) * 2 + 4);                        \
 9738+		stbir__simdf_madd(tot1, tot1, d, c);                                   \
 9739+	}
 9740+
 9741+#define stbir__store_output()                                                  \
 9742+	stbir__simdf_add(tot0, tot0, tot1);                                        \
 9743+	stbir__simdf_0123to2301(c, tot0);                                          \
 9744+	stbir__simdf_add(tot0, tot0, c);                                           \
 9745+	stbir__simdf_store2(output, tot0);                                         \
 9746+	horizontal_coefficients += coefficient_width;                              \
 9747+	++horizontal_contributors;                                                 \
 9748+	output += 2;
 9749 
 9750 #endif
 9751 
 9752 #else
 9753 
 9754-#define stbir__1_coeff_only()  \
 9755-    float tota,totb,c;         \
 9756-    c = hc[0];                 \
 9757-    tota = decode[0]*c;        \
 9758-    totb = decode[1]*c;
 9759-
 9760-#define stbir__2_coeff_only()  \
 9761-    float tota,totb,c;         \
 9762-    c = hc[0];                 \
 9763-    tota = decode[0]*c;        \
 9764-    totb = decode[1]*c;        \
 9765-    c = hc[1];                 \
 9766-    tota += decode[2]*c;       \
 9767-    totb += decode[3]*c;
 9768+#define stbir__1_coeff_only()                                                  \
 9769+	float tota, totb, c;                                                       \
 9770+	c = hc[0];                                                                 \
 9771+	tota = decode[0] * c;                                                      \
 9772+	totb = decode[1] * c;
 9773+
 9774+#define stbir__2_coeff_only()                                                  \
 9775+	float tota, totb, c;                                                       \
 9776+	c = hc[0];                                                                 \
 9777+	tota = decode[0] * c;                                                      \
 9778+	totb = decode[1] * c;                                                      \
 9779+	c = hc[1];                                                                 \
 9780+	tota += decode[2] * c;                                                     \
 9781+	totb += decode[3] * c;
 9782 
 9783 // this weird order of add matches the simd
 9784-#define stbir__3_coeff_only()  \
 9785-    float tota,totb,c;         \
 9786-    c = hc[0];                 \
 9787-    tota = decode[0]*c;        \
 9788-    totb = decode[1]*c;        \
 9789-    c = hc[2];                 \
 9790-    tota += decode[4]*c;       \
 9791-    totb += decode[5]*c;       \
 9792-    c = hc[1];                 \
 9793-    tota += decode[2]*c;       \
 9794-    totb += decode[3]*c;
 9795-
 9796-#define stbir__store_output_tiny()                \
 9797-    output[0] = tota;                             \
 9798-    output[1] = totb;                             \
 9799-    horizontal_coefficients += coefficient_width; \
 9800-    ++horizontal_contributors;                    \
 9801-    output += 2;
 9802-
 9803-#define stbir__4_coeff_start()      \
 9804-    float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c;  \
 9805-    c = hc[0];                      \
 9806-    tota0 = decode[0]*c;            \
 9807-    totb0 = decode[1]*c;            \
 9808-    c = hc[1];                      \
 9809-    tota1 = decode[2]*c;            \
 9810-    totb1 = decode[3]*c;            \
 9811-    c = hc[2];                      \
 9812-    tota2 = decode[4]*c;            \
 9813-    totb2 = decode[5]*c;            \
 9814-    c = hc[3];                      \
 9815-    tota3 = decode[6]*c;            \
 9816-    totb3 = decode[7]*c;
 9817-
 9818-#define stbir__4_coeff_continue_from_4( ofs )  \
 9819-    c = hc[0+(ofs)];                           \
 9820-    tota0 += decode[0+(ofs)*2]*c;              \
 9821-    totb0 += decode[1+(ofs)*2]*c;              \
 9822-    c = hc[1+(ofs)];                           \
 9823-    tota1 += decode[2+(ofs)*2]*c;              \
 9824-    totb1 += decode[3+(ofs)*2]*c;              \
 9825-    c = hc[2+(ofs)];                           \
 9826-    tota2 += decode[4+(ofs)*2]*c;              \
 9827-    totb2 += decode[5+(ofs)*2]*c;              \
 9828-    c = hc[3+(ofs)];                           \
 9829-    tota3 += decode[6+(ofs)*2]*c;              \
 9830-    totb3 += decode[7+(ofs)*2]*c;
 9831-
 9832-#define stbir__1_coeff_remnant( ofs )  \
 9833-    c = hc[0+(ofs)];                   \
 9834-    tota0 += decode[0+(ofs)*2] * c;    \
 9835-    totb0 += decode[1+(ofs)*2] * c;
 9836-
 9837-#define stbir__2_coeff_remnant( ofs )  \
 9838-    c = hc[0+(ofs)];                   \
 9839-    tota0 += decode[0+(ofs)*2] * c;    \
 9840-    totb0 += decode[1+(ofs)*2] * c;    \
 9841-    c = hc[1+(ofs)];                   \
 9842-    tota1 += decode[2+(ofs)*2] * c;    \
 9843-    totb1 += decode[3+(ofs)*2] * c;
 9844-
 9845-#define stbir__3_coeff_remnant( ofs )  \
 9846-    c = hc[0+(ofs)];                   \
 9847-    tota0 += decode[0+(ofs)*2] * c;    \
 9848-    totb0 += decode[1+(ofs)*2] * c;    \
 9849-    c = hc[1+(ofs)];                   \
 9850-    tota1 += decode[2+(ofs)*2] * c;    \
 9851-    totb1 += decode[3+(ofs)*2] * c;    \
 9852-    c = hc[2+(ofs)];                   \
 9853-    tota2 += decode[4+(ofs)*2] * c;    \
 9854-    totb2 += decode[5+(ofs)*2] * c;
 9855-
 9856-#define stbir__store_output()                     \
 9857-    output[0] = (tota0+tota2)+(tota1+tota3);      \
 9858-    output[1] = (totb0+totb2)+(totb1+totb3);      \
 9859-    horizontal_coefficients += coefficient_width; \
 9860-    ++horizontal_contributors;                    \
 9861-    output += 2;
 9862+#define stbir__3_coeff_only()                                                  \
 9863+	float tota, totb, c;                                                       \
 9864+	c = hc[0];                                                                 \
 9865+	tota = decode[0] * c;                                                      \
 9866+	totb = decode[1] * c;                                                      \
 9867+	c = hc[2];                                                                 \
 9868+	tota += decode[4] * c;                                                     \
 9869+	totb += decode[5] * c;                                                     \
 9870+	c = hc[1];                                                                 \
 9871+	tota += decode[2] * c;                                                     \
 9872+	totb += decode[3] * c;
 9873+
 9874+#define stbir__store_output_tiny()                                             \
 9875+	output[0] = tota;                                                          \
 9876+	output[1] = totb;                                                          \
 9877+	horizontal_coefficients += coefficient_width;                              \
 9878+	++horizontal_contributors;                                                 \
 9879+	output += 2;
 9880+
 9881+#define stbir__4_coeff_start()                                                 \
 9882+	float tota0, tota1, tota2, tota3, totb0, totb1, totb2, totb3, c;           \
 9883+	c = hc[0];                                                                 \
 9884+	tota0 = decode[0] * c;                                                     \
 9885+	totb0 = decode[1] * c;                                                     \
 9886+	c = hc[1];                                                                 \
 9887+	tota1 = decode[2] * c;                                                     \
 9888+	totb1 = decode[3] * c;                                                     \
 9889+	c = hc[2];                                                                 \
 9890+	tota2 = decode[4] * c;                                                     \
 9891+	totb2 = decode[5] * c;                                                     \
 9892+	c = hc[3];                                                                 \
 9893+	tota3 = decode[6] * c;                                                     \
 9894+	totb3 = decode[7] * c;
 9895+
 9896+#define stbir__4_coeff_continue_from_4(ofs)                                    \
 9897+	c = hc[0 + (ofs)];                                                         \
 9898+	tota0 += decode[0 + (ofs) * 2] * c;                                        \
 9899+	totb0 += decode[1 + (ofs) * 2] * c;                                        \
 9900+	c = hc[1 + (ofs)];                                                         \
 9901+	tota1 += decode[2 + (ofs) * 2] * c;                                        \
 9902+	totb1 += decode[3 + (ofs) * 2] * c;                                        \
 9903+	c = hc[2 + (ofs)];                                                         \
 9904+	tota2 += decode[4 + (ofs) * 2] * c;                                        \
 9905+	totb2 += decode[5 + (ofs) * 2] * c;                                        \
 9906+	c = hc[3 + (ofs)];                                                         \
 9907+	tota3 += decode[6 + (ofs) * 2] * c;                                        \
 9908+	totb3 += decode[7 + (ofs) * 2] * c;
 9909+
 9910+#define stbir__1_coeff_remnant(ofs)                                            \
 9911+	c = hc[0 + (ofs)];                                                         \
 9912+	tota0 += decode[0 + (ofs) * 2] * c;                                        \
 9913+	totb0 += decode[1 + (ofs) * 2] * c;
 9914+
 9915+#define stbir__2_coeff_remnant(ofs)                                            \
 9916+	c = hc[0 + (ofs)];                                                         \
 9917+	tota0 += decode[0 + (ofs) * 2] * c;                                        \
 9918+	totb0 += decode[1 + (ofs) * 2] * c;                                        \
 9919+	c = hc[1 + (ofs)];                                                         \
 9920+	tota1 += decode[2 + (ofs) * 2] * c;                                        \
 9921+	totb1 += decode[3 + (ofs) * 2] * c;
 9922+
 9923+#define stbir__3_coeff_remnant(ofs)                                            \
 9924+	c = hc[0 + (ofs)];                                                         \
 9925+	tota0 += decode[0 + (ofs) * 2] * c;                                        \
 9926+	totb0 += decode[1 + (ofs) * 2] * c;                                        \
 9927+	c = hc[1 + (ofs)];                                                         \
 9928+	tota1 += decode[2 + (ofs) * 2] * c;                                        \
 9929+	totb1 += decode[3 + (ofs) * 2] * c;                                        \
 9930+	c = hc[2 + (ofs)];                                                         \
 9931+	tota2 += decode[4 + (ofs) * 2] * c;                                        \
 9932+	totb2 += decode[5 + (ofs) * 2] * c;
 9933+
 9934+#define stbir__store_output()                                                  \
 9935+	output[0] = (tota0 + tota2) + (tota1 + tota3);                             \
 9936+	output[1] = (totb0 + totb2) + (totb1 + totb3);                             \
 9937+	horizontal_coefficients += coefficient_width;                              \
 9938+	++horizontal_contributors;                                                 \
 9939+	output += 2;
 9940 
 9941 #endif
 9942 
 9943@@ -5052,300 +5938,306 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 9944 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
 9945 #include STBIR__HEADER_FILENAME
 9946 
 9947-
 9948 //=================
 9949 // Do 3 channel horizontal routines
 9950 
 9951 #ifdef STBIR_SIMD
 9952 
 9953-#define stbir__1_coeff_only()         \
 9954-    stbir__simdf tot,c,d;             \
 9955-    STBIR_SIMD_NO_UNROLL(decode);     \
 9956-    stbir__simdf_load1z( c, hc );     \
 9957-    stbir__simdf_0123to0001( c, c );  \
 9958-    stbir__simdf_load( d, decode );   \
 9959-    stbir__simdf_mult( tot, d, c );
 9960-
 9961-#define stbir__2_coeff_only()         \
 9962-    stbir__simdf tot,c,cs,d;          \
 9963-    STBIR_SIMD_NO_UNROLL(decode);     \
 9964-    stbir__simdf_load2( cs, hc );     \
 9965-    stbir__simdf_0123to0000( c, cs ); \
 9966-    stbir__simdf_load( d, decode );   \
 9967-    stbir__simdf_mult( tot, d, c );   \
 9968-    stbir__simdf_0123to1111( c, cs ); \
 9969-    stbir__simdf_load( d, decode+3 ); \
 9970-    stbir__simdf_madd( tot, tot, d, c );
 9971-
 9972-#define stbir__3_coeff_only()            \
 9973-    stbir__simdf tot,c,d,cs;             \
 9974-    STBIR_SIMD_NO_UNROLL(decode);        \
 9975-    stbir__simdf_load( cs, hc );         \
 9976-    stbir__simdf_0123to0000( c, cs );    \
 9977-    stbir__simdf_load( d, decode );      \
 9978-    stbir__simdf_mult( tot, d, c );      \
 9979-    stbir__simdf_0123to1111( c, cs );    \
 9980-    stbir__simdf_load( d, decode+3 );    \
 9981-    stbir__simdf_madd( tot, tot, d, c ); \
 9982-    stbir__simdf_0123to2222( c, cs );    \
 9983-    stbir__simdf_load( d, decode+6 );    \
 9984-    stbir__simdf_madd( tot, tot, d, c );
 9985-
 9986-#define stbir__store_output_tiny()                \
 9987-    stbir__simdf_store2( output, tot );           \
 9988-    stbir__simdf_0123to2301( tot, tot );          \
 9989-    stbir__simdf_store1( output+2, tot );         \
 9990-    horizontal_coefficients += coefficient_width; \
 9991-    ++horizontal_contributors;                    \
 9992-    output += 3;
 9993+#define stbir__1_coeff_only()                                                  \
 9994+	stbir__simdf tot, c, d;                                                    \
 9995+	STBIR_SIMD_NO_UNROLL(decode);                                              \
 9996+	stbir__simdf_load1z(c, hc);                                                \
 9997+	stbir__simdf_0123to0001(c, c);                                             \
 9998+	stbir__simdf_load(d, decode);                                              \
 9999+	stbir__simdf_mult(tot, d, c);
10000+
10001+#define stbir__2_coeff_only()                                                  \
10002+	stbir__simdf tot, c, cs, d;                                                \
10003+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10004+	stbir__simdf_load2(cs, hc);                                                \
10005+	stbir__simdf_0123to0000(c, cs);                                            \
10006+	stbir__simdf_load(d, decode);                                              \
10007+	stbir__simdf_mult(tot, d, c);                                              \
10008+	stbir__simdf_0123to1111(c, cs);                                            \
10009+	stbir__simdf_load(d, decode + 3);                                          \
10010+	stbir__simdf_madd(tot, tot, d, c);
10011+
10012+#define stbir__3_coeff_only()                                                  \
10013+	stbir__simdf tot, c, d, cs;                                                \
10014+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10015+	stbir__simdf_load(cs, hc);                                                 \
10016+	stbir__simdf_0123to0000(c, cs);                                            \
10017+	stbir__simdf_load(d, decode);                                              \
10018+	stbir__simdf_mult(tot, d, c);                                              \
10019+	stbir__simdf_0123to1111(c, cs);                                            \
10020+	stbir__simdf_load(d, decode + 3);                                          \
10021+	stbir__simdf_madd(tot, tot, d, c);                                         \
10022+	stbir__simdf_0123to2222(c, cs);                                            \
10023+	stbir__simdf_load(d, decode + 6);                                          \
10024+	stbir__simdf_madd(tot, tot, d, c);
10025+
10026+#define stbir__store_output_tiny()                                             \
10027+	stbir__simdf_store2(output, tot);                                          \
10028+	stbir__simdf_0123to2301(tot, tot);                                         \
10029+	stbir__simdf_store1(output + 2, tot);                                      \
10030+	horizontal_coefficients += coefficient_width;                              \
10031+	++horizontal_contributors;                                                 \
10032+	output += 3;
10033 
10034 #ifdef STBIR_SIMD8
10035 
10036-// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
10037-#define stbir__4_coeff_start()                     \
10038-    stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t;  \
10039-    STBIR_SIMD_NO_UNROLL(decode);                  \
10040-    stbir__simdf8_load4b( cs, hc );                \
10041-    stbir__simdf8_0123to00001111( c, cs );         \
10042-    stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
10043-    stbir__simdf8_0123to22223333( c, cs );         \
10044-    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
10045-
10046-#define stbir__4_coeff_continue_from_4( ofs )      \
10047-    STBIR_SIMD_NO_UNROLL(decode);                  \
10048-    stbir__simdf8_load4b( cs, hc + (ofs) );        \
10049-    stbir__simdf8_0123to00001111( c, cs );         \
10050-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
10051-    stbir__simdf8_0123to22223333( c, cs );         \
10052-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
10053-
10054-#define stbir__1_coeff_remnant( ofs )                          \
10055-    STBIR_SIMD_NO_UNROLL(decode);                              \
10056-    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
10057-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
10058-
10059-#define stbir__2_coeff_remnant( ofs )                          \
10060-    STBIR_SIMD_NO_UNROLL(decode);                              \
10061-    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
10062-    stbir__simdf8_0123to22223333( c, cs );                     \
10063-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
10064-
10065- #define stbir__3_coeff_remnant( ofs )                           \
10066-    STBIR_SIMD_NO_UNROLL(decode);                                \
10067-    stbir__simdf8_load4b( cs, hc + (ofs) );                      \
10068-    stbir__simdf8_0123to00001111( c, cs );                       \
10069-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
10070-    stbir__simdf8_0123to2222( t, cs );                           \
10071-    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
10072-
10073-#define stbir__store_output()                       \
10074-    stbir__simdf8_add( tot0, tot0, tot1 );          \
10075-    stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
10076-    stbir__simdf8_add4halves( t, t, tot0 );         \
10077-    horizontal_coefficients += coefficient_width;   \
10078-    ++horizontal_contributors;                      \
10079-    output += 3;                                    \
10080-    if ( output < output_end )                      \
10081-    {                                               \
10082-      stbir__simdf_store( output-3, t );            \
10083-      continue;                                     \
10084-    }                                               \
10085-    { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
10086-    stbir__simdf_store2( output-3, t );             \
10087-    stbir__simdf_store1( output+2-3, tt ); }        \
10088-    break;
10089-
10090+// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different
10091+// halves of the AVX reg fyi
10092+#define stbir__4_coeff_start()                                                 \
10093+	stbir__simdf8 tot0, tot1, c, cs;                                           \
10094+	stbir__simdf t;                                                            \
10095+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10096+	stbir__simdf8_load4b(cs, hc);                                              \
10097+	stbir__simdf8_0123to00001111(c, cs);                                       \
10098+	stbir__simdf8_mult_mem(tot0, c, decode - 1);                               \
10099+	stbir__simdf8_0123to22223333(c, cs);                                       \
10100+	stbir__simdf8_mult_mem(tot1, c, decode + 6 - 1);
10101+
10102+#define stbir__4_coeff_continue_from_4(ofs)                                    \
10103+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10104+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
10105+	stbir__simdf8_0123to00001111(c, cs);                                       \
10106+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 3 - 1);             \
10107+	stbir__simdf8_0123to22223333(c, cs);                                       \
10108+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + (ofs) * 3 + 6 - 1);
10109+
10110+#define stbir__1_coeff_remnant(ofs)                                            \
10111+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10112+	stbir__simdf_load1rep4(t, hc + (ofs));                                     \
10113+	stbir__simdf8_madd_mem4(tot0, tot0, t, decode + (ofs) * 3 - 1);
10114+
10115+#define stbir__2_coeff_remnant(ofs)                                            \
10116+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10117+	stbir__simdf8_load4b(cs, hc + (ofs) - 2);                                  \
10118+	stbir__simdf8_0123to22223333(c, cs);                                       \
10119+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 3 - 1);
10120+
10121+#define stbir__3_coeff_remnant(ofs)                                            \
10122+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10123+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
10124+	stbir__simdf8_0123to00001111(c, cs);                                       \
10125+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 3 - 1);             \
10126+	stbir__simdf8_0123to2222(t, cs);                                           \
10127+	stbir__simdf8_madd_mem4(tot1, tot1, t, decode + (ofs) * 3 + 6 - 1);
10128+
10129+#define stbir__store_output()                                                  \
10130+	stbir__simdf8_add(tot0, tot0, tot1);                                       \
10131+	stbir__simdf_0123to1230(t, stbir__if_simdf8_cast_to_simdf4(tot0));         \
10132+	stbir__simdf8_add4halves(t, t, tot0);                                      \
10133+	horizontal_coefficients += coefficient_width;                              \
10134+	++horizontal_contributors;                                                 \
10135+	output += 3;                                                               \
10136+	if (output < output_end) {                                                 \
10137+		stbir__simdf_store(output - 3, t);                                     \
10138+		continue;                                                              \
10139+	}                                                                          \
10140+	{                                                                          \
10141+		stbir__simdf tt;                                                       \
10142+		stbir__simdf_0123to2301(tt, t);                                        \
10143+		stbir__simdf_store2(output - 3, t);                                    \
10144+		stbir__simdf_store1(output + 2 - 3, tt);                               \
10145+	}                                                                          \
10146+	break;
10147 
10148 #else
10149 
10150-#define stbir__4_coeff_start()                  \
10151-    stbir__simdf tot0,tot1,tot2,c,cs;           \
10152-    STBIR_SIMD_NO_UNROLL(decode);               \
10153-    stbir__simdf_load( cs, hc );                \
10154-    stbir__simdf_0123to0001( c, cs );           \
10155-    stbir__simdf_mult_mem( tot0, c, decode );   \
10156-    stbir__simdf_0123to1122( c, cs );           \
10157-    stbir__simdf_mult_mem( tot1, c, decode+4 ); \
10158-    stbir__simdf_0123to2333( c, cs );           \
10159-    stbir__simdf_mult_mem( tot2, c, decode+8 );
10160-
10161-#define stbir__4_coeff_continue_from_4( ofs )                 \
10162-    STBIR_SIMD_NO_UNROLL(decode);                             \
10163-    stbir__simdf_load( cs, hc + (ofs) );                      \
10164-    stbir__simdf_0123to0001( c, cs );                         \
10165-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
10166-    stbir__simdf_0123to1122( c, cs );                         \
10167-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
10168-    stbir__simdf_0123to2333( c, cs );                         \
10169-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
10170-
10171-#define stbir__1_coeff_remnant( ofs )         \
10172-    STBIR_SIMD_NO_UNROLL(decode);             \
10173-    stbir__simdf_load1z( c, hc + (ofs) );     \
10174-    stbir__simdf_0123to0001( c, c );          \
10175-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
10176-
10177-#define stbir__2_coeff_remnant( ofs )                       \
10178-    { stbir__simdf d;                                       \
10179-    STBIR_SIMD_NO_UNROLL(decode);                           \
10180-    stbir__simdf_load2z( cs, hc + (ofs) );                  \
10181-    stbir__simdf_0123to0001( c, cs );                       \
10182-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
10183-    stbir__simdf_0123to1122( c, cs );                       \
10184-    stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
10185-    stbir__simdf_madd( tot1, tot1, c, d ); }
10186-
10187-#define stbir__3_coeff_remnant( ofs )                         \
10188-    { stbir__simdf d;                                         \
10189-    STBIR_SIMD_NO_UNROLL(decode);                             \
10190-    stbir__simdf_load( cs, hc + (ofs) );                      \
10191-    stbir__simdf_0123to0001( c, cs );                         \
10192-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
10193-    stbir__simdf_0123to1122( c, cs );                         \
10194-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
10195-    stbir__simdf_0123to2222( c, cs );                         \
10196-    stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
10197-    stbir__simdf_madd( tot2, tot2, c, d );  }
10198-
10199-#define stbir__store_output()                       \
10200-    stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
10201-    stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 );  \
10202-    stbir__simdf_0123to1230( tot2, tot2 );          \
10203-    stbir__simdf_add( tot0, tot0, cs );             \
10204-    stbir__simdf_add( c, c, tot2 );                 \
10205-    stbir__simdf_add( tot0, tot0, c );              \
10206-    horizontal_coefficients += coefficient_width;   \
10207-    ++horizontal_contributors;                      \
10208-    output += 3;                                    \
10209-    if ( output < output_end )                      \
10210-    {                                               \
10211-      stbir__simdf_store( output-3, tot0 );         \
10212-      continue;                                     \
10213-    }                                               \
10214-    stbir__simdf_0123to2301( tot1, tot0 );          \
10215-    stbir__simdf_store2( output-3, tot0 );          \
10216-    stbir__simdf_store1( output+2-3, tot1 );        \
10217-    break;
10218+#define stbir__4_coeff_start()                                                 \
10219+	stbir__simdf tot0, tot1, tot2, c, cs;                                      \
10220+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10221+	stbir__simdf_load(cs, hc);                                                 \
10222+	stbir__simdf_0123to0001(c, cs);                                            \
10223+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
10224+	stbir__simdf_0123to1122(c, cs);                                            \
10225+	stbir__simdf_mult_mem(tot1, c, decode + 4);                                \
10226+	stbir__simdf_0123to2333(c, cs);                                            \
10227+	stbir__simdf_mult_mem(tot2, c, decode + 8);
10228+
10229+#define stbir__4_coeff_continue_from_4(ofs)                                    \
10230+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10231+	stbir__simdf_load(cs, hc + (ofs));                                         \
10232+	stbir__simdf_0123to0001(c, cs);                                            \
10233+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 3);                  \
10234+	stbir__simdf_0123to1122(c, cs);                                            \
10235+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 3 + 4);              \
10236+	stbir__simdf_0123to2333(c, cs);                                            \
10237+	stbir__simdf_madd_mem(tot2, tot2, c, decode + (ofs) * 3 + 8);
10238+
10239+#define stbir__1_coeff_remnant(ofs)                                            \
10240+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10241+	stbir__simdf_load1z(c, hc + (ofs));                                        \
10242+	stbir__simdf_0123to0001(c, c);                                             \
10243+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 3);
10244+
10245+#define stbir__2_coeff_remnant(ofs)                                            \
10246+	{                                                                          \
10247+		stbir__simdf d;                                                        \
10248+		STBIR_SIMD_NO_UNROLL(decode);                                          \
10249+		stbir__simdf_load2z(cs, hc + (ofs));                                   \
10250+		stbir__simdf_0123to0001(c, cs);                                        \
10251+		stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 3);              \
10252+		stbir__simdf_0123to1122(c, cs);                                        \
10253+		stbir__simdf_load2z(d, decode + (ofs) * 3 + 4);                        \
10254+		stbir__simdf_madd(tot1, tot1, c, d);                                   \
10255+	}
10256+
10257+#define stbir__3_coeff_remnant(ofs)                                            \
10258+	{                                                                          \
10259+		stbir__simdf d;                                                        \
10260+		STBIR_SIMD_NO_UNROLL(decode);                                          \
10261+		stbir__simdf_load(cs, hc + (ofs));                                     \
10262+		stbir__simdf_0123to0001(c, cs);                                        \
10263+		stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 3);              \
10264+		stbir__simdf_0123to1122(c, cs);                                        \
10265+		stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 3 + 4);          \
10266+		stbir__simdf_0123to2222(c, cs);                                        \
10267+		stbir__simdf_load1z(d, decode + (ofs) * 3 + 8);                        \
10268+		stbir__simdf_madd(tot2, tot2, c, d);                                   \
10269+	}
10270+
10271+#define stbir__store_output()                                                  \
10272+	stbir__simdf_0123ABCDto3ABx(c, tot0, tot1);                                \
10273+	stbir__simdf_0123ABCDto23Ax(cs, tot1, tot2);                               \
10274+	stbir__simdf_0123to1230(tot2, tot2);                                       \
10275+	stbir__simdf_add(tot0, tot0, cs);                                          \
10276+	stbir__simdf_add(c, c, tot2);                                              \
10277+	stbir__simdf_add(tot0, tot0, c);                                           \
10278+	horizontal_coefficients += coefficient_width;                              \
10279+	++horizontal_contributors;                                                 \
10280+	output += 3;                                                               \
10281+	if (output < output_end) {                                                 \
10282+		stbir__simdf_store(output - 3, tot0);                                  \
10283+		continue;                                                              \
10284+	}                                                                          \
10285+	stbir__simdf_0123to2301(tot1, tot0);                                       \
10286+	stbir__simdf_store2(output - 3, tot0);                                     \
10287+	stbir__simdf_store1(output + 2 - 3, tot1);                                 \
10288+	break;
10289 
10290 #endif
10291 
10292 #else
10293 
10294-#define stbir__1_coeff_only()  \
10295-    float tot0, tot1, tot2, c; \
10296-    c = hc[0];                 \
10297-    tot0 = decode[0]*c;        \
10298-    tot1 = decode[1]*c;        \
10299-    tot2 = decode[2]*c;
10300-
10301-#define stbir__2_coeff_only()  \
10302-    float tot0, tot1, tot2, c; \
10303-    c = hc[0];                 \
10304-    tot0 = decode[0]*c;        \
10305-    tot1 = decode[1]*c;        \
10306-    tot2 = decode[2]*c;        \
10307-    c = hc[1];                 \
10308-    tot0 += decode[3]*c;       \
10309-    tot1 += decode[4]*c;       \
10310-    tot2 += decode[5]*c;
10311-
10312-#define stbir__3_coeff_only()  \
10313-    float tot0, tot1, tot2, c; \
10314-    c = hc[0];                 \
10315-    tot0 = decode[0]*c;        \
10316-    tot1 = decode[1]*c;        \
10317-    tot2 = decode[2]*c;        \
10318-    c = hc[1];                 \
10319-    tot0 += decode[3]*c;       \
10320-    tot1 += decode[4]*c;       \
10321-    tot2 += decode[5]*c;       \
10322-    c = hc[2];                 \
10323-    tot0 += decode[6]*c;       \
10324-    tot1 += decode[7]*c;       \
10325-    tot2 += decode[8]*c;
10326-
10327-#define stbir__store_output_tiny()                \
10328-    output[0] = tot0;                             \
10329-    output[1] = tot1;                             \
10330-    output[2] = tot2;                             \
10331-    horizontal_coefficients += coefficient_width; \
10332-    ++horizontal_contributors;                    \
10333-    output += 3;
10334-
10335-#define stbir__4_coeff_start()      \
10336-    float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c;  \
10337-    c = hc[0];                      \
10338-    tota0 = decode[0]*c;            \
10339-    tota1 = decode[1]*c;            \
10340-    tota2 = decode[2]*c;            \
10341-    c = hc[1];                      \
10342-    totb0 = decode[3]*c;            \
10343-    totb1 = decode[4]*c;            \
10344-    totb2 = decode[5]*c;            \
10345-    c = hc[2];                      \
10346-    totc0 = decode[6]*c;            \
10347-    totc1 = decode[7]*c;            \
10348-    totc2 = decode[8]*c;            \
10349-    c = hc[3];                      \
10350-    totd0 = decode[9]*c;            \
10351-    totd1 = decode[10]*c;           \
10352-    totd2 = decode[11]*c;
10353-
10354-#define stbir__4_coeff_continue_from_4( ofs )  \
10355-    c = hc[0+(ofs)];                           \
10356-    tota0 += decode[0+(ofs)*3]*c;              \
10357-    tota1 += decode[1+(ofs)*3]*c;              \
10358-    tota2 += decode[2+(ofs)*3]*c;              \
10359-    c = hc[1+(ofs)];                           \
10360-    totb0 += decode[3+(ofs)*3]*c;              \
10361-    totb1 += decode[4+(ofs)*3]*c;              \
10362-    totb2 += decode[5+(ofs)*3]*c;              \
10363-    c = hc[2+(ofs)];                           \
10364-    totc0 += decode[6+(ofs)*3]*c;              \
10365-    totc1 += decode[7+(ofs)*3]*c;              \
10366-    totc2 += decode[8+(ofs)*3]*c;              \
10367-    c = hc[3+(ofs)];                           \
10368-    totd0 += decode[9+(ofs)*3]*c;              \
10369-    totd1 += decode[10+(ofs)*3]*c;             \
10370-    totd2 += decode[11+(ofs)*3]*c;
10371-
10372-#define stbir__1_coeff_remnant( ofs )  \
10373-    c = hc[0+(ofs)];                   \
10374-    tota0 += decode[0+(ofs)*3]*c;      \
10375-    tota1 += decode[1+(ofs)*3]*c;      \
10376-    tota2 += decode[2+(ofs)*3]*c;
10377-
10378-#define stbir__2_coeff_remnant( ofs )  \
10379-    c = hc[0+(ofs)];                   \
10380-    tota0 += decode[0+(ofs)*3]*c;      \
10381-    tota1 += decode[1+(ofs)*3]*c;      \
10382-    tota2 += decode[2+(ofs)*3]*c;      \
10383-    c = hc[1+(ofs)];                   \
10384-    totb0 += decode[3+(ofs)*3]*c;      \
10385-    totb1 += decode[4+(ofs)*3]*c;      \
10386-    totb2 += decode[5+(ofs)*3]*c;      \
10387-
10388-#define stbir__3_coeff_remnant( ofs )  \
10389-    c = hc[0+(ofs)];                   \
10390-    tota0 += decode[0+(ofs)*3]*c;      \
10391-    tota1 += decode[1+(ofs)*3]*c;      \
10392-    tota2 += decode[2+(ofs)*3]*c;      \
10393-    c = hc[1+(ofs)];                   \
10394-    totb0 += decode[3+(ofs)*3]*c;      \
10395-    totb1 += decode[4+(ofs)*3]*c;      \
10396-    totb2 += decode[5+(ofs)*3]*c;      \
10397-    c = hc[2+(ofs)];                   \
10398-    totc0 += decode[6+(ofs)*3]*c;      \
10399-    totc1 += decode[7+(ofs)*3]*c;      \
10400-    totc2 += decode[8+(ofs)*3]*c;
10401-
10402-#define stbir__store_output()                     \
10403-    output[0] = (tota0+totc0)+(totb0+totd0);      \
10404-    output[1] = (tota1+totc1)+(totb1+totd1);      \
10405-    output[2] = (tota2+totc2)+(totb2+totd2);      \
10406-    horizontal_coefficients += coefficient_width; \
10407-    ++horizontal_contributors;                    \
10408-    output += 3;
10409+#define stbir__1_coeff_only()                                                  \
10410+	float tot0, tot1, tot2, c;                                                 \
10411+	c = hc[0];                                                                 \
10412+	tot0 = decode[0] * c;                                                      \
10413+	tot1 = decode[1] * c;                                                      \
10414+	tot2 = decode[2] * c;
10415+
10416+#define stbir__2_coeff_only()                                                  \
10417+	float tot0, tot1, tot2, c;                                                 \
10418+	c = hc[0];                                                                 \
10419+	tot0 = decode[0] * c;                                                      \
10420+	tot1 = decode[1] * c;                                                      \
10421+	tot2 = decode[2] * c;                                                      \
10422+	c = hc[1];                                                                 \
10423+	tot0 += decode[3] * c;                                                     \
10424+	tot1 += decode[4] * c;                                                     \
10425+	tot2 += decode[5] * c;
10426+
10427+#define stbir__3_coeff_only()                                                  \
10428+	float tot0, tot1, tot2, c;                                                 \
10429+	c = hc[0];                                                                 \
10430+	tot0 = decode[0] * c;                                                      \
10431+	tot1 = decode[1] * c;                                                      \
10432+	tot2 = decode[2] * c;                                                      \
10433+	c = hc[1];                                                                 \
10434+	tot0 += decode[3] * c;                                                     \
10435+	tot1 += decode[4] * c;                                                     \
10436+	tot2 += decode[5] * c;                                                     \
10437+	c = hc[2];                                                                 \
10438+	tot0 += decode[6] * c;                                                     \
10439+	tot1 += decode[7] * c;                                                     \
10440+	tot2 += decode[8] * c;
10441+
10442+#define stbir__store_output_tiny()                                             \
10443+	output[0] = tot0;                                                          \
10444+	output[1] = tot1;                                                          \
10445+	output[2] = tot2;                                                          \
10446+	horizontal_coefficients += coefficient_width;                              \
10447+	++horizontal_contributors;                                                 \
10448+	output += 3;
10449+
10450+#define stbir__4_coeff_start()                                                 \
10451+	float tota0, tota1, tota2, totb0, totb1, totb2, totc0, totc1, totc2,       \
10452+	    totd0, totd1, totd2, c;                                                \
10453+	c = hc[0];                                                                 \
10454+	tota0 = decode[0] * c;                                                     \
10455+	tota1 = decode[1] * c;                                                     \
10456+	tota2 = decode[2] * c;                                                     \
10457+	c = hc[1];                                                                 \
10458+	totb0 = decode[3] * c;                                                     \
10459+	totb1 = decode[4] * c;                                                     \
10460+	totb2 = decode[5] * c;                                                     \
10461+	c = hc[2];                                                                 \
10462+	totc0 = decode[6] * c;                                                     \
10463+	totc1 = decode[7] * c;                                                     \
10464+	totc2 = decode[8] * c;                                                     \
10465+	c = hc[3];                                                                 \
10466+	totd0 = decode[9] * c;                                                     \
10467+	totd1 = decode[10] * c;                                                    \
10468+	totd2 = decode[11] * c;
10469+
10470+#define stbir__4_coeff_continue_from_4(ofs)                                    \
10471+	c = hc[0 + (ofs)];                                                         \
10472+	tota0 += decode[0 + (ofs) * 3] * c;                                        \
10473+	tota1 += decode[1 + (ofs) * 3] * c;                                        \
10474+	tota2 += decode[2 + (ofs) * 3] * c;                                        \
10475+	c = hc[1 + (ofs)];                                                         \
10476+	totb0 += decode[3 + (ofs) * 3] * c;                                        \
10477+	totb1 += decode[4 + (ofs) * 3] * c;                                        \
10478+	totb2 += decode[5 + (ofs) * 3] * c;                                        \
10479+	c = hc[2 + (ofs)];                                                         \
10480+	totc0 += decode[6 + (ofs) * 3] * c;                                        \
10481+	totc1 += decode[7 + (ofs) * 3] * c;                                        \
10482+	totc2 += decode[8 + (ofs) * 3] * c;                                        \
10483+	c = hc[3 + (ofs)];                                                         \
10484+	totd0 += decode[9 + (ofs) * 3] * c;                                        \
10485+	totd1 += decode[10 + (ofs) * 3] * c;                                       \
10486+	totd2 += decode[11 + (ofs) * 3] * c;
10487+
10488+#define stbir__1_coeff_remnant(ofs)                                            \
10489+	c = hc[0 + (ofs)];                                                         \
10490+	tota0 += decode[0 + (ofs) * 3] * c;                                        \
10491+	tota1 += decode[1 + (ofs) * 3] * c;                                        \
10492+	tota2 += decode[2 + (ofs) * 3] * c;
10493+
10494+#define stbir__2_coeff_remnant(ofs)                                            \
10495+	c = hc[0 + (ofs)];                                                         \
10496+	tota0 += decode[0 + (ofs) * 3] * c;                                        \
10497+	tota1 += decode[1 + (ofs) * 3] * c;                                        \
10498+	tota2 += decode[2 + (ofs) * 3] * c;                                        \
10499+	c = hc[1 + (ofs)];                                                         \
10500+	totb0 += decode[3 + (ofs) * 3] * c;                                        \
10501+	totb1 += decode[4 + (ofs) * 3] * c;                                        \
10502+	totb2 += decode[5 + (ofs) * 3] * c;
10503+
10504+#define stbir__3_coeff_remnant(ofs)                                            \
10505+	c = hc[0 + (ofs)];                                                         \
10506+	tota0 += decode[0 + (ofs) * 3] * c;                                        \
10507+	tota1 += decode[1 + (ofs) * 3] * c;                                        \
10508+	tota2 += decode[2 + (ofs) * 3] * c;                                        \
10509+	c = hc[1 + (ofs)];                                                         \
10510+	totb0 += decode[3 + (ofs) * 3] * c;                                        \
10511+	totb1 += decode[4 + (ofs) * 3] * c;                                        \
10512+	totb2 += decode[5 + (ofs) * 3] * c;                                        \
10513+	c = hc[2 + (ofs)];                                                         \
10514+	totc0 += decode[6 + (ofs) * 3] * c;                                        \
10515+	totc1 += decode[7 + (ofs) * 3] * c;                                        \
10516+	totc2 += decode[8 + (ofs) * 3] * c;
10517+
10518+#define stbir__store_output()                                                  \
10519+	output[0] = (tota0 + totc0) + (totb0 + totd0);                             \
10520+	output[1] = (tota1 + totc1) + (totb1 + totd1);                             \
10521+	output[2] = (tota2 + totc2) + (totb2 + totd2);                             \
10522+	horizontal_coefficients += coefficient_width;                              \
10523+	++horizontal_contributors;                                                 \
10524+	output += 3;
10525 
10526 #endif
10527 
10528@@ -5358,291 +6250,292 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
10529 
10530 #ifdef STBIR_SIMD
10531 
10532-#define stbir__1_coeff_only()             \
10533-    stbir__simdf tot,c;                   \
10534-    STBIR_SIMD_NO_UNROLL(decode);         \
10535-    stbir__simdf_load1( c, hc );          \
10536-    stbir__simdf_0123to0000( c, c );      \
10537-    stbir__simdf_mult_mem( tot, c, decode );
10538-
10539-#define stbir__2_coeff_only()                       \
10540-    stbir__simdf tot,c,cs;                          \
10541-    STBIR_SIMD_NO_UNROLL(decode);                   \
10542-    stbir__simdf_load2( cs, hc );                   \
10543-    stbir__simdf_0123to0000( c, cs );               \
10544-    stbir__simdf_mult_mem( tot, c, decode );        \
10545-    stbir__simdf_0123to1111( c, cs );               \
10546-    stbir__simdf_madd_mem( tot, tot, c, decode+4 );
10547-
10548-#define stbir__3_coeff_only()                       \
10549-    stbir__simdf tot,c,cs;                          \
10550-    STBIR_SIMD_NO_UNROLL(decode);                   \
10551-    stbir__simdf_load( cs, hc );                    \
10552-    stbir__simdf_0123to0000( c, cs );               \
10553-    stbir__simdf_mult_mem( tot, c, decode );        \
10554-    stbir__simdf_0123to1111( c, cs );               \
10555-    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
10556-    stbir__simdf_0123to2222( c, cs );               \
10557-    stbir__simdf_madd_mem( tot, tot, c, decode+8 );
10558-
10559-#define stbir__store_output_tiny()                \
10560-    stbir__simdf_store( output, tot );            \
10561-    horizontal_coefficients += coefficient_width; \
10562-    ++horizontal_contributors;                    \
10563-    output += 4;
10564+#define stbir__1_coeff_only()                                                  \
10565+	stbir__simdf tot, c;                                                       \
10566+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10567+	stbir__simdf_load1(c, hc);                                                 \
10568+	stbir__simdf_0123to0000(c, c);                                             \
10569+	stbir__simdf_mult_mem(tot, c, decode);
10570+
10571+#define stbir__2_coeff_only()                                                  \
10572+	stbir__simdf tot, c, cs;                                                   \
10573+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10574+	stbir__simdf_load2(cs, hc);                                                \
10575+	stbir__simdf_0123to0000(c, cs);                                            \
10576+	stbir__simdf_mult_mem(tot, c, decode);                                     \
10577+	stbir__simdf_0123to1111(c, cs);                                            \
10578+	stbir__simdf_madd_mem(tot, tot, c, decode + 4);
10579+
10580+#define stbir__3_coeff_only()                                                  \
10581+	stbir__simdf tot, c, cs;                                                   \
10582+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10583+	stbir__simdf_load(cs, hc);                                                 \
10584+	stbir__simdf_0123to0000(c, cs);                                            \
10585+	stbir__simdf_mult_mem(tot, c, decode);                                     \
10586+	stbir__simdf_0123to1111(c, cs);                                            \
10587+	stbir__simdf_madd_mem(tot, tot, c, decode + 4);                            \
10588+	stbir__simdf_0123to2222(c, cs);                                            \
10589+	stbir__simdf_madd_mem(tot, tot, c, decode + 8);
10590+
10591+#define stbir__store_output_tiny()                                             \
10592+	stbir__simdf_store(output, tot);                                           \
10593+	horizontal_coefficients += coefficient_width;                              \
10594+	++horizontal_contributors;                                                 \
10595+	output += 4;
10596 
10597 #ifdef STBIR_SIMD8
10598 
10599-#define stbir__4_coeff_start()                     \
10600-    stbir__simdf8 tot0,c,cs; stbir__simdf t;  \
10601-    STBIR_SIMD_NO_UNROLL(decode);                  \
10602-    stbir__simdf8_load4b( cs, hc );                \
10603-    stbir__simdf8_0123to00001111( c, cs );         \
10604-    stbir__simdf8_mult_mem( tot0, c, decode );     \
10605-    stbir__simdf8_0123to22223333( c, cs );         \
10606-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
10607-
10608-#define stbir__4_coeff_continue_from_4( ofs )                  \
10609-    STBIR_SIMD_NO_UNROLL(decode);                              \
10610-    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
10611-    stbir__simdf8_0123to00001111( c, cs );                     \
10612-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
10613-    stbir__simdf8_0123to22223333( c, cs );                     \
10614-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
10615-
10616-#define stbir__1_coeff_remnant( ofs )                          \
10617-    STBIR_SIMD_NO_UNROLL(decode);                              \
10618-    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
10619-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
10620-
10621-#define stbir__2_coeff_remnant( ofs )                          \
10622-    STBIR_SIMD_NO_UNROLL(decode);                              \
10623-    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
10624-    stbir__simdf8_0123to22223333( c, cs );                     \
10625-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
10626-
10627- #define stbir__3_coeff_remnant( ofs )                         \
10628-    STBIR_SIMD_NO_UNROLL(decode);                              \
10629-    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
10630-    stbir__simdf8_0123to00001111( c, cs );                     \
10631-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
10632-    stbir__simdf8_0123to2222( t, cs );                         \
10633-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
10634-
10635-#define stbir__store_output()                      \
10636-    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
10637-    stbir__simdf_store( output, t );               \
10638-    horizontal_coefficients += coefficient_width;  \
10639-    ++horizontal_contributors;                     \
10640-    output += 4;
10641+#define stbir__4_coeff_start()                                                 \
10642+	stbir__simdf8 tot0, c, cs;                                                 \
10643+	stbir__simdf t;                                                            \
10644+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10645+	stbir__simdf8_load4b(cs, hc);                                              \
10646+	stbir__simdf8_0123to00001111(c, cs);                                       \
10647+	stbir__simdf8_mult_mem(tot0, c, decode);                                   \
10648+	stbir__simdf8_0123to22223333(c, cs);                                       \
10649+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + 8);
10650+
10651+#define stbir__4_coeff_continue_from_4(ofs)                                    \
10652+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10653+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
10654+	stbir__simdf8_0123to00001111(c, cs);                                       \
10655+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 4);                 \
10656+	stbir__simdf8_0123to22223333(c, cs);                                       \
10657+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 4 + 8);
10658+
10659+#define stbir__1_coeff_remnant(ofs)                                            \
10660+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10661+	stbir__simdf_load1rep4(t, hc + (ofs));                                     \
10662+	stbir__simdf8_madd_mem4(tot0, tot0, t, decode + (ofs) * 4);
10663+
10664+#define stbir__2_coeff_remnant(ofs)                                            \
10665+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10666+	stbir__simdf8_load4b(cs, hc + (ofs) - 2);                                  \
10667+	stbir__simdf8_0123to22223333(c, cs);                                       \
10668+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 4);
10669+
10670+#define stbir__3_coeff_remnant(ofs)                                            \
10671+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10672+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
10673+	stbir__simdf8_0123to00001111(c, cs);                                       \
10674+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 4);                 \
10675+	stbir__simdf8_0123to2222(t, cs);                                           \
10676+	stbir__simdf8_madd_mem4(tot0, tot0, t, decode + (ofs) * 4 + 8);
10677+
10678+#define stbir__store_output()                                                  \
10679+	stbir__simdf8_add4halves(t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0);  \
10680+	stbir__simdf_store(output, t);                                             \
10681+	horizontal_coefficients += coefficient_width;                              \
10682+	++horizontal_contributors;                                                 \
10683+	output += 4;
10684 
10685 #else
10686 
10687-#define stbir__4_coeff_start()                        \
10688-    stbir__simdf tot0,tot1,c,cs;                      \
10689-    STBIR_SIMD_NO_UNROLL(decode);                     \
10690-    stbir__simdf_load( cs, hc );                      \
10691-    stbir__simdf_0123to0000( c, cs );                 \
10692-    stbir__simdf_mult_mem( tot0, c, decode );         \
10693-    stbir__simdf_0123to1111( c, cs );                 \
10694-    stbir__simdf_mult_mem( tot1, c, decode+4 );       \
10695-    stbir__simdf_0123to2222( c, cs );                 \
10696-    stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
10697-    stbir__simdf_0123to3333( c, cs );                 \
10698-    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
10699-
10700-#define stbir__4_coeff_continue_from_4( ofs )                  \
10701-    STBIR_SIMD_NO_UNROLL(decode);                              \
10702-    stbir__simdf_load( cs, hc + (ofs) );                       \
10703-    stbir__simdf_0123to0000( c, cs );                          \
10704-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
10705-    stbir__simdf_0123to1111( c, cs );                          \
10706-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
10707-    stbir__simdf_0123to2222( c, cs );                          \
10708-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
10709-    stbir__simdf_0123to3333( c, cs );                          \
10710-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
10711-
10712-#define stbir__1_coeff_remnant( ofs )                       \
10713-    STBIR_SIMD_NO_UNROLL(decode);                           \
10714-    stbir__simdf_load1( c, hc + (ofs) );                    \
10715-    stbir__simdf_0123to0000( c, c );                        \
10716-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
10717-
10718-#define stbir__2_coeff_remnant( ofs )                         \
10719-    STBIR_SIMD_NO_UNROLL(decode);                             \
10720-    stbir__simdf_load2( cs, hc + (ofs) );                     \
10721-    stbir__simdf_0123to0000( c, cs );                         \
10722-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
10723-    stbir__simdf_0123to1111( c, cs );                         \
10724-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
10725-
10726-#define stbir__3_coeff_remnant( ofs )                          \
10727-    STBIR_SIMD_NO_UNROLL(decode);                              \
10728-    stbir__simdf_load( cs, hc + (ofs) );                       \
10729-    stbir__simdf_0123to0000( c, cs );                          \
10730-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
10731-    stbir__simdf_0123to1111( c, cs );                          \
10732-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
10733-    stbir__simdf_0123to2222( c, cs );                          \
10734-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
10735-
10736-#define stbir__store_output()                     \
10737-    stbir__simdf_add( tot0, tot0, tot1 );         \
10738-    stbir__simdf_store( output, tot0 );           \
10739-    horizontal_coefficients += coefficient_width; \
10740-    ++horizontal_contributors;                    \
10741-    output += 4;
10742+#define stbir__4_coeff_start()                                                 \
10743+	stbir__simdf tot0, tot1, c, cs;                                            \
10744+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10745+	stbir__simdf_load(cs, hc);                                                 \
10746+	stbir__simdf_0123to0000(c, cs);                                            \
10747+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
10748+	stbir__simdf_0123to1111(c, cs);                                            \
10749+	stbir__simdf_mult_mem(tot1, c, decode + 4);                                \
10750+	stbir__simdf_0123to2222(c, cs);                                            \
10751+	stbir__simdf_madd_mem(tot0, tot0, c, decode + 8);                          \
10752+	stbir__simdf_0123to3333(c, cs);                                            \
10753+	stbir__simdf_madd_mem(tot1, tot1, c, decode + 12);
10754+
10755+#define stbir__4_coeff_continue_from_4(ofs)                                    \
10756+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10757+	stbir__simdf_load(cs, hc + (ofs));                                         \
10758+	stbir__simdf_0123to0000(c, cs);                                            \
10759+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4);                  \
10760+	stbir__simdf_0123to1111(c, cs);                                            \
10761+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 4 + 4);              \
10762+	stbir__simdf_0123to2222(c, cs);                                            \
10763+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4 + 8);              \
10764+	stbir__simdf_0123to3333(c, cs);                                            \
10765+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 4 + 12);
10766+
10767+#define stbir__1_coeff_remnant(ofs)                                            \
10768+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10769+	stbir__simdf_load1(c, hc + (ofs));                                         \
10770+	stbir__simdf_0123to0000(c, c);                                             \
10771+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4);
10772+
10773+#define stbir__2_coeff_remnant(ofs)                                            \
10774+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10775+	stbir__simdf_load2(cs, hc + (ofs));                                        \
10776+	stbir__simdf_0123to0000(c, cs);                                            \
10777+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4);                  \
10778+	stbir__simdf_0123to1111(c, cs);                                            \
10779+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 4 + 4);
10780+
10781+#define stbir__3_coeff_remnant(ofs)                                            \
10782+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10783+	stbir__simdf_load(cs, hc + (ofs));                                         \
10784+	stbir__simdf_0123to0000(c, cs);                                            \
10785+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4);                  \
10786+	stbir__simdf_0123to1111(c, cs);                                            \
10787+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 4 + 4);              \
10788+	stbir__simdf_0123to2222(c, cs);                                            \
10789+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 4 + 8);
10790+
10791+#define stbir__store_output()                                                  \
10792+	stbir__simdf_add(tot0, tot0, tot1);                                        \
10793+	stbir__simdf_store(output, tot0);                                          \
10794+	horizontal_coefficients += coefficient_width;                              \
10795+	++horizontal_contributors;                                                 \
10796+	output += 4;
10797 
10798 #endif
10799 
10800 #else
10801 
10802-#define stbir__1_coeff_only()         \
10803-    float p0,p1,p2,p3,c;              \
10804-    STBIR_SIMD_NO_UNROLL(decode);     \
10805-    c = hc[0];                        \
10806-    p0 = decode[0] * c;               \
10807-    p1 = decode[1] * c;               \
10808-    p2 = decode[2] * c;               \
10809-    p3 = decode[3] * c;
10810-
10811-#define stbir__2_coeff_only()         \
10812-    float p0,p1,p2,p3,c;              \
10813-    STBIR_SIMD_NO_UNROLL(decode);     \
10814-    c = hc[0];                        \
10815-    p0 = decode[0] * c;               \
10816-    p1 = decode[1] * c;               \
10817-    p2 = decode[2] * c;               \
10818-    p3 = decode[3] * c;               \
10819-    c = hc[1];                        \
10820-    p0 += decode[4] * c;              \
10821-    p1 += decode[5] * c;              \
10822-    p2 += decode[6] * c;              \
10823-    p3 += decode[7] * c;
10824-
10825-#define stbir__3_coeff_only()         \
10826-    float p0,p1,p2,p3,c;              \
10827-    STBIR_SIMD_NO_UNROLL(decode);     \
10828-    c = hc[0];                        \
10829-    p0 = decode[0] * c;               \
10830-    p1 = decode[1] * c;               \
10831-    p2 = decode[2] * c;               \
10832-    p3 = decode[3] * c;               \
10833-    c = hc[1];                        \
10834-    p0 += decode[4] * c;              \
10835-    p1 += decode[5] * c;              \
10836-    p2 += decode[6] * c;              \
10837-    p3 += decode[7] * c;              \
10838-    c = hc[2];                        \
10839-    p0 += decode[8] * c;              \
10840-    p1 += decode[9] * c;              \
10841-    p2 += decode[10] * c;             \
10842-    p3 += decode[11] * c;
10843-
10844-#define stbir__store_output_tiny()                \
10845-    output[0] = p0;                               \
10846-    output[1] = p1;                               \
10847-    output[2] = p2;                               \
10848-    output[3] = p3;                               \
10849-    horizontal_coefficients += coefficient_width; \
10850-    ++horizontal_contributors;                    \
10851-    output += 4;
10852-
10853-#define stbir__4_coeff_start()        \
10854-    float x0,x1,x2,x3,y0,y1,y2,y3,c;  \
10855-    STBIR_SIMD_NO_UNROLL(decode);     \
10856-    c = hc[0];                        \
10857-    x0 = decode[0] * c;               \
10858-    x1 = decode[1] * c;               \
10859-    x2 = decode[2] * c;               \
10860-    x3 = decode[3] * c;               \
10861-    c = hc[1];                        \
10862-    y0 = decode[4] * c;               \
10863-    y1 = decode[5] * c;               \
10864-    y2 = decode[6] * c;               \
10865-    y3 = decode[7] * c;               \
10866-    c = hc[2];                        \
10867-    x0 += decode[8] * c;              \
10868-    x1 += decode[9] * c;              \
10869-    x2 += decode[10] * c;             \
10870-    x3 += decode[11] * c;             \
10871-    c = hc[3];                        \
10872-    y0 += decode[12] * c;             \
10873-    y1 += decode[13] * c;             \
10874-    y2 += decode[14] * c;             \
10875-    y3 += decode[15] * c;
10876-
10877-#define stbir__4_coeff_continue_from_4( ofs ) \
10878-    STBIR_SIMD_NO_UNROLL(decode);     \
10879-    c = hc[0+(ofs)];                  \
10880-    x0 += decode[0+(ofs)*4] * c;      \
10881-    x1 += decode[1+(ofs)*4] * c;      \
10882-    x2 += decode[2+(ofs)*4] * c;      \
10883-    x3 += decode[3+(ofs)*4] * c;      \
10884-    c = hc[1+(ofs)];                  \
10885-    y0 += decode[4+(ofs)*4] * c;      \
10886-    y1 += decode[5+(ofs)*4] * c;      \
10887-    y2 += decode[6+(ofs)*4] * c;      \
10888-    y3 += decode[7+(ofs)*4] * c;      \
10889-    c = hc[2+(ofs)];                  \
10890-    x0 += decode[8+(ofs)*4] * c;      \
10891-    x1 += decode[9+(ofs)*4] * c;      \
10892-    x2 += decode[10+(ofs)*4] * c;     \
10893-    x3 += decode[11+(ofs)*4] * c;     \
10894-    c = hc[3+(ofs)];                  \
10895-    y0 += decode[12+(ofs)*4] * c;     \
10896-    y1 += decode[13+(ofs)*4] * c;     \
10897-    y2 += decode[14+(ofs)*4] * c;     \
10898-    y3 += decode[15+(ofs)*4] * c;
10899-
10900-#define stbir__1_coeff_remnant( ofs ) \
10901-    STBIR_SIMD_NO_UNROLL(decode);     \
10902-    c = hc[0+(ofs)];                  \
10903-    x0 += decode[0+(ofs)*4] * c;      \
10904-    x1 += decode[1+(ofs)*4] * c;      \
10905-    x2 += decode[2+(ofs)*4] * c;      \
10906-    x3 += decode[3+(ofs)*4] * c;
10907-
10908-#define stbir__2_coeff_remnant( ofs ) \
10909-    STBIR_SIMD_NO_UNROLL(decode);     \
10910-    c = hc[0+(ofs)];                  \
10911-    x0 += decode[0+(ofs)*4] * c;      \
10912-    x1 += decode[1+(ofs)*4] * c;      \
10913-    x2 += decode[2+(ofs)*4] * c;      \
10914-    x3 += decode[3+(ofs)*4] * c;      \
10915-    c = hc[1+(ofs)];                  \
10916-    y0 += decode[4+(ofs)*4] * c;      \
10917-    y1 += decode[5+(ofs)*4] * c;      \
10918-    y2 += decode[6+(ofs)*4] * c;      \
10919-    y3 += decode[7+(ofs)*4] * c;
10920-
10921-#define stbir__3_coeff_remnant( ofs ) \
10922-    STBIR_SIMD_NO_UNROLL(decode);     \
10923-    c = hc[0+(ofs)];                  \
10924-    x0 += decode[0+(ofs)*4] * c;      \
10925-    x1 += decode[1+(ofs)*4] * c;      \
10926-    x2 += decode[2+(ofs)*4] * c;      \
10927-    x3 += decode[3+(ofs)*4] * c;      \
10928-    c = hc[1+(ofs)];                  \
10929-    y0 += decode[4+(ofs)*4] * c;      \
10930-    y1 += decode[5+(ofs)*4] * c;      \
10931-    y2 += decode[6+(ofs)*4] * c;      \
10932-    y3 += decode[7+(ofs)*4] * c;      \
10933-    c = hc[2+(ofs)];                  \
10934-    x0 += decode[8+(ofs)*4] * c;      \
10935-    x1 += decode[9+(ofs)*4] * c;      \
10936-    x2 += decode[10+(ofs)*4] * c;     \
10937-    x3 += decode[11+(ofs)*4] * c;
10938-
10939-#define stbir__store_output()                     \
10940-    output[0] = x0 + y0;                          \
10941-    output[1] = x1 + y1;                          \
10942-    output[2] = x2 + y2;                          \
10943-    output[3] = x3 + y3;                          \
10944-    horizontal_coefficients += coefficient_width; \
10945-    ++horizontal_contributors;                    \
10946-    output += 4;
10947+#define stbir__1_coeff_only()                                                  \
10948+	float p0, p1, p2, p3, c;                                                   \
10949+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10950+	c = hc[0];                                                                 \
10951+	p0 = decode[0] * c;                                                        \
10952+	p1 = decode[1] * c;                                                        \
10953+	p2 = decode[2] * c;                                                        \
10954+	p3 = decode[3] * c;
10955+
10956+#define stbir__2_coeff_only()                                                  \
10957+	float p0, p1, p2, p3, c;                                                   \
10958+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10959+	c = hc[0];                                                                 \
10960+	p0 = decode[0] * c;                                                        \
10961+	p1 = decode[1] * c;                                                        \
10962+	p2 = decode[2] * c;                                                        \
10963+	p3 = decode[3] * c;                                                        \
10964+	c = hc[1];                                                                 \
10965+	p0 += decode[4] * c;                                                       \
10966+	p1 += decode[5] * c;                                                       \
10967+	p2 += decode[6] * c;                                                       \
10968+	p3 += decode[7] * c;
10969+
10970+#define stbir__3_coeff_only()                                                  \
10971+	float p0, p1, p2, p3, c;                                                   \
10972+	STBIR_SIMD_NO_UNROLL(decode);                                              \
10973+	c = hc[0];                                                                 \
10974+	p0 = decode[0] * c;                                                        \
10975+	p1 = decode[1] * c;                                                        \
10976+	p2 = decode[2] * c;                                                        \
10977+	p3 = decode[3] * c;                                                        \
10978+	c = hc[1];                                                                 \
10979+	p0 += decode[4] * c;                                                       \
10980+	p1 += decode[5] * c;                                                       \
10981+	p2 += decode[6] * c;                                                       \
10982+	p3 += decode[7] * c;                                                       \
10983+	c = hc[2];                                                                 \
10984+	p0 += decode[8] * c;                                                       \
10985+	p1 += decode[9] * c;                                                       \
10986+	p2 += decode[10] * c;                                                      \
10987+	p3 += decode[11] * c;
10988+
10989+#define stbir__store_output_tiny()                                             \
10990+	output[0] = p0;                                                            \
10991+	output[1] = p1;                                                            \
10992+	output[2] = p2;                                                            \
10993+	output[3] = p3;                                                            \
10994+	horizontal_coefficients += coefficient_width;                              \
10995+	++horizontal_contributors;                                                 \
10996+	output += 4;
10997+
10998+#define stbir__4_coeff_start()                                                 \
10999+	float x0, x1, x2, x3, y0, y1, y2, y3, c;                                   \
11000+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11001+	c = hc[0];                                                                 \
11002+	x0 = decode[0] * c;                                                        \
11003+	x1 = decode[1] * c;                                                        \
11004+	x2 = decode[2] * c;                                                        \
11005+	x3 = decode[3] * c;                                                        \
11006+	c = hc[1];                                                                 \
11007+	y0 = decode[4] * c;                                                        \
11008+	y1 = decode[5] * c;                                                        \
11009+	y2 = decode[6] * c;                                                        \
11010+	y3 = decode[7] * c;                                                        \
11011+	c = hc[2];                                                                 \
11012+	x0 += decode[8] * c;                                                       \
11013+	x1 += decode[9] * c;                                                       \
11014+	x2 += decode[10] * c;                                                      \
11015+	x3 += decode[11] * c;                                                      \
11016+	c = hc[3];                                                                 \
11017+	y0 += decode[12] * c;                                                      \
11018+	y1 += decode[13] * c;                                                      \
11019+	y2 += decode[14] * c;                                                      \
11020+	y3 += decode[15] * c;
11021+
11022+#define stbir__4_coeff_continue_from_4(ofs)                                    \
11023+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11024+	c = hc[0 + (ofs)];                                                         \
11025+	x0 += decode[0 + (ofs) * 4] * c;                                           \
11026+	x1 += decode[1 + (ofs) * 4] * c;                                           \
11027+	x2 += decode[2 + (ofs) * 4] * c;                                           \
11028+	x3 += decode[3 + (ofs) * 4] * c;                                           \
11029+	c = hc[1 + (ofs)];                                                         \
11030+	y0 += decode[4 + (ofs) * 4] * c;                                           \
11031+	y1 += decode[5 + (ofs) * 4] * c;                                           \
11032+	y2 += decode[6 + (ofs) * 4] * c;                                           \
11033+	y3 += decode[7 + (ofs) * 4] * c;                                           \
11034+	c = hc[2 + (ofs)];                                                         \
11035+	x0 += decode[8 + (ofs) * 4] * c;                                           \
11036+	x1 += decode[9 + (ofs) * 4] * c;                                           \
11037+	x2 += decode[10 + (ofs) * 4] * c;                                          \
11038+	x3 += decode[11 + (ofs) * 4] * c;                                          \
11039+	c = hc[3 + (ofs)];                                                         \
11040+	y0 += decode[12 + (ofs) * 4] * c;                                          \
11041+	y1 += decode[13 + (ofs) * 4] * c;                                          \
11042+	y2 += decode[14 + (ofs) * 4] * c;                                          \
11043+	y3 += decode[15 + (ofs) * 4] * c;
11044+
11045+#define stbir__1_coeff_remnant(ofs)                                            \
11046+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11047+	c = hc[0 + (ofs)];                                                         \
11048+	x0 += decode[0 + (ofs) * 4] * c;                                           \
11049+	x1 += decode[1 + (ofs) * 4] * c;                                           \
11050+	x2 += decode[2 + (ofs) * 4] * c;                                           \
11051+	x3 += decode[3 + (ofs) * 4] * c;
11052+
11053+#define stbir__2_coeff_remnant(ofs)                                            \
11054+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11055+	c = hc[0 + (ofs)];                                                         \
11056+	x0 += decode[0 + (ofs) * 4] * c;                                           \
11057+	x1 += decode[1 + (ofs) * 4] * c;                                           \
11058+	x2 += decode[2 + (ofs) * 4] * c;                                           \
11059+	x3 += decode[3 + (ofs) * 4] * c;                                           \
11060+	c = hc[1 + (ofs)];                                                         \
11061+	y0 += decode[4 + (ofs) * 4] * c;                                           \
11062+	y1 += decode[5 + (ofs) * 4] * c;                                           \
11063+	y2 += decode[6 + (ofs) * 4] * c;                                           \
11064+	y3 += decode[7 + (ofs) * 4] * c;
11065+
11066+#define stbir__3_coeff_remnant(ofs)                                            \
11067+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11068+	c = hc[0 + (ofs)];                                                         \
11069+	x0 += decode[0 + (ofs) * 4] * c;                                           \
11070+	x1 += decode[1 + (ofs) * 4] * c;                                           \
11071+	x2 += decode[2 + (ofs) * 4] * c;                                           \
11072+	x3 += decode[3 + (ofs) * 4] * c;                                           \
11073+	c = hc[1 + (ofs)];                                                         \
11074+	y0 += decode[4 + (ofs) * 4] * c;                                           \
11075+	y1 += decode[5 + (ofs) * 4] * c;                                           \
11076+	y2 += decode[6 + (ofs) * 4] * c;                                           \
11077+	y3 += decode[7 + (ofs) * 4] * c;                                           \
11078+	c = hc[2 + (ofs)];                                                         \
11079+	x0 += decode[8 + (ofs) * 4] * c;                                           \
11080+	x1 += decode[9 + (ofs) * 4] * c;                                           \
11081+	x2 += decode[10 + (ofs) * 4] * c;                                          \
11082+	x3 += decode[11 + (ofs) * 4] * c;
11083+
11084+#define stbir__store_output()                                                  \
11085+	output[0] = x0 + y0;                                                       \
11086+	output[1] = x1 + y1;                                                       \
11087+	output[2] = x2 + y2;                                                       \
11088+	output[3] = x3 + y3;                                                       \
11089+	horizontal_coefficients += coefficient_width;                              \
11090+	++horizontal_contributors;                                                 \
11091+	output += 4;
11092 
11093 #endif
11094 
11095@@ -5650,402 +6543,401 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
11096 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
11097 #include STBIR__HEADER_FILENAME
11098 
11099-
11100-
11101 //=================
11102 // Do 7 channel horizontal routines
11103 
11104 #ifdef STBIR_SIMD
11105 
11106-#define stbir__1_coeff_only()                   \
11107-    stbir__simdf tot0,tot1,c;                   \
11108-    STBIR_SIMD_NO_UNROLL(decode);               \
11109-    stbir__simdf_load1( c, hc );                \
11110-    stbir__simdf_0123to0000( c, c );            \
11111-    stbir__simdf_mult_mem( tot0, c, decode );   \
11112-    stbir__simdf_mult_mem( tot1, c, decode+3 );
11113-
11114-#define stbir__2_coeff_only()                         \
11115-    stbir__simdf tot0,tot1,c,cs;                      \
11116-    STBIR_SIMD_NO_UNROLL(decode);                     \
11117-    stbir__simdf_load2( cs, hc );                     \
11118-    stbir__simdf_0123to0000( c, cs );                 \
11119-    stbir__simdf_mult_mem( tot0, c, decode );         \
11120-    stbir__simdf_mult_mem( tot1, c, decode+3 );       \
11121-    stbir__simdf_0123to1111( c, cs );                 \
11122-    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
11123-    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
11124-
11125-#define stbir__3_coeff_only()                           \
11126-    stbir__simdf tot0,tot1,c,cs;                        \
11127-    STBIR_SIMD_NO_UNROLL(decode);                       \
11128-    stbir__simdf_load( cs, hc );                        \
11129-    stbir__simdf_0123to0000( c, cs );                   \
11130-    stbir__simdf_mult_mem( tot0, c, decode );           \
11131-    stbir__simdf_mult_mem( tot1, c, decode+3 );         \
11132-    stbir__simdf_0123to1111( c, cs );                   \
11133-    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 );   \
11134-    stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
11135-    stbir__simdf_0123to2222( c, cs );                   \
11136-    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
11137-    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
11138-
11139-#define stbir__store_output_tiny()                \
11140-    stbir__simdf_store( output+3, tot1 );         \
11141-    stbir__simdf_store( output, tot0 );           \
11142-    horizontal_coefficients += coefficient_width; \
11143-    ++horizontal_contributors;                    \
11144-    output += 7;
11145+#define stbir__1_coeff_only()                                                  \
11146+	stbir__simdf tot0, tot1, c;                                                \
11147+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11148+	stbir__simdf_load1(c, hc);                                                 \
11149+	stbir__simdf_0123to0000(c, c);                                             \
11150+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
11151+	stbir__simdf_mult_mem(tot1, c, decode + 3);
11152+
11153+#define stbir__2_coeff_only()                                                  \
11154+	stbir__simdf tot0, tot1, c, cs;                                            \
11155+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11156+	stbir__simdf_load2(cs, hc);                                                \
11157+	stbir__simdf_0123to0000(c, cs);                                            \
11158+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
11159+	stbir__simdf_mult_mem(tot1, c, decode + 3);                                \
11160+	stbir__simdf_0123to1111(c, cs);                                            \
11161+	stbir__simdf_madd_mem(tot0, tot0, c, decode + 7);                          \
11162+	stbir__simdf_madd_mem(tot1, tot1, c, decode + 10);
11163+
11164+#define stbir__3_coeff_only()                                                  \
11165+	stbir__simdf tot0, tot1, c, cs;                                            \
11166+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11167+	stbir__simdf_load(cs, hc);                                                 \
11168+	stbir__simdf_0123to0000(c, cs);                                            \
11169+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
11170+	stbir__simdf_mult_mem(tot1, c, decode + 3);                                \
11171+	stbir__simdf_0123to1111(c, cs);                                            \
11172+	stbir__simdf_madd_mem(tot0, tot0, c, decode + 7);                          \
11173+	stbir__simdf_madd_mem(tot1, tot1, c, decode + 10);                         \
11174+	stbir__simdf_0123to2222(c, cs);                                            \
11175+	stbir__simdf_madd_mem(tot0, tot0, c, decode + 14);                         \
11176+	stbir__simdf_madd_mem(tot1, tot1, c, decode + 17);
11177+
11178+#define stbir__store_output_tiny()                                             \
11179+	stbir__simdf_store(output + 3, tot1);                                      \
11180+	stbir__simdf_store(output, tot0);                                          \
11181+	horizontal_coefficients += coefficient_width;                              \
11182+	++horizontal_contributors;                                                 \
11183+	output += 7;
11184 
11185 #ifdef STBIR_SIMD8
11186 
11187-#define stbir__4_coeff_start()                     \
11188-    stbir__simdf8 tot0,tot1,c,cs;                  \
11189-    STBIR_SIMD_NO_UNROLL(decode);                  \
11190-    stbir__simdf8_load4b( cs, hc );                \
11191-    stbir__simdf8_0123to00000000( c, cs );         \
11192-    stbir__simdf8_mult_mem( tot0, c, decode );     \
11193-    stbir__simdf8_0123to11111111( c, cs );         \
11194-    stbir__simdf8_mult_mem( tot1, c, decode+7 );   \
11195-    stbir__simdf8_0123to22222222( c, cs );         \
11196-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
11197-    stbir__simdf8_0123to33333333( c, cs );         \
11198-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
11199-
11200-#define stbir__4_coeff_continue_from_4( ofs )                   \
11201-    STBIR_SIMD_NO_UNROLL(decode);                               \
11202-    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
11203-    stbir__simdf8_0123to00000000( c, cs );                      \
11204-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
11205-    stbir__simdf8_0123to11111111( c, cs );                      \
11206-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
11207-    stbir__simdf8_0123to22222222( c, cs );                      \
11208-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
11209-    stbir__simdf8_0123to33333333( c, cs );                      \
11210-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
11211-
11212-#define stbir__1_coeff_remnant( ofs )                           \
11213-    STBIR_SIMD_NO_UNROLL(decode);                               \
11214-    stbir__simdf8_load1b( c, hc + (ofs) );                      \
11215-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
11216-
11217-#define stbir__2_coeff_remnant( ofs )                           \
11218-    STBIR_SIMD_NO_UNROLL(decode);                               \
11219-    stbir__simdf8_load1b( c, hc + (ofs) );                      \
11220-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
11221-    stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
11222-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
11223-
11224-#define stbir__3_coeff_remnant( ofs )                           \
11225-    STBIR_SIMD_NO_UNROLL(decode);                               \
11226-    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
11227-    stbir__simdf8_0123to00000000( c, cs );                      \
11228-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
11229-    stbir__simdf8_0123to11111111( c, cs );                      \
11230-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
11231-    stbir__simdf8_0123to22222222( c, cs );                      \
11232-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
11233-
11234-#define stbir__store_output()                     \
11235-    stbir__simdf8_add( tot0, tot0, tot1 );        \
11236-    horizontal_coefficients += coefficient_width; \
11237-    ++horizontal_contributors;                    \
11238-    output += 7;                                  \
11239-    if ( output < output_end )                    \
11240-    {                                             \
11241-      stbir__simdf8_store( output-7, tot0 );      \
11242-      continue;                                   \
11243-    }                                             \
11244-    stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
11245-    stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) );           \
11246-    break;
11247+#define stbir__4_coeff_start()                                                 \
11248+	stbir__simdf8 tot0, tot1, c, cs;                                           \
11249+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11250+	stbir__simdf8_load4b(cs, hc);                                              \
11251+	stbir__simdf8_0123to00000000(c, cs);                                       \
11252+	stbir__simdf8_mult_mem(tot0, c, decode);                                   \
11253+	stbir__simdf8_0123to11111111(c, cs);                                       \
11254+	stbir__simdf8_mult_mem(tot1, c, decode + 7);                               \
11255+	stbir__simdf8_0123to22222222(c, cs);                                       \
11256+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + 14);                        \
11257+	stbir__simdf8_0123to33333333(c, cs);                                       \
11258+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + 21);
11259+
11260+#define stbir__4_coeff_continue_from_4(ofs)                                    \
11261+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11262+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
11263+	stbir__simdf8_0123to00000000(c, cs);                                       \
11264+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                 \
11265+	stbir__simdf8_0123to11111111(c, cs);                                       \
11266+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 7);             \
11267+	stbir__simdf8_0123to22222222(c, cs);                                       \
11268+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7 + 14);            \
11269+	stbir__simdf8_0123to33333333(c, cs);                                       \
11270+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 21);
11271+
11272+#define stbir__1_coeff_remnant(ofs)                                            \
11273+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11274+	stbir__simdf8_load1b(c, hc + (ofs));                                       \
11275+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7);
11276+
11277+#define stbir__2_coeff_remnant(ofs)                                            \
11278+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11279+	stbir__simdf8_load1b(c, hc + (ofs));                                       \
11280+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                 \
11281+	stbir__simdf8_load1b(c, hc + (ofs) + 1);                                   \
11282+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 7);
11283+
11284+#define stbir__3_coeff_remnant(ofs)                                            \
11285+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11286+	stbir__simdf8_load4b(cs, hc + (ofs));                                      \
11287+	stbir__simdf8_0123to00000000(c, cs);                                       \
11288+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                 \
11289+	stbir__simdf8_0123to11111111(c, cs);                                       \
11290+	stbir__simdf8_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 7);             \
11291+	stbir__simdf8_0123to22222222(c, cs);                                       \
11292+	stbir__simdf8_madd_mem(tot0, tot0, c, decode + (ofs) * 7 + 14);
11293+
11294+#define stbir__store_output()                                                  \
11295+	stbir__simdf8_add(tot0, tot0, tot1);                                       \
11296+	horizontal_coefficients += coefficient_width;                              \
11297+	++horizontal_contributors;                                                 \
11298+	output += 7;                                                               \
11299+	if (output < output_end) {                                                 \
11300+		stbir__simdf8_store(output - 7, tot0);                                 \
11301+		continue;                                                              \
11302+	}                                                                          \
11303+	stbir__simdf_store(                                                        \
11304+	    output - 7 + 3,                                                        \
11305+	    stbir__simdf_swiz(stbir__simdf8_gettop4(tot0), 0, 0, 1, 2));           \
11306+	stbir__simdf_store(output - 7, stbir__if_simdf8_cast_to_simdf4(tot0));     \
11307+	break;
11308 
11309 #else
11310 
11311-#define stbir__4_coeff_start()                    \
11312-    stbir__simdf tot0,tot1,tot2,tot3,c,cs;        \
11313-    STBIR_SIMD_NO_UNROLL(decode);                 \
11314-    stbir__simdf_load( cs, hc );                  \
11315-    stbir__simdf_0123to0000( c, cs );             \
11316-    stbir__simdf_mult_mem( tot0, c, decode );     \
11317-    stbir__simdf_mult_mem( tot1, c, decode+3 );   \
11318-    stbir__simdf_0123to1111( c, cs );             \
11319-    stbir__simdf_mult_mem( tot2, c, decode+7 );   \
11320-    stbir__simdf_mult_mem( tot3, c, decode+10 );  \
11321-    stbir__simdf_0123to2222( c, cs );             \
11322-    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
11323-    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
11324-    stbir__simdf_0123to3333( c, cs );                   \
11325-    stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
11326-    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
11327-
11328-#define stbir__4_coeff_continue_from_4( ofs )                   \
11329-    STBIR_SIMD_NO_UNROLL(decode);                               \
11330-    stbir__simdf_load( cs, hc + (ofs) );                        \
11331-    stbir__simdf_0123to0000( c, cs );                           \
11332-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
11333-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
11334-    stbir__simdf_0123to1111( c, cs );                           \
11335-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
11336-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
11337-    stbir__simdf_0123to2222( c, cs );                           \
11338-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
11339-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
11340-    stbir__simdf_0123to3333( c, cs );                           \
11341-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
11342-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
11343-
11344-#define stbir__1_coeff_remnant( ofs )                           \
11345-    STBIR_SIMD_NO_UNROLL(decode);                               \
11346-    stbir__simdf_load1( c, hc + (ofs) );                        \
11347-    stbir__simdf_0123to0000( c, c );                            \
11348-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
11349-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
11350-
11351-#define stbir__2_coeff_remnant( ofs )                           \
11352-    STBIR_SIMD_NO_UNROLL(decode);                               \
11353-    stbir__simdf_load2( cs, hc + (ofs) );                       \
11354-    stbir__simdf_0123to0000( c, cs );                           \
11355-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
11356-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
11357-    stbir__simdf_0123to1111( c, cs );                           \
11358-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
11359-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
11360-
11361-#define stbir__3_coeff_remnant( ofs )                           \
11362-    STBIR_SIMD_NO_UNROLL(decode);                               \
11363-    stbir__simdf_load( cs, hc + (ofs) );                        \
11364-    stbir__simdf_0123to0000( c, cs );                           \
11365-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
11366-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
11367-    stbir__simdf_0123to1111( c, cs );                           \
11368-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
11369-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
11370-    stbir__simdf_0123to2222( c, cs );                           \
11371-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
11372-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
11373-
11374-#define stbir__store_output()                     \
11375-    stbir__simdf_add( tot0, tot0, tot2 );         \
11376-    stbir__simdf_add( tot1, tot1, tot3 );         \
11377-    stbir__simdf_store( output+3, tot1 );         \
11378-    stbir__simdf_store( output, tot0 );           \
11379-    horizontal_coefficients += coefficient_width; \
11380-    ++horizontal_contributors;                    \
11381-    output += 7;
11382+#define stbir__4_coeff_start()                                                 \
11383+	stbir__simdf tot0, tot1, tot2, tot3, c, cs;                                \
11384+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11385+	stbir__simdf_load(cs, hc);                                                 \
11386+	stbir__simdf_0123to0000(c, cs);                                            \
11387+	stbir__simdf_mult_mem(tot0, c, decode);                                    \
11388+	stbir__simdf_mult_mem(tot1, c, decode + 3);                                \
11389+	stbir__simdf_0123to1111(c, cs);                                            \
11390+	stbir__simdf_mult_mem(tot2, c, decode + 7);                                \
11391+	stbir__simdf_mult_mem(tot3, c, decode + 10);                               \
11392+	stbir__simdf_0123to2222(c, cs);                                            \
11393+	stbir__simdf_madd_mem(tot0, tot0, c, decode + 14);                         \
11394+	stbir__simdf_madd_mem(tot1, tot1, c, decode + 17);                         \
11395+	stbir__simdf_0123to3333(c, cs);                                            \
11396+	stbir__simdf_madd_mem(tot2, tot2, c, decode + 21);                         \
11397+	stbir__simdf_madd_mem(tot3, tot3, c, decode + 24);
11398+
11399+#define stbir__4_coeff_continue_from_4(ofs)                                    \
11400+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11401+	stbir__simdf_load(cs, hc + (ofs));                                         \
11402+	stbir__simdf_0123to0000(c, cs);                                            \
11403+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                  \
11404+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 3);              \
11405+	stbir__simdf_0123to1111(c, cs);                                            \
11406+	stbir__simdf_madd_mem(tot2, tot2, c, decode + (ofs) * 7 + 7);              \
11407+	stbir__simdf_madd_mem(tot3, tot3, c, decode + (ofs) * 7 + 10);             \
11408+	stbir__simdf_0123to2222(c, cs);                                            \
11409+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7 + 14);             \
11410+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 17);             \
11411+	stbir__simdf_0123to3333(c, cs);                                            \
11412+	stbir__simdf_madd_mem(tot2, tot2, c, decode + (ofs) * 7 + 21);             \
11413+	stbir__simdf_madd_mem(tot3, tot3, c, decode + (ofs) * 7 + 24);
11414+
11415+#define stbir__1_coeff_remnant(ofs)                                            \
11416+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11417+	stbir__simdf_load1(c, hc + (ofs));                                         \
11418+	stbir__simdf_0123to0000(c, c);                                             \
11419+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                  \
11420+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 3);
11421+
11422+#define stbir__2_coeff_remnant(ofs)                                            \
11423+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11424+	stbir__simdf_load2(cs, hc + (ofs));                                        \
11425+	stbir__simdf_0123to0000(c, cs);                                            \
11426+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                  \
11427+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 3);              \
11428+	stbir__simdf_0123to1111(c, cs);                                            \
11429+	stbir__simdf_madd_mem(tot2, tot2, c, decode + (ofs) * 7 + 7);              \
11430+	stbir__simdf_madd_mem(tot3, tot3, c, decode + (ofs) * 7 + 10);
11431+
11432+#define stbir__3_coeff_remnant(ofs)                                            \
11433+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11434+	stbir__simdf_load(cs, hc + (ofs));                                         \
11435+	stbir__simdf_0123to0000(c, cs);                                            \
11436+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7);                  \
11437+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 3);              \
11438+	stbir__simdf_0123to1111(c, cs);                                            \
11439+	stbir__simdf_madd_mem(tot2, tot2, c, decode + (ofs) * 7 + 7);              \
11440+	stbir__simdf_madd_mem(tot3, tot3, c, decode + (ofs) * 7 + 10);             \
11441+	stbir__simdf_0123to2222(c, cs);                                            \
11442+	stbir__simdf_madd_mem(tot0, tot0, c, decode + (ofs) * 7 + 14);             \
11443+	stbir__simdf_madd_mem(tot1, tot1, c, decode + (ofs) * 7 + 17);
11444+
11445+#define stbir__store_output()                                                  \
11446+	stbir__simdf_add(tot0, tot0, tot2);                                        \
11447+	stbir__simdf_add(tot1, tot1, tot3);                                        \
11448+	stbir__simdf_store(output + 3, tot1);                                      \
11449+	stbir__simdf_store(output, tot0);                                          \
11450+	horizontal_coefficients += coefficient_width;                              \
11451+	++horizontal_contributors;                                                 \
11452+	output += 7;
11453 
11454 #endif
11455 
11456 #else
11457 
11458-#define stbir__1_coeff_only()        \
11459-    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
11460-    c = hc[0];                       \
11461-    tot0 = decode[0]*c;              \
11462-    tot1 = decode[1]*c;              \
11463-    tot2 = decode[2]*c;              \
11464-    tot3 = decode[3]*c;              \
11465-    tot4 = decode[4]*c;              \
11466-    tot5 = decode[5]*c;              \
11467-    tot6 = decode[6]*c;
11468-
11469-#define stbir__2_coeff_only()        \
11470-    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
11471-    c = hc[0];                       \
11472-    tot0 = decode[0]*c;              \
11473-    tot1 = decode[1]*c;              \
11474-    tot2 = decode[2]*c;              \
11475-    tot3 = decode[3]*c;              \
11476-    tot4 = decode[4]*c;              \
11477-    tot5 = decode[5]*c;              \
11478-    tot6 = decode[6]*c;              \
11479-    c = hc[1];                       \
11480-    tot0 += decode[7]*c;             \
11481-    tot1 += decode[8]*c;             \
11482-    tot2 += decode[9]*c;             \
11483-    tot3 += decode[10]*c;            \
11484-    tot4 += decode[11]*c;            \
11485-    tot5 += decode[12]*c;            \
11486-    tot6 += decode[13]*c;            \
11487-
11488-#define stbir__3_coeff_only()        \
11489-    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
11490-    c = hc[0];                       \
11491-    tot0 = decode[0]*c;              \
11492-    tot1 = decode[1]*c;              \
11493-    tot2 = decode[2]*c;              \
11494-    tot3 = decode[3]*c;              \
11495-    tot4 = decode[4]*c;              \
11496-    tot5 = decode[5]*c;              \
11497-    tot6 = decode[6]*c;              \
11498-    c = hc[1];                       \
11499-    tot0 += decode[7]*c;             \
11500-    tot1 += decode[8]*c;             \
11501-    tot2 += decode[9]*c;             \
11502-    tot3 += decode[10]*c;            \
11503-    tot4 += decode[11]*c;            \
11504-    tot5 += decode[12]*c;            \
11505-    tot6 += decode[13]*c;            \
11506-    c = hc[2];                       \
11507-    tot0 += decode[14]*c;            \
11508-    tot1 += decode[15]*c;            \
11509-    tot2 += decode[16]*c;            \
11510-    tot3 += decode[17]*c;            \
11511-    tot4 += decode[18]*c;            \
11512-    tot5 += decode[19]*c;            \
11513-    tot6 += decode[20]*c;            \
11514-
11515-#define stbir__store_output_tiny()                \
11516-    output[0] = tot0;                             \
11517-    output[1] = tot1;                             \
11518-    output[2] = tot2;                             \
11519-    output[3] = tot3;                             \
11520-    output[4] = tot4;                             \
11521-    output[5] = tot5;                             \
11522-    output[6] = tot6;                             \
11523-    horizontal_coefficients += coefficient_width; \
11524-    ++horizontal_contributors;                    \
11525-    output += 7;
11526-
11527-#define stbir__4_coeff_start()    \
11528-    float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
11529-    STBIR_SIMD_NO_UNROLL(decode); \
11530-    c = hc[0];                    \
11531-    x0 = decode[0] * c;           \
11532-    x1 = decode[1] * c;           \
11533-    x2 = decode[2] * c;           \
11534-    x3 = decode[3] * c;           \
11535-    x4 = decode[4] * c;           \
11536-    x5 = decode[5] * c;           \
11537-    x6 = decode[6] * c;           \
11538-    c = hc[1];                    \
11539-    y0 = decode[7] * c;           \
11540-    y1 = decode[8] * c;           \
11541-    y2 = decode[9] * c;           \
11542-    y3 = decode[10] * c;          \
11543-    y4 = decode[11] * c;          \
11544-    y5 = decode[12] * c;          \
11545-    y6 = decode[13] * c;          \
11546-    c = hc[2];                    \
11547-    x0 += decode[14] * c;         \
11548-    x1 += decode[15] * c;         \
11549-    x2 += decode[16] * c;         \
11550-    x3 += decode[17] * c;         \
11551-    x4 += decode[18] * c;         \
11552-    x5 += decode[19] * c;         \
11553-    x6 += decode[20] * c;         \
11554-    c = hc[3];                    \
11555-    y0 += decode[21] * c;         \
11556-    y1 += decode[22] * c;         \
11557-    y2 += decode[23] * c;         \
11558-    y3 += decode[24] * c;         \
11559-    y4 += decode[25] * c;         \
11560-    y5 += decode[26] * c;         \
11561-    y6 += decode[27] * c;
11562-
11563-#define stbir__4_coeff_continue_from_4( ofs ) \
11564-    STBIR_SIMD_NO_UNROLL(decode);  \
11565-    c = hc[0+(ofs)];               \
11566-    x0 += decode[0+(ofs)*7] * c;   \
11567-    x1 += decode[1+(ofs)*7] * c;   \
11568-    x2 += decode[2+(ofs)*7] * c;   \
11569-    x3 += decode[3+(ofs)*7] * c;   \
11570-    x4 += decode[4+(ofs)*7] * c;   \
11571-    x5 += decode[5+(ofs)*7] * c;   \
11572-    x6 += decode[6+(ofs)*7] * c;   \
11573-    c = hc[1+(ofs)];               \
11574-    y0 += decode[7+(ofs)*7] * c;   \
11575-    y1 += decode[8+(ofs)*7] * c;   \
11576-    y2 += decode[9+(ofs)*7] * c;   \
11577-    y3 += decode[10+(ofs)*7] * c;  \
11578-    y4 += decode[11+(ofs)*7] * c;  \
11579-    y5 += decode[12+(ofs)*7] * c;  \
11580-    y6 += decode[13+(ofs)*7] * c;  \
11581-    c = hc[2+(ofs)];               \
11582-    x0 += decode[14+(ofs)*7] * c;  \
11583-    x1 += decode[15+(ofs)*7] * c;  \
11584-    x2 += decode[16+(ofs)*7] * c;  \
11585-    x3 += decode[17+(ofs)*7] * c;  \
11586-    x4 += decode[18+(ofs)*7] * c;  \
11587-    x5 += decode[19+(ofs)*7] * c;  \
11588-    x6 += decode[20+(ofs)*7] * c;  \
11589-    c = hc[3+(ofs)];               \
11590-    y0 += decode[21+(ofs)*7] * c;  \
11591-    y1 += decode[22+(ofs)*7] * c;  \
11592-    y2 += decode[23+(ofs)*7] * c;  \
11593-    y3 += decode[24+(ofs)*7] * c;  \
11594-    y4 += decode[25+(ofs)*7] * c;  \
11595-    y5 += decode[26+(ofs)*7] * c;  \
11596-    y6 += decode[27+(ofs)*7] * c;
11597-
11598-#define stbir__1_coeff_remnant( ofs ) \
11599-    STBIR_SIMD_NO_UNROLL(decode);  \
11600-    c = hc[0+(ofs)];               \
11601-    x0 += decode[0+(ofs)*7] * c;   \
11602-    x1 += decode[1+(ofs)*7] * c;   \
11603-    x2 += decode[2+(ofs)*7] * c;   \
11604-    x3 += decode[3+(ofs)*7] * c;   \
11605-    x4 += decode[4+(ofs)*7] * c;   \
11606-    x5 += decode[5+(ofs)*7] * c;   \
11607-    x6 += decode[6+(ofs)*7] * c;   \
11608-
11609-#define stbir__2_coeff_remnant( ofs ) \
11610-    STBIR_SIMD_NO_UNROLL(decode);  \
11611-    c = hc[0+(ofs)];               \
11612-    x0 += decode[0+(ofs)*7] * c;   \
11613-    x1 += decode[1+(ofs)*7] * c;   \
11614-    x2 += decode[2+(ofs)*7] * c;   \
11615-    x3 += decode[3+(ofs)*7] * c;   \
11616-    x4 += decode[4+(ofs)*7] * c;   \
11617-    x5 += decode[5+(ofs)*7] * c;   \
11618-    x6 += decode[6+(ofs)*7] * c;   \
11619-    c = hc[1+(ofs)];               \
11620-    y0 += decode[7+(ofs)*7] * c;   \
11621-    y1 += decode[8+(ofs)*7] * c;   \
11622-    y2 += decode[9+(ofs)*7] * c;   \
11623-    y3 += decode[10+(ofs)*7] * c;  \
11624-    y4 += decode[11+(ofs)*7] * c;  \
11625-    y5 += decode[12+(ofs)*7] * c;  \
11626-    y6 += decode[13+(ofs)*7] * c;  \
11627-
11628-#define stbir__3_coeff_remnant( ofs ) \
11629-    STBIR_SIMD_NO_UNROLL(decode);  \
11630-    c = hc[0+(ofs)];               \
11631-    x0 += decode[0+(ofs)*7] * c;   \
11632-    x1 += decode[1+(ofs)*7] * c;   \
11633-    x2 += decode[2+(ofs)*7] * c;   \
11634-    x3 += decode[3+(ofs)*7] * c;   \
11635-    x4 += decode[4+(ofs)*7] * c;   \
11636-    x5 += decode[5+(ofs)*7] * c;   \
11637-    x6 += decode[6+(ofs)*7] * c;   \
11638-    c = hc[1+(ofs)];               \
11639-    y0 += decode[7+(ofs)*7] * c;   \
11640-    y1 += decode[8+(ofs)*7] * c;   \
11641-    y2 += decode[9+(ofs)*7] * c;   \
11642-    y3 += decode[10+(ofs)*7] * c;  \
11643-    y4 += decode[11+(ofs)*7] * c;  \
11644-    y5 += decode[12+(ofs)*7] * c;  \
11645-    y6 += decode[13+(ofs)*7] * c;  \
11646-    c = hc[2+(ofs)];               \
11647-    x0 += decode[14+(ofs)*7] * c;  \
11648-    x1 += decode[15+(ofs)*7] * c;  \
11649-    x2 += decode[16+(ofs)*7] * c;  \
11650-    x3 += decode[17+(ofs)*7] * c;  \
11651-    x4 += decode[18+(ofs)*7] * c;  \
11652-    x5 += decode[19+(ofs)*7] * c;  \
11653-    x6 += decode[20+(ofs)*7] * c;  \
11654-
11655-#define stbir__store_output()                     \
11656-    output[0] = x0 + y0;                          \
11657-    output[1] = x1 + y1;                          \
11658-    output[2] = x2 + y2;                          \
11659-    output[3] = x3 + y3;                          \
11660-    output[4] = x4 + y4;                          \
11661-    output[5] = x5 + y5;                          \
11662-    output[6] = x6 + y6;                          \
11663-    horizontal_coefficients += coefficient_width; \
11664-    ++horizontal_contributors;                    \
11665-    output += 7;
11666+#define stbir__1_coeff_only()                                                  \
11667+	float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c;                         \
11668+	c = hc[0];                                                                 \
11669+	tot0 = decode[0] * c;                                                      \
11670+	tot1 = decode[1] * c;                                                      \
11671+	tot2 = decode[2] * c;                                                      \
11672+	tot3 = decode[3] * c;                                                      \
11673+	tot4 = decode[4] * c;                                                      \
11674+	tot5 = decode[5] * c;                                                      \
11675+	tot6 = decode[6] * c;
11676+
11677+#define stbir__2_coeff_only()                                                  \
11678+	float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c;                         \
11679+	c = hc[0];                                                                 \
11680+	tot0 = decode[0] * c;                                                      \
11681+	tot1 = decode[1] * c;                                                      \
11682+	tot2 = decode[2] * c;                                                      \
11683+	tot3 = decode[3] * c;                                                      \
11684+	tot4 = decode[4] * c;                                                      \
11685+	tot5 = decode[5] * c;                                                      \
11686+	tot6 = decode[6] * c;                                                      \
11687+	c = hc[1];                                                                 \
11688+	tot0 += decode[7] * c;                                                     \
11689+	tot1 += decode[8] * c;                                                     \
11690+	tot2 += decode[9] * c;                                                     \
11691+	tot3 += decode[10] * c;                                                    \
11692+	tot4 += decode[11] * c;                                                    \
11693+	tot5 += decode[12] * c;                                                    \
11694+	tot6 += decode[13] * c;
11695+
11696+#define stbir__3_coeff_only()                                                  \
11697+	float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c;                         \
11698+	c = hc[0];                                                                 \
11699+	tot0 = decode[0] * c;                                                      \
11700+	tot1 = decode[1] * c;                                                      \
11701+	tot2 = decode[2] * c;                                                      \
11702+	tot3 = decode[3] * c;                                                      \
11703+	tot4 = decode[4] * c;                                                      \
11704+	tot5 = decode[5] * c;                                                      \
11705+	tot6 = decode[6] * c;                                                      \
11706+	c = hc[1];                                                                 \
11707+	tot0 += decode[7] * c;                                                     \
11708+	tot1 += decode[8] * c;                                                     \
11709+	tot2 += decode[9] * c;                                                     \
11710+	tot3 += decode[10] * c;                                                    \
11711+	tot4 += decode[11] * c;                                                    \
11712+	tot5 += decode[12] * c;                                                    \
11713+	tot6 += decode[13] * c;                                                    \
11714+	c = hc[2];                                                                 \
11715+	tot0 += decode[14] * c;                                                    \
11716+	tot1 += decode[15] * c;                                                    \
11717+	tot2 += decode[16] * c;                                                    \
11718+	tot3 += decode[17] * c;                                                    \
11719+	tot4 += decode[18] * c;                                                    \
11720+	tot5 += decode[19] * c;                                                    \
11721+	tot6 += decode[20] * c;
11722+
11723+#define stbir__store_output_tiny()                                             \
11724+	output[0] = tot0;                                                          \
11725+	output[1] = tot1;                                                          \
11726+	output[2] = tot2;                                                          \
11727+	output[3] = tot3;                                                          \
11728+	output[4] = tot4;                                                          \
11729+	output[5] = tot5;                                                          \
11730+	output[6] = tot6;                                                          \
11731+	horizontal_coefficients += coefficient_width;                              \
11732+	++horizontal_contributors;                                                 \
11733+	output += 7;
11734+
11735+#define stbir__4_coeff_start()                                                 \
11736+	float x0, x1, x2, x3, x4, x5, x6, y0, y1, y2, y3, y4, y5, y6, c;           \
11737+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11738+	c = hc[0];                                                                 \
11739+	x0 = decode[0] * c;                                                        \
11740+	x1 = decode[1] * c;                                                        \
11741+	x2 = decode[2] * c;                                                        \
11742+	x3 = decode[3] * c;                                                        \
11743+	x4 = decode[4] * c;                                                        \
11744+	x5 = decode[5] * c;                                                        \
11745+	x6 = decode[6] * c;                                                        \
11746+	c = hc[1];                                                                 \
11747+	y0 = decode[7] * c;                                                        \
11748+	y1 = decode[8] * c;                                                        \
11749+	y2 = decode[9] * c;                                                        \
11750+	y3 = decode[10] * c;                                                       \
11751+	y4 = decode[11] * c;                                                       \
11752+	y5 = decode[12] * c;                                                       \
11753+	y6 = decode[13] * c;                                                       \
11754+	c = hc[2];                                                                 \
11755+	x0 += decode[14] * c;                                                      \
11756+	x1 += decode[15] * c;                                                      \
11757+	x2 += decode[16] * c;                                                      \
11758+	x3 += decode[17] * c;                                                      \
11759+	x4 += decode[18] * c;                                                      \
11760+	x5 += decode[19] * c;                                                      \
11761+	x6 += decode[20] * c;                                                      \
11762+	c = hc[3];                                                                 \
11763+	y0 += decode[21] * c;                                                      \
11764+	y1 += decode[22] * c;                                                      \
11765+	y2 += decode[23] * c;                                                      \
11766+	y3 += decode[24] * c;                                                      \
11767+	y4 += decode[25] * c;                                                      \
11768+	y5 += decode[26] * c;                                                      \
11769+	y6 += decode[27] * c;
11770+
11771+#define stbir__4_coeff_continue_from_4(ofs)                                    \
11772+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11773+	c = hc[0 + (ofs)];                                                         \
11774+	x0 += decode[0 + (ofs) * 7] * c;                                           \
11775+	x1 += decode[1 + (ofs) * 7] * c;                                           \
11776+	x2 += decode[2 + (ofs) * 7] * c;                                           \
11777+	x3 += decode[3 + (ofs) * 7] * c;                                           \
11778+	x4 += decode[4 + (ofs) * 7] * c;                                           \
11779+	x5 += decode[5 + (ofs) * 7] * c;                                           \
11780+	x6 += decode[6 + (ofs) * 7] * c;                                           \
11781+	c = hc[1 + (ofs)];                                                         \
11782+	y0 += decode[7 + (ofs) * 7] * c;                                           \
11783+	y1 += decode[8 + (ofs) * 7] * c;                                           \
11784+	y2 += decode[9 + (ofs) * 7] * c;                                           \
11785+	y3 += decode[10 + (ofs) * 7] * c;                                          \
11786+	y4 += decode[11 + (ofs) * 7] * c;                                          \
11787+	y5 += decode[12 + (ofs) * 7] * c;                                          \
11788+	y6 += decode[13 + (ofs) * 7] * c;                                          \
11789+	c = hc[2 + (ofs)];                                                         \
11790+	x0 += decode[14 + (ofs) * 7] * c;                                          \
11791+	x1 += decode[15 + (ofs) * 7] * c;                                          \
11792+	x2 += decode[16 + (ofs) * 7] * c;                                          \
11793+	x3 += decode[17 + (ofs) * 7] * c;                                          \
11794+	x4 += decode[18 + (ofs) * 7] * c;                                          \
11795+	x5 += decode[19 + (ofs) * 7] * c;                                          \
11796+	x6 += decode[20 + (ofs) * 7] * c;                                          \
11797+	c = hc[3 + (ofs)];                                                         \
11798+	y0 += decode[21 + (ofs) * 7] * c;                                          \
11799+	y1 += decode[22 + (ofs) * 7] * c;                                          \
11800+	y2 += decode[23 + (ofs) * 7] * c;                                          \
11801+	y3 += decode[24 + (ofs) * 7] * c;                                          \
11802+	y4 += decode[25 + (ofs) * 7] * c;                                          \
11803+	y5 += decode[26 + (ofs) * 7] * c;                                          \
11804+	y6 += decode[27 + (ofs) * 7] * c;
11805+
11806+#define stbir__1_coeff_remnant(ofs)                                            \
11807+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11808+	c = hc[0 + (ofs)];                                                         \
11809+	x0 += decode[0 + (ofs) * 7] * c;                                           \
11810+	x1 += decode[1 + (ofs) * 7] * c;                                           \
11811+	x2 += decode[2 + (ofs) * 7] * c;                                           \
11812+	x3 += decode[3 + (ofs) * 7] * c;                                           \
11813+	x4 += decode[4 + (ofs) * 7] * c;                                           \
11814+	x5 += decode[5 + (ofs) * 7] * c;                                           \
11815+	x6 += decode[6 + (ofs) * 7] * c;
11816+
11817+#define stbir__2_coeff_remnant(ofs)                                            \
11818+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11819+	c = hc[0 + (ofs)];                                                         \
11820+	x0 += decode[0 + (ofs) * 7] * c;                                           \
11821+	x1 += decode[1 + (ofs) * 7] * c;                                           \
11822+	x2 += decode[2 + (ofs) * 7] * c;                                           \
11823+	x3 += decode[3 + (ofs) * 7] * c;                                           \
11824+	x4 += decode[4 + (ofs) * 7] * c;                                           \
11825+	x5 += decode[5 + (ofs) * 7] * c;                                           \
11826+	x6 += decode[6 + (ofs) * 7] * c;                                           \
11827+	c = hc[1 + (ofs)];                                                         \
11828+	y0 += decode[7 + (ofs) * 7] * c;                                           \
11829+	y1 += decode[8 + (ofs) * 7] * c;                                           \
11830+	y2 += decode[9 + (ofs) * 7] * c;                                           \
11831+	y3 += decode[10 + (ofs) * 7] * c;                                          \
11832+	y4 += decode[11 + (ofs) * 7] * c;                                          \
11833+	y5 += decode[12 + (ofs) * 7] * c;                                          \
11834+	y6 += decode[13 + (ofs) * 7] * c;
11835+
11836+#define stbir__3_coeff_remnant(ofs)                                            \
11837+	STBIR_SIMD_NO_UNROLL(decode);                                              \
11838+	c = hc[0 + (ofs)];                                                         \
11839+	x0 += decode[0 + (ofs) * 7] * c;                                           \
11840+	x1 += decode[1 + (ofs) * 7] * c;                                           \
11841+	x2 += decode[2 + (ofs) * 7] * c;                                           \
11842+	x3 += decode[3 + (ofs) * 7] * c;                                           \
11843+	x4 += decode[4 + (ofs) * 7] * c;                                           \
11844+	x5 += decode[5 + (ofs) * 7] * c;                                           \
11845+	x6 += decode[6 + (ofs) * 7] * c;                                           \
11846+	c = hc[1 + (ofs)];                                                         \
11847+	y0 += decode[7 + (ofs) * 7] * c;                                           \
11848+	y1 += decode[8 + (ofs) * 7] * c;                                           \
11849+	y2 += decode[9 + (ofs) * 7] * c;                                           \
11850+	y3 += decode[10 + (ofs) * 7] * c;                                          \
11851+	y4 += decode[11 + (ofs) * 7] * c;                                          \
11852+	y5 += decode[12 + (ofs) * 7] * c;                                          \
11853+	y6 += decode[13 + (ofs) * 7] * c;                                          \
11854+	c = hc[2 + (ofs)];                                                         \
11855+	x0 += decode[14 + (ofs) * 7] * c;                                          \
11856+	x1 += decode[15 + (ofs) * 7] * c;                                          \
11857+	x2 += decode[16 + (ofs) * 7] * c;                                          \
11858+	x3 += decode[17 + (ofs) * 7] * c;                                          \
11859+	x4 += decode[18 + (ofs) * 7] * c;                                          \
11860+	x5 += decode[19 + (ofs) * 7] * c;                                          \
11861+	x6 += decode[20 + (ofs) * 7] * c;
11862+
11863+#define stbir__store_output()                                                  \
11864+	output[0] = x0 + y0;                                                       \
11865+	output[1] = x1 + y1;                                                       \
11866+	output[2] = x2 + y2;                                                       \
11867+	output[3] = x3 + y3;                                                       \
11868+	output[4] = x4 + y4;                                                       \
11869+	output[5] = x5 + y5;                                                       \
11870+	output[6] = x6 + y6;                                                       \
11871+	horizontal_coefficients += coefficient_width;                              \
11872+	++horizontal_contributors;                                                 \
11873+	output += 7;
11874 
11875 #endif
11876 
11877@@ -6053,7 +6945,6 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
11878 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
11879 #include STBIR__HEADER_FILENAME
11880 
11881-
11882 // include all of the vertical resamplers (both scatter and gather versions)
11883 
11884 #define STBIR__vertical_channels 1
11885@@ -6128,801 +7019,1081 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
11886 #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
11887 #include STBIR__HEADER_FILENAME
11888 
11889-typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
11890-
11891-static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
11892-{
11893-  stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
11894-};
11895-
11896-static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
11897-{
11898-  stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
11899-};
11900-
11901-typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
11902-
11903-static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
11904-{
11905-  stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
11906-};
11907-
11908-static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
11909-{
11910-  stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
11911-};
11912-
11913-
11914-static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row  STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
11915-{
11916-  int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
11917-  int channels = stbir_info->channels;
11918-  int width_times_channels = num_pixels * channels;
11919-  void * output_buffer;
11920-
11921-  // un-alpha weight if we need to
11922-  if ( stbir_info->alpha_unweight )
11923-  {
11924-    STBIR_PROFILE_START( unalpha );
11925-    stbir_info->alpha_unweight( encode_buffer, width_times_channels );
11926-    STBIR_PROFILE_END( unalpha );
11927-  }
11928-
11929-  // write directly into output by default
11930-  output_buffer = output_buffer_data;
11931-
11932-  // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
11933-  if ( stbir_info->out_pixels_cb )
11934-    output_buffer = encode_buffer;
11935-
11936-  STBIR_PROFILE_START( encode );
11937-  // convert into the output buffer
11938-  stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
11939-  STBIR_PROFILE_END( encode );
11940-
11941-  // if we have an output callback, call it to send the data
11942-  if ( stbir_info->out_pixels_cb )
11943-    stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
11944+typedef void
11945+STBIR_VERTICAL_GATHERFUNC(float *output, float const *coeffs,
11946+                          float const **inputs, float const *input0_end);
11947+
11948+static STBIR_VERTICAL_GATHERFUNC *stbir__vertical_gathers[8] = {
11949+    stbir__vertical_gather_with_1_coeffs, stbir__vertical_gather_with_2_coeffs,
11950+    stbir__vertical_gather_with_3_coeffs, stbir__vertical_gather_with_4_coeffs,
11951+    stbir__vertical_gather_with_5_coeffs, stbir__vertical_gather_with_6_coeffs,
11952+    stbir__vertical_gather_with_7_coeffs, stbir__vertical_gather_with_8_coeffs};
11953+
11954+static STBIR_VERTICAL_GATHERFUNC *stbir__vertical_gathers_continues[8] = {
11955+    stbir__vertical_gather_with_1_coeffs_cont,
11956+    stbir__vertical_gather_with_2_coeffs_cont,
11957+    stbir__vertical_gather_with_3_coeffs_cont,
11958+    stbir__vertical_gather_with_4_coeffs_cont,
11959+    stbir__vertical_gather_with_5_coeffs_cont,
11960+    stbir__vertical_gather_with_6_coeffs_cont,
11961+    stbir__vertical_gather_with_7_coeffs_cont,
11962+    stbir__vertical_gather_with_8_coeffs_cont};
11963+
11964+typedef void
11965+STBIR_VERTICAL_SCATTERFUNC(float **outputs, float const *coeffs,
11966+                           float const *input, float const *input_end);
11967+
11968+static STBIR_VERTICAL_SCATTERFUNC *stbir__vertical_scatter_sets[8] = {
11969+    stbir__vertical_scatter_with_1_coeffs,
11970+    stbir__vertical_scatter_with_2_coeffs,
11971+    stbir__vertical_scatter_with_3_coeffs,
11972+    stbir__vertical_scatter_with_4_coeffs,
11973+    stbir__vertical_scatter_with_5_coeffs,
11974+    stbir__vertical_scatter_with_6_coeffs,
11975+    stbir__vertical_scatter_with_7_coeffs,
11976+    stbir__vertical_scatter_with_8_coeffs};
11977+
11978+static STBIR_VERTICAL_SCATTERFUNC *stbir__vertical_scatter_blends[8] = {
11979+    stbir__vertical_scatter_with_1_coeffs_cont,
11980+    stbir__vertical_scatter_with_2_coeffs_cont,
11981+    stbir__vertical_scatter_with_3_coeffs_cont,
11982+    stbir__vertical_scatter_with_4_coeffs_cont,
11983+    stbir__vertical_scatter_with_5_coeffs_cont,
11984+    stbir__vertical_scatter_with_6_coeffs_cont,
11985+    stbir__vertical_scatter_with_7_coeffs_cont,
11986+    stbir__vertical_scatter_with_8_coeffs_cont};
11987+
11988+static void
11989+stbir__encode_scanline(stbir__info const *stbir_info, void *output_buffer_data,
11990+                       float *encode_buffer,
11991+                       int row STBIR_ONLY_PROFILE_GET_SPLIT_INFO)
11992+{
11993+	int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
11994+	int channels = stbir_info->channels;
11995+	int width_times_channels = num_pixels * channels;
11996+	void *output_buffer;
11997+
11998+	// un-alpha weight if we need to
11999+	if (stbir_info->alpha_unweight) {
12000+		STBIR_PROFILE_START(unalpha);
12001+		stbir_info->alpha_unweight(encode_buffer, width_times_channels);
12002+		STBIR_PROFILE_END(unalpha);
12003+	}
12004+
12005+	// write directly into output by default
12006+	output_buffer = output_buffer_data;
12007+
12008+	// if we have an output callback, we first convert the decode buffer in
12009+	// place (and then hand that to the callback)
12010+	if (stbir_info->out_pixels_cb) {
12011+		output_buffer = encode_buffer;
12012+	}
12013+
12014+	STBIR_PROFILE_START(encode);
12015+	// convert into the output buffer
12016+	stbir_info->encode_pixels(output_buffer, width_times_channels,
12017+	                          encode_buffer);
12018+	STBIR_PROFILE_END(encode);
12019+
12020+	// if we have an output callback, call it to send the data
12021+	if (stbir_info->out_pixels_cb) {
12022+		stbir_info->out_pixels_cb(output_buffer, num_pixels, row,
12023+		                          stbir_info->user_data);
12024+	}
12025 }
12026 
12027-
12028 // Get the ring buffer pointer for an index
12029-static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
12030+static float *
12031+stbir__get_ring_buffer_entry(stbir__info const *stbir_info,
12032+                             stbir__per_split_info const *split_info, int index)
12033 {
12034-  STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
12035+	STBIR_ASSERT(index < stbir_info->ring_buffer_num_entries);
12036 
12037-  #ifdef STBIR__SEPARATE_ALLOCATIONS
12038-    return split_info->ring_buffers[ index ];
12039-  #else
12040-    return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
12041-  #endif
12042+#ifdef STBIR__SEPARATE_ALLOCATIONS
12043+	return split_info->ring_buffers[index];
12044+#else
12045+	return (float *)(((char *)split_info->ring_buffer) +
12046+	                 (index * stbir_info->ring_buffer_length_bytes));
12047+#endif
12048 }
12049 
12050 // Get the specified scan line from the ring buffer
12051-static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
12052-{
12053-  int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
12054-  return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
12055-}
12056-
12057-static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
12058-{
12059-  float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
12060-
12061-  STBIR_PROFILE_START( horizontal );
12062-  if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
12063-    STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
12064-  else
12065-    stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
12066-  STBIR_PROFILE_END( horizontal );
12067-}
12068-
12069-static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
12070-{
12071-  float* encode_buffer = split_info->vertical_buffer;
12072-  float* decode_buffer = split_info->decode_buffer;
12073-  int vertical_first = stbir_info->vertical_first;
12074-  int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
12075-  int width_times_channels = stbir_info->effective_channels * width;
12076-
12077-  STBIR_ASSERT( stbir_info->vertical.is_gather );
12078-
12079-  // loop over the contributing scanlines and scale into the buffer
12080-  STBIR_PROFILE_START( vertical );
12081-  {
12082-    int k = 0, total = contrib_n1 - contrib_n0 + 1;
12083-    STBIR_ASSERT( total > 0 );
12084-    do {
12085-      float const * inputs[8];
12086-      int i, cnt = total; if ( cnt > 8 ) cnt = 8;
12087-      for( i = 0 ; i < cnt ; i++ )
12088-        inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
12089-
12090-      // call the N scanlines at a time function (up to 8 scanlines of blending at once)
12091-      ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
12092-      k += cnt;
12093-      total -= cnt;
12094-    } while ( total );
12095-  }
12096-  STBIR_PROFILE_END( vertical );
12097-
12098-  if ( vertical_first )
12099-  {
12100-    // Now resample the gathered vertical data in the horizontal axis into the encode buffer
12101-    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
12102-    decode_buffer[ width_times_channels+1 ] = 0.0f; 
12103-    stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12104-  }
12105-
12106-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
12107-                          encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12108-}
12109-
12110-static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
12111-{
12112-  int ring_buffer_index;
12113-  float* ring_buffer;
12114-
12115-  // Decode the nth scanline from the source image into the decode buffer.
12116-  stbir__decode_scanline( stbir_info, n, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12117-
12118-  // update new end scanline
12119-  split_info->ring_buffer_last_scanline = n;
12120-
12121-  // get ring buffer
12122-  ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
12123-  ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
12124-
12125-  // Now resample it into the ring buffer.
12126-  stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12127-
12128-  // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
12129-}
12130-
12131-static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
12132-{
12133-  int y, start_output_y, end_output_y;
12134-  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
12135-  float const * vertical_coefficients = stbir_info->vertical.coefficients;
12136-
12137-  STBIR_ASSERT( stbir_info->vertical.is_gather );
12138-
12139-  start_output_y = split_info->start_output_y;
12140-  end_output_y = split_info[split_count-1].end_output_y;
12141-
12142-  vertical_contributors += start_output_y;
12143-  vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
12144-
12145-  // initialize the ring buffer for gathering
12146-  split_info->ring_buffer_begin_index = 0;
12147-  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
12148-  split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
12149-
12150-  for (y = start_output_y; y < end_output_y; y++)
12151-  {
12152-    int in_first_scanline, in_last_scanline;
12153-
12154-    in_first_scanline = vertical_contributors->n0;
12155-    in_last_scanline = vertical_contributors->n1;
12156-
12157-    // make sure the indexing hasn't broken
12158-    STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
12159-
12160-    // Load in new scanlines
12161-    while (in_last_scanline > split_info->ring_buffer_last_scanline)
12162-    {
12163-      STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
12164-
12165-      // make sure there was room in the ring buffer when we add new scanlines
12166-      if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
12167-      {
12168-        split_info->ring_buffer_first_scanline++;
12169-        split_info->ring_buffer_begin_index++;
12170-      }
12171-
12172-      if ( stbir_info->vertical_first )
12173-      {
12174-        float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
12175-        // Decode the nth scanline from the source image into the decode buffer.
12176-        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12177-      }
12178-      else
12179-      {
12180-        stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
12181-      }
12182-    }
12183-
12184-    // Now all buffers should be ready to write a row of vertical sampling, so do it.
12185-    stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
12186-
12187-    ++vertical_contributors;
12188-    vertical_coefficients += stbir_info->vertical.coefficient_width;
12189-  }
12190+static float *
12191+stbir__get_ring_buffer_scanline(stbir__info const *stbir_info,
12192+                                stbir__per_split_info const *split_info,
12193+                                int get_scanline)
12194+{
12195+	int ring_buffer_index =
12196+	    (split_info->ring_buffer_begin_index +
12197+	     (get_scanline - split_info->ring_buffer_first_scanline)) %
12198+	    stbir_info->ring_buffer_num_entries;
12199+	return stbir__get_ring_buffer_entry(stbir_info, split_info,
12200+	                                    ring_buffer_index);
12201+}
12202+
12203+static void
12204+stbir__resample_horizontal_gather(
12205+    stbir__info const *stbir_info, float *output_buffer,
12206+    float const *input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO)
12207+{
12208+	float const *decode_buffer =
12209+	    input_buffer - (stbir_info->scanline_extents.conservative.n0 *
12210+	                    stbir_info->effective_channels);
12211+
12212+	STBIR_PROFILE_START(horizontal);
12213+	if ((stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE) &&
12214+	    (stbir_info->horizontal.scale_info.scale == 1.0f)) {
12215+		STBIR_MEMCPY(output_buffer, input_buffer,
12216+		             stbir_info->horizontal.scale_info.output_sub_size *
12217+		                 sizeof(float) * stbir_info->effective_channels);
12218+	} else {
12219+		stbir_info->horizontal_gather_channels(
12220+		    output_buffer, stbir_info->horizontal.scale_info.output_sub_size,
12221+		    decode_buffer, stbir_info->horizontal.contributors,
12222+		    stbir_info->horizontal.coefficients,
12223+		    stbir_info->horizontal.coefficient_width);
12224+	}
12225+	STBIR_PROFILE_END(horizontal);
12226+}
12227+
12228+static void
12229+stbir__resample_vertical_gather(stbir__info const *stbir_info,
12230+                                stbir__per_split_info *split_info, int n,
12231+                                int contrib_n0, int contrib_n1,
12232+                                float const *vertical_coefficients)
12233+{
12234+	float *encode_buffer = split_info->vertical_buffer;
12235+	float *decode_buffer = split_info->decode_buffer;
12236+	int vertical_first = stbir_info->vertical_first;
12237+	int width = (vertical_first)
12238+	                ? (stbir_info->scanline_extents.conservative.n1 -
12239+	                   stbir_info->scanline_extents.conservative.n0 + 1)
12240+	                : stbir_info->horizontal.scale_info.output_sub_size;
12241+	int width_times_channels = stbir_info->effective_channels * width;
12242+
12243+	STBIR_ASSERT(stbir_info->vertical.is_gather);
12244+
12245+	// loop over the contributing scanlines and scale into the buffer
12246+	STBIR_PROFILE_START(vertical);
12247+	{
12248+		int k = 0, total = contrib_n1 - contrib_n0 + 1;
12249+		STBIR_ASSERT(total > 0);
12250+		do {
12251+			float const *inputs[8];
12252+			int i, cnt = total;
12253+			if (cnt > 8) {
12254+				cnt = 8;
12255+			}
12256+			for (i = 0; i < cnt; i++) {
12257+				inputs[i] = stbir__get_ring_buffer_scanline(
12258+				    stbir_info, split_info, k + i + contrib_n0);
12259+			}
12260+
12261+			// call the N scanlines at a time function (up to 8 scanlines of
12262+			// blending at once)
12263+			((k == 0) ? stbir__vertical_gathers
12264+			          : stbir__vertical_gathers_continues)[cnt - 1](
12265+			    (vertical_first) ? decode_buffer : encode_buffer,
12266+			    vertical_coefficients + k, inputs,
12267+			    inputs[0] + width_times_channels);
12268+			k += cnt;
12269+			total -= cnt;
12270+		} while (total);
12271+	}
12272+	STBIR_PROFILE_END(vertical);
12273+
12274+	if (vertical_first) {
12275+		// Now resample the gathered vertical data in the horizontal axis into
12276+		// the encode buffer
12277+		decode_buffer[width_times_channels] =
12278+		    0.0f; // clear two over for horizontals with a remnant of 3
12279+		decode_buffer[width_times_channels + 1] = 0.0f;
12280+		stbir__resample_horizontal_gather(
12281+		    stbir_info, encode_buffer,
12282+		    decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12283+	}
12284+
12285+	stbir__encode_scanline(
12286+	    stbir_info,
12287+	    ((char *)stbir_info->output_data) +
12288+	        ((size_t)n * (size_t)stbir_info->output_stride_bytes),
12289+	    encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12290+}
12291+
12292+static void
12293+stbir__decode_and_resample_for_vertical_gather_loop(
12294+    stbir__info const *stbir_info, stbir__per_split_info *split_info, int n)
12295+{
12296+	int ring_buffer_index;
12297+	float *ring_buffer;
12298+
12299+	// Decode the nth scanline from the source image into the decode buffer.
12300+	stbir__decode_scanline(
12301+	    stbir_info, n,
12302+	    split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12303+
12304+	// update new end scanline
12305+	split_info->ring_buffer_last_scanline = n;
12306+
12307+	// get ring buffer
12308+	ring_buffer_index = (split_info->ring_buffer_begin_index +
12309+	                     (split_info->ring_buffer_last_scanline -
12310+	                      split_info->ring_buffer_first_scanline)) %
12311+	                    stbir_info->ring_buffer_num_entries;
12312+	ring_buffer =
12313+	    stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
12314+
12315+	// Now resample it into the ring buffer.
12316+	stbir__resample_horizontal_gather(
12317+	    stbir_info, ring_buffer,
12318+	    split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12319+
12320+	// Now it's sitting in the ring buffer ready to be used as source for the
12321+	// vertical sampling.
12322+}
12323+
12324+static void
12325+stbir__vertical_gather_loop(stbir__info const *stbir_info,
12326+                            stbir__per_split_info *split_info, int split_count)
12327+{
12328+	int y, start_output_y, end_output_y;
12329+	stbir__contributors *vertical_contributors =
12330+	    stbir_info->vertical.contributors;
12331+	float const *vertical_coefficients = stbir_info->vertical.coefficients;
12332+
12333+	STBIR_ASSERT(stbir_info->vertical.is_gather);
12334+
12335+	start_output_y = split_info->start_output_y;
12336+	end_output_y = split_info[split_count - 1].end_output_y;
12337+
12338+	vertical_contributors += start_output_y;
12339+	vertical_coefficients +=
12340+	    start_output_y * stbir_info->vertical.coefficient_width;
12341+
12342+	// initialize the ring buffer for gathering
12343+	split_info->ring_buffer_begin_index = 0;
12344+	split_info->ring_buffer_first_scanline = vertical_contributors->n0;
12345+	split_info->ring_buffer_last_scanline =
12346+	    split_info->ring_buffer_first_scanline - 1; // means "empty"
12347+
12348+	for (y = start_output_y; y < end_output_y; y++) {
12349+		int in_first_scanline, in_last_scanline;
12350+
12351+		in_first_scanline = vertical_contributors->n0;
12352+		in_last_scanline = vertical_contributors->n1;
12353+
12354+		// make sure the indexing hasn't broken
12355+		STBIR_ASSERT(in_first_scanline >=
12356+		             split_info->ring_buffer_first_scanline);
12357+
12358+		// Load in new scanlines
12359+		while (in_last_scanline > split_info->ring_buffer_last_scanline) {
12360+			STBIR_ASSERT((split_info->ring_buffer_last_scanline -
12361+			              split_info->ring_buffer_first_scanline + 1) <=
12362+			             stbir_info->ring_buffer_num_entries);
12363+
12364+			// make sure there was room in the ring buffer when we add new
12365+			// scanlines
12366+			if ((split_info->ring_buffer_last_scanline -
12367+			     split_info->ring_buffer_first_scanline + 1) ==
12368+			    stbir_info->ring_buffer_num_entries) {
12369+				split_info->ring_buffer_first_scanline++;
12370+				split_info->ring_buffer_begin_index++;
12371+			}
12372+
12373+			if (stbir_info->vertical_first) {
12374+				float *ring_buffer = stbir__get_ring_buffer_scanline(
12375+				    stbir_info, split_info,
12376+				    ++split_info->ring_buffer_last_scanline);
12377+				// Decode the nth scanline from the source image into the decode
12378+				// buffer.
12379+				stbir__decode_scanline(
12380+				    stbir_info, split_info->ring_buffer_last_scanline,
12381+				    ring_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12382+			} else {
12383+				stbir__decode_and_resample_for_vertical_gather_loop(
12384+				    stbir_info, split_info,
12385+				    split_info->ring_buffer_last_scanline + 1);
12386+			}
12387+		}
12388+
12389+		// Now all buffers should be ready to write a row of vertical sampling,
12390+		// so do it.
12391+		stbir__resample_vertical_gather(stbir_info, split_info, y,
12392+		                                in_first_scanline, in_last_scanline,
12393+		                                vertical_coefficients);
12394+
12395+		++vertical_contributors;
12396+		vertical_coefficients += stbir_info->vertical.coefficient_width;
12397+	}
12398 }
12399 
12400 #define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
12401-#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
12402-
12403-static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
12404-{
12405-  // evict a scanline out into the output buffer
12406-  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
12407-
12408-  // dump the scanline out
12409-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12410-
12411-  // mark it as empty
12412-  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
12413-
12414-  // advance the first scanline
12415-  split_info->ring_buffer_first_scanline++;
12416-  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
12417-    split_info->ring_buffer_begin_index = 0;
12418-}
12419-
12420-static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
12421-{
12422-  // evict a scanline out into the output buffer
12423-
12424-  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
12425-
12426-  // Now resample it into the buffer.
12427-  stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12428-
12429-  // dump the scanline out
12430-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12431-
12432-  // mark it as empty
12433-  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
12434-
12435-  // advance the first scanline
12436-  split_info->ring_buffer_first_scanline++;
12437-  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
12438-    split_info->ring_buffer_begin_index = 0;
12439-}
12440-
12441-static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
12442-{
12443-  STBIR_ASSERT( !stbir_info->vertical.is_gather );
12444-
12445-  STBIR_PROFILE_START( vertical );
12446-  {
12447-    int k = 0, total = n1 - n0 + 1;
12448-    STBIR_ASSERT( total > 0 );
12449-    do {
12450-      float * outputs[8];
12451-      int i, n = total; if ( n > 8 ) n = 8;
12452-      for( i = 0 ; i < n ; i++ )
12453-      {
12454-        outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
12455-        if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
12456-        {
12457-          n = i;
12458-          break;
12459-        }
12460-      }
12461-      // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
12462-      ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
12463-      k += n;
12464-      total -= n;
12465-    } while ( total );
12466-  }
12467-
12468-  STBIR_PROFILE_END( vertical );
12469-}
12470-
12471-typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
12472-
12473-static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
12474-{
12475-  int y, start_output_y, end_output_y, start_input_y, end_input_y;
12476-  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
12477-  float const * vertical_coefficients = stbir_info->vertical.coefficients;
12478-  stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
12479-  void * scanline_scatter_buffer;
12480-  void * scanline_scatter_buffer_end;
12481-  int on_first_input_y, last_input_y;
12482-  int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
12483-  int width_times_channels = stbir_info->effective_channels * width;
12484-
12485-  STBIR_ASSERT( !stbir_info->vertical.is_gather );
12486-
12487-  start_output_y = split_info->start_output_y;
12488-  end_output_y = split_info[split_count-1].end_output_y;  // may do multiple split counts
12489-
12490-  start_input_y = split_info->start_input_y;
12491-  end_input_y = split_info[split_count-1].end_input_y;
12492-
12493-  // adjust for starting offset start_input_y
12494-  y = start_input_y + stbir_info->vertical.filter_pixel_margin;
12495-  vertical_contributors += y ;
12496-  vertical_coefficients += stbir_info->vertical.coefficient_width * y;
12497-
12498-  if ( stbir_info->vertical_first )
12499-  {
12500-    handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
12501-    scanline_scatter_buffer = split_info->decode_buffer;
12502-    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
12503-  }
12504-  else
12505-  {
12506-    handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
12507-    scanline_scatter_buffer = split_info->vertical_buffer;
12508-    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
12509-  }
12510-
12511-  // initialize the ring buffer for scattering
12512-  split_info->ring_buffer_first_scanline = start_output_y;
12513-  split_info->ring_buffer_last_scanline = -1;
12514-  split_info->ring_buffer_begin_index = -1;
12515-
12516-  // mark all the buffers as empty to start
12517-  for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
12518-  {
12519-    float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
12520-    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
12521-    decode_buffer[ width_times_channels+1 ] = 0.0f; 
12522-    decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
12523-  }
12524-
12525-  // do the loop in input space
12526-  on_first_input_y = 1; last_input_y = start_input_y;
12527-  for (y = start_input_y ; y < end_input_y; y++)
12528-  {
12529-    int out_first_scanline, out_last_scanline;
12530-
12531-    out_first_scanline = vertical_contributors->n0;
12532-    out_last_scanline = vertical_contributors->n1;
12533-
12534-    STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
12535-
12536-    if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
12537-    {
12538-      float const * vc = vertical_coefficients;
12539-
12540-      // keep track of the range actually seen for the next resize
12541-      last_input_y = y;
12542-      if ( ( on_first_input_y ) && ( y > start_input_y ) )
12543-        split_info->start_input_y = y;
12544-      on_first_input_y = 0;
12545-
12546-      // clip the region
12547-      if ( out_first_scanline < start_output_y )
12548-      {
12549-        vc += start_output_y - out_first_scanline;
12550-        out_first_scanline = start_output_y;
12551-      }
12552-
12553-      if ( out_last_scanline >= end_output_y )
12554-        out_last_scanline = end_output_y - 1;
12555-
12556-      // if very first scanline, init the index
12557-      if (split_info->ring_buffer_begin_index < 0)
12558-        split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
12559-
12560-      STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
12561-
12562-      // Decode the nth scanline from the source image into the decode buffer.
12563-      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12564-
12565-      // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
12566-      if ( !stbir_info->vertical_first )
12567-        stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
12568-
12569-      // Now it's sitting in the buffer ready to be distributed into the ring buffers.
12570-
12571-      // evict from the ringbuffer, if we need are full
12572-      if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
12573-           ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
12574-        handle_scanline_for_scatter( stbir_info, split_info );
12575-
12576-      // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
12577-      stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
12578-
12579-      // update the end of the buffer
12580-      if ( out_last_scanline > split_info->ring_buffer_last_scanline )
12581-        split_info->ring_buffer_last_scanline = out_last_scanline;
12582-    }
12583-    ++vertical_contributors;
12584-    vertical_coefficients += stbir_info->vertical.coefficient_width;
12585-  }
12586-
12587-  // now evict the scanlines that are left over in the ring buffer
12588-  while ( split_info->ring_buffer_first_scanline < end_output_y )
12589-    handle_scanline_for_scatter(stbir_info, split_info);
12590-
12591-  // update the end_input_y if we do multiple resizes with the same data
12592-  ++last_input_y;
12593-  for( y = 0 ; y < split_count; y++ )
12594-    if ( split_info[y].end_input_y > last_input_y )
12595-      split_info[y].end_input_y = last_input_y;
12596-}
12597-
12598-
12599-static stbir__kernel_callback * stbir__builtin_kernels[] =   { 0, stbir__filter_trapezoid,  stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
12600-static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one,     stbir__support_two,  stbir__support_two,       stbir__support_two,     stbir__support_zeropoint5 };
12601-
12602-static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
12603-{
12604-  // set filter
12605-  if (filter == 0)
12606-  {
12607-    filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
12608-    if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
12609-    {
12610-      if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
12611-        filter = STBIR_FILTER_POINT_SAMPLE;
12612-      else
12613-        filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
12614-    }
12615-  }
12616-  samp->filter_enum = filter;
12617-
12618-  STBIR_ASSERT(samp->filter_enum != 0);
12619-  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
12620-  samp->filter_kernel = stbir__builtin_kernels[ filter ];
12621-  samp->filter_support = stbir__builtin_supports[ filter ];
12622-
12623-  if ( kernel && support )
12624-  {
12625-    samp->filter_kernel = kernel;
12626-    samp->filter_support = support;
12627-    samp->filter_enum = STBIR_FILTER_OTHER;
12628-  }
12629-
12630-  samp->edge = edge;
12631-  samp->filter_pixel_width  = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
12632-  // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
12633-  //    For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
12634-  //    For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
12635-  //    scanlines in memory at once).
12636-  samp->is_gather = 0;
12637-  if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
12638-    samp->is_gather = 1;
12639-  else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
12640-    samp->is_gather = 2;
12641-
12642-  // pre calculate stuff based on the above
12643-  samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
12644-
12645-  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
12646-  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
12647-  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
12648-  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
12649-  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
12650-  //   overhang, and once for the right side). We only do this for edge mode, since the other 
12651-  //   modes can just re-edge clamp back in again.
12652-  if ( edge == STBIR_EDGE_WRAP )
12653-    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
12654-      samp->filter_pixel_width = scale_info->input_full_size * 3;
12655-
12656-  // This is how much to expand buffers to account for filters seeking outside
12657-  // the image boundaries.
12658-  samp->filter_pixel_margin = samp->filter_pixel_width / 2;
12659-  
12660-  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
12661-  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
12662-  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
12663-  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
12664-  if ( edge == STBIR_EDGE_WRAP )
12665-    if ( samp->filter_pixel_margin > scale_info->input_full_size )
12666-      samp->filter_pixel_margin = scale_info->input_full_size;
12667-
12668-  samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
12669-
12670-  samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
12671-  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
12672-
12673-  samp->gather_prescatter_contributors = 0;
12674-  samp->gather_prescatter_coefficients = 0;
12675-  if ( samp->is_gather == 0 )
12676-  {
12677-    samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
12678-    samp->gather_prescatter_num_contributors  = stbir__get_contributors(samp, 2);
12679-    samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
12680-    samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
12681-  }
12682-}
12683-
12684-static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
12685-{
12686-  float scale = samp->scale_info.scale;
12687-  float out_shift = samp->scale_info.pixel_shift;
12688-  stbir__support_callback * support = samp->filter_support;
12689-  int input_full_size = samp->scale_info.input_full_size;
12690-  stbir_edge edge = samp->edge;
12691-  float inv_scale = samp->scale_info.inv_scale;
12692-
12693-  STBIR_ASSERT( samp->is_gather != 0 );
12694-
12695-  if ( samp->is_gather == 1 )
12696-  {
12697-    int in_first_pixel, in_last_pixel;
12698-    float out_filter_radius = support(inv_scale, user_data) * scale;
12699-
12700-    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
12701-    range->n0 = in_first_pixel;
12702-    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
12703-    range->n1 = in_last_pixel;
12704-  }
12705-  else if ( samp->is_gather == 2 ) // downsample gather, refine
12706-  {
12707-    float in_pixels_radius = support(scale, user_data) * inv_scale;
12708-    int filter_pixel_margin = samp->filter_pixel_margin;
12709-    int output_sub_size = samp->scale_info.output_sub_size;
12710-    int input_end;
12711-    int n;
12712-    int in_first_pixel, in_last_pixel;
12713-
12714-    // get a conservative area of the input range
12715-    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
12716-    range->n0 = in_first_pixel;
12717-    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
12718-    range->n1 = in_last_pixel;
12719-
12720-    // now go through the margin to the start of area to find bottom
12721-    n = range->n0 + 1;
12722-    input_end = -filter_pixel_margin;
12723-    while( n >= input_end )
12724-    {
12725-      int out_first_pixel, out_last_pixel;
12726-      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
12727-      if ( out_first_pixel > out_last_pixel )
12728-        break;
12729-
12730-      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
12731-        range->n0 = n;
12732-      --n;
12733-    }
12734-
12735-    // now go through the end of the area through the margin to find top
12736-    n = range->n1 - 1;
12737-    input_end = n + 1 + filter_pixel_margin;
12738-    while( n <= input_end )
12739-    {
12740-      int out_first_pixel, out_last_pixel;
12741-      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
12742-      if ( out_first_pixel > out_last_pixel )
12743-        break;
12744-      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
12745-        range->n1 = n;
12746-      ++n;
12747-    }
12748-  }
12749-
12750-  if ( samp->edge == STBIR_EDGE_WRAP )
12751-  {
12752-    // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
12753-    if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
12754-    {
12755-      int marg = range->n1 - input_full_size + 1;
12756-      if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
12757-        range->n0 = 0;
12758-    }
12759-    if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
12760-    {
12761-      int marg = -range->n0;
12762-      if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
12763-        range->n1 = input_full_size - 1;
12764-    }
12765-  }
12766-  else
12767-  {
12768-    // for non-edge-wrap modes, we never read over the edge, so clamp
12769-    if ( range->n0 < 0 )
12770-      range->n0 = 0;
12771-    if ( range->n1 >= input_full_size )
12772-      range->n1 = input_full_size - 1;
12773-  }
12774-}
12775-
12776-static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs )
12777-{
12778-  int i, cur;
12779-  int left = output_height;
12780-
12781-  cur = 0;
12782-  for( i = 0 ; i < splits ; i++ )
12783-  {
12784-    int each;
12785-
12786-    split_info[i].start_output_y = cur;
12787-    each = left / ( splits - i );
12788-    split_info[i].end_output_y = cur + each;
12789-
12790-    // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have
12791-    //   a "special" set of coefficients. Basically, with exactly the right filter at exactly the right
12792-    //   resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we
12793-    //   don't process them at all.  But this leads to a tricky thing with the thread splits, where we
12794-    //   might have a set of two coeffs like this for example: (4,4) and (3,6).  The 4,4 means there was
12795-    //   just one single coeff because things worked out perfectly (normally, they all have 4 coeffs
12796-    //   like the range 3,6.  The problem is that if we start right on the (4,4) on a brand new thread,
12797-    //   then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load
12798-    //   it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are 
12799-    //   larger than our existing samples - it's just how the eviction works). So, our solution here
12800-    //   is pretty simple, if we start right on a range that has samples that start earlier, then we 
12801-    //   simply bump up our previous thread split range to include it, and then start this threads
12802-    //   range with the smaller sample. It just moves one scanline from one thread split to another,
12803-    //   so that we end with the unusual one, instead of start with it. To do this, we check 2-4 
12804-    //   sample at each thread split start and then occassionally move them.
12805-    
12806-    if ( ( is_gather ) && ( i ) )
12807-    {
12808-      stbir__contributors * small_contribs;
12809-      int j, smallest, stop, start_n0;
12810-      stbir__contributors * split_contribs = contribs + cur;
12811-
12812-      // scan for a max of 3x the filter width or until the next thread split
12813-      stop = vertical_pixel_margin * 3;
12814-      if ( each < stop )
12815-        stop = each;
12816-
12817-      // loops a few times before early out
12818-      smallest = 0;
12819-      small_contribs = split_contribs;
12820-      start_n0 = small_contribs->n0;
12821-      for( j = 1 ; j <= stop ; j++ )
12822-      {
12823-        ++split_contribs;
12824-        if ( split_contribs->n0 > start_n0 )
12825-          break;
12826-        if ( split_contribs->n0 < small_contribs->n0 )
12827-        {
12828-          small_contribs = split_contribs;
12829-          smallest = j;
12830-        }
12831-      }
12832-
12833-      split_info[i-1].end_output_y += smallest;
12834-      split_info[i].start_output_y += smallest;
12835-    }
12836-
12837-    cur += each;
12838-    left -= each;
12839-
12840-    // scatter range (updated to minimum as you run it)
12841-    split_info[i].start_input_y = -vertical_pixel_margin;
12842-    split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
12843-  }
12844-}
12845-
12846-static void stbir__free_internal_mem( stbir__info *info )
12847-{
12848-  #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
12849-
12850-  if ( info )
12851-  {
12852-  #ifndef STBIR__SEPARATE_ALLOCATIONS
12853-    STBIR__FREE_AND_CLEAR( info->alloced_mem );
12854-  #else
12855-    int i,j;
12856-
12857-    if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
12858-    {
12859-      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
12860-      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
12861-    }
12862-    for( i = 0 ; i < info->splits ; i++ )
12863-    {
12864-      for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
12865-      {
12866-        #ifdef STBIR_SIMD8
12867-        if ( info->effective_channels == 3 )
12868-          --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
12869-        #endif
12870-        STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
12871-      }
12872-
12873-      #ifdef STBIR_SIMD8
12874-      if ( info->effective_channels == 3 )
12875-        --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
12876-      #endif
12877-      STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
12878-      STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
12879-      STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
12880-    }
12881-    STBIR__FREE_AND_CLEAR( info->split_info );
12882-    if ( info->vertical.coefficients != info->horizontal.coefficients )
12883-    {
12884-      STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
12885-      STBIR__FREE_AND_CLEAR( info->vertical.contributors );
12886-    }
12887-    STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
12888-    STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
12889-    STBIR__FREE_AND_CLEAR( info->alloced_mem );
12890-    STBIR_FREE( info, info->user_data );
12891-  #endif
12892-  }
12893-
12894-  #undef STBIR__FREE_AND_CLEAR
12895-}
12896-
12897-static int stbir__get_max_split( int splits, int height )
12898-{
12899-  int i;
12900-  int max = 0;
12901-
12902-  for( i = 0 ; i < splits ; i++ )
12903-  {
12904-    int each = height / ( splits - i );
12905-    if ( each > max )
12906-      max = each;
12907-    height -= each;
12908-  }
12909-  return max;
12910-}
12911-
12912-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
12913-{
12914-  0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
12915-};
12916-
12917-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
12918-{
12919-  0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
12920-};
12921+#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr)                                      \
12922+	((ptr)[0] == STBIR__FLOAT_EMPTY_MARKER)
12923+
12924+static void
12925+stbir__encode_first_scanline_from_scatter(stbir__info const *stbir_info,
12926+                                          stbir__per_split_info *split_info)
12927+{
12928+	// evict a scanline out into the output buffer
12929+	float *ring_buffer_entry = stbir__get_ring_buffer_entry(
12930+	    stbir_info, split_info, split_info->ring_buffer_begin_index);
12931+
12932+	// dump the scanline out
12933+	stbir__encode_scanline(stbir_info,
12934+	                       ((char *)stbir_info->output_data) +
12935+	                           ((size_t)split_info->ring_buffer_first_scanline *
12936+	                            (size_t)stbir_info->output_stride_bytes),
12937+	                       ring_buffer_entry,
12938+	                       split_info->ring_buffer_first_scanline
12939+	                           STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12940+
12941+	// mark it as empty
12942+	ring_buffer_entry[0] = STBIR__FLOAT_EMPTY_MARKER;
12943+
12944+	// advance the first scanline
12945+	split_info->ring_buffer_first_scanline++;
12946+	if (++split_info->ring_buffer_begin_index ==
12947+	    stbir_info->ring_buffer_num_entries) {
12948+		split_info->ring_buffer_begin_index = 0;
12949+	}
12950+}
12951+
12952+static void
12953+stbir__horizontal_resample_and_encode_first_scanline_from_scatter(
12954+    stbir__info const *stbir_info, stbir__per_split_info *split_info)
12955+{
12956+	// evict a scanline out into the output buffer
12957+
12958+	float *ring_buffer_entry = stbir__get_ring_buffer_entry(
12959+	    stbir_info, split_info, split_info->ring_buffer_begin_index);
12960+
12961+	// Now resample it into the buffer.
12962+	stbir__resample_horizontal_gather(
12963+	    stbir_info, split_info->vertical_buffer,
12964+	    ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12965+
12966+	// dump the scanline out
12967+	stbir__encode_scanline(stbir_info,
12968+	                       ((char *)stbir_info->output_data) +
12969+	                           ((size_t)split_info->ring_buffer_first_scanline *
12970+	                            (size_t)stbir_info->output_stride_bytes),
12971+	                       split_info->vertical_buffer,
12972+	                       split_info->ring_buffer_first_scanline
12973+	                           STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
12974+
12975+	// mark it as empty
12976+	ring_buffer_entry[0] = STBIR__FLOAT_EMPTY_MARKER;
12977+
12978+	// advance the first scanline
12979+	split_info->ring_buffer_first_scanline++;
12980+	if (++split_info->ring_buffer_begin_index ==
12981+	    stbir_info->ring_buffer_num_entries) {
12982+		split_info->ring_buffer_begin_index = 0;
12983+	}
12984+}
12985+
12986+static void
12987+stbir__resample_vertical_scatter(stbir__info const *stbir_info,
12988+                                 stbir__per_split_info *split_info, int n0,
12989+                                 int n1, float const *vertical_coefficients,
12990+                                 float const *vertical_buffer,
12991+                                 float const *vertical_buffer_end)
12992+{
12993+	STBIR_ASSERT(!stbir_info->vertical.is_gather);
12994+
12995+	STBIR_PROFILE_START(vertical);
12996+	{
12997+		int k = 0, total = n1 - n0 + 1;
12998+		STBIR_ASSERT(total > 0);
12999+		do {
13000+			float *outputs[8];
13001+			int i, n = total;
13002+			if (n > 8) {
13003+				n = 8;
13004+			}
13005+			for (i = 0; i < n; i++) {
13006+				outputs[i] = stbir__get_ring_buffer_scanline(
13007+				    stbir_info, split_info, k + i + n0);
13008+				if ((i) &&
13009+				    (STBIR__FLOAT_BUFFER_IS_EMPTY(outputs[i]) !=
13010+				     STBIR__FLOAT_BUFFER_IS_EMPTY(
13011+				         outputs[0]))) // make sure runs are of the same type
13012+				{
13013+					n = i;
13014+					break;
13015+				}
13016+			}
13017+			// call the scatter to N scanlines at a time function (up to 8
13018+			// scanlines of scattering at once)
13019+			((STBIR__FLOAT_BUFFER_IS_EMPTY(outputs[0]))
13020+			     ? stbir__vertical_scatter_sets
13021+			     : stbir__vertical_scatter_blends)[n - 1](
13022+			    outputs, vertical_coefficients + k, vertical_buffer,
13023+			    vertical_buffer_end);
13024+			k += n;
13025+			total -= n;
13026+		} while (total);
13027+	}
13028+
13029+	STBIR_PROFILE_END(vertical);
13030+}
13031+
13032+typedef void
13033+stbir__handle_scanline_for_scatter_func(stbir__info const *stbir_info,
13034+                                        stbir__per_split_info *split_info);
13035+
13036+static void
13037+stbir__vertical_scatter_loop(stbir__info const *stbir_info,
13038+                             stbir__per_split_info *split_info, int split_count)
13039+{
13040+	int y, start_output_y, end_output_y, start_input_y, end_input_y;
13041+	stbir__contributors *vertical_contributors =
13042+	    stbir_info->vertical.contributors;
13043+	float const *vertical_coefficients = stbir_info->vertical.coefficients;
13044+	stbir__handle_scanline_for_scatter_func *handle_scanline_for_scatter;
13045+	void *scanline_scatter_buffer;
13046+	void *scanline_scatter_buffer_end;
13047+	int on_first_input_y, last_input_y;
13048+	int width = (stbir_info->vertical_first)
13049+	                ? (stbir_info->scanline_extents.conservative.n1 -
13050+	                   stbir_info->scanline_extents.conservative.n0 + 1)
13051+	                : stbir_info->horizontal.scale_info.output_sub_size;
13052+	int width_times_channels = stbir_info->effective_channels * width;
13053+
13054+	STBIR_ASSERT(!stbir_info->vertical.is_gather);
13055+
13056+	start_output_y = split_info->start_output_y;
13057+	end_output_y = split_info[split_count - 1]
13058+	                   .end_output_y; // may do multiple split counts
13059+
13060+	start_input_y = split_info->start_input_y;
13061+	end_input_y = split_info[split_count - 1].end_input_y;
13062+
13063+	// adjust for starting offset start_input_y
13064+	y = start_input_y + stbir_info->vertical.filter_pixel_margin;
13065+	vertical_contributors += y;
13066+	vertical_coefficients += stbir_info->vertical.coefficient_width * y;
13067+
13068+	if (stbir_info->vertical_first) {
13069+		handle_scanline_for_scatter =
13070+		    stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
13071+		scanline_scatter_buffer = split_info->decode_buffer;
13072+		scanline_scatter_buffer_end =
13073+		    ((char *)scanline_scatter_buffer) +
13074+		    sizeof(float) * stbir_info->effective_channels *
13075+		        (stbir_info->scanline_extents.conservative.n1 -
13076+		         stbir_info->scanline_extents.conservative.n0 + 1);
13077+	} else {
13078+		handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
13079+		scanline_scatter_buffer = split_info->vertical_buffer;
13080+		scanline_scatter_buffer_end =
13081+		    ((char *)scanline_scatter_buffer) +
13082+		    sizeof(float) * stbir_info->effective_channels *
13083+		        stbir_info->horizontal.scale_info.output_sub_size;
13084+	}
13085+
13086+	// initialize the ring buffer for scattering
13087+	split_info->ring_buffer_first_scanline = start_output_y;
13088+	split_info->ring_buffer_last_scanline = -1;
13089+	split_info->ring_buffer_begin_index = -1;
13090+
13091+	// mark all the buffers as empty to start
13092+	for (y = 0; y < stbir_info->ring_buffer_num_entries; y++) {
13093+		float *decode_buffer =
13094+		    stbir__get_ring_buffer_entry(stbir_info, split_info, y);
13095+		decode_buffer[width_times_channels] =
13096+		    0.0f; // clear two over for horizontals with a remnant of 3
13097+		decode_buffer[width_times_channels + 1] = 0.0f;
13098+		decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
13099+	}
13100+
13101+	// do the loop in input space
13102+	on_first_input_y = 1;
13103+	last_input_y = start_input_y;
13104+	for (y = start_input_y; y < end_input_y; y++) {
13105+		int out_first_scanline, out_last_scanline;
13106+
13107+		out_first_scanline = vertical_contributors->n0;
13108+		out_last_scanline = vertical_contributors->n1;
13109+
13110+		STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <=
13111+		             stbir_info->ring_buffer_num_entries);
13112+
13113+		if ((out_last_scanline >= out_first_scanline) &&
13114+		    (((out_first_scanline >= start_output_y) &&
13115+		      (out_first_scanline < end_output_y)) ||
13116+		     ((out_last_scanline >= start_output_y) &&
13117+		      (out_last_scanline < end_output_y)))) {
13118+			float const *vc = vertical_coefficients;
13119+
13120+			// keep track of the range actually seen for the next resize
13121+			last_input_y = y;
13122+			if ((on_first_input_y) && (y > start_input_y)) {
13123+				split_info->start_input_y = y;
13124+			}
13125+			on_first_input_y = 0;
13126+
13127+			// clip the region
13128+			if (out_first_scanline < start_output_y) {
13129+				vc += start_output_y - out_first_scanline;
13130+				out_first_scanline = start_output_y;
13131+			}
13132+
13133+			if (out_last_scanline >= end_output_y) {
13134+				out_last_scanline = end_output_y - 1;
13135+			}
13136+
13137+			// if very first scanline, init the index
13138+			if (split_info->ring_buffer_begin_index < 0) {
13139+				split_info->ring_buffer_begin_index =
13140+				    out_first_scanline - start_output_y;
13141+			}
13142+
13143+			STBIR_ASSERT(split_info->ring_buffer_begin_index <=
13144+			             out_first_scanline);
13145+
13146+			// Decode the nth scanline from the source image into the decode
13147+			// buffer.
13148+			stbir__decode_scanline(
13149+			    stbir_info, y,
13150+			    split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
13151+
13152+			// When horizontal first, we resample horizontally into the vertical
13153+			// buffer before we scatter it out
13154+			if (!stbir_info->vertical_first) {
13155+				stbir__resample_horizontal_gather(
13156+				    stbir_info, split_info->vertical_buffer,
13157+				    split_info
13158+				        ->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO);
13159+			}
13160+
13161+			// Now it's sitting in the buffer ready to be distributed into the
13162+			// ring buffers.
13163+
13164+			// evict from the ringbuffer, if we need are full
13165+			if (((split_info->ring_buffer_last_scanline -
13166+			      split_info->ring_buffer_first_scanline + 1) ==
13167+			     stbir_info->ring_buffer_num_entries) &&
13168+			    (out_last_scanline > split_info->ring_buffer_last_scanline)) {
13169+				handle_scanline_for_scatter(stbir_info, split_info);
13170+			}
13171+
13172+			// Now the horizontal buffer is ready to write to all ring buffer
13173+			// rows, so do it.
13174+			stbir__resample_vertical_scatter(
13175+			    stbir_info, split_info, out_first_scanline, out_last_scanline,
13176+			    vc, (float *)scanline_scatter_buffer,
13177+			    (float *)scanline_scatter_buffer_end);
13178+
13179+			// update the end of the buffer
13180+			if (out_last_scanline > split_info->ring_buffer_last_scanline) {
13181+				split_info->ring_buffer_last_scanline = out_last_scanline;
13182+			}
13183+		}
13184+		++vertical_contributors;
13185+		vertical_coefficients += stbir_info->vertical.coefficient_width;
13186+	}
13187+
13188+	// now evict the scanlines that are left over in the ring buffer
13189+	while (split_info->ring_buffer_first_scanline < end_output_y) {
13190+		handle_scanline_for_scatter(stbir_info, split_info);
13191+	}
13192+
13193+	// update the end_input_y if we do multiple resizes with the same data
13194+	++last_input_y;
13195+	for (y = 0; y < split_count; y++) {
13196+		if (split_info[y].end_input_y > last_input_y) {
13197+			split_info[y].end_input_y = last_input_y;
13198+		}
13199+	}
13200+}
13201+
13202+static stbir__kernel_callback *stbir__builtin_kernels[] = {
13203+    0,
13204+    stbir__filter_trapezoid,
13205+    stbir__filter_triangle,
13206+    stbir__filter_cubic,
13207+    stbir__filter_catmullrom,
13208+    stbir__filter_mitchell,
13209+    stbir__filter_point};
13210+static stbir__support_callback *stbir__builtin_supports[] = {
13211+    0,
13212+    stbir__support_trapezoid,
13213+    stbir__support_one,
13214+    stbir__support_two,
13215+    stbir__support_two,
13216+    stbir__support_two,
13217+    stbir__support_zeropoint5};
13218+
13219+static void
13220+stbir__set_sampler(stbir__sampler *samp, stbir_filter filter,
13221+                   stbir__kernel_callback *kernel,
13222+                   stbir__support_callback *support, stbir_edge edge,
13223+                   stbir__scale_info *scale_info, int always_gather,
13224+                   void *user_data)
13225+{
13226+	// set filter
13227+	if (filter == 0) {
13228+		filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
13229+		if (scale_info->scale >= (1.0f - stbir__small_float)) {
13230+			if ((scale_info->scale <= (1.0f + stbir__small_float)) &&
13231+			    (STBIR_CEILF(scale_info->pixel_shift) ==
13232+			     scale_info->pixel_shift)) {
13233+				filter = STBIR_FILTER_POINT_SAMPLE;
13234+			} else {
13235+				filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
13236+			}
13237+		}
13238+	}
13239+	samp->filter_enum = filter;
13240+
13241+	STBIR_ASSERT(samp->filter_enum != 0);
13242+	STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
13243+	samp->filter_kernel = stbir__builtin_kernels[filter];
13244+	samp->filter_support = stbir__builtin_supports[filter];
13245+
13246+	if (kernel && support) {
13247+		samp->filter_kernel = kernel;
13248+		samp->filter_support = support;
13249+		samp->filter_enum = STBIR_FILTER_OTHER;
13250+	}
13251+
13252+	samp->edge = edge;
13253+	samp->filter_pixel_width = stbir__get_filter_pixel_width(
13254+	    samp->filter_support, scale_info->scale, user_data);
13255+	// Gather is always better, but in extreme downsamples, you have to most or
13256+	// all of the data in memory
13257+	//    For horizontal, we always have all the pixels, so we always use gather
13258+	//    here (always_gather==1). For vertical, we use gather if scaling up
13259+	//    (which means we will have samp->filter_pixel_width scanlines in memory
13260+	//    at once).
13261+	samp->is_gather = 0;
13262+	if (scale_info->scale >= (1.0f - stbir__small_float)) {
13263+		samp->is_gather = 1;
13264+	} else if ((always_gather) ||
13265+	           (samp->filter_pixel_width <=
13266+	            STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT)) {
13267+		samp->is_gather = 2;
13268+	}
13269+
13270+	// pre calculate stuff based on the above
13271+	samp->coefficient_width =
13272+	    stbir__get_coefficient_width(samp, samp->is_gather, user_data);
13273+
13274+	// filter_pixel_width is the conservative size in pixels of input that
13275+	// affect an output pixel.
13276+	//   In rare cases (only with 2 pix to 1 pix with the default filters), it's
13277+	//   possible that the filter will extend before or after the scanline
13278+	//   beyond just one extra entire copy of the scanline (we would hit the
13279+	//   edge twice). We don't let you do that, so we clamp the total width to
13280+	//   3x the total of input pixel (once for the scanline, once for the left
13281+	//   side overhang, and once for the right side). We only do this for edge
13282+	//   mode, since the other modes can just re-edge clamp back in again.
13283+	if (edge == STBIR_EDGE_WRAP) {
13284+		if (samp->filter_pixel_width > (scale_info->input_full_size * 3)) {
13285+			samp->filter_pixel_width = scale_info->input_full_size * 3;
13286+		}
13287+	}
13288+
13289+	// This is how much to expand buffers to account for filters seeking outside
13290+	// the image boundaries.
13291+	samp->filter_pixel_margin = samp->filter_pixel_width / 2;
13292+
13293+	// filter_pixel_margin is the amount that this filter can overhang on just
13294+	// one side of either
13295+	//   end of the scanline (left or the right). Since we only allow you to
13296+	//   overhang 1 scanline's worth of pixels, we clamp this one side of
13297+	//   overhang to the input scanline size. Again, this clamping only happens
13298+	//   in rare cases with the default filters (2 pix to 1 pix).
13299+	if (edge == STBIR_EDGE_WRAP) {
13300+		if (samp->filter_pixel_margin > scale_info->input_full_size) {
13301+			samp->filter_pixel_margin = scale_info->input_full_size;
13302+		}
13303+	}
13304+
13305+	samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
13306+
13307+	samp->contributors_size =
13308+	    samp->num_contributors * sizeof(stbir__contributors);
13309+	samp->coefficients_size =
13310+	    samp->num_contributors * samp->coefficient_width * sizeof(float) +
13311+	    sizeof(float) *
13312+	        STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
13313+
13314+	samp->gather_prescatter_contributors = 0;
13315+	samp->gather_prescatter_coefficients = 0;
13316+	if (samp->is_gather == 0) {
13317+		samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
13318+		samp->gather_prescatter_num_contributors =
13319+		    stbir__get_contributors(samp, 2);
13320+		samp->gather_prescatter_contributors_size =
13321+		    samp->gather_prescatter_num_contributors *
13322+		    sizeof(stbir__contributors);
13323+		samp->gather_prescatter_coefficients_size =
13324+		    samp->gather_prescatter_num_contributors *
13325+		    samp->gather_prescatter_coefficient_width * sizeof(float);
13326+	}
13327+}
13328+
13329+static void
13330+stbir__get_conservative_extents(stbir__sampler *samp,
13331+                                stbir__contributors *range, void *user_data)
13332+{
13333+	float scale = samp->scale_info.scale;
13334+	float out_shift = samp->scale_info.pixel_shift;
13335+	stbir__support_callback *support = samp->filter_support;
13336+	int input_full_size = samp->scale_info.input_full_size;
13337+	stbir_edge edge = samp->edge;
13338+	float inv_scale = samp->scale_info.inv_scale;
13339+
13340+	STBIR_ASSERT(samp->is_gather != 0);
13341+
13342+	if (samp->is_gather == 1) {
13343+		int in_first_pixel, in_last_pixel;
13344+		float out_filter_radius = support(inv_scale, user_data) * scale;
13345+
13346+		stbir__calculate_in_pixel_range(&in_first_pixel, &in_last_pixel, 0.5,
13347+		                                out_filter_radius, inv_scale, out_shift,
13348+		                                input_full_size, edge);
13349+		range->n0 = in_first_pixel;
13350+		stbir__calculate_in_pixel_range(
13351+		    &in_first_pixel, &in_last_pixel,
13352+		    ((float)(samp->scale_info.output_sub_size - 1)) + 0.5f,
13353+		    out_filter_radius, inv_scale, out_shift, input_full_size, edge);
13354+		range->n1 = in_last_pixel;
13355+	} else if (samp->is_gather == 2) // downsample gather, refine
13356+	{
13357+		float in_pixels_radius = support(scale, user_data) * inv_scale;
13358+		int filter_pixel_margin = samp->filter_pixel_margin;
13359+		int output_sub_size = samp->scale_info.output_sub_size;
13360+		int input_end;
13361+		int n;
13362+		int in_first_pixel, in_last_pixel;
13363+
13364+		// get a conservative area of the input range
13365+		stbir__calculate_in_pixel_range(&in_first_pixel, &in_last_pixel, 0, 0,
13366+		                                inv_scale, out_shift, input_full_size,
13367+		                                edge);
13368+		range->n0 = in_first_pixel;
13369+		stbir__calculate_in_pixel_range(&in_first_pixel, &in_last_pixel,
13370+		                                (float)output_sub_size, 0, inv_scale,
13371+		                                out_shift, input_full_size, edge);
13372+		range->n1 = in_last_pixel;
13373+
13374+		// now go through the margin to the start of area to find bottom
13375+		n = range->n0 + 1;
13376+		input_end = -filter_pixel_margin;
13377+		while (n >= input_end) {
13378+			int out_first_pixel, out_last_pixel;
13379+			stbir__calculate_out_pixel_range(
13380+			    &out_first_pixel, &out_last_pixel, ((float)n) + 0.5f,
13381+			    in_pixels_radius, scale, out_shift, output_sub_size);
13382+			if (out_first_pixel > out_last_pixel) {
13383+				break;
13384+			}
13385+
13386+			if ((out_first_pixel < output_sub_size) || (out_last_pixel >= 0)) {
13387+				range->n0 = n;
13388+			}
13389+			--n;
13390+		}
13391+
13392+		// now go through the end of the area through the margin to find top
13393+		n = range->n1 - 1;
13394+		input_end = n + 1 + filter_pixel_margin;
13395+		while (n <= input_end) {
13396+			int out_first_pixel, out_last_pixel;
13397+			stbir__calculate_out_pixel_range(
13398+			    &out_first_pixel, &out_last_pixel, ((float)n) + 0.5f,
13399+			    in_pixels_radius, scale, out_shift, output_sub_size);
13400+			if (out_first_pixel > out_last_pixel) {
13401+				break;
13402+			}
13403+			if ((out_first_pixel < output_sub_size) || (out_last_pixel >= 0)) {
13404+				range->n1 = n;
13405+			}
13406+			++n;
13407+		}
13408+	}
13409+
13410+	if (samp->edge == STBIR_EDGE_WRAP) {
13411+		// if we are wrapping, and we are very close to the image size (so the
13412+		// edges might merge), just use the scanline up to the edge
13413+		if ((range->n0 > 0) && (range->n1 >= input_full_size)) {
13414+			int marg = range->n1 - input_full_size + 1;
13415+			if ((marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD) >= range->n0) {
13416+				range->n0 = 0;
13417+			}
13418+		}
13419+		if ((range->n0 < 0) && (range->n1 < (input_full_size - 1))) {
13420+			int marg = -range->n0;
13421+			if ((input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD -
13422+			     1) <= range->n1) {
13423+				range->n1 = input_full_size - 1;
13424+			}
13425+		}
13426+	} else {
13427+		// for non-edge-wrap modes, we never read over the edge, so clamp
13428+		if (range->n0 < 0) {
13429+			range->n0 = 0;
13430+		}
13431+		if (range->n1 >= input_full_size) {
13432+			range->n1 = input_full_size - 1;
13433+		}
13434+	}
13435+}
13436+
13437+static void
13438+stbir__get_split_info(stbir__per_split_info *split_info, int splits,
13439+                      int output_height, int vertical_pixel_margin,
13440+                      int input_full_height, int is_gather,
13441+                      stbir__contributors *contribs)
13442+{
13443+	int i, cur;
13444+	int left = output_height;
13445+
13446+	cur = 0;
13447+	for (i = 0; i < splits; i++) {
13448+		int each;
13449+
13450+		split_info[i].start_output_y = cur;
13451+		each = left / (splits - i);
13452+		split_info[i].end_output_y = cur + each;
13453+
13454+		// ok, when we are gathering, we need to make sure we are starting on a
13455+		// y offset that doesn't have
13456+		//   a "special" set of coefficients. Basically, with exactly the right
13457+		//   filter at exactly the right resize at exactly the right phase, some
13458+		//   of the coefficents can be zero. When they are zero, we don't
13459+		//   process them at all.  But this leads to a tricky thing with the
13460+		//   thread splits, where we might have a set of two coeffs like this
13461+		//   for example: (4,4) and (3,6).  The 4,4 means there was just one
13462+		//   single coeff because things worked out perfectly (normally, they
13463+		//   all have 4 coeffs like the range 3,6.  The problem is that if we
13464+		//   start right on the (4,4) on a brand new thread, then when we get to
13465+		//   (3,6), we don't have the "3" sample in memory (because we didn't
13466+		//   load it on the initial (4,4) range because it didn't have a 3 (we
13467+		//   only add new samples that are larger than our existing samples -
13468+		//   it's just how the eviction works). So, our solution here is pretty
13469+		//   simple, if we start right on a range that has samples that start
13470+		//   earlier, then we simply bump up our previous thread split range to
13471+		//   include it, and then start this threads range with the smaller
13472+		//   sample. It just moves one scanline from one thread split to
13473+		//   another, so that we end with the unusual one, instead of start with
13474+		//   it. To do this, we check 2-4 sample at each thread split start and
13475+		//   then occassionally move them.
13476+
13477+		if ((is_gather) && (i)) {
13478+			stbir__contributors *small_contribs;
13479+			int j, smallest, stop, start_n0;
13480+			stbir__contributors *split_contribs = contribs + cur;
13481+
13482+			// scan for a max of 3x the filter width or until the next thread
13483+			// split
13484+			stop = vertical_pixel_margin * 3;
13485+			if (each < stop) {
13486+				stop = each;
13487+			}
13488+
13489+			// loops a few times before early out
13490+			smallest = 0;
13491+			small_contribs = split_contribs;
13492+			start_n0 = small_contribs->n0;
13493+			for (j = 1; j <= stop; j++) {
13494+				++split_contribs;
13495+				if (split_contribs->n0 > start_n0) {
13496+					break;
13497+				}
13498+				if (split_contribs->n0 < small_contribs->n0) {
13499+					small_contribs = split_contribs;
13500+					smallest = j;
13501+				}
13502+			}
13503+
13504+			split_info[i - 1].end_output_y += smallest;
13505+			split_info[i].start_output_y += smallest;
13506+		}
13507+
13508+		cur += each;
13509+		left -= each;
13510+
13511+		// scatter range (updated to minimum as you run it)
13512+		split_info[i].start_input_y = -vertical_pixel_margin;
13513+		split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
13514+	}
13515+}
13516+
13517+static void
13518+stbir__free_internal_mem(stbir__info *info)
13519+{
13520+#define STBIR__FREE_AND_CLEAR(ptr)                                             \
13521+	{                                                                          \
13522+		if (ptr) {                                                             \
13523+			void *p = (ptr);                                                   \
13524+			(ptr) = 0;                                                         \
13525+			STBIR_FREE(p, info->user_data);                                    \
13526+		}                                                                      \
13527+	}
13528+
13529+	if (info) {
13530+#ifndef STBIR__SEPARATE_ALLOCATIONS
13531+		STBIR__FREE_AND_CLEAR(info->alloced_mem);
13532+#else
13533+		int i, j;
13534+
13535+		if ((info->vertical.gather_prescatter_contributors) &&
13536+		    ((void *)info->vertical.gather_prescatter_contributors !=
13537+		     (void *)info->split_info[0].decode_buffer)) {
13538+			STBIR__FREE_AND_CLEAR(
13539+			    info->vertical.gather_prescatter_coefficients);
13540+			STBIR__FREE_AND_CLEAR(
13541+			    info->vertical.gather_prescatter_contributors);
13542+		}
13543+		for (i = 0; i < info->splits; i++) {
13544+			for (j = 0; j < info->alloc_ring_buffer_num_entries; j++) {
13545+#ifdef STBIR_SIMD8
13546+				if (info->effective_channels == 3) {
13547+					--info->split_info[i]
13548+					      .ring_buffers[j]; // avx in 3 channel mode needs one
13549+					                        // float at the start of the buffer
13550+				}
13551+#endif
13552+				STBIR__FREE_AND_CLEAR(info->split_info[i].ring_buffers[j]);
13553+			}
13554 
13555-// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
13556+#ifdef STBIR_SIMD8
13557+			if (info->effective_channels == 3) {
13558+				--info->split_info[i]
13559+				      .decode_buffer; // avx in 3 channel mode needs one float
13560+				                      // at the start of the buffer
13561+			}
13562+#endif
13563+			STBIR__FREE_AND_CLEAR(info->split_info[i].decode_buffer);
13564+			STBIR__FREE_AND_CLEAR(info->split_info[i].ring_buffers);
13565+			STBIR__FREE_AND_CLEAR(info->split_info[i].vertical_buffer);
13566+		}
13567+		STBIR__FREE_AND_CLEAR(info->split_info);
13568+		if (info->vertical.coefficients != info->horizontal.coefficients) {
13569+			STBIR__FREE_AND_CLEAR(info->vertical.coefficients);
13570+			STBIR__FREE_AND_CLEAR(info->vertical.contributors);
13571+		}
13572+		STBIR__FREE_AND_CLEAR(info->horizontal.coefficients);
13573+		STBIR__FREE_AND_CLEAR(info->horizontal.contributors);
13574+		STBIR__FREE_AND_CLEAR(info->alloced_mem);
13575+		STBIR_FREE(info, info->user_data);
13576+#endif
13577+	}
13578+
13579+#undef STBIR__FREE_AND_CLEAR
13580+}
13581+
13582+static int
13583+stbir__get_max_split(int splits, int height)
13584+{
13585+	int i;
13586+	int max = 0;
13587+
13588+	for (i = 0; i < splits; i++) {
13589+		int each = height / (splits - i);
13590+		if (each > max) {
13591+			max = each;
13592+		}
13593+		height -= each;
13594+	}
13595+	return max;
13596+}
13597+
13598+static stbir__horizontal_gather_channels_func *
13599+    *stbir__horizontal_gather_n_coeffs_funcs[8] = {
13600+        0,
13601+        stbir__horizontal_gather_1_channels_with_n_coeffs_funcs,
13602+        stbir__horizontal_gather_2_channels_with_n_coeffs_funcs,
13603+        stbir__horizontal_gather_3_channels_with_n_coeffs_funcs,
13604+        stbir__horizontal_gather_4_channels_with_n_coeffs_funcs,
13605+        0,
13606+        0,
13607+        stbir__horizontal_gather_7_channels_with_n_coeffs_funcs};
13608+
13609+static stbir__horizontal_gather_channels_func *
13610+    *stbir__horizontal_gather_channels_funcs[8] = {
13611+        0,
13612+        stbir__horizontal_gather_1_channels_funcs,
13613+        stbir__horizontal_gather_2_channels_funcs,
13614+        stbir__horizontal_gather_3_channels_funcs,
13615+        stbir__horizontal_gather_4_channels_funcs,
13616+        0,
13617+        0,
13618+        stbir__horizontal_gather_7_channels_funcs};
13619+
13620+// there are six resize classifications: 0 == vertical scatter, 1 == vertical
13621+// gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather <
13622+// 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4
13623+// pixel wide column
13624 #define STBIR_RESIZE_CLASSIFICATIONS 8
13625 
13626-static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]=  // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
13627-{
13628-  {
13629-    { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
13630-    { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
13631-    { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
13632-    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
13633-    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
13634-    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
13635-    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
13636-    { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
13637-  }, {
13638-    { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
13639-    { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
13640-    { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
13641-    { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
13642-    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
13643-    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
13644-    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
13645-    { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
13646-  }, {
13647-    { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
13648-    { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
13649-    { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
13650-    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
13651-    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
13652-    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
13653-    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
13654-    { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
13655-  }, {
13656-    { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
13657-    { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
13658-    { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
13659-    { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
13660-    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
13661-    { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
13662-    { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
13663-    { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
13664-  }, {
13665-    { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
13666-    { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
13667-    { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
13668-    { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
13669-    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
13670-    { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
13671-    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
13672-    { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
13673-  }
13674-};
13675+static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS]
13676+                                   [4] = // 5 = 0=1chan, 1=2chan, 2=3chan,
13677+                                         // 3=4chan, 4=7chan
13678+    {{
13679+         {1.00000f, 1.00000f, 0.31250f, 1.00000f},
13680+         {0.56250f, 0.59375f, 0.00000f, 0.96875f},
13681+         {1.00000f, 0.06250f, 0.00000f, 1.00000f},
13682+         {0.00000f, 0.09375f, 1.00000f, 1.00000f},
13683+         {1.00000f, 1.00000f, 1.00000f, 1.00000f},
13684+         {0.03125f, 0.12500f, 1.00000f, 1.00000f},
13685+         {0.06250f, 0.12500f, 0.00000f, 1.00000f},
13686+         {0.00000f, 1.00000f, 0.00000f, 0.03125f},
13687+     },
13688+     {
13689+         {0.00000f, 0.84375f, 0.00000f, 0.03125f},
13690+         {0.09375f, 0.93750f, 0.00000f, 0.78125f},
13691+         {0.87500f, 0.21875f, 0.00000f, 0.96875f},
13692+         {0.09375f, 0.09375f, 1.00000f, 1.00000f},
13693+         {1.00000f, 1.00000f, 1.00000f, 1.00000f},
13694+         {0.03125f, 0.12500f, 1.00000f, 1.00000f},
13695+         {0.06250f, 0.12500f, 0.00000f, 1.00000f},
13696+         {0.00000f, 1.00000f, 0.00000f, 0.53125f},
13697+     },
13698+     {
13699+         {0.00000f, 0.53125f, 0.00000f, 0.03125f},
13700+         {0.06250f, 0.96875f, 0.00000f, 0.53125f},
13701+         {0.87500f, 0.18750f, 0.00000f, 0.93750f},
13702+         {0.00000f, 0.09375f, 1.00000f, 1.00000f},
13703+         {1.00000f, 1.00000f, 1.00000f, 1.00000f},
13704+         {0.03125f, 0.12500f, 1.00000f, 1.00000f},
13705+         {0.06250f, 0.12500f, 0.00000f, 1.00000f},
13706+         {0.00000f, 1.00000f, 0.00000f, 0.56250f},
13707+     },
13708+     {
13709+         {0.00000f, 0.50000f, 0.00000f, 0.71875f},
13710+         {0.06250f, 0.84375f, 0.00000f, 0.87500f},
13711+         {1.00000f, 0.50000f, 0.50000f, 0.96875f},
13712+         {1.00000f, 0.09375f, 0.31250f, 0.50000f},
13713+         {1.00000f, 1.00000f, 1.00000f, 1.00000f},
13714+         {1.00000f, 0.03125f, 0.03125f, 0.53125f},
13715+         {0.18750f, 0.12500f, 0.00000f, 1.00000f},
13716+         {0.00000f, 1.00000f, 0.03125f, 0.18750f},
13717+     },
13718+     {
13719+         {0.00000f, 0.59375f, 0.00000f, 0.96875f},
13720+         {0.06250f, 0.81250f, 0.06250f, 0.59375f},
13721+         {0.75000f, 0.43750f, 0.12500f, 0.96875f},
13722+         {0.87500f, 0.06250f, 0.18750f, 0.43750f},
13723+         {1.00000f, 1.00000f, 1.00000f, 1.00000f},
13724+         {0.15625f, 0.12500f, 1.00000f, 1.00000f},
13725+         {0.06250f, 0.12500f, 0.00000f, 1.00000f},
13726+         {0.00000f, 1.00000f, 0.03125f, 0.34375f},
13727+     }};
13728 
13729 // structure that allow us to query and override info for training the costs
13730-typedef struct STBIR__V_FIRST_INFO
13731-{
13732-  double v_cost, h_cost;
13733-  int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
13734-  int v_first;
13735-  int v_resize_classification;
13736-  int is_gather;
13737+typedef struct STBIR__V_FIRST_INFO {
13738+	double v_cost, h_cost;
13739+	int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
13740+	int v_first;
13741+	int v_resize_classification;
13742+	int is_gather;
13743 } STBIR__V_FIRST_INFO;
13744 
13745 #ifdef STBIR__V_FIRST_INFO_BUFFER
13746@@ -6954,1243 +8125,1685 @@ static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
13747 //     app that solves for the best weights (and shows how well it
13748 //     does currently).
13749 
13750-static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
13751-{
13752-  double v_cost, h_cost;
13753-  float * weights;
13754-  int vertical_first;
13755-  int v_classification;
13756-
13757-  // categorize the resize into buckets
13758-  if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
13759-    v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
13760-  else if ( vertical_scale <= 1.0f )
13761-    v_classification = ( is_gather ) ? 1 : 0;
13762-  else if ( vertical_scale <= 2.0f)
13763-    v_classification = 2;
13764-  else if ( vertical_scale <= 3.0f)
13765-    v_classification = 3;
13766-  else if ( vertical_scale <= 4.0f)
13767-    v_classification = 5;
13768-  else
13769-    v_classification = 6;
13770-
13771-  // use the right weights
13772-  weights = weights_table[ v_classification ];
13773-
13774-  // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
13775-  h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
13776-  v_cost = (float)vertical_filter_pixel_width  * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
13777-
13778-  // use computation estimate to decide vertical first or not
13779-  vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
13780-
13781-  // save these, if requested
13782-  if ( info )
13783-  {
13784-    info->h_cost = h_cost;
13785-    info->v_cost = v_cost;
13786-    info->v_resize_classification = v_classification;
13787-    info->v_first = vertical_first;
13788-    info->is_gather = is_gather;
13789-  }
13790-
13791-  // and this allows us to override everything for testing (see dotiming.c)
13792-  if ( ( info ) && ( info->control_v_first ) )
13793-    vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
13794-
13795-  return vertical_first;
13796+static int
13797+stbir__should_do_vertical_first(
13798+    float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4],
13799+    int horizontal_filter_pixel_width, float horizontal_scale,
13800+    int horizontal_output_size, int vertical_filter_pixel_width,
13801+    float vertical_scale, int vertical_output_size, int is_gather,
13802+    STBIR__V_FIRST_INFO *info)
13803+{
13804+	double v_cost, h_cost;
13805+	float *weights;
13806+	int vertical_first;
13807+	int v_classification;
13808+
13809+	// categorize the resize into buckets
13810+	if ((vertical_output_size <= 4) || (horizontal_output_size <= 4)) {
13811+		v_classification =
13812+		    (vertical_output_size < horizontal_output_size) ? 6 : 7;
13813+	} else if (vertical_scale <= 1.0f) {
13814+		v_classification = (is_gather) ? 1 : 0;
13815+	} else if (vertical_scale <= 2.0f) {
13816+		v_classification = 2;
13817+	} else if (vertical_scale <= 3.0f) {
13818+		v_classification = 3;
13819+	} else if (vertical_scale <= 4.0f) {
13820+		v_classification = 5;
13821+	} else {
13822+		v_classification = 6;
13823+	}
13824+
13825+	// use the right weights
13826+	weights = weights_table[v_classification];
13827+
13828+	// this is the costs when you don't take into account modern CPUs with high
13829+	// ipc and simd and caches - wish we had a better estimate
13830+	h_cost = (float)horizontal_filter_pixel_width * weights[0] +
13831+	         horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
13832+	v_cost = (float)vertical_filter_pixel_width * weights[2] +
13833+	         vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
13834+
13835+	// use computation estimate to decide vertical first or not
13836+	vertical_first = (v_cost <= h_cost) ? 1 : 0;
13837+
13838+	// save these, if requested
13839+	if (info) {
13840+		info->h_cost = h_cost;
13841+		info->v_cost = v_cost;
13842+		info->v_resize_classification = v_classification;
13843+		info->v_first = vertical_first;
13844+		info->is_gather = is_gather;
13845+	}
13846+
13847+	// and this allows us to override everything for testing (see dotiming.c)
13848+	if ((info) && (info->control_v_first)) {
13849+		vertical_first = (info->control_v_first == 2) ? 1 : 0;
13850+	}
13851+
13852+	return vertical_first;
13853 }
13854 
13855 // layout lookups - must match stbir_internal_pixel_layout
13856 static unsigned char stbir__pixel_channels[] = {
13857-  1,2,3,3,4,   // 1ch, 2ch, rgb, bgr, 4ch
13858-  4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
13859-  4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
13860+    1, 2, 3, 3, 4,    // 1ch, 2ch, rgb, bgr, 4ch
13861+    4, 4, 4, 4, 2, 2, // RGBA,BGRA,ARGB,ABGR,RA,AR
13862+    4, 4, 4, 4, 2, 2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
13863 };
13864 
13865-// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
13866-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
13867-static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
13868-  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
13869-  STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
13870-  STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
13871+// the internal pixel layout enums are in a different order, so we can easily do
13872+// range comparisons of types
13873+//   the public pixel layout is ordered in a way that if you cast num_channels
13874+//   (1-4) to the enum, you get something sensible
13875+static stbir_internal_pixel_layout
13876+    stbir__pixel_layout_convert_public_to_internal[] = {
13877+        STBIRI_BGR,     STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB,
13878+        STBIRI_RGBA,    STBIRI_4CHANNEL, STBIRI_BGRA,     STBIRI_ARGB,
13879+        STBIRI_ABGR,    STBIRI_RA,       STBIRI_AR,       STBIRI_RGBA_PM,
13880+        STBIRI_BGRA_PM, STBIRI_ARGB_PM,  STBIRI_ABGR_PM,  STBIRI_RA_PM,
13881+        STBIRI_AR_PM,
13882 };
13883 
13884-static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
13885-{
13886-  static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
13887-
13888-  stbir__info * info = 0;
13889-  void * alloced = 0;
13890-  size_t alloced_total = 0;
13891-  int vertical_first;
13892-  size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
13893-  int alloc_ring_buffer_num_entries;
13894-
13895-  int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
13896-  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
13897-  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
13898-  stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
13899-  int channels = stbir__pixel_channels[ input_pixel_layout ];
13900-  int effective_channels = channels;
13901-
13902-  // first figure out what type of alpha weighting to use (if any)
13903-  if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
13904-  {
13905-    if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
13906-    {
13907-      if ( fast_alpha )
13908-      {
13909-        alpha_weighting_type = 4;
13910-      }
13911-      else
13912-      {
13913-        static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
13914-        alpha_weighting_type = 2;
13915-        effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
13916-      }
13917-    }
13918-    else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
13919-    {
13920-      // input premult, output non-premult
13921-      alpha_weighting_type = 3;
13922-    }
13923-    else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
13924-    {
13925-      // input non-premult, output premult
13926-      alpha_weighting_type = 1;
13927-    }
13928-  }
13929-
13930-  // channel in and out count must match currently
13931-  if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
13932-    return 0;
13933-
13934-  // get vertical first
13935-  vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
13936-
13937-  // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
13938-  //   we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
13939-  //   the conversion routines overwriting the callback input data.
13940-  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger
13941-
13942-#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
13943-  if ( effective_channels == 3 )
13944-    decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
13945-#endif
13946-
13947-  ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
13948-
13949-  // if we do vertical first, the ring buffer holds a whole decoded line
13950-  if ( vertical_first )
13951-    ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
13952-
13953-  if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
13954-
13955-  // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
13956-  alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
13957-
13958-  // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
13959-  if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
13960-    alloc_ring_buffer_num_entries = conservative_split_output_size;
13961-
13962-  ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;
13963-
13964-  // The vertical buffer is used differently, depending on whether we are scattering
13965-  //   the vertical scanlines, or gathering them.
13966-  //   If scattering, it's used at the temp buffer to accumulate each output.
13967-  //   If gathering, it's just the output buffer.
13968-  vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
13969-
13970-  // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
13971-  for(;;)
13972-  {
13973-    int i;
13974-    void * advance_mem = alloced;
13975-    int copy_horizontal = 0;
13976-    stbir__sampler * possibly_use_horizontal_for_pivot = 0;
13977+static stbir__info *
13978+stbir__alloc_internal_mem_and_build_samplers(
13979+    stbir__sampler *horizontal, stbir__sampler *vertical,
13980+    stbir__contributors *conservative,
13981+    stbir_pixel_layout input_pixel_layout_public,
13982+    stbir_pixel_layout output_pixel_layout_public, int splits, int new_x,
13983+    int new_y, int fast_alpha,
13984+    void *user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO)
13985+{
13986+	static char stbir_channel_count_index[8] = {9, 0, 1, 2, 3, 9, 9, 4};
13987+
13988+	stbir__info *info = 0;
13989+	void *alloced = 0;
13990+	size_t alloced_total = 0;
13991+	int vertical_first;
13992+	size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size,
13993+	    vertical_buffer_size;
13994+	int alloc_ring_buffer_num_entries;
13995+
13996+	int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
13997+	int conservative_split_output_size =
13998+	    stbir__get_max_split(splits, vertical->scale_info.output_sub_size);
13999+	stbir_internal_pixel_layout input_pixel_layout =
14000+	    stbir__pixel_layout_convert_public_to_internal
14001+	        [input_pixel_layout_public];
14002+	stbir_internal_pixel_layout output_pixel_layout =
14003+	    stbir__pixel_layout_convert_public_to_internal
14004+	        [output_pixel_layout_public];
14005+	int channels = stbir__pixel_channels[input_pixel_layout];
14006+	int effective_channels = channels;
14007+
14008+	// first figure out what type of alpha weighting to use (if any)
14009+	if ((horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE) ||
14010+	    (vertical->filter_enum !=
14011+	     STBIR_FILTER_POINT_SAMPLE)) // no alpha weighting on point sampling
14012+	{
14013+		if ((input_pixel_layout >= STBIRI_RGBA) &&
14014+		    (input_pixel_layout <= STBIRI_AR) &&
14015+		    (output_pixel_layout >= STBIRI_RGBA) &&
14016+		    (output_pixel_layout <= STBIRI_AR)) {
14017+			if (fast_alpha) {
14018+				alpha_weighting_type = 4;
14019+			} else {
14020+				static int fancy_alpha_effective_cnts[6] = {7, 7, 7, 7, 3, 3};
14021+				alpha_weighting_type = 2;
14022+				effective_channels =
14023+				    fancy_alpha_effective_cnts[input_pixel_layout -
14024+				                               STBIRI_RGBA];
14025+			}
14026+		} else if ((input_pixel_layout >= STBIRI_RGBA_PM) &&
14027+		           (input_pixel_layout <= STBIRI_AR_PM) &&
14028+		           (output_pixel_layout >= STBIRI_RGBA) &&
14029+		           (output_pixel_layout <= STBIRI_AR)) {
14030+			// input premult, output non-premult
14031+			alpha_weighting_type = 3;
14032+		} else if ((input_pixel_layout >= STBIRI_RGBA) &&
14033+		           (input_pixel_layout <= STBIRI_AR) &&
14034+		           (output_pixel_layout >= STBIRI_RGBA_PM) &&
14035+		           (output_pixel_layout <= STBIRI_AR_PM)) {
14036+			// input non-premult, output premult
14037+			alpha_weighting_type = 1;
14038+		}
14039+	}
14040+
14041+	// channel in and out count must match currently
14042+	if (channels != stbir__pixel_channels[output_pixel_layout]) {
14043+		return 0;
14044+	}
14045+
14046+	// get vertical first
14047+	vertical_first = stbir__should_do_vertical_first(
14048+	    stbir__compute_weights[(
14049+	        int)stbir_channel_count_index[effective_channels]],
14050+	    horizontal->filter_pixel_width, horizontal->scale_info.scale,
14051+	    horizontal->scale_info.output_sub_size, vertical->filter_pixel_width,
14052+	    vertical->scale_info.scale, vertical->scale_info.output_sub_size,
14053+	    vertical->is_gather, STBIR__V_FIRST_INFO_POINTER);
14054+
14055+	// sometimes read one float off in some of the unrolled loops (with a weight
14056+	// of zero coeff, so it doesn't have an effect)
14057+	//   we use a few extra floats instead of just 1, so that input callback
14058+	//   buffer can overlap with the decode buffer without the conversion
14059+	//   routines overwriting the callback input data.
14060+	decode_buffer_size =
14061+	    (conservative->n1 - conservative->n0 + 1) * effective_channels *
14062+	        sizeof(float) +
14063+	    sizeof(float) * STBIR_INPUT_CALLBACK_PADDING; // extra floats for input
14064+	                                                  // callback stagger
14065+
14066+#if defined(STBIR__SEPARATE_ALLOCATIONS) && defined(STBIR_SIMD8)
14067+	if (effective_channels == 3) {
14068+		decode_buffer_size +=
14069+		    sizeof(float); // avx in 3 channel mode needs one float at the start
14070+		                   // of the buffer (only with separate allocations)
14071+	}
14072+#endif
14073+
14074+	ring_buffer_length_bytes =
14075+	    (size_t)horizontal->scale_info.output_sub_size *
14076+	        (size_t)effective_channels * sizeof(float) +
14077+	    sizeof(float) *
14078+	        STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
14079+
14080+	// if we do vertical first, the ring buffer holds a whole decoded line
14081+	if (vertical_first) {
14082+		ring_buffer_length_bytes = (decode_buffer_size + 15) & ~15;
14083+	}
14084+
14085+	if ((ring_buffer_length_bytes & 4095) == 0) {
14086+		ring_buffer_length_bytes += 64 * 3; // avoid 4k alias
14087+	}
14088+
14089+	// One extra entry because floating point precision problems sometimes cause
14090+	// an extra to be necessary.
14091+	alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
14092+
14093+	// we never need more ring buffer entries than the scanlines we're
14094+	// outputting when in scatter mode
14095+	if ((!vertical->is_gather) &&
14096+	    (alloc_ring_buffer_num_entries > conservative_split_output_size)) {
14097+		alloc_ring_buffer_num_entries = conservative_split_output_size;
14098+	}
14099+
14100+	ring_buffer_size = (size_t)alloc_ring_buffer_num_entries *
14101+	                   (size_t)ring_buffer_length_bytes;
14102+
14103+	// The vertical buffer is used differently, depending on whether we are
14104+	// scattering
14105+	//   the vertical scanlines, or gathering them.
14106+	//   If scattering, it's used at the temp buffer to accumulate each output.
14107+	//   If gathering, it's just the output buffer.
14108+	vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size *
14109+	                           (size_t)effective_channels * sizeof(float) +
14110+	                       sizeof(float); // extra float for padding
14111+
14112+	// we make two passes through this loop, 1st to add everything up, 2nd to
14113+	// allocate and init
14114+	for (;;) {
14115+		int i;
14116+		void *advance_mem = alloced;
14117+		int copy_horizontal = 0;
14118+		stbir__sampler *possibly_use_horizontal_for_pivot = 0;
14119 
14120 #ifdef STBIR__SEPARATE_ALLOCATIONS
14121-    #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
14122+#define STBIR__NEXT_PTR(ptr, size, ntype)                                      \
14123+	if (alloced) {                                                             \
14124+		void *p = STBIR_MALLOC(size, user_data);                               \
14125+		if (p == 0) {                                                          \
14126+			stbir__free_internal_mem(info);                                    \
14127+			return 0;                                                          \
14128+		}                                                                      \
14129+		(ptr) = (ntype *)p;                                                    \
14130+	}
14131 #else
14132-    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size));
14133-#endif
14134-
14135-    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
14136-
14137-    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
14138-
14139-    if ( info )
14140-    {
14141-      static stbir__alpha_weight_func * fancy_alpha_weights[6]  =    { stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_2ch,   stbir__fancy_alpha_weight_2ch };
14142-      static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
14143-      static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
14144-      static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
14145-
14146-      // initialize info fields
14147-      info->alloced_mem = alloced;
14148-      info->alloced_total = alloced_total;
14149-
14150-      info->channels = channels;
14151-      info->effective_channels = effective_channels;
14152-
14153-      info->offset_x = new_x;
14154-      info->offset_y = new_y;
14155-      info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
14156-      info->ring_buffer_num_entries = 0;
14157-      info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
14158-      info->splits = splits;
14159-      info->vertical_first = vertical_first;
14160-
14161-      info->input_pixel_layout_internal = input_pixel_layout;
14162-      info->output_pixel_layout_internal = output_pixel_layout;
14163-
14164-      // setup alpha weight functions
14165-      info->alpha_weight = 0;
14166-      info->alpha_unweight = 0;
14167-
14168-      // handle alpha weighting functions and overrides
14169-      if ( alpha_weighting_type == 2 )
14170-      {
14171-        // high quality alpha multiplying on the way in, dividing on the way out
14172-        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
14173-        info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
14174-      }
14175-      else if ( alpha_weighting_type == 4 )
14176-      {
14177-        // fast alpha multiplying on the way in, dividing on the way out
14178-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
14179-        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
14180-      }
14181-      else if ( alpha_weighting_type == 1 )
14182-      {
14183-        // fast alpha on the way in, leave in premultiplied form on way out
14184-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
14185-      }
14186-      else if ( alpha_weighting_type == 3 )
14187-      {
14188-        // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
14189-        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
14190-      }
14191-
14192-      // handle 3-chan color flipping, using the alpha weight path
14193-      if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
14194-           ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
14195-      {
14196-        // do the flipping on the smaller of the two ends
14197-        if ( horizontal->scale_info.scale < 1.0f )
14198-          info->alpha_unweight = stbir__simple_flip_3ch;
14199-        else
14200-          info->alpha_weight = stbir__simple_flip_3ch;
14201-      }
14202-
14203-    }
14204-
14205-    // get all the per-split buffers
14206-    for( i = 0 ; i < splits ; i++ )
14207-    {
14208-      STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
14209+#define STBIR__NEXT_PTR(ptr, size, ntype)                                      \
14210+	advance_mem = (void *)((((size_t)advance_mem) + 15) & ~15);                \
14211+	if (alloced)                                                               \
14212+		ptr = (ntype *)advance_mem;                                            \
14213+	advance_mem = (char *)(((size_t)advance_mem) + (size));
14214+#endif
14215+
14216+		STBIR__NEXT_PTR(info, sizeof(stbir__info), stbir__info);
14217+
14218+		STBIR__NEXT_PTR(info->split_info,
14219+		                sizeof(stbir__per_split_info) * splits,
14220+		                stbir__per_split_info);
14221+
14222+		if (info) {
14223+			static stbir__alpha_weight_func *fancy_alpha_weights[6] = {
14224+			    stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch,
14225+			    stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch,
14226+			    stbir__fancy_alpha_weight_2ch, stbir__fancy_alpha_weight_2ch};
14227+			static stbir__alpha_unweight_func *fancy_alpha_unweights[6] = {
14228+			    stbir__fancy_alpha_unweight_4ch,
14229+			    stbir__fancy_alpha_unweight_4ch,
14230+			    stbir__fancy_alpha_unweight_4ch,
14231+			    stbir__fancy_alpha_unweight_4ch,
14232+			    stbir__fancy_alpha_unweight_2ch,
14233+			    stbir__fancy_alpha_unweight_2ch};
14234+			static stbir__alpha_weight_func *simple_alpha_weights[6] = {
14235+			    stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch,
14236+			    stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch,
14237+			    stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch};
14238+			static stbir__alpha_unweight_func *simple_alpha_unweights[6] = {
14239+			    stbir__simple_alpha_unweight_4ch,
14240+			    stbir__simple_alpha_unweight_4ch,
14241+			    stbir__simple_alpha_unweight_4ch,
14242+			    stbir__simple_alpha_unweight_4ch,
14243+			    stbir__simple_alpha_unweight_2ch,
14244+			    stbir__simple_alpha_unweight_2ch};
14245+
14246+			// initialize info fields
14247+			info->alloced_mem = alloced;
14248+			info->alloced_total = alloced_total;
14249+
14250+			info->channels = channels;
14251+			info->effective_channels = effective_channels;
14252+
14253+			info->offset_x = new_x;
14254+			info->offset_y = new_y;
14255+			info->alloc_ring_buffer_num_entries =
14256+			    (int)alloc_ring_buffer_num_entries;
14257+			info->ring_buffer_num_entries = 0;
14258+			info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
14259+			info->splits = splits;
14260+			info->vertical_first = vertical_first;
14261+
14262+			info->input_pixel_layout_internal = input_pixel_layout;
14263+			info->output_pixel_layout_internal = output_pixel_layout;
14264+
14265+			// setup alpha weight functions
14266+			info->alpha_weight = 0;
14267+			info->alpha_unweight = 0;
14268+
14269+			// handle alpha weighting functions and overrides
14270+			if (alpha_weighting_type == 2) {
14271+				// high quality alpha multiplying on the way in, dividing on the
14272+				// way out
14273+				info->alpha_weight =
14274+				    fancy_alpha_weights[input_pixel_layout - STBIRI_RGBA];
14275+				info->alpha_unweight =
14276+				    fancy_alpha_unweights[output_pixel_layout - STBIRI_RGBA];
14277+			} else if (alpha_weighting_type == 4) {
14278+				// fast alpha multiplying on the way in, dividing on the way out
14279+				info->alpha_weight =
14280+				    simple_alpha_weights[input_pixel_layout - STBIRI_RGBA];
14281+				info->alpha_unweight =
14282+				    simple_alpha_unweights[output_pixel_layout - STBIRI_RGBA];
14283+			} else if (alpha_weighting_type == 1) {
14284+				// fast alpha on the way in, leave in premultiplied form on way
14285+				// out
14286+				info->alpha_weight =
14287+				    simple_alpha_weights[input_pixel_layout - STBIRI_RGBA];
14288+			} else if (alpha_weighting_type == 3) {
14289+				// incoming is premultiplied, fast alpha dividing on the way out
14290+				// - non-premultiplied output
14291+				info->alpha_unweight =
14292+				    simple_alpha_unweights[output_pixel_layout - STBIRI_RGBA];
14293+			}
14294+
14295+			// handle 3-chan color flipping, using the alpha weight path
14296+			if (((input_pixel_layout == STBIRI_RGB) &&
14297+			     (output_pixel_layout == STBIRI_BGR)) ||
14298+			    ((input_pixel_layout == STBIRI_BGR) &&
14299+			     (output_pixel_layout == STBIRI_RGB))) {
14300+				// do the flipping on the smaller of the two ends
14301+				if (horizontal->scale_info.scale < 1.0f) {
14302+					info->alpha_unweight = stbir__simple_flip_3ch;
14303+				} else {
14304+					info->alpha_weight = stbir__simple_flip_3ch;
14305+				}
14306+			}
14307+		}
14308+
14309+		// get all the per-split buffers
14310+		for (i = 0; i < splits; i++) {
14311+			STBIR__NEXT_PTR(info->split_info[i].decode_buffer,
14312+			                decode_buffer_size, float);
14313 
14314 #ifdef STBIR__SEPARATE_ALLOCATIONS
14315 
14316-      #ifdef STBIR_SIMD8
14317-      if ( ( info ) && ( effective_channels == 3 ) )
14318-        ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
14319-      #endif
14320-
14321-      STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
14322-      {
14323-        int j;
14324-        for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
14325-        {
14326-          STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
14327-          #ifdef STBIR_SIMD8
14328-          if ( ( info ) && ( effective_channels == 3 ) )
14329-            ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
14330-          #endif
14331-        }
14332-      }
14333+#ifdef STBIR_SIMD8
14334+			if ((info) && (effective_channels == 3)) {
14335+				++info->split_info[i]
14336+				      .decode_buffer; // avx in 3 channel mode needs one float
14337+				                      // at the start of the buffer
14338+			}
14339+#endif
14340+
14341+			STBIR__NEXT_PTR(info->split_info[i].ring_buffers,
14342+			                alloc_ring_buffer_num_entries * sizeof(float *),
14343+			                float *);
14344+			{
14345+				int j;
14346+				for (j = 0; j < alloc_ring_buffer_num_entries; j++) {
14347+					STBIR__NEXT_PTR(info->split_info[i].ring_buffers[j],
14348+					                ring_buffer_length_bytes, float);
14349+#ifdef STBIR_SIMD8
14350+					if ((info) && (effective_channels == 3)) {
14351+						++info->split_info[i]
14352+						      .ring_buffers[j]; // avx in 3 channel mode needs
14353+						                        // one float at the start of the
14354+						                        // buffer
14355+					}
14356+#endif
14357+				}
14358+			}
14359 #else
14360-      STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
14361+			STBIR__NEXT_PTR(info->split_info[i].ring_buffer, ring_buffer_size,
14362+			                float);
14363 #endif
14364-      STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
14365-    }
14366+			STBIR__NEXT_PTR(info->split_info[i].vertical_buffer,
14367+			                vertical_buffer_size, float);
14368+		}
14369 
14370-    // alloc memory for to-be-pivoted coeffs (if necessary)
14371-    if ( vertical->is_gather == 0 )
14372-    {
14373-      size_t both;
14374-      size_t temp_mem_amt;
14375+		// alloc memory for to-be-pivoted coeffs (if necessary)
14376+		if (vertical->is_gather == 0) {
14377+			size_t both;
14378+			size_t temp_mem_amt;
14379 
14380-      // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
14381-      //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
14382-      //   is too small, we just allocate extra memory to use as this temp.
14383+			// when in vertical scatter mode, we first build the coefficients in
14384+			// gather mode, and then pivot after,
14385+			//   that means we need two buffers, so we try to use the decode
14386+			//   buffer and ring buffer for this. if that is too small, we just
14387+			//   allocate extra memory to use as this temp.
14388 
14389-      both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;
14390+			both = (size_t)vertical->gather_prescatter_contributors_size +
14391+			       (size_t)vertical->gather_prescatter_coefficients_size;
14392 
14393 #ifdef STBIR__SEPARATE_ALLOCATIONS
14394-      temp_mem_amt = decode_buffer_size;
14395+			temp_mem_amt = decode_buffer_size;
14396 
14397-      #ifdef STBIR_SIMD8
14398-      if ( effective_channels == 3 )
14399-        --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
14400-      #endif
14401+#ifdef STBIR_SIMD8
14402+			if (effective_channels == 3) {
14403+				--temp_mem_amt; // avx in 3 channel mode needs one float at the
14404+				                // start of the buffer
14405+			}
14406+#endif
14407 #else
14408-      temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
14409-#endif
14410-      if ( temp_mem_amt >= both )
14411-      {
14412-        if ( info )
14413-        {
14414-          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
14415-          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
14416-        }
14417-      }
14418-      else
14419-      {
14420-        // ring+decode memory is too small, so allocate temp memory
14421-        STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
14422-        STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
14423-      }
14424-    }
14425-
14426-    STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
14427-    STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
14428-
14429-    // are the two filters identical?? (happens a lot with mipmap generation)
14430-    if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
14431-    {
14432-      float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
14433-      float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
14434-      if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
14435-      if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
14436-      if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
14437-      {
14438-        if ( horizontal->is_gather == vertical->is_gather )
14439-        {
14440-          copy_horizontal = 1;
14441-          goto no_vert_alloc;
14442-        }
14443-        // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
14444-        possibly_use_horizontal_for_pivot = horizontal;
14445-      }
14446-    }
14447-
14448-    STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
14449-    STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
14450-
14451-   no_vert_alloc:
14452-
14453-    if ( info )
14454-    {
14455-      STBIR_PROFILE_BUILD_START( horizontal );
14456-
14457-      stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
14458-
14459-      // setup the horizontal gather functions
14460-      // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
14461-      info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
14462-      // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
14463-      if ( horizontal->extent_info.widest <= 12 )
14464-        info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
14465-
14466-      info->scanline_extents.conservative.n0 = conservative->n0;
14467-      info->scanline_extents.conservative.n1 = conservative->n1;
14468-
14469-      // get exact extents
14470-      stbir__get_extents( horizontal, &info->scanline_extents );
14471-
14472-      // pack the horizontal coeffs
14473-      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
14474-
14475-      STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
14476-
14477-      STBIR_PROFILE_BUILD_END( horizontal );
14478-
14479-      if ( copy_horizontal )
14480-      {
14481-        STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
14482-      }
14483-      else
14484-      {
14485-        STBIR_PROFILE_BUILD_START( vertical );
14486-
14487-        stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
14488-        STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
14489-
14490-        STBIR_PROFILE_BUILD_END( vertical );
14491-      }
14492-
14493-      // setup the vertical split ranges
14494-      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors );
14495-
14496-      // now we know precisely how many entries we need
14497-      info->ring_buffer_num_entries = info->vertical.extent_info.widest;
14498-
14499-      // we never need more ring buffer entries than the scanlines we're outputting
14500-      if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
14501-        info->ring_buffer_num_entries = conservative_split_output_size;
14502-      STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
14503-    }
14504-    #undef STBIR__NEXT_PTR
14505-
14506-
14507-    // is this the first time through loop?
14508-    if ( info == 0 )
14509-    {
14510-      alloced_total = ( 15 + (size_t)advance_mem );
14511-      alloced = STBIR_MALLOC( alloced_total, user_data );
14512-      if ( alloced == 0 )
14513-        return 0;
14514-    }
14515-    else
14516-      return info;  // success
14517-  }
14518-}
14519-
14520-static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
14521-{
14522-  stbir__per_split_info * split_info = info->split_info + split_start;
14523-
14524-  STBIR_PROFILE_CLEAR_EXTRAS();
14525-
14526-  STBIR_PROFILE_FIRST_START( looping );
14527-  if (info->vertical.is_gather)
14528-    stbir__vertical_gather_loop( info, split_info, split_count );
14529-  else
14530-    stbir__vertical_scatter_loop( info, split_info, split_count );
14531-  STBIR_PROFILE_END( looping );
14532-
14533-  return 1;
14534-}
14535-
14536-static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
14537-{
14538-  static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
14539-  {
14540-    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
14541-  };
14542-
14543-  static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
14544-  {
14545-    { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
14546-    { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
14547-    { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
14548-    { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
14549-    { /* RA   */ stbir__decode_uint8_srgb2_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
14550-    { /* AR   */ stbir__decode_uint8_srgb2_linearalpha_AR,   stbir__decode_uint8_srgb_AR,   0, stbir__decode_float_linear_AR,   stbir__decode_half_float_linear_AR },
14551-  };
14552-
14553-  static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
14554-  {
14555-    { stbir__decode_uint8_linear_scaled,  stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
14556-  };
14557-
14558-  static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
14559-  {
14560-    { /* RGBA */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
14561-    { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA,  stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
14562-    { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB,  stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
14563-    { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR,  stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
14564-    { /* RA   */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
14565-    { /* AR   */ { stbir__decode_uint8_linear_scaled_AR,    stbir__decode_uint8_linear_AR },   { stbir__decode_uint16_linear_scaled_AR,   stbir__decode_uint16_linear_AR } }
14566-  };
14567-
14568-  static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
14569-  {
14570-    /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
14571-  };
14572-
14573-  static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
14574-  {
14575-    { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
14576-    { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
14577-    { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
14578-    { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
14579-    { /* RA   */ stbir__encode_uint8_srgb2_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
14580-    { /* AR   */ stbir__encode_uint8_srgb2_linearalpha_AR,   stbir__encode_uint8_srgb_AR,   0, stbir__encode_float_linear_AR,   stbir__encode_half_float_linear_AR }
14581-  };
14582-
14583-  static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
14584-  {
14585-    { stbir__encode_uint8_linear_scaled,  stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
14586-  };
14587-
14588-  static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
14589-  {
14590-    { /* RGBA */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
14591-    { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA,  stbir__encode_uint8_linear_BGRA },  { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
14592-    { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB,  stbir__encode_uint8_linear_ARGB },  { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
14593-    { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR,  stbir__encode_uint8_linear_ABGR },  { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
14594-    { /* RA   */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
14595-    { /* AR   */ { stbir__encode_uint8_linear_scaled_AR,    stbir__encode_uint8_linear_AR },    { stbir__encode_uint16_linear_scaled_AR,   stbir__encode_uint16_linear_AR } }
14596-  };
14597-
14598-  stbir__decode_pixels_func * decode_pixels = 0;
14599-  stbir__encode_pixels_func * encode_pixels = 0;
14600-  stbir_datatype input_type, output_type;
14601-
14602-  input_type = resize->input_data_type;
14603-  output_type = resize->output_data_type;
14604-  info->input_data = resize->input_pixels;
14605-  info->input_stride_bytes = resize->input_stride_in_bytes;
14606-  info->output_stride_bytes = resize->output_stride_in_bytes;
14607-
14608-  // if we're completely point sampling, then we can turn off SRGB
14609-  if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
14610-  {
14611-    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
14612-         ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
14613-    {
14614-      input_type = STBIR_TYPE_UINT8;
14615-      output_type = STBIR_TYPE_UINT8;
14616-    }
14617-  }
14618-
14619-  // recalc the output and input strides
14620-  if ( info->input_stride_bytes == 0 )
14621-    info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
14622-
14623-  if ( info->output_stride_bytes == 0 )
14624-    info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
14625-
14626-  // calc offset
14627-  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
14628-
14629-  info->in_pixels_cb = resize->input_cb;
14630-  info->user_data = resize->user_data;
14631-  info->out_pixels_cb = resize->output_cb;
14632-
14633-  // setup the input format converters
14634-  if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
14635-  {
14636-    int non_scaled = 0;
14637-
14638-    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
14639-    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight )  ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
14640-      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
14641-        non_scaled = 1;
14642-
14643-    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
14644-      decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
14645-    else
14646-      decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
14647-  }
14648-  else
14649-  {
14650-    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
14651-      decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
14652-    else
14653-      decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
14654-  }
14655-
14656-  // setup the output format converters
14657-  if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
14658-  {
14659-    int non_scaled = 0;
14660-
14661-    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
14662-    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
14663-      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
14664-        non_scaled = 1;
14665-
14666-    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
14667-      encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
14668-    else
14669-      encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
14670-  }
14671-  else
14672-  {
14673-    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
14674-      encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
14675-    else
14676-      encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
14677-  }
14678-
14679-  info->input_type = input_type;
14680-  info->output_type = output_type;
14681-  info->decode_pixels = decode_pixels;
14682-  info->encode_pixels = encode_pixels;
14683-}
14684-
14685-static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
14686-{
14687-  double per, adj;
14688-  int over;
14689-
14690-  // do left/top edge
14691-  if ( *outx < 0 )
14692-  {
14693-    per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
14694-    adj = per * ( *u1 - *u0 );
14695-    *u0 -= adj; // increases u0
14696-    *outx = 0;
14697-  }
14698-
14699-  // do right/bot edge
14700-  over = outw - ( *outx + *outsubw );
14701-  if ( over < 0 )
14702-  {
14703-    per = ( (double)over ) / ( (double)*outsubw ); // is negative
14704-    adj = per * ( *u1 - *u0 );
14705-    *u1 += adj; // decrease u1
14706-    *outsubw = outw - *outx;
14707-  }
14708-}
14709-
14710-// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
14711-static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
14712-{
14713-  double err;
14714-  stbir_uint64 top, bot;
14715-  stbir_uint64 numer_last = 0;
14716-  stbir_uint64 denom_last = 1;
14717-  stbir_uint64 numer_estimate = 1;
14718-  stbir_uint64 denom_estimate = 0;
14719-
14720-  // scale to past float error range
14721-  top = (stbir_uint64)( f * (double)(1 << 25) );
14722-  bot = 1 << 25;
14723-
14724-  // keep refining, but usually stops in a few loops - usually 5 for bad cases
14725-  for(;;)
14726-  {
14727-    stbir_uint64 est, temp;
14728-
14729-    // hit limit, break out and do best full range estimate
14730-    if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
14731-      break;
14732-
14733-    // is the current error less than 1 bit of a float? if so, we're done
14734-    if ( denom_estimate )
14735-    {
14736-      err = ( (double)numer_estimate / (double)denom_estimate ) - f;
14737-      if ( err < 0.0 ) err = -err;
14738-      if ( err < ( 1.0 / (double)(1<<24) ) )
14739-      {
14740-        // yup, found it
14741-        *numer = (stbir_uint32) numer_estimate;
14742-        *denom = (stbir_uint32) denom_estimate;
14743-        return 1;
14744-      }
14745-    }
14746-
14747-    // no more refinement bits left? break out and do full range estimate
14748-    if ( bot == 0 )
14749-      break;
14750-
14751-    // gcd the estimate bits
14752-    est = top / bot;
14753-    temp = top % bot;
14754-    top = bot;
14755-    bot = temp;
14756-
14757-    // move remainders
14758-    temp = est * denom_estimate + denom_last;
14759-    denom_last = denom_estimate;
14760-    denom_estimate = temp;
14761-
14762-    // move remainders
14763-    temp = est * numer_estimate + numer_last;
14764-    numer_last = numer_estimate;
14765-    numer_estimate = temp;
14766-  }
14767-
14768-  // we didn't fine anything good enough for float, use a full range estimate
14769-  if ( limit_denom )
14770-  {
14771-    numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
14772-    denom_estimate = limit;
14773-  }
14774-  else
14775-  {
14776-    numer_estimate = limit;
14777-    denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
14778-  }
14779-
14780-  *numer = (stbir_uint32) numer_estimate;
14781-  *denom = (stbir_uint32) denom_estimate;
14782-
14783-  err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
14784-  if ( err < 0.0 ) err = -err;
14785-  return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
14786-}
14787-
14788-static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
14789-{
14790-  double output_range, input_range, output_s, input_s, ratio, scale;
14791-
14792-  input_s = input_s1 - input_s0;
14793-
14794-  // null area
14795-  if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
14796-       ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
14797-    return 0;
14798-
14799-  // are either of the ranges completely out of bounds?
14800-  if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
14801-    return 0;
14802-
14803-  output_range = (double)output_full_range;
14804-  input_range = (double)input_full_range;
14805-
14806-  output_s = ( (double)output_sub_range) / output_range;
14807-
14808-  // figure out the scaling to use
14809-  ratio = output_s / input_s;
14810-
14811-  // save scale before clipping
14812-  scale = ( output_range / input_range ) * ratio;
14813-  scale_info->scale = (float)scale;
14814-  scale_info->inv_scale = (float)( 1.0 / scale );
14815-
14816-  // clip output area to left/right output edges (and adjust input area)
14817-  stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
14818-
14819-  // recalc input area
14820-  input_s = input_s1 - input_s0;
14821-
14822-  // after clipping do we have zero input area?
14823-  if ( input_s <= stbir__small_float )
14824-    return 0;
14825-
14826-  // calculate and store the starting source offsets in output pixel space
14827-  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
14828-
14829-  scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
14830-
14831-  scale_info->input_full_size = input_full_range;
14832-  scale_info->output_sub_size = output_sub_range;
14833-
14834-  return 1;
14835-}
14836-
14837-
14838-static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
14839-{
14840-  resize->input_cb = 0;
14841-  resize->output_cb = 0;
14842-  resize->user_data = resize;
14843-  resize->samplers = 0;
14844-  resize->called_alloc = 0;
14845-  resize->horizontal_filter = STBIR_FILTER_DEFAULT;
14846-  resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
14847-  resize->vertical_filter = STBIR_FILTER_DEFAULT;
14848-  resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
14849-  resize->horizontal_edge = STBIR_EDGE_CLAMP;
14850-  resize->vertical_edge = STBIR_EDGE_CLAMP;
14851-  resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
14852-  resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
14853-  resize->input_data_type = data_type;
14854-  resize->output_data_type = data_type;
14855-  resize->input_pixel_layout_public = pixel_layout;
14856-  resize->output_pixel_layout_public = pixel_layout;
14857-  resize->needs_rebuild = 1;
14858-}
14859-
14860-STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
14861-                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
14862-                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
14863-                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type )
14864-{
14865-  resize->input_pixels = input_pixels;
14866-  resize->input_w = input_w;
14867-  resize->input_h = input_h;
14868-  resize->input_stride_in_bytes = input_stride_in_bytes;
14869-  resize->output_pixels = output_pixels;
14870-  resize->output_w = output_w;
14871-  resize->output_h = output_h;
14872-  resize->output_stride_in_bytes = output_stride_in_bytes;
14873-  resize->fast_alpha = 0;
14874-
14875-  stbir__init_and_set_layout( resize, pixel_layout, data_type );
14876+			temp_mem_amt = (size_t)(decode_buffer_size + ring_buffer_size +
14877+			                        vertical_buffer_size) *
14878+			               (size_t)splits;
14879+#endif
14880+			if (temp_mem_amt >= both) {
14881+				if (info) {
14882+					vertical->gather_prescatter_contributors =
14883+					    (stbir__contributors *)info->split_info[0]
14884+					        .decode_buffer;
14885+					vertical->gather_prescatter_coefficients =
14886+					    (float *)(((char *)info->split_info[0].decode_buffer) +
14887+					              vertical
14888+					                  ->gather_prescatter_contributors_size);
14889+				}
14890+			} else {
14891+				// ring+decode memory is too small, so allocate temp memory
14892+				STBIR__NEXT_PTR(vertical->gather_prescatter_contributors,
14893+				                vertical->gather_prescatter_contributors_size,
14894+				                stbir__contributors);
14895+				STBIR__NEXT_PTR(vertical->gather_prescatter_coefficients,
14896+				                vertical->gather_prescatter_coefficients_size,
14897+				                float);
14898+			}
14899+		}
14900+
14901+		STBIR__NEXT_PTR(horizontal->contributors, horizontal->contributors_size,
14902+		                stbir__contributors);
14903+		STBIR__NEXT_PTR(horizontal->coefficients, horizontal->coefficients_size,
14904+		                float);
14905+
14906+		// are the two filters identical?? (happens a lot with mipmap
14907+		// generation)
14908+		if ((horizontal->filter_kernel == vertical->filter_kernel) &&
14909+		    (horizontal->filter_support == vertical->filter_support) &&
14910+		    (horizontal->edge == vertical->edge) &&
14911+		    (horizontal->scale_info.output_sub_size ==
14912+		     vertical->scale_info.output_sub_size)) {
14913+			float diff_scale =
14914+			    horizontal->scale_info.scale - vertical->scale_info.scale;
14915+			float diff_shift = horizontal->scale_info.pixel_shift -
14916+			                   vertical->scale_info.pixel_shift;
14917+			if (diff_scale < 0.0f) {
14918+				diff_scale = -diff_scale;
14919+			}
14920+			if (diff_shift < 0.0f) {
14921+				diff_shift = -diff_shift;
14922+			}
14923+			if ((diff_scale <= stbir__small_float) &&
14924+			    (diff_shift <= stbir__small_float)) {
14925+				if (horizontal->is_gather == vertical->is_gather) {
14926+					copy_horizontal = 1;
14927+					goto no_vert_alloc;
14928+				}
14929+				// everything matches, but vertical is scatter, horizontal is
14930+				// gather, use horizontal coeffs for vertical pivot coeffs
14931+				possibly_use_horizontal_for_pivot = horizontal;
14932+			}
14933+		}
14934+
14935+		STBIR__NEXT_PTR(vertical->contributors, vertical->contributors_size,
14936+		                stbir__contributors);
14937+		STBIR__NEXT_PTR(vertical->coefficients, vertical->coefficients_size,
14938+		                float);
14939+
14940+	no_vert_alloc:
14941+
14942+		if (info) {
14943+			STBIR_PROFILE_BUILD_START(horizontal);
14944+
14945+			stbir__calculate_filters(
14946+			    horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO);
14947+
14948+			// setup the horizontal gather functions
14949+			// start with defaulting to the n_coeffs functions (specialized on
14950+			// channels and remnant leftover)
14951+			info->horizontal_gather_channels =
14952+			    stbir__horizontal_gather_n_coeffs_funcs
14953+			        [effective_channels][horizontal->extent_info.widest & 3];
14954+			// but if the number of coeffs <= 12, use another set of special
14955+			// cases. <=12 coeffs is any enlarging resize, or shrinking resize
14956+			// down to about 1/3 size
14957+			if (horizontal->extent_info.widest <= 12) {
14958+				info->horizontal_gather_channels =
14959+				    stbir__horizontal_gather_channels_funcs
14960+				        [effective_channels]
14961+				        [horizontal->extent_info.widest - 1];
14962+			}
14963+
14964+			info->scanline_extents.conservative.n0 = conservative->n0;
14965+			info->scanline_extents.conservative.n1 = conservative->n1;
14966+
14967+			// get exact extents
14968+			stbir__get_extents(horizontal, &info->scanline_extents);
14969+
14970+			// pack the horizontal coeffs
14971+			horizontal->coefficient_width = stbir__pack_coefficients(
14972+			    horizontal->num_contributors, horizontal->contributors,
14973+			    horizontal->coefficients, horizontal->coefficient_width,
14974+			    horizontal->extent_info.widest,
14975+			    info->scanline_extents.conservative.n0,
14976+			    info->scanline_extents.conservative.n1);
14977+
14978+			STBIR_MEMCPY(&info->horizontal, horizontal, sizeof(stbir__sampler));
14979+
14980+			STBIR_PROFILE_BUILD_END(horizontal);
14981+
14982+			if (copy_horizontal) {
14983+				STBIR_MEMCPY(&info->vertical, horizontal,
14984+				             sizeof(stbir__sampler));
14985+			} else {
14986+				STBIR_PROFILE_BUILD_START(vertical);
14987+
14988+				stbir__calculate_filters(
14989+				    vertical, possibly_use_horizontal_for_pivot,
14990+				    user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO);
14991+				STBIR_MEMCPY(&info->vertical, vertical, sizeof(stbir__sampler));
14992+
14993+				STBIR_PROFILE_BUILD_END(vertical);
14994+			}
14995+
14996+			// setup the vertical split ranges
14997+			stbir__get_split_info(info->split_info, info->splits,
14998+			                      info->vertical.scale_info.output_sub_size,
14999+			                      info->vertical.filter_pixel_margin,
15000+			                      info->vertical.scale_info.input_full_size,
15001+			                      info->vertical.is_gather,
15002+			                      info->vertical.contributors);
15003+
15004+			// now we know precisely how many entries we need
15005+			info->ring_buffer_num_entries = info->vertical.extent_info.widest;
15006+
15007+			// we never need more ring buffer entries than the scanlines we're
15008+			// outputting
15009+			if ((!info->vertical.is_gather) &&
15010+			    (info->ring_buffer_num_entries >
15011+			     conservative_split_output_size)) {
15012+				info->ring_buffer_num_entries = conservative_split_output_size;
15013+			}
15014+			STBIR_ASSERT(info->ring_buffer_num_entries <=
15015+			             info->alloc_ring_buffer_num_entries);
15016+		}
15017+#undef STBIR__NEXT_PTR
15018+
15019+		// is this the first time through loop?
15020+		if (info == 0) {
15021+			alloced_total = (15 + (size_t)advance_mem);
15022+			alloced = STBIR_MALLOC(alloced_total, user_data);
15023+			if (alloced == 0) {
15024+				return 0;
15025+			}
15026+		} else {
15027+			return info; // success
15028+		}
15029+	}
15030+}
15031+
15032+static int
15033+stbir__perform_resize(stbir__info const *info, int split_start, int split_count)
15034+{
15035+	stbir__per_split_info *split_info = info->split_info + split_start;
15036+
15037+	STBIR_PROFILE_CLEAR_EXTRAS();
15038+
15039+	STBIR_PROFILE_FIRST_START(looping);
15040+	if (info->vertical.is_gather) {
15041+		stbir__vertical_gather_loop(info, split_info, split_count);
15042+	} else {
15043+		stbir__vertical_scatter_loop(info, split_info, split_count);
15044+	}
15045+	STBIR_PROFILE_END(looping);
15046+
15047+	return 1;
15048+}
15049+
15050+static void
15051+stbir__update_info_from_resize(stbir__info *info, STBIR_RESIZE *resize)
15052+{
15053+	static stbir__decode_pixels_func
15054+	    *decode_simple[STBIR_TYPE_HALF_FLOAT - STBIR_TYPE_UINT8_SRGB + 1] = {
15055+	        /* 1ch-4ch */ stbir__decode_uint8_srgb,
15056+	        stbir__decode_uint8_srgb,
15057+	        0,
15058+	        stbir__decode_float_linear,
15059+	        stbir__decode_half_float_linear,
15060+	    };
15061+
15062+	static stbir__decode_pixels_func
15063+	    *decode_alphas[STBIRI_AR - STBIRI_RGBA +
15064+	                   1][STBIR_TYPE_HALF_FLOAT - STBIR_TYPE_UINT8_SRGB + 1] = {
15065+	        {/* RGBA */ stbir__decode_uint8_srgb4_linearalpha,
15066+	         stbir__decode_uint8_srgb, 0, stbir__decode_float_linear,
15067+	         stbir__decode_half_float_linear},
15068+	        {/* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA,
15069+	         stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA,
15070+	         stbir__decode_half_float_linear_BGRA},
15071+	        {/* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB,
15072+	         stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB,
15073+	         stbir__decode_half_float_linear_ARGB},
15074+	        {/* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR,
15075+	         stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR,
15076+	         stbir__decode_half_float_linear_ABGR},
15077+	        {/* RA   */ stbir__decode_uint8_srgb2_linearalpha,
15078+	         stbir__decode_uint8_srgb, 0, stbir__decode_float_linear,
15079+	         stbir__decode_half_float_linear},
15080+	        {/* AR   */ stbir__decode_uint8_srgb2_linearalpha_AR,
15081+	         stbir__decode_uint8_srgb_AR, 0, stbir__decode_float_linear_AR,
15082+	         stbir__decode_half_float_linear_AR},
15083+	    };
15084+
15085+	static stbir__decode_pixels_func *decode_simple_scaled_or_not[2][2] = {
15086+	    {stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear},
15087+	    {stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear},
15088+	};
15089+
15090+	static stbir__decode_pixels_func
15091+	    *decode_alphas_scaled_or_not[STBIRI_AR - STBIRI_RGBA + 1][2][2] = {
15092+	        {/* RGBA */ {stbir__decode_uint8_linear_scaled,
15093+	                     stbir__decode_uint8_linear},
15094+	         {stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear}},
15095+	        {/* BGRA */ {stbir__decode_uint8_linear_scaled_BGRA,
15096+	                     stbir__decode_uint8_linear_BGRA},
15097+	         {stbir__decode_uint16_linear_scaled_BGRA,
15098+	          stbir__decode_uint16_linear_BGRA}},
15099+	        {/* ARGB */ {stbir__decode_uint8_linear_scaled_ARGB,
15100+	                     stbir__decode_uint8_linear_ARGB},
15101+	         {stbir__decode_uint16_linear_scaled_ARGB,
15102+	          stbir__decode_uint16_linear_ARGB}},
15103+	        {/* ABGR */ {stbir__decode_uint8_linear_scaled_ABGR,
15104+	                     stbir__decode_uint8_linear_ABGR},
15105+	         {stbir__decode_uint16_linear_scaled_ABGR,
15106+	          stbir__decode_uint16_linear_ABGR}},
15107+	        {/* RA   */ {stbir__decode_uint8_linear_scaled,
15108+	                     stbir__decode_uint8_linear},
15109+	         {stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear}},
15110+	        {/* AR   */ {stbir__decode_uint8_linear_scaled_AR,
15111+	                     stbir__decode_uint8_linear_AR},
15112+	         {stbir__decode_uint16_linear_scaled_AR,
15113+	          stbir__decode_uint16_linear_AR}}};
15114+
15115+	static stbir__encode_pixels_func
15116+	    *encode_simple[STBIR_TYPE_HALF_FLOAT - STBIR_TYPE_UINT8_SRGB + 1] = {
15117+	        /* 1ch-4ch */ stbir__encode_uint8_srgb,
15118+	        stbir__encode_uint8_srgb,
15119+	        0,
15120+	        stbir__encode_float_linear,
15121+	        stbir__encode_half_float_linear,
15122+	    };
15123+
15124+	static stbir__encode_pixels_func
15125+	    *encode_alphas[STBIRI_AR - STBIRI_RGBA +
15126+	                   1][STBIR_TYPE_HALF_FLOAT - STBIR_TYPE_UINT8_SRGB + 1] = {
15127+	        {/* RGBA */ stbir__encode_uint8_srgb4_linearalpha,
15128+	         stbir__encode_uint8_srgb, 0, stbir__encode_float_linear,
15129+	         stbir__encode_half_float_linear},
15130+	        {/* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA,
15131+	         stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA,
15132+	         stbir__encode_half_float_linear_BGRA},
15133+	        {/* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB,
15134+	         stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB,
15135+	         stbir__encode_half_float_linear_ARGB},
15136+	        {/* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR,
15137+	         stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR,
15138+	         stbir__encode_half_float_linear_ABGR},
15139+	        {/* RA   */ stbir__encode_uint8_srgb2_linearalpha,
15140+	         stbir__encode_uint8_srgb, 0, stbir__encode_float_linear,
15141+	         stbir__encode_half_float_linear},
15142+	        {/* AR   */ stbir__encode_uint8_srgb2_linearalpha_AR,
15143+	         stbir__encode_uint8_srgb_AR, 0, stbir__encode_float_linear_AR,
15144+	         stbir__encode_half_float_linear_AR}};
15145+
15146+	static stbir__encode_pixels_func *encode_simple_scaled_or_not[2][2] = {
15147+	    {stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear},
15148+	    {stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear},
15149+	};
15150+
15151+	static stbir__encode_pixels_func
15152+	    *encode_alphas_scaled_or_not[STBIRI_AR - STBIRI_RGBA + 1][2][2] = {
15153+	        {/* RGBA */ {stbir__encode_uint8_linear_scaled,
15154+	                     stbir__encode_uint8_linear},
15155+	         {stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear}},
15156+	        {/* BGRA */ {stbir__encode_uint8_linear_scaled_BGRA,
15157+	                     stbir__encode_uint8_linear_BGRA},
15158+	         {stbir__encode_uint16_linear_scaled_BGRA,
15159+	          stbir__encode_uint16_linear_BGRA}},
15160+	        {/* ARGB */ {stbir__encode_uint8_linear_scaled_ARGB,
15161+	                     stbir__encode_uint8_linear_ARGB},
15162+	         {stbir__encode_uint16_linear_scaled_ARGB,
15163+	          stbir__encode_uint16_linear_ARGB}},
15164+	        {/* ABGR */ {stbir__encode_uint8_linear_scaled_ABGR,
15165+	                     stbir__encode_uint8_linear_ABGR},
15166+	         {stbir__encode_uint16_linear_scaled_ABGR,
15167+	          stbir__encode_uint16_linear_ABGR}},
15168+	        {/* RA   */ {stbir__encode_uint8_linear_scaled,
15169+	                     stbir__encode_uint8_linear},
15170+	         {stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear}},
15171+	        {/* AR   */ {stbir__encode_uint8_linear_scaled_AR,
15172+	                     stbir__encode_uint8_linear_AR},
15173+	         {stbir__encode_uint16_linear_scaled_AR,
15174+	          stbir__encode_uint16_linear_AR}}};
15175+
15176+	stbir__decode_pixels_func *decode_pixels = 0;
15177+	stbir__encode_pixels_func *encode_pixels = 0;
15178+	stbir_datatype input_type, output_type;
15179+
15180+	input_type = resize->input_data_type;
15181+	output_type = resize->output_data_type;
15182+	info->input_data = resize->input_pixels;
15183+	info->input_stride_bytes = resize->input_stride_in_bytes;
15184+	info->output_stride_bytes = resize->output_stride_in_bytes;
15185+
15186+	// if we're completely point sampling, then we can turn off SRGB
15187+	if ((info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE) &&
15188+	    (info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE)) {
15189+		if (((input_type == STBIR_TYPE_UINT8_SRGB) ||
15190+		     (input_type == STBIR_TYPE_UINT8_SRGB_ALPHA)) &&
15191+		    ((output_type == STBIR_TYPE_UINT8_SRGB) ||
15192+		     (output_type == STBIR_TYPE_UINT8_SRGB_ALPHA))) {
15193+			input_type = STBIR_TYPE_UINT8;
15194+			output_type = STBIR_TYPE_UINT8;
15195+		}
15196+	}
15197+
15198+	// recalc the output and input strides
15199+	if (info->input_stride_bytes == 0) {
15200+		info->input_stride_bytes = info->channels *
15201+		                           info->horizontal.scale_info.input_full_size *
15202+		                           stbir__type_size[input_type];
15203+	}
15204+
15205+	if (info->output_stride_bytes == 0) {
15206+		info->output_stride_bytes =
15207+		    info->channels * info->horizontal.scale_info.output_sub_size *
15208+		    stbir__type_size[output_type];
15209+	}
15210+
15211+	// calc offset
15212+	info->output_data =
15213+	    ((char *)resize->output_pixels) +
15214+	    ((size_t)info->offset_y * (size_t)resize->output_stride_in_bytes) +
15215+	    (info->offset_x * info->channels * stbir__type_size[output_type]);
15216+
15217+	info->in_pixels_cb = resize->input_cb;
15218+	info->user_data = resize->user_data;
15219+	info->out_pixels_cb = resize->output_cb;
15220+
15221+	// setup the input format converters
15222+	if ((input_type == STBIR_TYPE_UINT8) || (input_type == STBIR_TYPE_UINT16)) {
15223+		int non_scaled = 0;
15224+
15225+		// check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0
15226+		// (which is a tiny bit faster when doing linear 8->8 or 16->16)
15227+		if ((!info->alpha_weight) &&
15228+		    (!info->alpha_unweight)) { // don't short circuit when alpha
15229+			                           // weighting (get everything to 0-1.0 as
15230+			                           // usual)
15231+			if (((input_type == STBIR_TYPE_UINT8) &&
15232+			     (output_type == STBIR_TYPE_UINT8)) ||
15233+			    ((input_type == STBIR_TYPE_UINT16) &&
15234+			     (output_type == STBIR_TYPE_UINT16))) {
15235+				non_scaled = 1;
15236+			}
15237+		}
15238+
15239+		if (info->input_pixel_layout_internal <= STBIRI_4CHANNEL) {
15240+			decode_pixels =
15241+			    decode_simple_scaled_or_not[input_type == STBIR_TYPE_UINT16]
15242+			                               [non_scaled];
15243+		} else {
15244+			decode_pixels =
15245+			    decode_alphas_scaled_or_not[(info->input_pixel_layout_internal -
15246+			                                 STBIRI_RGBA) %
15247+			                                (STBIRI_AR - STBIRI_RGBA + 1)]
15248+			                               [input_type == STBIR_TYPE_UINT16]
15249+			                               [non_scaled];
15250+		}
15251+	} else {
15252+		if (info->input_pixel_layout_internal <= STBIRI_4CHANNEL) {
15253+			decode_pixels = decode_simple[input_type - STBIR_TYPE_UINT8_SRGB];
15254+		} else {
15255+			decode_pixels = decode_alphas[(info->input_pixel_layout_internal -
15256+			                               STBIRI_RGBA) %
15257+			                              (STBIRI_AR - STBIRI_RGBA + 1)]
15258+			                             [input_type - STBIR_TYPE_UINT8_SRGB];
15259+		}
15260+	}
15261+
15262+	// setup the output format converters
15263+	if ((output_type == STBIR_TYPE_UINT8) ||
15264+	    (output_type == STBIR_TYPE_UINT16)) {
15265+		int non_scaled = 0;
15266+
15267+		// check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0
15268+		// (which is a tiny bit faster when doing linear 8->8 or 16->16)
15269+		if ((!info->alpha_weight) &&
15270+		    (!info->alpha_unweight)) { // don't short circuit when alpha
15271+			                           // weighting (get everything to 0-1.0 as
15272+			                           // usual)
15273+			if (((input_type == STBIR_TYPE_UINT8) &&
15274+			     (output_type == STBIR_TYPE_UINT8)) ||
15275+			    ((input_type == STBIR_TYPE_UINT16) &&
15276+			     (output_type == STBIR_TYPE_UINT16))) {
15277+				non_scaled = 1;
15278+			}
15279+		}
15280+
15281+		if (info->output_pixel_layout_internal <= STBIRI_4CHANNEL) {
15282+			encode_pixels =
15283+			    encode_simple_scaled_or_not[output_type == STBIR_TYPE_UINT16]
15284+			                               [non_scaled];
15285+		} else {
15286+			encode_pixels = encode_alphas_scaled_or_not
15287+			    [(info->output_pixel_layout_internal - STBIRI_RGBA) %
15288+			     (STBIRI_AR - STBIRI_RGBA + 1)]
15289+			    [output_type == STBIR_TYPE_UINT16][non_scaled];
15290+		}
15291+	} else {
15292+		if (info->output_pixel_layout_internal <= STBIRI_4CHANNEL) {
15293+			encode_pixels = encode_simple[output_type - STBIR_TYPE_UINT8_SRGB];
15294+		} else {
15295+			encode_pixels = encode_alphas[(info->output_pixel_layout_internal -
15296+			                               STBIRI_RGBA) %
15297+			                              (STBIRI_AR - STBIRI_RGBA + 1)]
15298+			                             [output_type - STBIR_TYPE_UINT8_SRGB];
15299+		}
15300+	}
15301+
15302+	info->input_type = input_type;
15303+	info->output_type = output_type;
15304+	info->decode_pixels = decode_pixels;
15305+	info->encode_pixels = encode_pixels;
15306+}
15307+
15308+static void
15309+stbir__clip(int *outx, int *outsubw, int outw, double *u0, double *u1)
15310+{
15311+	double per, adj;
15312+	int over;
15313+
15314+	// do left/top edge
15315+	if (*outx < 0) {
15316+		per = ((double)*outx) / ((double)*outsubw); // is negative
15317+		adj = per * (*u1 - *u0);
15318+		*u0 -= adj; // increases u0
15319+		*outx = 0;
15320+	}
15321+
15322+	// do right/bot edge
15323+	over = outw - (*outx + *outsubw);
15324+	if (over < 0) {
15325+		per = ((double)over) / ((double)*outsubw); // is negative
15326+		adj = per * (*u1 - *u0);
15327+		*u1 += adj; // decrease u1
15328+		*outsubw = outw - *outx;
15329+	}
15330+}
15331+
15332+// converts a double to a rational that has less than one float bit of error
15333+// (returns 0 if unable to do so)
15334+static int
15335+stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer,
15336+                          stbir_uint32 *denom,
15337+                          int limit_denom) // limit_denom (1) or limit numer (0)
15338+{
15339+	double err;
15340+	stbir_uint64 top, bot;
15341+	stbir_uint64 numer_last = 0;
15342+	stbir_uint64 denom_last = 1;
15343+	stbir_uint64 numer_estimate = 1;
15344+	stbir_uint64 denom_estimate = 0;
15345+
15346+	// scale to past float error range
15347+	top = (stbir_uint64)(f * (double)(1 << 25));
15348+	bot = 1 << 25;
15349+
15350+	// keep refining, but usually stops in a few loops - usually 5 for bad cases
15351+	for (;;) {
15352+		stbir_uint64 est, temp;
15353+
15354+		// hit limit, break out and do best full range estimate
15355+		if (((limit_denom) ? denom_estimate : numer_estimate) >= limit) {
15356+			break;
15357+		}
15358+
15359+		// is the current error less than 1 bit of a float? if so, we're done
15360+		if (denom_estimate) {
15361+			err = ((double)numer_estimate / (double)denom_estimate) - f;
15362+			if (err < 0.0) {
15363+				err = -err;
15364+			}
15365+			if (err < (1.0 / (double)(1 << 24))) {
15366+				// yup, found it
15367+				*numer = (stbir_uint32)numer_estimate;
15368+				*denom = (stbir_uint32)denom_estimate;
15369+				return 1;
15370+			}
15371+		}
15372+
15373+		// no more refinement bits left? break out and do full range estimate
15374+		if (bot == 0) {
15375+			break;
15376+		}
15377+
15378+		// gcd the estimate bits
15379+		est = top / bot;
15380+		temp = top % bot;
15381+		top = bot;
15382+		bot = temp;
15383+
15384+		// move remainders
15385+		temp = est * denom_estimate + denom_last;
15386+		denom_last = denom_estimate;
15387+		denom_estimate = temp;
15388+
15389+		// move remainders
15390+		temp = est * numer_estimate + numer_last;
15391+		numer_last = numer_estimate;
15392+		numer_estimate = temp;
15393+	}
15394+
15395+	// we didn't fine anything good enough for float, use a full range estimate
15396+	if (limit_denom) {
15397+		numer_estimate = (stbir_uint64)(f * (double)limit + 0.5);
15398+		denom_estimate = limit;
15399+	} else {
15400+		numer_estimate = limit;
15401+		denom_estimate = (stbir_uint64)(((double)limit / f) + 0.5);
15402+	}
15403+
15404+	*numer = (stbir_uint32)numer_estimate;
15405+	*denom = (stbir_uint32)denom_estimate;
15406+
15407+	err = (denom_estimate) ? (((double)(stbir_uint32)numer_estimate /
15408+	                           (double)(stbir_uint32)denom_estimate) -
15409+	                          f)
15410+	                       : 1.0;
15411+	if (err < 0.0) {
15412+		err = -err;
15413+	}
15414+	return (err < (1.0 / (double)(1 << 24))) ? 1 : 0;
15415+}
15416+
15417+static int
15418+stbir__calculate_region_transform(stbir__scale_info *scale_info,
15419+                                  int output_full_range, int *output_offset,
15420+                                  int output_sub_range, int input_full_range,
15421+                                  double input_s0, double input_s1)
15422+{
15423+	double output_range, input_range, output_s, input_s, ratio, scale;
15424+
15425+	input_s = input_s1 - input_s0;
15426+
15427+	// null area
15428+	if ((output_full_range == 0) || (input_full_range == 0) ||
15429+	    (output_sub_range == 0) || (input_s <= stbir__small_float)) {
15430+		return 0;
15431+	}
15432+
15433+	// are either of the ranges completely out of bounds?
15434+	if ((*output_offset >= output_full_range) ||
15435+	    ((*output_offset + output_sub_range) <= 0) ||
15436+	    (input_s0 >= (1.0f - stbir__small_float)) ||
15437+	    (input_s1 <= stbir__small_float)) {
15438+		return 0;
15439+	}
15440+
15441+	output_range = (double)output_full_range;
15442+	input_range = (double)input_full_range;
15443+
15444+	output_s = ((double)output_sub_range) / output_range;
15445+
15446+	// figure out the scaling to use
15447+	ratio = output_s / input_s;
15448+
15449+	// save scale before clipping
15450+	scale = (output_range / input_range) * ratio;
15451+	scale_info->scale = (float)scale;
15452+	scale_info->inv_scale = (float)(1.0 / scale);
15453+
15454+	// clip output area to left/right output edges (and adjust input area)
15455+	stbir__clip(output_offset, &output_sub_range, output_full_range, &input_s0,
15456+	            &input_s1);
15457+
15458+	// recalc input area
15459+	input_s = input_s1 - input_s0;
15460+
15461+	// after clipping do we have zero input area?
15462+	if (input_s <= stbir__small_float) {
15463+		return 0;
15464+	}
15465+
15466+	// calculate and store the starting source offsets in output pixel space
15467+	scale_info->pixel_shift = (float)(input_s0 * ratio * output_range);
15468+
15469+	scale_info->scale_is_rational = stbir__double_to_rational(
15470+	    scale, (scale <= 1.0) ? output_full_range : input_full_range,
15471+	    &scale_info->scale_numerator, &scale_info->scale_denominator,
15472+	    (scale >= 1.0));
15473+
15474+	scale_info->input_full_size = input_full_range;
15475+	scale_info->output_sub_size = output_sub_range;
15476+
15477+	return 1;
15478+}
15479+
15480+static void
15481+stbir__init_and_set_layout(STBIR_RESIZE *resize,
15482+                           stbir_pixel_layout pixel_layout,
15483+                           stbir_datatype data_type)
15484+{
15485+	resize->input_cb = 0;
15486+	resize->output_cb = 0;
15487+	resize->user_data = resize;
15488+	resize->samplers = 0;
15489+	resize->called_alloc = 0;
15490+	resize->horizontal_filter = STBIR_FILTER_DEFAULT;
15491+	resize->horizontal_filter_kernel = 0;
15492+	resize->horizontal_filter_support = 0;
15493+	resize->vertical_filter = STBIR_FILTER_DEFAULT;
15494+	resize->vertical_filter_kernel = 0;
15495+	resize->vertical_filter_support = 0;
15496+	resize->horizontal_edge = STBIR_EDGE_CLAMP;
15497+	resize->vertical_edge = STBIR_EDGE_CLAMP;
15498+	resize->input_s0 = 0;
15499+	resize->input_t0 = 0;
15500+	resize->input_s1 = 1;
15501+	resize->input_t1 = 1;
15502+	resize->output_subx = 0;
15503+	resize->output_suby = 0;
15504+	resize->output_subw = resize->output_w;
15505+	resize->output_subh = resize->output_h;
15506+	resize->input_data_type = data_type;
15507+	resize->output_data_type = data_type;
15508+	resize->input_pixel_layout_public = pixel_layout;
15509+	resize->output_pixel_layout_public = pixel_layout;
15510+	resize->needs_rebuild = 1;
15511+}
15512+
15513+STBIRDEF void
15514+stbir_resize_init(STBIR_RESIZE *resize, const void *input_pixels, int input_w,
15515+                  int input_h, int input_stride_in_bytes, // stride can be zero
15516+                  void *output_pixels, int output_w, int output_h,
15517+                  int output_stride_in_bytes, // stride can be zero
15518+                  stbir_pixel_layout pixel_layout, stbir_datatype data_type)
15519+{
15520+	resize->input_pixels = input_pixels;
15521+	resize->input_w = input_w;
15522+	resize->input_h = input_h;
15523+	resize->input_stride_in_bytes = input_stride_in_bytes;
15524+	resize->output_pixels = output_pixels;
15525+	resize->output_w = output_w;
15526+	resize->output_h = output_h;
15527+	resize->output_stride_in_bytes = output_stride_in_bytes;
15528+	resize->fast_alpha = 0;
15529+
15530+	stbir__init_and_set_layout(resize, pixel_layout, data_type);
15531 }
15532 
15533 // You can update parameters any time after resize_init
15534-STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type )  // by default, datatype from resize_init
15535+STBIRDEF void
15536+stbir_set_datatypes(
15537+    STBIR_RESIZE *resize, stbir_datatype input_type,
15538+    stbir_datatype output_type) // by default, datatype from resize_init
15539 {
15540-  resize->input_data_type = input_type;
15541-  resize->output_data_type = output_type;
15542-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
15543-    stbir__update_info_from_resize( resize->samplers, resize );
15544+	resize->input_data_type = input_type;
15545+	resize->output_data_type = output_type;
15546+	if ((resize->samplers) && (!resize->needs_rebuild)) {
15547+		stbir__update_info_from_resize(resize->samplers, resize);
15548+	}
15549 }
15550 
15551-STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
15552+STBIRDEF void
15553+stbir_set_pixel_callbacks(
15554+    STBIR_RESIZE *resize, stbir_input_callback *input_cb,
15555+    stbir_output_callback *output_cb) // no callbacks by default
15556 {
15557-  resize->input_cb = input_cb;
15558-  resize->output_cb = output_cb;
15559+	resize->input_cb = input_cb;
15560+	resize->output_cb = output_cb;
15561 
15562-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
15563-  {
15564-    resize->samplers->in_pixels_cb = input_cb;
15565-    resize->samplers->out_pixels_cb = output_cb;
15566-  }
15567+	if ((resize->samplers) && (!resize->needs_rebuild)) {
15568+		resize->samplers->in_pixels_cb = input_cb;
15569+		resize->samplers->out_pixels_cb = output_cb;
15570+	}
15571 }
15572 
15573-STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
15574+STBIRDEF void
15575+stbir_set_user_data(STBIR_RESIZE *resize,
15576+                    void *user_data) // pass back STBIR_RESIZE* by default
15577 {
15578-  resize->user_data = user_data;
15579-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
15580-    resize->samplers->user_data = user_data;
15581+	resize->user_data = user_data;
15582+	if ((resize->samplers) && (!resize->needs_rebuild)) {
15583+		resize->samplers->user_data = user_data;
15584+	}
15585 }
15586 
15587-STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
15588+STBIRDEF void
15589+stbir_set_buffer_ptrs(STBIR_RESIZE *resize, const void *input_pixels,
15590+                      int input_stride_in_bytes, void *output_pixels,
15591+                      int output_stride_in_bytes)
15592 {
15593-  resize->input_pixels = input_pixels;
15594-  resize->input_stride_in_bytes = input_stride_in_bytes;
15595-  resize->output_pixels = output_pixels;
15596-  resize->output_stride_in_bytes = output_stride_in_bytes;
15597-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
15598-    stbir__update_info_from_resize( resize->samplers, resize );
15599+	resize->input_pixels = input_pixels;
15600+	resize->input_stride_in_bytes = input_stride_in_bytes;
15601+	resize->output_pixels = output_pixels;
15602+	resize->output_stride_in_bytes = output_stride_in_bytes;
15603+	if ((resize->samplers) && (!resize->needs_rebuild)) {
15604+		stbir__update_info_from_resize(resize->samplers, resize);
15605+	}
15606 }
15607 
15608-
15609-STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge )       // CLAMP by default
15610-{
15611-  resize->horizontal_edge = horizontal_edge;
15612-  resize->vertical_edge = vertical_edge;
15613-  resize->needs_rebuild = 1;
15614-  return 1;
15615-}
15616-
15617-STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
15618+STBIRDEF int
15619+stbir_set_edgemodes(STBIR_RESIZE *resize, stbir_edge horizontal_edge,
15620+                    stbir_edge vertical_edge) // CLAMP by default
15621 {
15622-  resize->horizontal_filter = horizontal_filter;
15623-  resize->vertical_filter = vertical_filter;
15624-  resize->needs_rebuild = 1;
15625-  return 1;
15626+	resize->horizontal_edge = horizontal_edge;
15627+	resize->vertical_edge = vertical_edge;
15628+	resize->needs_rebuild = 1;
15629+	return 1;
15630 }
15631 
15632-STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
15633+STBIRDEF int
15634+stbir_set_filters(STBIR_RESIZE *resize, stbir_filter horizontal_filter,
15635+                  stbir_filter vertical_filter) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE
15636+                                                // by default
15637 {
15638-  resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
15639-  resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
15640-  resize->needs_rebuild = 1;
15641-  return 1;
15642+	resize->horizontal_filter = horizontal_filter;
15643+	resize->vertical_filter = vertical_filter;
15644+	resize->needs_rebuild = 1;
15645+	return 1;
15646 }
15647 
15648-STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout )   // sets new pixel layouts
15649-{
15650-  resize->input_pixel_layout_public = input_pixel_layout;
15651-  resize->output_pixel_layout_public = output_pixel_layout;
15652-  resize->needs_rebuild = 1;
15653-  return 1;
15654+STBIRDEF int
15655+stbir_set_filter_callbacks(STBIR_RESIZE *resize,
15656+                           stbir__kernel_callback *horizontal_filter,
15657+                           stbir__support_callback *horizontal_support,
15658+                           stbir__kernel_callback *vertical_filter,
15659+                           stbir__support_callback *vertical_support)
15660+{
15661+	resize->horizontal_filter_kernel = horizontal_filter;
15662+	resize->horizontal_filter_support = horizontal_support;
15663+	resize->vertical_filter_kernel = vertical_filter;
15664+	resize->vertical_filter_support = vertical_support;
15665+	resize->needs_rebuild = 1;
15666+	return 1;
15667 }
15668 
15669-
15670-STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality )   // sets alpha speed
15671+STBIRDEF int
15672+stbir_set_pixel_layouts(
15673+    STBIR_RESIZE *resize, stbir_pixel_layout input_pixel_layout,
15674+    stbir_pixel_layout output_pixel_layout) // sets new pixel layouts
15675 {
15676-  resize->fast_alpha = non_pma_alpha_speed_over_quality;
15677-  resize->needs_rebuild = 1;
15678-  return 1;
15679+	resize->input_pixel_layout_public = input_pixel_layout;
15680+	resize->output_pixel_layout_public = output_pixel_layout;
15681+	resize->needs_rebuild = 1;
15682+	return 1;
15683 }
15684 
15685-STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 )                 // sets input region (full region by default)
15686+STBIRDEF int
15687+stbir_set_non_pm_alpha_speed_over_quality(
15688+    STBIR_RESIZE *resize,
15689+    int non_pma_alpha_speed_over_quality) // sets alpha speed
15690 {
15691-  resize->input_s0 = s0;
15692-  resize->input_t0 = t0;
15693-  resize->input_s1 = s1;
15694-  resize->input_t1 = t1;
15695-  resize->needs_rebuild = 1;
15696-
15697-  // are we inbounds?
15698-  if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
15699-       ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
15700-       ( s0 > (1.0f-stbir__small_float) ) ||
15701-       ( t0 > (1.0f-stbir__small_float) ) )
15702-    return 0;
15703-
15704-  return 1;
15705+	resize->fast_alpha = non_pma_alpha_speed_over_quality;
15706+	resize->needs_rebuild = 1;
15707+	return 1;
15708 }
15709 
15710-STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )          // sets input region (full region by default)
15711+STBIRDEF int
15712+stbir_set_input_subrect(STBIR_RESIZE *resize, double s0, double t0, double s1,
15713+                        double t1) // sets input region (full region by default)
15714 {
15715-  resize->output_subx = subx;
15716-  resize->output_suby = suby;
15717-  resize->output_subw = subw;
15718-  resize->output_subh = subh;
15719-  resize->needs_rebuild = 1;
15720+	resize->input_s0 = s0;
15721+	resize->input_t0 = t0;
15722+	resize->input_s1 = s1;
15723+	resize->input_t1 = t1;
15724+	resize->needs_rebuild = 1;
15725 
15726-  // are we inbounds?
15727-  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
15728-    return 0;
15729+	// are we inbounds?
15730+	if ((s1 < stbir__small_float) || ((s1 - s0) < stbir__small_float) ||
15731+	    (t1 < stbir__small_float) || ((t1 - t0) < stbir__small_float) ||
15732+	    (s0 > (1.0f - stbir__small_float)) ||
15733+	    (t0 > (1.0f - stbir__small_float))) {
15734+		return 0;
15735+	}
15736 
15737-  return 1;
15738-}
15739-
15740-STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )                 // sets both regions (full regions by default)
15741+	return 1;
15742+}
15743+
15744+STBIRDEF int
15745+stbir_set_output_pixel_subrect(
15746+    STBIR_RESIZE *resize, int subx, int suby, int subw,
15747+    int subh) // sets input region (full region by default)
15748 {
15749-  double s0, t0, s1, t1;
15750-
15751-  s0 = ( (double)subx ) / ( (double)resize->output_w );
15752-  t0 = ( (double)suby ) / ( (double)resize->output_h );
15753-  s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
15754-  t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
15755-
15756-  resize->input_s0 = s0;
15757-  resize->input_t0 = t0;
15758-  resize->input_s1 = s1;
15759-  resize->input_t1 = t1;
15760-  resize->output_subx = subx;
15761-  resize->output_suby = suby;
15762-  resize->output_subw = subw;
15763-  resize->output_subh = subh;
15764-  resize->needs_rebuild = 1;
15765+	resize->output_subx = subx;
15766+	resize->output_suby = suby;
15767+	resize->output_subw = subw;
15768+	resize->output_subh = subh;
15769+	resize->needs_rebuild = 1;
15770+
15771+	// are we inbounds?
15772+	if ((subx >= resize->output_w) || ((subx + subw) <= 0) ||
15773+	    (suby >= resize->output_h) || ((suby + subh) <= 0) || (subw == 0) ||
15774+	    (subh == 0)) {
15775+		return 0;
15776+	}
15777 
15778-  // are we inbounds?
15779-  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
15780-    return 0;
15781-
15782-  return 1;
15783+	return 1;
15784 }
15785 
15786-static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
15787+STBIRDEF int
15788+stbir_set_pixel_subrect(STBIR_RESIZE *resize, int subx, int suby, int subw,
15789+                        int subh) // sets both regions (full regions by default)
15790 {
15791-  stbir__contributors conservative = { 0, 0 };
15792-  stbir__sampler horizontal, vertical;
15793-  int new_output_subx, new_output_suby;
15794-  stbir__info * out_info;
15795-  #ifdef STBIR_PROFILE
15796-  stbir__info profile_infod;  // used to contain building profile info before everything is allocated
15797-  stbir__info * profile_info = &profile_infod;
15798-  #endif
15799-
15800-  // have we already built the samplers?
15801-  if ( resize->samplers )
15802-    return 0;
15803-
15804-  #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
15805-  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
15806-  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
15807-  #undef STBIR_RETURN_ERROR_AND_ASSERT
15808-
15809-  if ( splits <= 0 )
15810-    return 0;
15811+	double s0, t0, s1, t1;
15812+
15813+	s0 = ((double)subx) / ((double)resize->output_w);
15814+	t0 = ((double)suby) / ((double)resize->output_h);
15815+	s1 = ((double)(subx + subw)) / ((double)resize->output_w);
15816+	t1 = ((double)(suby + subh)) / ((double)resize->output_h);
15817+
15818+	resize->input_s0 = s0;
15819+	resize->input_t0 = t0;
15820+	resize->input_s1 = s1;
15821+	resize->input_t1 = t1;
15822+	resize->output_subx = subx;
15823+	resize->output_suby = suby;
15824+	resize->output_subw = subw;
15825+	resize->output_subh = subh;
15826+	resize->needs_rebuild = 1;
15827 
15828-  STBIR_PROFILE_BUILD_FIRST_START( build );
15829+	// are we inbounds?
15830+	if ((subx >= resize->output_w) || ((subx + subw) <= 0) ||
15831+	    (suby >= resize->output_h) || ((suby + subh) <= 0) || (subw == 0) ||
15832+	    (subh == 0)) {
15833+		return 0;
15834+	}
15835 
15836-  new_output_subx = resize->output_subx;
15837-  new_output_suby = resize->output_suby;
15838-
15839-  // do horizontal clip and scale calcs
15840-  if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
15841-    return 0;
15842-
15843-  // do vertical clip and scale calcs
15844-  if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
15845-    return 0;
15846-
15847-  // if nothing to do, just return
15848-  if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
15849-    return 0;
15850-
15851-  stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
15852-  stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
15853-  stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
15854-
15855-  if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
15856-  {
15857-    splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
15858-    if ( splits == 0 ) splits = 1;
15859-  }
15860-
15861-  STBIR_PROFILE_BUILD_START( alloc );
15862-  out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
15863-  STBIR_PROFILE_BUILD_END( alloc );
15864-  STBIR_PROFILE_BUILD_END( build );
15865-
15866-  if ( out_info )
15867-  {
15868-    resize->splits = splits;
15869-    resize->samplers = out_info;
15870-    resize->needs_rebuild = 0;
15871-    #ifdef STBIR_PROFILE
15872-      STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
15873-    #endif
15874-
15875-    // update anything that can be changed without recalcing samplers
15876-    stbir__update_info_from_resize( out_info, resize );
15877-
15878-    return splits;
15879-  }
15880-
15881-  return 0;
15882-}
15883-
15884-void stbir_free_samplers( STBIR_RESIZE * resize )
15885+	return 1;
15886+}
15887+
15888+static int
15889+stbir__perform_build(STBIR_RESIZE *resize, int splits)
15890 {
15891-  if ( resize->samplers )
15892-  {
15893-    stbir__free_internal_mem( resize->samplers );
15894-    resize->samplers = 0;
15895-    resize->called_alloc = 0;
15896-  }
15897-}
15898+	stbir__contributors conservative = {0, 0};
15899+	stbir__sampler horizontal, vertical;
15900+	int new_output_subx, new_output_suby;
15901+	stbir__info *out_info;
15902+#ifdef STBIR_PROFILE
15903+	stbir__info profile_infod; // used to contain building profile info before
15904+	                           // everything is allocated
15905+	stbir__info *profile_info = &profile_infod;
15906+#endif
15907 
15908-STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
15909-{
15910-  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
15911-  {
15912-    if ( resize->samplers )
15913-      stbir_free_samplers( resize );
15914+	// have we already built the samplers?
15915+	if (resize->samplers) {
15916+		return 0;
15917+	}
15918+
15919+#define STBIR_RETURN_ERROR_AND_ASSERT(exp)                                     \
15920+	STBIR_ASSERT(!(exp));                                                      \
15921+	if (exp)                                                                   \
15922+		return 0;
15923+	STBIR_RETURN_ERROR_AND_ASSERT((unsigned)resize->horizontal_filter >=
15924+	                              STBIR_FILTER_OTHER)
15925+	STBIR_RETURN_ERROR_AND_ASSERT((unsigned)resize->vertical_filter >=
15926+	                              STBIR_FILTER_OTHER)
15927+#undef STBIR_RETURN_ERROR_AND_ASSERT
15928+
15929+	if (splits <= 0) {
15930+		return 0;
15931+	}
15932+
15933+	STBIR_PROFILE_BUILD_FIRST_START(build);
15934+
15935+	new_output_subx = resize->output_subx;
15936+	new_output_suby = resize->output_suby;
15937+
15938+	// do horizontal clip and scale calcs
15939+	if (!stbir__calculate_region_transform(
15940+	        &horizontal.scale_info, resize->output_w, &new_output_subx,
15941+	        resize->output_subw, resize->input_w, resize->input_s0,
15942+	        resize->input_s1)) {
15943+		return 0;
15944+	}
15945+
15946+	// do vertical clip and scale calcs
15947+	if (!stbir__calculate_region_transform(
15948+	        &vertical.scale_info, resize->output_h, &new_output_suby,
15949+	        resize->output_subh, resize->input_h, resize->input_t0,
15950+	        resize->input_t1)) {
15951+		return 0;
15952+	}
15953+
15954+	// if nothing to do, just return
15955+	if ((horizontal.scale_info.output_sub_size == 0) ||
15956+	    (vertical.scale_info.output_sub_size == 0)) {
15957+		return 0;
15958+	}
15959+
15960+	stbir__set_sampler(
15961+	    &horizontal, resize->horizontal_filter,
15962+	    resize->horizontal_filter_kernel, resize->horizontal_filter_support,
15963+	    resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data);
15964+	stbir__get_conservative_extents(&horizontal, &conservative,
15965+	                                resize->user_data);
15966+	stbir__set_sampler(&vertical, resize->vertical_filter,
15967+	                   resize->vertical_filter_kernel,
15968+	                   resize->vertical_filter_support, resize->vertical_edge,
15969+	                   &vertical.scale_info, 0, resize->user_data);
15970+
15971+	if ((vertical.scale_info.output_sub_size / splits) <
15972+	    STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS) // each split should be a
15973+	                                              // minimum of 4 scanlines
15974+	                                              // (handwavey choice)
15975+	{
15976+		splits = vertical.scale_info.output_sub_size /
15977+		         STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
15978+		if (splits == 0) {
15979+			splits = 1;
15980+		}
15981+	}
15982+
15983+	STBIR_PROFILE_BUILD_START(alloc);
15984+	out_info = stbir__alloc_internal_mem_and_build_samplers(
15985+	    &horizontal, &vertical, &conservative,
15986+	    resize->input_pixel_layout_public, resize->output_pixel_layout_public,
15987+	    splits, new_output_subx, new_output_suby, resize->fast_alpha,
15988+	    resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO);
15989+	STBIR_PROFILE_BUILD_END(alloc);
15990+	STBIR_PROFILE_BUILD_END(build);
15991+
15992+	if (out_info) {
15993+		resize->splits = splits;
15994+		resize->samplers = out_info;
15995+		resize->needs_rebuild = 0;
15996+#ifdef STBIR_PROFILE
15997+		STBIR_MEMCPY(&out_info->profile, &profile_infod.profile,
15998+		             sizeof(out_info->profile));
15999+#endif
16000 
16001-    resize->called_alloc = 1;
16002-    return stbir__perform_build( resize, splits );
16003-  }
16004+		// update anything that can be changed without recalcing samplers
16005+		stbir__update_info_from_resize(out_info, resize);
16006 
16007-  STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
16008+		return splits;
16009+	}
16010 
16011-  return 1;
16012+	return 0;
16013 }
16014 
16015-STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
16016+void
16017+stbir_free_samplers(STBIR_RESIZE *resize)
16018 {
16019-  return stbir_build_samplers_with_splits( resize, 1 );
16020+	if (resize->samplers) {
16021+		stbir__free_internal_mem(resize->samplers);
16022+		resize->samplers = 0;
16023+		resize->called_alloc = 0;
16024+	}
16025 }
16026 
16027-STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
16028+STBIRDEF int
16029+stbir_build_samplers_with_splits(STBIR_RESIZE *resize, int splits)
16030 {
16031-  int result;
16032-
16033-  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
16034-  {
16035-    int alloc_state = resize->called_alloc;  // remember allocated state
16036-
16037-    if ( resize->samplers )
16038-    {
16039-      stbir__free_internal_mem( resize->samplers );
16040-      resize->samplers = 0;
16041-    }
16042+	if ((resize->samplers == 0) || (resize->needs_rebuild)) {
16043+		if (resize->samplers) {
16044+			stbir_free_samplers(resize);
16045+		}
16046 
16047-    if ( !stbir_build_samplers( resize ) )
16048-      return 0;
16049+		resize->called_alloc = 1;
16050+		return stbir__perform_build(resize, splits);
16051+	}
16052 
16053-    resize->called_alloc = alloc_state;
16054+	STBIR_PROFILE_BUILD_CLEAR(resize->samplers);
16055 
16056-    // if build_samplers succeeded (above), but there are no samplers set, then
16057-    //   the area to stretch into was zero pixels, so don't do anything and return
16058-    //   success
16059-    if ( resize->samplers == 0 )
16060-      return 1;
16061-  }
16062-  else
16063-  {
16064-    // didn't build anything - clear it
16065-    STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
16066-  }
16067-
16068-  // do resize
16069-  result = stbir__perform_resize( resize->samplers, 0, resize->splits );
16070-
16071-  // if we alloced, then free
16072-  if ( !resize->called_alloc )
16073-  {
16074-    stbir_free_samplers( resize );
16075-    resize->samplers = 0;
16076-  }
16077-
16078-  return result;
16079+	return 1;
16080 }
16081 
16082-STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
16083+STBIRDEF int
16084+stbir_build_samplers(STBIR_RESIZE *resize)
16085 {
16086-  STBIR_ASSERT( resize->samplers );
16087-
16088-  // if we're just doing the whole thing, call full
16089-  if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
16090-    return stbir_resize_extended( resize );
16091-
16092-  // you **must** build samplers first when using split resize
16093-  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
16094-    return 0;
16095-
16096-  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
16097-    return 0;
16098-
16099-  // do resize
16100-  return stbir__perform_resize( resize->samplers, split_start, split_count );
16101+	return stbir_build_samplers_with_splits(resize, 1);
16102 }
16103 
16104-
16105-static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
16106-                                               void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
16107-                                               stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter )
16108+STBIRDEF int
16109+stbir_resize_extended(STBIR_RESIZE *resize)
16110 {
16111-  STBIR_RESIZE resize;
16112-  int scanline_output_in_bytes;
16113-  int positive_output_stride_in_bytes;
16114-  void * start_ptr;
16115-  void * free_ptr;
16116-
16117-  scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ];
16118-  if ( scanline_output_in_bytes == 0 )
16119-    return 0;
16120-
16121-  // if zero stride, use scanline output
16122-  if ( output_stride_in_bytes == 0 )
16123-    output_stride_in_bytes = scanline_output_in_bytes;
16124-
16125-  // abs value for inverted images (negative pitches)
16126-  positive_output_stride_in_bytes = output_stride_in_bytes;
16127-  if ( positive_output_stride_in_bytes < 0 )
16128-    positive_output_stride_in_bytes = -positive_output_stride_in_bytes;
16129-
16130-  // is the requested stride smaller than the scanline output? if so, just fail
16131-  if ( positive_output_stride_in_bytes < scanline_output_in_bytes )
16132-    return 0;
16133+	int result;
16134 
16135-  start_ptr = output_pixels;
16136-  free_ptr = 0;  // no free pointer, since they passed buffer to use
16137+	if ((resize->samplers == 0) || (resize->needs_rebuild)) {
16138+		int alloc_state = resize->called_alloc; // remember allocated state
16139 
16140-  // did they pass a zero for the dest? if so, allocate the buffer
16141-  if ( output_pixels == 0 )
16142-  {
16143-    size_t size;
16144-    char * ptr;
16145-  
16146-    size = (size_t)positive_output_stride_in_bytes * (size_t)output_h;
16147-    if ( size == 0 )
16148-      return 0;
16149+		if (resize->samplers) {
16150+			stbir__free_internal_mem(resize->samplers);
16151+			resize->samplers = 0;
16152+		}
16153 
16154-    ptr = (char*) STBIR_MALLOC( size, 0 );
16155-    if ( ptr == 0 )
16156-      return 0;
16157+		if (!stbir_build_samplers(resize)) {
16158+			return 0;
16159+		}
16160 
16161-    free_ptr = ptr;
16162+		resize->called_alloc = alloc_state;
16163 
16164-    // point at the last scanline, if they requested a flipped image
16165-    if ( output_stride_in_bytes < 0 )
16166-      start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) );
16167-    else
16168-      start_ptr = ptr;
16169-  }
16170+		// if build_samplers succeeded (above), but there are no samplers set,
16171+		// then
16172+		//   the area to stretch into was zero pixels, so don't do anything and
16173+		//   return success
16174+		if (resize->samplers == 0) {
16175+			return 1;
16176+		}
16177+	} else {
16178+		// didn't build anything - clear it
16179+		STBIR_PROFILE_BUILD_CLEAR(resize->samplers);
16180+	}
16181 
16182-  // ok, now do the resize
16183-  stbir_resize_init( &resize,
16184-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
16185-                     start_ptr, output_w, output_h, output_stride_in_bytes,
16186-                     pixel_layout, data_type );
16187+	// do resize
16188+	result = stbir__perform_resize(resize->samplers, 0, resize->splits);
16189 
16190-  resize.horizontal_edge = edge;
16191-  resize.vertical_edge = edge;
16192-  resize.horizontal_filter = filter;
16193-  resize.vertical_filter = filter;
16194+	// if we alloced, then free
16195+	if (!resize->called_alloc) {
16196+		stbir_free_samplers(resize);
16197+		resize->samplers = 0;
16198+	}
16199 
16200-  if ( !stbir_resize_extended( &resize ) )
16201-  {
16202-    if ( free_ptr )
16203-      STBIR_FREE( free_ptr, 0 );
16204-    return 0;
16205-  }
16206-
16207-  return (free_ptr) ? free_ptr : start_ptr;
16208-}
16209-
16210-
16211-
16212-STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
16213-                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
16214-                                                          stbir_pixel_layout pixel_layout )
16215-{
16216-  return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
16217-                                                      output_pixels, output_w, output_h, output_stride_in_bytes, 
16218-                                                      pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
16219+	return result;
16220 }
16221 
16222-STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
16223-                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
16224-                                                        stbir_pixel_layout pixel_layout )
16225+STBIRDEF int
16226+stbir_resize_extended_split(STBIR_RESIZE *resize, int split_start,
16227+                            int split_count)
16228 {
16229-  return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
16230-                                                      output_pixels, output_w, output_h, output_stride_in_bytes, 
16231-                                                      pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
16232-}
16233+	STBIR_ASSERT(resize->samplers);
16234 
16235+	// if we're just doing the whole thing, call full
16236+	if ((split_start == -1) ||
16237+	    ((split_start == 0) && (split_count == resize->splits))) {
16238+		return stbir_resize_extended(resize);
16239+	}
16240 
16241-STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
16242-                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
16243-                                                  stbir_pixel_layout pixel_layout )
16244-{
16245-  return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
16246-                                              output_pixels, output_w, output_h, output_stride_in_bytes, 
16247-                                              pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT  );
16248-}
16249+	// you **must** build samplers first when using split resize
16250+	if ((resize->samplers == 0) || (resize->needs_rebuild)) {
16251+		return 0;
16252+	}
16253 
16254+	if ((split_start >= resize->splits) || (split_start < 0) ||
16255+	    ((split_start + split_count) > resize->splits) || (split_count <= 0)) {
16256+		return 0;
16257+	}
16258 
16259-STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
16260-                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
16261-                                    stbir_pixel_layout pixel_layout, stbir_datatype data_type,
16262-                                    stbir_edge edge, stbir_filter filter )
16263-{
16264-  return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
16265-                                             output_pixels, output_w, output_h, output_stride_in_bytes, 
16266-                                             pixel_layout, data_type, edge, filter  );
16267+	// do resize
16268+	return stbir__perform_resize(resize->samplers, split_start, split_count);
16269+}
16270+
16271+static void *
16272+stbir_quick_resize_helper(const void *input_pixels, int input_w, int input_h,
16273+                          int input_stride_in_bytes, void *output_pixels,
16274+                          int output_w, int output_h,
16275+                          int output_stride_in_bytes,
16276+                          stbir_pixel_layout pixel_layout,
16277+                          stbir_datatype data_type, stbir_edge edge,
16278+                          stbir_filter filter)
16279+{
16280+	STBIR_RESIZE resize;
16281+	int scanline_output_in_bytes;
16282+	int positive_output_stride_in_bytes;
16283+	void *start_ptr;
16284+	void *free_ptr;
16285+
16286+	scanline_output_in_bytes =
16287+	    output_w * stbir__type_size[data_type] *
16288+	    stbir__pixel_channels
16289+	        [stbir__pixel_layout_convert_public_to_internal[pixel_layout]];
16290+	if (scanline_output_in_bytes == 0) {
16291+		return 0;
16292+	}
16293+
16294+	// if zero stride, use scanline output
16295+	if (output_stride_in_bytes == 0) {
16296+		output_stride_in_bytes = scanline_output_in_bytes;
16297+	}
16298+
16299+	// abs value for inverted images (negative pitches)
16300+	positive_output_stride_in_bytes = output_stride_in_bytes;
16301+	if (positive_output_stride_in_bytes < 0) {
16302+		positive_output_stride_in_bytes = -positive_output_stride_in_bytes;
16303+	}
16304+
16305+	// is the requested stride smaller than the scanline output? if so, just
16306+	// fail
16307+	if (positive_output_stride_in_bytes < scanline_output_in_bytes) {
16308+		return 0;
16309+	}
16310+
16311+	start_ptr = output_pixels;
16312+	free_ptr = 0; // no free pointer, since they passed buffer to use
16313+
16314+	// did they pass a zero for the dest? if so, allocate the buffer
16315+	if (output_pixels == 0) {
16316+		size_t size;
16317+		char *ptr;
16318+
16319+		size = (size_t)positive_output_stride_in_bytes * (size_t)output_h;
16320+		if (size == 0) {
16321+			return 0;
16322+		}
16323+
16324+		ptr = (char *)STBIR_MALLOC(size, 0);
16325+		if (ptr == 0) {
16326+			return 0;
16327+		}
16328+
16329+		free_ptr = ptr;
16330+
16331+		// point at the last scanline, if they requested a flipped image
16332+		if (output_stride_in_bytes < 0) {
16333+			start_ptr = ptr + ((size_t)positive_output_stride_in_bytes *
16334+			                   (size_t)(output_h - 1));
16335+		} else {
16336+			start_ptr = ptr;
16337+		}
16338+	}
16339+
16340+	// ok, now do the resize
16341+	stbir_resize_init(&resize, input_pixels, input_w, input_h,
16342+	                  input_stride_in_bytes, start_ptr, output_w, output_h,
16343+	                  output_stride_in_bytes, pixel_layout, data_type);
16344+
16345+	resize.horizontal_edge = edge;
16346+	resize.vertical_edge = edge;
16347+	resize.horizontal_filter = filter;
16348+	resize.vertical_filter = filter;
16349+
16350+	if (!stbir_resize_extended(&resize)) {
16351+		if (free_ptr) {
16352+			STBIR_FREE(free_ptr, 0);
16353+		}
16354+		return 0;
16355+	}
16356+
16357+	return (free_ptr) ? free_ptr : start_ptr;
16358+}
16359+
16360+STBIRDEF unsigned char *
16361+stbir_resize_uint8_linear(const unsigned char *input_pixels, int input_w,
16362+                          int input_h, int input_stride_in_bytes,
16363+                          unsigned char *output_pixels, int output_w,
16364+                          int output_h, int output_stride_in_bytes,
16365+                          stbir_pixel_layout pixel_layout)
16366+{
16367+	return (unsigned char *)stbir_quick_resize_helper(
16368+	    input_pixels, input_w, input_h, input_stride_in_bytes, output_pixels,
16369+	    output_w, output_h, output_stride_in_bytes, pixel_layout,
16370+	    STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT);
16371+}
16372+
16373+STBIRDEF unsigned char *
16374+stbir_resize_uint8_srgb(const unsigned char *input_pixels, int input_w,
16375+                        int input_h, int input_stride_in_bytes,
16376+                        unsigned char *output_pixels, int output_w,
16377+                        int output_h, int output_stride_in_bytes,
16378+                        stbir_pixel_layout pixel_layout)
16379+{
16380+	return (unsigned char *)stbir_quick_resize_helper(
16381+	    input_pixels, input_w, input_h, input_stride_in_bytes, output_pixels,
16382+	    output_w, output_h, output_stride_in_bytes, pixel_layout,
16383+	    STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT);
16384+}
16385+
16386+STBIRDEF float *
16387+stbir_resize_float_linear(const float *input_pixels, int input_w, int input_h,
16388+                          int input_stride_in_bytes, float *output_pixels,
16389+                          int output_w, int output_h,
16390+                          int output_stride_in_bytes,
16391+                          stbir_pixel_layout pixel_layout)
16392+{
16393+	return (float *)stbir_quick_resize_helper(
16394+	    input_pixels, input_w, input_h, input_stride_in_bytes, output_pixels,
16395+	    output_w, output_h, output_stride_in_bytes, pixel_layout,
16396+	    STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT);
16397+}
16398+
16399+STBIRDEF void *
16400+stbir_resize(const void *input_pixels, int input_w, int input_h,
16401+             int input_stride_in_bytes, void *output_pixels, int output_w,
16402+             int output_h, int output_stride_in_bytes,
16403+             stbir_pixel_layout pixel_layout, stbir_datatype data_type,
16404+             stbir_edge edge, stbir_filter filter)
16405+{
16406+	return (void *)stbir_quick_resize_helper(
16407+	    input_pixels, input_w, input_h, input_stride_in_bytes, output_pixels,
16408+	    output_w, output_h, output_stride_in_bytes, pixel_layout, data_type,
16409+	    edge, filter);
16410 }
16411 
16412 #ifdef STBIR_PROFILE
16413 
16414-STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
16415-{
16416-  static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
16417-  stbir__info* samp = resize->samplers;
16418-  int i;
16419-
16420-  typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
16421-  typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
16422-  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
16423-
16424-  for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
16425-    info->clocks[i] = samp->profile.array[i+1];
16426-
16427-  info->total_clocks = samp->profile.named.total;
16428-  info->descriptions = bdescriptions;
16429-  info->count = STBIR__ARRAY_SIZE( bdescriptions );
16430-}
16431-
16432-STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
16433-{
16434-  static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
16435-  stbir__per_split_info * split_info;
16436-  int s, i;
16437-
16438-  typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
16439-  typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
16440-  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
16441-
16442-  if ( split_start == -1 )
16443-  {
16444-    split_start = 0;
16445-    split_count = resize->samplers->splits;
16446-  }
16447-
16448-  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
16449-  {
16450-    info->total_clocks = 0;
16451-    info->descriptions = 0;
16452-    info->count = 0;
16453-    return;
16454-  }
16455-
16456-  split_info = resize->samplers->split_info + split_start;
16457-
16458-  // sum up the profile from all the splits
16459-  for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
16460-  {
16461-    stbir_uint64 sum = 0;
16462-    for( s = 0 ; s < split_count ; s++ )
16463-      sum += split_info[s].profile.array[i+1];
16464-    info->clocks[i] = sum;
16465-  }
16466-
16467-  info->total_clocks = split_info->profile.named.total;
16468-  info->descriptions = descriptions;
16469-  info->count = STBIR__ARRAY_SIZE( descriptions );
16470-}
16471-
16472-STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
16473-{
16474-  stbir_resize_split_profile_info( info, resize, -1, 0 );
16475+STBIRDEF void
16476+stbir_resize_build_profile_info(STBIR_PROFILE_INFO *info,
16477+                                STBIR_RESIZE const *resize)
16478+{
16479+	static char const *bdescriptions[6] = {
16480+	    "Building",         "Allocating",          "Horizontal sampler",
16481+	    "Vertical sampler", "Coefficient cleanup", "Coefficient piovot"};
16482+	stbir__info *samp = resize->samplers;
16483+	int i;
16484+
16485+	typedef int testa[(STBIR__ARRAY_SIZE(bdescriptions) ==
16486+	                   (STBIR__ARRAY_SIZE(samp->profile.array) - 1))
16487+	                      ? 1
16488+	                      : -1];
16489+	typedef int
16490+	    testb[(sizeof(samp->profile.array) == (sizeof(samp->profile.named)))
16491+	              ? 1
16492+	              : -1];
16493+	typedef int
16494+	    testc[(sizeof(info->clocks) >= (sizeof(samp->profile.named))) ? 1 : -1];
16495+
16496+	for (i = 0; i < STBIR__ARRAY_SIZE(bdescriptions); i++) {
16497+		info->clocks[i] = samp->profile.array[i + 1];
16498+	}
16499+
16500+	info->total_clocks = samp->profile.named.total;
16501+	info->descriptions = bdescriptions;
16502+	info->count = STBIR__ARRAY_SIZE(bdescriptions);
16503+}
16504+
16505+STBIRDEF void
16506+stbir_resize_split_profile_info(STBIR_PROFILE_INFO *info,
16507+                                STBIR_RESIZE const *resize, int split_start,
16508+                                int split_count)
16509+{
16510+	static char const *descriptions[7] = {
16511+	    "Looping",          "Vertical sampling", "Horizontal sampling",
16512+	    "Scanline input",   "Scanline output",   "Alpha weighting",
16513+	    "Alpha unweighting"};
16514+	stbir__per_split_info *split_info;
16515+	int s, i;
16516+
16517+	typedef int testa[(STBIR__ARRAY_SIZE(descriptions) ==
16518+	                   (STBIR__ARRAY_SIZE(split_info->profile.array) - 1))
16519+	                      ? 1
16520+	                      : -1];
16521+	typedef int testb[(sizeof(split_info->profile.array) ==
16522+	                   (sizeof(split_info->profile.named)))
16523+	                      ? 1
16524+	                      : -1];
16525+	typedef int
16526+	    testc[(sizeof(info->clocks) >= (sizeof(split_info->profile.named)))
16527+	              ? 1
16528+	              : -1];
16529+
16530+	if (split_start == -1) {
16531+		split_start = 0;
16532+		split_count = resize->samplers->splits;
16533+	}
16534+
16535+	if ((split_start >= resize->splits) || (split_start < 0) ||
16536+	    ((split_start + split_count) > resize->splits) || (split_count <= 0)) {
16537+		info->total_clocks = 0;
16538+		info->descriptions = 0;
16539+		info->count = 0;
16540+		return;
16541+	}
16542+
16543+	split_info = resize->samplers->split_info + split_start;
16544+
16545+	// sum up the profile from all the splits
16546+	for (i = 0; i < STBIR__ARRAY_SIZE(descriptions); i++) {
16547+		stbir_uint64 sum = 0;
16548+		for (s = 0; s < split_count; s++) {
16549+			sum += split_info[s].profile.array[i + 1];
16550+		}
16551+		info->clocks[i] = sum;
16552+	}
16553+
16554+	info->total_clocks = split_info->profile.named.total;
16555+	info->descriptions = descriptions;
16556+	info->count = STBIR__ARRAY_SIZE(descriptions);
16557+}
16558+
16559+STBIRDEF void
16560+stbir_resize_extended_profile_info(STBIR_PROFILE_INFO *info,
16561+                                   STBIR_RESIZE const *resize)
16562+{
16563+	stbir_resize_split_profile_info(info, resize, -1, 0);
16564 }
16565 
16566 #endif // STBIR_PROFILE
16567@@ -8215,32 +9828,58 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
16568 
16569 #endif // STB_IMAGE_RESIZE_IMPLEMENTATION
16570 
16571-#else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
16572+#else // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
16573 
16574 // we reinclude the header file to define all the horizontal functions
16575-//   specializing each function for the number of coeffs is 20-40% faster *OVERALL*
16576+//   specializing each function for the number of coeffs is 20-40% faster
16577+//   *OVERALL*
16578 
16579 // by including the header file again this way, we can still debug the functions
16580 
16581-#define STBIR_strs_join2( start, mid, end ) start##mid##end
16582-#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
16583+#define STBIR_strs_join2(start, mid, end) start##mid##end
16584+#define STBIR_strs_join1(start, mid, end) STBIR_strs_join2(start, mid, end)
16585 
16586-#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
16587-#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
16588+#define STBIR_strs_join24(start, mid1, mid2, end) start##mid1##mid2##end
16589+#define STBIR_strs_join14(start, mid1, mid2, end)                              \
16590+	STBIR_strs_join24(start, mid1, mid2, end)
16591 
16592 #ifdef STB_IMAGE_RESIZE_DO_CODERS
16593 
16594 #ifdef stbir__decode_suffix
16595-#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
16596+#define STBIR__CODER_NAME(name) STBIR_strs_join1(name, _, stbir__decode_suffix)
16597 #else
16598-#define STBIR__CODER_NAME( name ) name
16599+#define STBIR__CODER_NAME(name) name
16600 #endif
16601 
16602 #ifdef stbir__decode_swizzle
16603-#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
16604-#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
16605-#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
16606-#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
16607+#define stbir__decode_simdf8_flip(reg)                                         \
16608+	STBIR_strs_join1(                                                          \
16609+	    STBIR_strs_join1(                                                      \
16610+	        STBIR_strs_join1(STBIR_strs_join1(stbir__simdf8_0123to,            \
16611+	                                          stbir__decode_order0,            \
16612+	                                          stbir__decode_order1),           \
16613+	                         stbir__decode_order2, stbir__decode_order3),      \
16614+	        stbir__decode_order0, stbir__decode_order1),                       \
16615+	    stbir__decode_order2, stbir__decode_order3)(reg, reg)
16616+#define stbir__decode_simdf4_flip(reg)                                         \
16617+	STBIR_strs_join1(STBIR_strs_join1(stbir__simdf_0123to,                     \
16618+	                                  stbir__decode_order0,                    \
16619+	                                  stbir__decode_order1),                   \
16620+	                 stbir__decode_order2, stbir__decode_order3)(reg, reg)
16621+#define stbir__encode_simdf8_unflip(reg)                                       \
16622+	STBIR_strs_join1(                                                          \
16623+	    STBIR_strs_join1(                                                      \
16624+	        STBIR_strs_join1(STBIR_strs_join1(stbir__simdf8_0123to,            \
16625+	                                          stbir__encode_order0,            \
16626+	                                          stbir__encode_order1),           \
16627+	                         stbir__encode_order2, stbir__encode_order3),      \
16628+	        stbir__encode_order0, stbir__encode_order1),                       \
16629+	    stbir__encode_order2, stbir__encode_order3)(reg, reg)
16630+#define stbir__encode_simdf4_unflip(reg)                                       \
16631+	STBIR_strs_join1(STBIR_strs_join1(stbir__simdf_0123to,                     \
16632+	                                  stbir__encode_order0,                    \
16633+	                                  stbir__encode_order1),                   \
16634+	                 stbir__encode_order2, stbir__encode_order3)(reg, reg)
16635 #else
16636 #define stbir__decode_order0 0
16637 #define stbir__decode_order1 1
16638@@ -8257,1589 +9896,1817 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
16639 #endif
16640 
16641 #ifdef STBIR_SIMD8
16642-#define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
16643+#define stbir__encode_simdfX_unflip stbir__encode_simdf8_unflip
16644 #else
16645-#define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
16646-#endif
16647-
16648-static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
16649-{
16650-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
16651-  float * decode_end = (float*) decode + width_times_channels;
16652-  unsigned char const * input = (unsigned char const*)inputp;
16653-
16654-  #ifdef STBIR_SIMD
16655-  unsigned char const * end_input_m16 = input + width_times_channels - 16;
16656-  if ( width_times_channels >= 16 )
16657-  {
16658-    decode_end -= 16;
16659-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
16660-    for(;;)
16661-    {
16662-      #ifdef STBIR_SIMD8
16663-      stbir__simdi i; stbir__simdi8 o0,o1;
16664-      stbir__simdf8 of0, of1;
16665-      STBIR_NO_UNROLL(decode);
16666-      stbir__simdi_load( i, input );
16667-      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
16668-      stbir__simdi8_convert_i32_to_float( of0, o0 );
16669-      stbir__simdi8_convert_i32_to_float( of1, o1 );
16670-      stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
16671-      stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
16672-      stbir__decode_simdf8_flip( of0 );
16673-      stbir__decode_simdf8_flip( of1 );
16674-      stbir__simdf8_store( decode + 0, of0 );
16675-      stbir__simdf8_store( decode + 8, of1 );
16676-      #else
16677-      stbir__simdi i, o0, o1, o2, o3;
16678-      stbir__simdf of0, of1, of2, of3;
16679-      STBIR_NO_UNROLL(decode);
16680-      stbir__simdi_load( i, input );
16681-      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
16682-      stbir__simdi_convert_i32_to_float( of0, o0 );
16683-      stbir__simdi_convert_i32_to_float( of1, o1 );
16684-      stbir__simdi_convert_i32_to_float( of2, o2 );
16685-      stbir__simdi_convert_i32_to_float( of3, o3 );
16686-      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
16687-      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
16688-      stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
16689-      stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
16690-      stbir__decode_simdf4_flip( of0 );
16691-      stbir__decode_simdf4_flip( of1 );
16692-      stbir__decode_simdf4_flip( of2 );
16693-      stbir__decode_simdf4_flip( of3 );
16694-      stbir__simdf_store( decode + 0,  of0 );
16695-      stbir__simdf_store( decode + 4,  of1 );
16696-      stbir__simdf_store( decode + 8,  of2 );
16697-      stbir__simdf_store( decode + 12, of3 );
16698-      #endif
16699-      decode += 16;
16700-      input += 16;
16701-      if ( decode <= decode_end )
16702-        continue;
16703-      if ( decode == ( decode_end + 16 ) )
16704-        break;
16705-      decode = decode_end; // backup and do last couple
16706-      input = end_input_m16;
16707-    }
16708-    return decode_end + 16;
16709-  }
16710-  #endif
16711-
16712-  // try to do blocks of 4 when you can
16713-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
16714-  decode += 4;
16715-  STBIR_SIMD_NO_UNROLL_LOOP_START
16716-  while( decode <= decode_end )
16717-  {
16718-    STBIR_SIMD_NO_UNROLL(decode);
16719-    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
16720-    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
16721-    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
16722-    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
16723-    decode += 4;
16724-    input += 4;
16725-  }
16726-  decode -= 4;
16727-  #endif
16728-
16729-  // do the remnants
16730-  #if stbir__coder_min_num < 4
16731-  STBIR_NO_UNROLL_LOOP_START
16732-  while( decode < decode_end )
16733-  {
16734-    STBIR_NO_UNROLL(decode);
16735-    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
16736-    #if stbir__coder_min_num >= 2
16737-    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
16738-    #endif
16739-    #if stbir__coder_min_num >= 3
16740-    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
16741-    #endif
16742-    decode += stbir__coder_min_num;
16743-    input += stbir__coder_min_num;
16744-  }
16745-  #endif
16746-
16747-  return decode_end;
16748-}
16749-
16750-static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
16751-{
16752-  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
16753-  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
16754-
16755-  #ifdef STBIR_SIMD
16756-  if ( width_times_channels >= stbir__simdfX_float_count*2 )
16757-  {
16758-    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
16759-    end_output -= stbir__simdfX_float_count*2;
16760-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
16761-    for(;;)
16762-    {
16763-      stbir__simdfX e0, e1;
16764-      stbir__simdi i;
16765-      STBIR_SIMD_NO_UNROLL(encode);
16766-      stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
16767-      stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
16768-      stbir__encode_simdfX_unflip( e0 );
16769-      stbir__encode_simdfX_unflip( e1 );
16770-      #ifdef STBIR_SIMD8
16771-      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
16772-      stbir__simdi_store( output, i );
16773-      #else
16774-      stbir__simdf_pack_to_8bytes( i, e0, e1 );
16775-      stbir__simdi_store2( output, i );
16776-      #endif
16777-      encode += stbir__simdfX_float_count*2;
16778-      output += stbir__simdfX_float_count*2;
16779-      if ( output <= end_output )
16780-        continue;
16781-      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
16782-        break;
16783-      output = end_output; // backup and do last couple
16784-      encode = end_encode_m8;
16785-    }
16786-    return;
16787-  }
16788-
16789-  // try to do blocks of 4 when you can
16790-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
16791-  output += 4;
16792-  STBIR_NO_UNROLL_LOOP_START
16793-  while( output <= end_output )
16794-  {
16795-    stbir__simdf e0;
16796-    stbir__simdi i0;
16797-    STBIR_NO_UNROLL(encode);
16798-    stbir__simdf_load( e0, encode );
16799-    stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
16800-    stbir__encode_simdf4_unflip( e0 );
16801-    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
16802-    *(int*)(output-4) = stbir__simdi_to_int( i0 );
16803-    output += 4;
16804-    encode += 4;
16805-  }
16806-  output -= 4;
16807-  #endif
16808-
16809-  // do the remnants
16810-  #if stbir__coder_min_num < 4
16811-  STBIR_NO_UNROLL_LOOP_START
16812-  while( output < end_output )
16813-  {
16814-    stbir__simdf e0;
16815-    STBIR_NO_UNROLL(encode);
16816-    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
16817-    #if stbir__coder_min_num >= 2
16818-    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
16819-    #endif
16820-    #if stbir__coder_min_num >= 3
16821-    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
16822-    #endif
16823-    output += stbir__coder_min_num;
16824-    encode += stbir__coder_min_num;
16825-  }
16826-  #endif
16827-
16828-  #else
16829-
16830-  // try to do blocks of 4 when you can
16831-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
16832-  output += 4;
16833-  while( output <= end_output )
16834-  {
16835-    float f;
16836-    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
16837-    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
16838-    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
16839-    f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
16840-    output += 4;
16841-    encode += 4;
16842-  }
16843-  output -= 4;
16844-  #endif
16845-
16846-  // do the remnants
16847-  #if stbir__coder_min_num < 4
16848-  STBIR_NO_UNROLL_LOOP_START
16849-  while( output < end_output )
16850-  {
16851-    float f;
16852-    STBIR_NO_UNROLL(encode);
16853-    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
16854-    #if stbir__coder_min_num >= 2
16855-    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
16856-    #endif
16857-    #if stbir__coder_min_num >= 3
16858-    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
16859-    #endif
16860-    output += stbir__coder_min_num;
16861-    encode += stbir__coder_min_num;
16862-  }
16863-  #endif
16864-  #endif
16865-}
16866-
16867-static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
16868-{
16869-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
16870-  float * decode_end = (float*) decode + width_times_channels;
16871-  unsigned char const * input = (unsigned char const*)inputp;
16872-
16873-  #ifdef STBIR_SIMD
16874-  unsigned char const * end_input_m16 = input + width_times_channels - 16;
16875-  if ( width_times_channels >= 16 )
16876-  {
16877-    decode_end -= 16;
16878-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
16879-    for(;;)
16880-    {
16881-      #ifdef STBIR_SIMD8
16882-      stbir__simdi i; stbir__simdi8 o0,o1;
16883-      stbir__simdf8 of0, of1;
16884-      STBIR_NO_UNROLL(decode);
16885-      stbir__simdi_load( i, input );
16886-      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
16887-      stbir__simdi8_convert_i32_to_float( of0, o0 );
16888-      stbir__simdi8_convert_i32_to_float( of1, o1 );
16889-      stbir__decode_simdf8_flip( of0 );
16890-      stbir__decode_simdf8_flip( of1 );
16891-      stbir__simdf8_store( decode + 0, of0 );
16892-      stbir__simdf8_store( decode + 8, of1 );
16893-      #else
16894-      stbir__simdi i, o0, o1, o2, o3;
16895-      stbir__simdf of0, of1, of2, of3;
16896-      STBIR_NO_UNROLL(decode);
16897-      stbir__simdi_load( i, input );
16898-      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
16899-      stbir__simdi_convert_i32_to_float( of0, o0 );
16900-      stbir__simdi_convert_i32_to_float( of1, o1 );
16901-      stbir__simdi_convert_i32_to_float( of2, o2 );
16902-      stbir__simdi_convert_i32_to_float( of3, o3 );
16903-      stbir__decode_simdf4_flip( of0 );
16904-      stbir__decode_simdf4_flip( of1 );
16905-      stbir__decode_simdf4_flip( of2 );
16906-      stbir__decode_simdf4_flip( of3 );
16907-      stbir__simdf_store( decode + 0,  of0 );
16908-      stbir__simdf_store( decode + 4,  of1 );
16909-      stbir__simdf_store( decode + 8,  of2 );
16910-      stbir__simdf_store( decode + 12, of3 );
16911-#endif
16912-      decode += 16;
16913-      input += 16;
16914-      if ( decode <= decode_end )
16915-        continue;
16916-      if ( decode == ( decode_end + 16 ) )
16917-        break;
16918-      decode = decode_end; // backup and do last couple
16919-      input = end_input_m16;
16920-    }
16921-    return decode_end + 16;
16922-  }
16923-  #endif
16924-
16925-  // try to do blocks of 4 when you can
16926-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
16927-  decode += 4;
16928-  STBIR_SIMD_NO_UNROLL_LOOP_START
16929-  while( decode <= decode_end )
16930-  {
16931-    STBIR_SIMD_NO_UNROLL(decode);
16932-    decode[0-4] = ((float)(input[stbir__decode_order0]));
16933-    decode[1-4] = ((float)(input[stbir__decode_order1]));
16934-    decode[2-4] = ((float)(input[stbir__decode_order2]));
16935-    decode[3-4] = ((float)(input[stbir__decode_order3]));
16936-    decode += 4;
16937-    input += 4;
16938-  }
16939-  decode -= 4;
16940-  #endif
16941-
16942-  // do the remnants
16943-  #if stbir__coder_min_num < 4
16944-  STBIR_NO_UNROLL_LOOP_START
16945-  while( decode < decode_end )
16946-  {
16947-    STBIR_NO_UNROLL(decode);
16948-    decode[0] = ((float)(input[stbir__decode_order0]));
16949-    #if stbir__coder_min_num >= 2
16950-    decode[1] = ((float)(input[stbir__decode_order1]));
16951-    #endif
16952-    #if stbir__coder_min_num >= 3
16953-    decode[2] = ((float)(input[stbir__decode_order2]));
16954-    #endif
16955-    decode += stbir__coder_min_num;
16956-    input += stbir__coder_min_num;
16957-  }
16958-  #endif
16959-  return decode_end;
16960-}
16961-
16962-static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
16963-{
16964-  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
16965-  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
16966-
16967-  #ifdef STBIR_SIMD
16968-  if ( width_times_channels >= stbir__simdfX_float_count*2 )
16969-  {
16970-    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
16971-    end_output -= stbir__simdfX_float_count*2;
16972-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
16973-    for(;;)
16974-    {
16975-      stbir__simdfX e0, e1;
16976-      stbir__simdi i;
16977-      STBIR_SIMD_NO_UNROLL(encode);
16978-      stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
16979-      stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
16980-      stbir__encode_simdfX_unflip( e0 );
16981-      stbir__encode_simdfX_unflip( e1 );
16982-      #ifdef STBIR_SIMD8
16983-      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
16984-      stbir__simdi_store( output, i );
16985-      #else
16986-      stbir__simdf_pack_to_8bytes( i, e0, e1 );
16987-      stbir__simdi_store2( output, i );
16988-      #endif
16989-      encode += stbir__simdfX_float_count*2;
16990-      output += stbir__simdfX_float_count*2;
16991-      if ( output <= end_output )
16992-        continue;
16993-      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
16994-        break;
16995-      output = end_output; // backup and do last couple
16996-      encode = end_encode_m8;
16997-    }
16998-    return;
16999-  }
17000-
17001-  // try to do blocks of 4 when you can
17002-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17003-  output += 4;
17004-  STBIR_NO_UNROLL_LOOP_START
17005-  while( output <= end_output )
17006-  {
17007-    stbir__simdf e0;
17008-    stbir__simdi i0;
17009-    STBIR_NO_UNROLL(encode);
17010-    stbir__simdf_load( e0, encode );
17011-    stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
17012-    stbir__encode_simdf4_unflip( e0 );
17013-    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
17014-    *(int*)(output-4) = stbir__simdi_to_int( i0 );
17015-    output += 4;
17016-    encode += 4;
17017-  }
17018-  output -= 4;
17019-  #endif
17020-
17021-  #else
17022-
17023-  // try to do blocks of 4 when you can
17024-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17025-  output += 4;
17026-  while( output <= end_output )
17027-  {
17028-    float f;
17029-    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
17030-    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
17031-    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
17032-    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
17033-    output += 4;
17034-    encode += 4;
17035-  }
17036-  output -= 4;
17037-  #endif
17038-
17039-  #endif
17040-
17041-  // do the remnants
17042-  #if stbir__coder_min_num < 4
17043-  STBIR_NO_UNROLL_LOOP_START
17044-  while( output < end_output )
17045-  {
17046-    float f;
17047-    STBIR_NO_UNROLL(encode);
17048-    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
17049-    #if stbir__coder_min_num >= 2
17050-    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
17051-    #endif
17052-    #if stbir__coder_min_num >= 3
17053-    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
17054-    #endif
17055-    output += stbir__coder_min_num;
17056-    encode += stbir__coder_min_num;
17057-  }
17058-  #endif
17059-}
17060-
17061-static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
17062-{
17063-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17064-  float * decode_end = (float*) decode + width_times_channels;
17065-  unsigned char const * input = (unsigned char const *)inputp;
17066-
17067-  // try to do blocks of 4 when you can
17068-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17069-  decode += 4;
17070-  while( decode <= decode_end )
17071-  {
17072-    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
17073-    decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
17074-    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
17075-    decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
17076-    decode += 4;
17077-    input += 4;
17078-  }
17079-  decode -= 4;
17080-  #endif
17081-
17082-  // do the remnants
17083-  #if stbir__coder_min_num < 4
17084-  STBIR_NO_UNROLL_LOOP_START
17085-  while( decode < decode_end )
17086-  {
17087-    STBIR_NO_UNROLL(decode);
17088-    decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
17089-    #if stbir__coder_min_num >= 2
17090-    decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
17091-    #endif
17092-    #if stbir__coder_min_num >= 3
17093-    decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
17094-    #endif
17095-    decode += stbir__coder_min_num;
17096-    input += stbir__coder_min_num;
17097-  }
17098-  #endif
17099-  return decode_end;
17100-}
17101-
17102-#define stbir__min_max_shift20( i, f ) \
17103-    stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
17104-    stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
17105-    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
17106-
17107-#define stbir__scale_and_convert( i, f ) \
17108-    stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
17109-    stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
17110-    stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
17111-    stbir__simdf_convert_float_to_i32( i, f );
17112-
17113-#define stbir__linear_to_srgb_finish( i, f ) \
17114-{ \
17115-    stbir__simdi temp;  \
17116-    stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
17117-    stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
17118-    stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
17119-    stbir__simdi_16madd( i, i, temp ); \
17120-    stbir__simdi_32shr( i, i, 16 ); \
17121-}
17122-
17123-#define stbir__simdi_table_lookup2( v0,v1, table ) \
17124-{ \
17125-  stbir__simdi_u32 temp0,temp1; \
17126-  temp0.m128i_i128 = v0; \
17127-  temp1.m128i_i128 = v1; \
17128-  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
17129-  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
17130-  v0 = temp0.m128i_i128; \
17131-  v1 = temp1.m128i_i128; \
17132-}
17133-
17134-#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
17135-{ \
17136-  stbir__simdi_u32 temp0,temp1,temp2; \
17137-  temp0.m128i_i128 = v0; \
17138-  temp1.m128i_i128 = v1; \
17139-  temp2.m128i_i128 = v2; \
17140-  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
17141-  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
17142-  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
17143-  v0 = temp0.m128i_i128; \
17144-  v1 = temp1.m128i_i128; \
17145-  v2 = temp2.m128i_i128; \
17146-}
17147-
17148-#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
17149-{ \
17150-  stbir__simdi_u32 temp0,temp1,temp2,temp3; \
17151-  temp0.m128i_i128 = v0; \
17152-  temp1.m128i_i128 = v1; \
17153-  temp2.m128i_i128 = v2; \
17154-  temp3.m128i_i128 = v3; \
17155-  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
17156-  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
17157-  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
17158-  temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
17159-  v0 = temp0.m128i_i128; \
17160-  v1 = temp1.m128i_i128; \
17161-  v2 = temp2.m128i_i128; \
17162-  v3 = temp3.m128i_i128; \
17163-}
17164-
17165-static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
17166-{
17167-  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
17168-  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
17169-
17170-  #ifdef STBIR_SIMD
17171-
17172-  if ( width_times_channels >= 16 )
17173-  {
17174-    float const * end_encode_m16 = encode + width_times_channels - 16;
17175-    end_output -= 16;
17176-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
17177-    for(;;)
17178-    {
17179-      stbir__simdf f0, f1, f2, f3;
17180-      stbir__simdi i0, i1, i2, i3;
17181-      STBIR_SIMD_NO_UNROLL(encode);
17182-
17183-      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
17184-
17185-      stbir__min_max_shift20( i0, f0 );
17186-      stbir__min_max_shift20( i1, f1 );
17187-      stbir__min_max_shift20( i2, f2 );
17188-      stbir__min_max_shift20( i3, f3 );
17189-
17190-      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
17191-
17192-      stbir__linear_to_srgb_finish( i0, f0 );
17193-      stbir__linear_to_srgb_finish( i1, f1 );
17194-      stbir__linear_to_srgb_finish( i2, f2 );
17195-      stbir__linear_to_srgb_finish( i3, f3 );
17196-
17197-      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
17198-
17199-      encode += 16;
17200-      output += 16;
17201-      if ( output <= end_output )
17202-        continue;
17203-      if ( output == ( end_output + 16 ) )
17204-        break;
17205-      output = end_output; // backup and do last couple
17206-      encode = end_encode_m16;
17207-    }
17208-    return;
17209-  }
17210-  #endif
17211-
17212-  // try to do blocks of 4 when you can
17213-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17214-  output += 4;
17215-  STBIR_SIMD_NO_UNROLL_LOOP_START
17216-  while ( output <= end_output )
17217-  {
17218-    STBIR_SIMD_NO_UNROLL(encode);
17219-
17220-    output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
17221-    output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
17222-    output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
17223-    output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
17224-
17225-    output += 4;
17226-    encode += 4;
17227-  }
17228-  output -= 4;
17229-  #endif
17230-
17231-  // do the remnants
17232-  #if stbir__coder_min_num < 4
17233-  STBIR_NO_UNROLL_LOOP_START
17234-  while( output < end_output )
17235-  {
17236-    STBIR_NO_UNROLL(encode);
17237-    output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
17238-    #if stbir__coder_min_num >= 2
17239-    output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
17240-    #endif
17241-    #if stbir__coder_min_num >= 3
17242-    output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
17243-    #endif
17244-    output += stbir__coder_min_num;
17245-    encode += stbir__coder_min_num;
17246-  }
17247-  #endif
17248-}
17249-
17250-#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
17251-
17252-static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
17253-{
17254-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17255-  float * decode_end = (float*) decode + width_times_channels;
17256-  unsigned char const * input = (unsigned char const *)inputp;
17257-
17258-  do {
17259-    decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
17260-    decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
17261-    decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
17262-    decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
17263-    input += 4;
17264-    decode += 4;
17265-  } while( decode < decode_end );
17266-  return decode_end;
17267-}
17268-
17269-
17270-static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
17271-{
17272-  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
17273-  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
17274-
17275-  #ifdef STBIR_SIMD
17276-
17277-  if ( width_times_channels >= 16 )
17278-  {
17279-    float const * end_encode_m16 = encode + width_times_channels - 16;
17280-    end_output -= 16;
17281-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
17282-    for(;;)
17283-    {
17284-      stbir__simdf f0, f1, f2, f3;
17285-      stbir__simdi i0, i1, i2, i3;
17286-
17287-      STBIR_SIMD_NO_UNROLL(encode);
17288-      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
17289-
17290-      stbir__min_max_shift20( i0, f0 );
17291-      stbir__min_max_shift20( i1, f1 );
17292-      stbir__min_max_shift20( i2, f2 );
17293-      stbir__scale_and_convert( i3, f3 );
17294-
17295-      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
17296-
17297-      stbir__linear_to_srgb_finish( i0, f0 );
17298-      stbir__linear_to_srgb_finish( i1, f1 );
17299-      stbir__linear_to_srgb_finish( i2, f2 );
17300-
17301-      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
17302-
17303-      output += 16;
17304-      encode += 16;
17305-
17306-      if ( output <= end_output )
17307-        continue;
17308-      if ( output == ( end_output + 16 ) )
17309-        break;
17310-      output = end_output; // backup and do last couple
17311-      encode = end_encode_m16;
17312-    }
17313-    return;
17314-  }
17315-  #endif
17316-
17317-  STBIR_SIMD_NO_UNROLL_LOOP_START
17318-  do {
17319-    float f;
17320-    STBIR_SIMD_NO_UNROLL(encode);
17321-
17322-    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
17323-    output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
17324-    output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
17325-
17326-    f = encode[3] * stbir__max_uint8_as_float + 0.5f;
17327-    STBIR_CLAMP(f, 0, 255);
17328-    output[stbir__decode_order3] = (unsigned char) f;
17329+#define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip
17330+#endif
17331 
17332-    output += 4;
17333-    encode += 4;
17334-  } while( output < end_output );
17335-}
17336+static float *
17337+STBIR__CODER_NAME(stbir__decode_uint8_linear_scaled)(float *decodep,
17338+                                                     int width_times_channels,
17339+                                                     void const *inputp)
17340+{
17341+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
17342+	float *decode_end = (float *)decode + width_times_channels;
17343+	unsigned char const *input = (unsigned char const *)inputp;
17344+
17345+#ifdef STBIR_SIMD
17346+	unsigned char const *end_input_m16 = input + width_times_channels - 16;
17347+	if (width_times_channels >= 16) {
17348+		decode_end -= 16;
17349+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
17350+		for (;;) {
17351+#ifdef STBIR_SIMD8
17352+			stbir__simdi i;
17353+			stbir__simdi8 o0, o1;
17354+			stbir__simdf8 of0, of1;
17355+			STBIR_NO_UNROLL(decode);
17356+			stbir__simdi_load(i, input);
17357+			stbir__simdi8_expand_u8_to_u32(o0, o1, i);
17358+			stbir__simdi8_convert_i32_to_float(of0, o0);
17359+			stbir__simdi8_convert_i32_to_float(of1, o1);
17360+			stbir__simdf8_mult(of0, of0, STBIR_max_uint8_as_float_inverted8);
17361+			stbir__simdf8_mult(of1, of1, STBIR_max_uint8_as_float_inverted8);
17362+			stbir__decode_simdf8_flip(of0);
17363+			stbir__decode_simdf8_flip(of1);
17364+			stbir__simdf8_store(decode + 0, of0);
17365+			stbir__simdf8_store(decode + 8, of1);
17366+#else
17367+			stbir__simdi i, o0, o1, o2, o3;
17368+			stbir__simdf of0, of1, of2, of3;
17369+			STBIR_NO_UNROLL(decode);
17370+			stbir__simdi_load(i, input);
17371+			stbir__simdi_expand_u8_to_u32(o0, o1, o2, o3, i);
17372+			stbir__simdi_convert_i32_to_float(of0, o0);
17373+			stbir__simdi_convert_i32_to_float(of1, o1);
17374+			stbir__simdi_convert_i32_to_float(of2, o2);
17375+			stbir__simdi_convert_i32_to_float(of3, o3);
17376+			stbir__simdf_mult(of0, of0,
17377+			                  STBIR__CONSTF(STBIR_max_uint8_as_float_inverted));
17378+			stbir__simdf_mult(of1, of1,
17379+			                  STBIR__CONSTF(STBIR_max_uint8_as_float_inverted));
17380+			stbir__simdf_mult(of2, of2,
17381+			                  STBIR__CONSTF(STBIR_max_uint8_as_float_inverted));
17382+			stbir__simdf_mult(of3, of3,
17383+			                  STBIR__CONSTF(STBIR_max_uint8_as_float_inverted));
17384+			stbir__decode_simdf4_flip(of0);
17385+			stbir__decode_simdf4_flip(of1);
17386+			stbir__decode_simdf4_flip(of2);
17387+			stbir__decode_simdf4_flip(of3);
17388+			stbir__simdf_store(decode + 0, of0);
17389+			stbir__simdf_store(decode + 4, of1);
17390+			stbir__simdf_store(decode + 8, of2);
17391+			stbir__simdf_store(decode + 12, of3);
17392+#endif
17393+			decode += 16;
17394+			input += 16;
17395+			if (decode <= decode_end) {
17396+				continue;
17397+			}
17398+			if (decode == (decode_end + 16)) {
17399+				break;
17400+			}
17401+			decode = decode_end; // backup and do last couple
17402+			input = end_input_m16;
17403+		}
17404+		return decode_end + 16;
17405+	}
17406+#endif
17407+
17408+// try to do blocks of 4 when you can
17409+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17410+	decode += 4;
17411+	STBIR_SIMD_NO_UNROLL_LOOP_START
17412+	while (decode <= decode_end) {
17413+		STBIR_SIMD_NO_UNROLL(decode);
17414+		decode[0 - 4] = ((float)(input[stbir__decode_order0])) *
17415+		                stbir__max_uint8_as_float_inverted;
17416+		decode[1 - 4] = ((float)(input[stbir__decode_order1])) *
17417+		                stbir__max_uint8_as_float_inverted;
17418+		decode[2 - 4] = ((float)(input[stbir__decode_order2])) *
17419+		                stbir__max_uint8_as_float_inverted;
17420+		decode[3 - 4] = ((float)(input[stbir__decode_order3])) *
17421+		                stbir__max_uint8_as_float_inverted;
17422+		decode += 4;
17423+		input += 4;
17424+	}
17425+	decode -= 4;
17426+#endif
17427 
17428+// do the remnants
17429+#if stbir__coder_min_num < 4
17430+	STBIR_NO_UNROLL_LOOP_START
17431+	while (decode < decode_end) {
17432+		STBIR_NO_UNROLL(decode);
17433+		decode[0] = ((float)(input[stbir__decode_order0])) *
17434+		            stbir__max_uint8_as_float_inverted;
17435+#if stbir__coder_min_num >= 2
17436+		decode[1] = ((float)(input[stbir__decode_order1])) *
17437+		            stbir__max_uint8_as_float_inverted;
17438+#endif
17439+#if stbir__coder_min_num >= 3
17440+		decode[2] = ((float)(input[stbir__decode_order2])) *
17441+		            stbir__max_uint8_as_float_inverted;
17442+#endif
17443+		decode += stbir__coder_min_num;
17444+		input += stbir__coder_min_num;
17445+	}
17446 #endif
17447 
17448-#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
17449+	return decode_end;
17450+}
17451 
17452-static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
17453+static void
17454+STBIR__CODER_NAME(stbir__encode_uint8_linear_scaled)(void *outputp,
17455+                                                     int width_times_channels,
17456+                                                     float const *encode)
17457 {
17458-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17459-  float * decode_end = (float*) decode + width_times_channels;
17460-  unsigned char const * input = (unsigned char const *)inputp;
17461+	unsigned char STBIR_SIMD_STREAMOUT_PTR(*) output = (unsigned char *)outputp;
17462+	unsigned char *end_output =
17463+	    ((unsigned char *)output) + width_times_channels;
17464 
17465-  decode += 4;
17466-  while( decode <= decode_end )
17467-  {
17468-    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
17469-    decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
17470-    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
17471-    decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
17472-    input += 4;
17473-    decode += 4;
17474-  }
17475-  decode -= 4;
17476-  if( decode < decode_end )
17477-  {
17478-    decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
17479-    decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
17480-  }
17481-  return decode_end;
17482-}
17483-
17484-static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
17485-{
17486-  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
17487-  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
17488-
17489-  #ifdef STBIR_SIMD
17490-
17491-  if ( width_times_channels >= 16 )
17492-  {
17493-    float const * end_encode_m16 = encode + width_times_channels - 16;
17494-    end_output -= 16;
17495-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
17496-    for(;;)
17497-    {
17498-      stbir__simdf f0, f1, f2, f3;
17499-      stbir__simdi i0, i1, i2, i3;
17500-
17501-      STBIR_SIMD_NO_UNROLL(encode);
17502-      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
17503-
17504-      stbir__min_max_shift20( i0, f0 );
17505-      stbir__scale_and_convert( i1, f1 );
17506-      stbir__min_max_shift20( i2, f2 );
17507-      stbir__scale_and_convert( i3, f3 );
17508-
17509-      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
17510-
17511-      stbir__linear_to_srgb_finish( i0, f0 );
17512-      stbir__linear_to_srgb_finish( i2, f2 );
17513-
17514-      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
17515-
17516-      output += 16;
17517-      encode += 16;
17518-      if ( output <= end_output )
17519-        continue;
17520-      if ( output == ( end_output + 16 ) )
17521-        break;
17522-      output = end_output; // backup and do last couple
17523-      encode = end_encode_m16;
17524-    }
17525-    return;
17526-  }
17527-  #endif
17528-
17529-  STBIR_SIMD_NO_UNROLL_LOOP_START
17530-  do {
17531-    float f;
17532-    STBIR_SIMD_NO_UNROLL(encode);
17533-
17534-    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
17535-
17536-    f = encode[1] * stbir__max_uint8_as_float + 0.5f;
17537-    STBIR_CLAMP(f, 0, 255);
17538-    output[stbir__decode_order1] = (unsigned char) f;
17539-
17540-    output += 2;
17541-    encode += 2;
17542-  } while( output < end_output );
17543-}
17544-
17545-#endif
17546-
17547-static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
17548-{
17549-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17550-  float * decode_end = (float*) decode + width_times_channels;
17551-  unsigned short const * input = (unsigned short const *)inputp;
17552-
17553-  #ifdef STBIR_SIMD
17554-  unsigned short const * end_input_m8 = input + width_times_channels - 8;
17555-  if ( width_times_channels >= 8 )
17556-  {
17557-    decode_end -= 8;
17558-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
17559-    for(;;)
17560-    {
17561-      #ifdef STBIR_SIMD8
17562-      stbir__simdi i; stbir__simdi8 o;
17563-      stbir__simdf8 of;
17564-      STBIR_NO_UNROLL(decode);
17565-      stbir__simdi_load( i, input );
17566-      stbir__simdi8_expand_u16_to_u32( o, i );
17567-      stbir__simdi8_convert_i32_to_float( of, o );
17568-      stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
17569-      stbir__decode_simdf8_flip( of );
17570-      stbir__simdf8_store( decode + 0, of );
17571-      #else
17572-      stbir__simdi i, o0, o1;
17573-      stbir__simdf of0, of1;
17574-      STBIR_NO_UNROLL(decode);
17575-      stbir__simdi_load( i, input );
17576-      stbir__simdi_expand_u16_to_u32( o0,o1,i );
17577-      stbir__simdi_convert_i32_to_float( of0, o0 );
17578-      stbir__simdi_convert_i32_to_float( of1, o1 );
17579-      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
17580-      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
17581-      stbir__decode_simdf4_flip( of0 );
17582-      stbir__decode_simdf4_flip( of1 );
17583-      stbir__simdf_store( decode + 0,  of0 );
17584-      stbir__simdf_store( decode + 4,  of1 );
17585-      #endif
17586-      decode += 8;
17587-      input += 8;
17588-      if ( decode <= decode_end )
17589-        continue;
17590-      if ( decode == ( decode_end + 8 ) )
17591-        break;
17592-      decode = decode_end; // backup and do last couple
17593-      input = end_input_m8;
17594-    }
17595-    return decode_end + 8;
17596-  }
17597-  #endif
17598-
17599-  // try to do blocks of 4 when you can
17600-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17601-  decode += 4;
17602-  STBIR_SIMD_NO_UNROLL_LOOP_START
17603-  while( decode <= decode_end )
17604-  {
17605-    STBIR_SIMD_NO_UNROLL(decode);
17606-    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
17607-    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
17608-    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
17609-    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
17610-    decode += 4;
17611-    input += 4;
17612-  }
17613-  decode -= 4;
17614-  #endif
17615-
17616-  // do the remnants
17617-  #if stbir__coder_min_num < 4
17618-  STBIR_NO_UNROLL_LOOP_START
17619-  while( decode < decode_end )
17620-  {
17621-    STBIR_NO_UNROLL(decode);
17622-    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
17623-    #if stbir__coder_min_num >= 2
17624-    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
17625-    #endif
17626-    #if stbir__coder_min_num >= 3
17627-    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
17628-    #endif
17629-    decode += stbir__coder_min_num;
17630-    input += stbir__coder_min_num;
17631-  }
17632-  #endif
17633-  return decode_end;
17634-}
17635-
17636-
17637-static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
17638-{
17639-  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
17640-  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
17641-
17642-  #ifdef STBIR_SIMD
17643-  {
17644-    if ( width_times_channels >= stbir__simdfX_float_count*2 )
17645-    {
17646-      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
17647-      end_output -= stbir__simdfX_float_count*2;
17648-      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
17649-      for(;;)
17650-      {
17651-        stbir__simdfX e0, e1;
17652-        stbir__simdiX i;
17653-        STBIR_SIMD_NO_UNROLL(encode);
17654-        stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
17655-        stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
17656-        stbir__encode_simdfX_unflip( e0 );
17657-        stbir__encode_simdfX_unflip( e1 );
17658-        stbir__simdfX_pack_to_words( i, e0, e1 );
17659-        stbir__simdiX_store( output, i );
17660-        encode += stbir__simdfX_float_count*2;
17661-        output += stbir__simdfX_float_count*2;
17662-        if ( output <= end_output )
17663-          continue;
17664-        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
17665-          break;
17666-        output = end_output;     // backup and do last couple
17667-        encode = end_encode_m8;
17668-      }
17669-      return;
17670-    }
17671-  }
17672-
17673-  // try to do blocks of 4 when you can
17674-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17675-  output += 4;
17676-  STBIR_NO_UNROLL_LOOP_START
17677-  while( output <= end_output )
17678-  {
17679-    stbir__simdf e;
17680-    stbir__simdi i;
17681-    STBIR_NO_UNROLL(encode);
17682-    stbir__simdf_load( e, encode );
17683-    stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
17684-    stbir__encode_simdf4_unflip( e );
17685-    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
17686-    stbir__simdi_store2( output-4, i );
17687-    output += 4;
17688-    encode += 4;
17689-  }
17690-  output -= 4;
17691-  #endif
17692-
17693-  // do the remnants
17694-  #if stbir__coder_min_num < 4
17695-  STBIR_NO_UNROLL_LOOP_START
17696-  while( output < end_output )
17697-  {
17698-    stbir__simdf e;
17699-    STBIR_NO_UNROLL(encode);
17700-    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
17701-    #if stbir__coder_min_num >= 2
17702-    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
17703-    #endif
17704-    #if stbir__coder_min_num >= 3
17705-    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
17706-    #endif
17707-    output += stbir__coder_min_num;
17708-    encode += stbir__coder_min_num;
17709-  }
17710-  #endif
17711-
17712-  #else
17713-
17714-  // try to do blocks of 4 when you can
17715-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17716-  output += 4;
17717-  STBIR_SIMD_NO_UNROLL_LOOP_START
17718-  while( output <= end_output )
17719-  {
17720-    float f;
17721-    STBIR_SIMD_NO_UNROLL(encode);
17722-    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
17723-    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
17724-    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
17725-    f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
17726-    output += 4;
17727-    encode += 4;
17728-  }
17729-  output -= 4;
17730-  #endif
17731-
17732-  // do the remnants
17733-  #if stbir__coder_min_num < 4
17734-  STBIR_NO_UNROLL_LOOP_START
17735-  while( output < end_output )
17736-  {
17737-    float f;
17738-    STBIR_NO_UNROLL(encode);
17739-    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
17740-    #if stbir__coder_min_num >= 2
17741-    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
17742-    #endif
17743-    #if stbir__coder_min_num >= 3
17744-    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
17745-    #endif
17746-    output += stbir__coder_min_num;
17747-    encode += stbir__coder_min_num;
17748-  }
17749-  #endif
17750-  #endif
17751-}
17752-
17753-static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
17754-{
17755-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17756-  float * decode_end = (float*) decode + width_times_channels;
17757-  unsigned short const * input = (unsigned short const *)inputp;
17758-
17759-  #ifdef STBIR_SIMD
17760-  unsigned short const * end_input_m8 = input + width_times_channels - 8;
17761-  if ( width_times_channels >= 8 )
17762-  {
17763-    decode_end -= 8;
17764-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
17765-    for(;;)
17766-    {
17767-      #ifdef STBIR_SIMD8
17768-      stbir__simdi i; stbir__simdi8 o;
17769-      stbir__simdf8 of;
17770-      STBIR_NO_UNROLL(decode);
17771-      stbir__simdi_load( i, input );
17772-      stbir__simdi8_expand_u16_to_u32( o, i );
17773-      stbir__simdi8_convert_i32_to_float( of, o );
17774-      stbir__decode_simdf8_flip( of );
17775-      stbir__simdf8_store( decode + 0, of );
17776-      #else
17777-      stbir__simdi i, o0, o1;
17778-      stbir__simdf of0, of1;
17779-      STBIR_NO_UNROLL(decode);
17780-      stbir__simdi_load( i, input );
17781-      stbir__simdi_expand_u16_to_u32( o0, o1, i );
17782-      stbir__simdi_convert_i32_to_float( of0, o0 );
17783-      stbir__simdi_convert_i32_to_float( of1, o1 );
17784-      stbir__decode_simdf4_flip( of0 );
17785-      stbir__decode_simdf4_flip( of1 );
17786-      stbir__simdf_store( decode + 0,  of0 );
17787-      stbir__simdf_store( decode + 4,  of1 );
17788-      #endif
17789-      decode += 8;
17790-      input += 8;
17791-      if ( decode <= decode_end )
17792-        continue;
17793-      if ( decode == ( decode_end + 8 ) )
17794-        break;
17795-      decode = decode_end; // backup and do last couple
17796-      input = end_input_m8;
17797-    }
17798-    return decode_end + 8;
17799-  }
17800-  #endif
17801-
17802-  // try to do blocks of 4 when you can
17803-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17804-  decode += 4;
17805-  STBIR_SIMD_NO_UNROLL_LOOP_START
17806-  while( decode <= decode_end )
17807-  {
17808-    STBIR_SIMD_NO_UNROLL(decode);
17809-    decode[0-4] = ((float)(input[stbir__decode_order0]));
17810-    decode[1-4] = ((float)(input[stbir__decode_order1]));
17811-    decode[2-4] = ((float)(input[stbir__decode_order2]));
17812-    decode[3-4] = ((float)(input[stbir__decode_order3]));
17813-    decode += 4;
17814-    input += 4;
17815-  }
17816-  decode -= 4;
17817-  #endif
17818-
17819-  // do the remnants
17820-  #if stbir__coder_min_num < 4
17821-  STBIR_NO_UNROLL_LOOP_START
17822-  while( decode < decode_end )
17823-  {
17824-    STBIR_NO_UNROLL(decode);
17825-    decode[0] = ((float)(input[stbir__decode_order0]));
17826-    #if stbir__coder_min_num >= 2
17827-    decode[1] = ((float)(input[stbir__decode_order1]));
17828-    #endif
17829-    #if stbir__coder_min_num >= 3
17830-    decode[2] = ((float)(input[stbir__decode_order2]));
17831-    #endif
17832-    decode += stbir__coder_min_num;
17833-    input += stbir__coder_min_num;
17834-  }
17835-  #endif
17836-  return decode_end;
17837-}
17838-
17839-static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
17840-{
17841-  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
17842-  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
17843-
17844-  #ifdef STBIR_SIMD
17845-  {
17846-    if ( width_times_channels >= stbir__simdfX_float_count*2 )
17847-    {
17848-      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
17849-      end_output -= stbir__simdfX_float_count*2;
17850-      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
17851-      for(;;)
17852-      {
17853-        stbir__simdfX e0, e1;
17854-        stbir__simdiX i;
17855-        STBIR_SIMD_NO_UNROLL(encode);
17856-        stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
17857-        stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
17858-        stbir__encode_simdfX_unflip( e0 );
17859-        stbir__encode_simdfX_unflip( e1 );
17860-        stbir__simdfX_pack_to_words( i, e0, e1 );
17861-        stbir__simdiX_store( output, i );
17862-        encode += stbir__simdfX_float_count*2;
17863-        output += stbir__simdfX_float_count*2;
17864-        if ( output <= end_output )
17865-          continue;
17866-        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
17867-          break;
17868-        output = end_output; // backup and do last couple
17869-        encode = end_encode_m8;
17870-      }
17871-      return;
17872-    }
17873-  }
17874-
17875-  // try to do blocks of 4 when you can
17876-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17877-  output += 4;
17878-  STBIR_NO_UNROLL_LOOP_START
17879-  while( output <= end_output )
17880-  {
17881-    stbir__simdf e;
17882-    stbir__simdi i;
17883-    STBIR_NO_UNROLL(encode);
17884-    stbir__simdf_load( e, encode );
17885-    stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
17886-    stbir__encode_simdf4_unflip( e );
17887-    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
17888-    stbir__simdi_store2( output-4, i );
17889-    output += 4;
17890-    encode += 4;
17891-  }
17892-  output -= 4;
17893-  #endif
17894-
17895-  #else
17896-
17897-  // try to do blocks of 4 when you can
17898-  #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
17899-  output += 4;
17900-  STBIR_SIMD_NO_UNROLL_LOOP_START
17901-  while( output <= end_output )
17902-  {
17903-    float f;
17904-    STBIR_SIMD_NO_UNROLL(encode);
17905-    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
17906-    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
17907-    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
17908-    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
17909-    output += 4;
17910-    encode += 4;
17911-  }
17912-  output -= 4;
17913-  #endif
17914-
17915-  #endif
17916-
17917-  // do the remnants
17918-  #if stbir__coder_min_num < 4
17919-  STBIR_NO_UNROLL_LOOP_START
17920-  while( output < end_output )
17921-  {
17922-    float f;
17923-    STBIR_NO_UNROLL(encode);
17924-    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
17925-    #if stbir__coder_min_num >= 2
17926-    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
17927-    #endif
17928-    #if stbir__coder_min_num >= 3
17929-    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
17930-    #endif
17931-    output += stbir__coder_min_num;
17932-    encode += stbir__coder_min_num;
17933-  }
17934-  #endif
17935-}
17936-
17937-static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
17938-{
17939-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
17940-  float * decode_end = (float*) decode + width_times_channels;
17941-  stbir__FP16 const * input = (stbir__FP16 const *)inputp;
17942-
17943-  #ifdef STBIR_SIMD
17944-  if ( width_times_channels >= 8 )
17945-  {
17946-    stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
17947-    decode_end -= 8;
17948-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
17949-    for(;;)
17950-    {
17951-      STBIR_NO_UNROLL(decode);
17952-
17953-      stbir__half_to_float_SIMD( decode, input );
17954-      #ifdef stbir__decode_swizzle
17955-      #ifdef STBIR_SIMD8
17956-      {
17957-        stbir__simdf8 of;
17958-        stbir__simdf8_load( of, decode );
17959-        stbir__decode_simdf8_flip( of );
17960-        stbir__simdf8_store( decode, of );
17961-      }
17962-      #else
17963-      {
17964-        stbir__simdf of0,of1;
17965-        stbir__simdf_load( of0, decode );
17966-        stbir__simdf_load( of1, decode+4 );
17967-        stbir__decode_simdf4_flip( of0 );
17968-        stbir__decode_simdf4_flip( of1 );
17969-        stbir__simdf_store( decode, of0 );
17970-        stbir__simdf_store( decode+4, of1 );
17971-      }
17972-      #endif
17973-      #endif
17974-      decode += 8;
17975-      input += 8;
17976-      if ( decode <= decode_end )
17977-        continue;
17978-      if ( decode == ( decode_end + 8 ) )
17979-        break;
17980-      decode = decode_end; // backup and do last couple
17981-      input = end_input_m8;
17982-    }
17983-    return decode_end + 8;
17984-  }
17985-  #endif
17986-
17987-  // try to do blocks of 4 when you can
17988-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
17989-  decode += 4;
17990-  STBIR_SIMD_NO_UNROLL_LOOP_START
17991-  while( decode <= decode_end )
17992-  {
17993-    STBIR_SIMD_NO_UNROLL(decode);
17994-    decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
17995-    decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
17996-    decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
17997-    decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
17998-    decode += 4;
17999-    input += 4;
18000-  }
18001-  decode -= 4;
18002-  #endif
18003-
18004-  // do the remnants
18005-  #if stbir__coder_min_num < 4
18006-  STBIR_NO_UNROLL_LOOP_START
18007-  while( decode < decode_end )
18008-  {
18009-    STBIR_NO_UNROLL(decode);
18010-    decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
18011-    #if stbir__coder_min_num >= 2
18012-    decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
18013-    #endif
18014-    #if stbir__coder_min_num >= 3
18015-    decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
18016-    #endif
18017-    decode += stbir__coder_min_num;
18018-    input += stbir__coder_min_num;
18019-  }
18020-  #endif
18021-  return decode_end;
18022-}
18023-
18024-static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
18025-{
18026-  stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
18027-  stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
18028-
18029-  #ifdef STBIR_SIMD
18030-  if ( width_times_channels >= 8 )
18031-  {
18032-    float const * end_encode_m8 = encode + width_times_channels - 8;
18033-    end_output -= 8;
18034-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
18035-    for(;;)
18036-    {
18037-      STBIR_SIMD_NO_UNROLL(encode);
18038-      #ifdef stbir__decode_swizzle
18039-      #ifdef STBIR_SIMD8
18040-      {
18041-        stbir__simdf8 of;
18042-        stbir__simdf8_load( of, encode );
18043-        stbir__encode_simdf8_unflip( of );
18044-        stbir__float_to_half_SIMD( output, (float*)&of );
18045-      }
18046-      #else
18047-      {
18048-        stbir__simdf of[2];
18049-        stbir__simdf_load( of[0], encode );
18050-        stbir__simdf_load( of[1], encode+4 );
18051-        stbir__encode_simdf4_unflip( of[0] );
18052-        stbir__encode_simdf4_unflip( of[1] );
18053-        stbir__float_to_half_SIMD( output, (float*)of );
18054-      }
18055-      #endif
18056-      #else
18057-      stbir__float_to_half_SIMD( output, encode );
18058-      #endif
18059-      encode += 8;
18060-      output += 8;
18061-      if ( output <= end_output )
18062-        continue;
18063-      if ( output == ( end_output + 8 ) )
18064-        break;
18065-      output = end_output; // backup and do last couple
18066-      encode = end_encode_m8;
18067-    }
18068-    return;
18069-  }
18070-  #endif
18071-
18072-  // try to do blocks of 4 when you can
18073-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18074-  output += 4;
18075-  STBIR_SIMD_NO_UNROLL_LOOP_START
18076-  while( output <= end_output )
18077-  {
18078-    STBIR_SIMD_NO_UNROLL(output);
18079-    output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
18080-    output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
18081-    output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
18082-    output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
18083-    output += 4;
18084-    encode += 4;
18085-  }
18086-  output -= 4;
18087-  #endif
18088-
18089-  // do the remnants
18090-  #if stbir__coder_min_num < 4
18091-  STBIR_NO_UNROLL_LOOP_START
18092-  while( output < end_output )
18093-  {
18094-    STBIR_NO_UNROLL(output);
18095-    output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
18096-    #if stbir__coder_min_num >= 2
18097-    output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
18098-    #endif
18099-    #if stbir__coder_min_num >= 3
18100-    output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
18101-    #endif
18102-    output += stbir__coder_min_num;
18103-    encode += stbir__coder_min_num;
18104-  }
18105-  #endif
18106-}
18107-
18108-static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
18109-{
18110-  #ifdef stbir__decode_swizzle
18111-  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
18112-  float * decode_end = (float*) decode + width_times_channels;
18113-  float const * input = (float const *)inputp;
18114-
18115-  #ifdef STBIR_SIMD
18116-  if ( width_times_channels >= 16 )
18117-  {
18118-    float const * end_input_m16 = input + width_times_channels - 16;
18119-    decode_end -= 16;
18120-    STBIR_NO_UNROLL_LOOP_START_INF_FOR
18121-    for(;;)
18122-    {
18123-      STBIR_NO_UNROLL(decode);
18124-      #ifdef stbir__decode_swizzle
18125-      #ifdef STBIR_SIMD8
18126-      {
18127-        stbir__simdf8 of0,of1;
18128-        stbir__simdf8_load( of0, input );
18129-        stbir__simdf8_load( of1, input+8 );
18130-        stbir__decode_simdf8_flip( of0 );
18131-        stbir__decode_simdf8_flip( of1 );
18132-        stbir__simdf8_store( decode, of0 );
18133-        stbir__simdf8_store( decode+8, of1 );
18134-      }
18135-      #else
18136-      {
18137-        stbir__simdf of0,of1,of2,of3;
18138-        stbir__simdf_load( of0, input );
18139-        stbir__simdf_load( of1, input+4 );
18140-        stbir__simdf_load( of2, input+8 );
18141-        stbir__simdf_load( of3, input+12 );
18142-        stbir__decode_simdf4_flip( of0 );
18143-        stbir__decode_simdf4_flip( of1 );
18144-        stbir__decode_simdf4_flip( of2 );
18145-        stbir__decode_simdf4_flip( of3 );
18146-        stbir__simdf_store( decode, of0 );
18147-        stbir__simdf_store( decode+4, of1 );
18148-        stbir__simdf_store( decode+8, of2 );
18149-        stbir__simdf_store( decode+12, of3 );
18150-      }
18151-      #endif
18152-      #endif
18153-      decode += 16;
18154-      input += 16;
18155-      if ( decode <= decode_end )
18156-        continue;
18157-      if ( decode == ( decode_end + 16 ) )
18158-        break;
18159-      decode = decode_end; // backup and do last couple
18160-      input = end_input_m16;
18161-    }
18162-    return decode_end + 16;
18163-  }
18164-  #endif
18165-
18166-  // try to do blocks of 4 when you can
18167-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18168-  decode += 4;
18169-  STBIR_SIMD_NO_UNROLL_LOOP_START
18170-  while( decode <= decode_end )
18171-  {
18172-    STBIR_SIMD_NO_UNROLL(decode);
18173-    decode[0-4] = input[stbir__decode_order0];
18174-    decode[1-4] = input[stbir__decode_order1];
18175-    decode[2-4] = input[stbir__decode_order2];
18176-    decode[3-4] = input[stbir__decode_order3];
18177-    decode += 4;
18178-    input += 4;
18179-  }
18180-  decode -= 4;
18181-  #endif
18182-
18183-  // do the remnants
18184-  #if stbir__coder_min_num < 4
18185-  STBIR_NO_UNROLL_LOOP_START
18186-  while( decode < decode_end )
18187-  {
18188-    STBIR_NO_UNROLL(decode);
18189-    decode[0] = input[stbir__decode_order0];
18190-    #if stbir__coder_min_num >= 2
18191-    decode[1] = input[stbir__decode_order1];
18192-    #endif
18193-    #if stbir__coder_min_num >= 3
18194-    decode[2] = input[stbir__decode_order2];
18195-    #endif
18196-    decode += stbir__coder_min_num;
18197-    input += stbir__coder_min_num;
18198-  }
18199-  #endif
18200-  return decode_end;
18201-
18202-  #else
18203-
18204-  if ( (void*)decodep != inputp )
18205-    STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
18206-
18207-  return decodep + width_times_channels;
18208-
18209-  #endif
18210-}
18211-
18212-static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
18213-{
18214-  #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
18215-
18216-  if ( (void*)outputp != (void*) encode )
18217-    STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
18218-
18219-  #else
18220-
18221-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
18222-  float * end_output = ( (float*) output ) + width_times_channels;
18223-
18224-  #ifdef STBIR_FLOAT_HIGH_CLAMP
18225-  #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
18226-  #else
18227-  #define stbir_scalar_hi_clamp( v )
18228-  #endif
18229-  #ifdef STBIR_FLOAT_LOW_CLAMP
18230-  #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
18231-  #else
18232-  #define stbir_scalar_lo_clamp( v )
18233-  #endif
18234-
18235-  #ifdef STBIR_SIMD
18236-
18237-  #ifdef STBIR_FLOAT_HIGH_CLAMP
18238-  const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
18239-  #endif
18240-  #ifdef STBIR_FLOAT_LOW_CLAMP
18241-  const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
18242-  #endif
18243-
18244-  if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
18245-  {
18246-    float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
18247-    end_output -= ( stbir__simdfX_float_count * 2 );
18248-    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
18249-    for(;;)
18250-    {
18251-      stbir__simdfX e0, e1;
18252-      STBIR_SIMD_NO_UNROLL(encode);
18253-      stbir__simdfX_load( e0, encode );
18254-      stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
18255-#ifdef STBIR_FLOAT_HIGH_CLAMP
18256-      stbir__simdfX_min( e0, e0, high_clamp );
18257-      stbir__simdfX_min( e1, e1, high_clamp );
18258+#ifdef STBIR_SIMD
18259+	if (width_times_channels >= stbir__simdfX_float_count * 2) {
18260+		float const *end_encode_m8 =
18261+		    encode + width_times_channels - stbir__simdfX_float_count * 2;
18262+		end_output -= stbir__simdfX_float_count * 2;
18263+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
18264+		for (;;) {
18265+			stbir__simdfX e0, e1;
18266+			stbir__simdi i;
18267+			STBIR_SIMD_NO_UNROLL(encode);
18268+			stbir__simdfX_madd_mem(e0, STBIR_simd_point5X,
18269+			                       STBIR_max_uint8_as_floatX, encode);
18270+			stbir__simdfX_madd_mem(e1, STBIR_simd_point5X,
18271+			                       STBIR_max_uint8_as_floatX,
18272+			                       encode + stbir__simdfX_float_count);
18273+			stbir__encode_simdfX_unflip(e0);
18274+			stbir__encode_simdfX_unflip(e1);
18275+#ifdef STBIR_SIMD8
18276+			stbir__simdf8_pack_to_16bytes(i, e0, e1);
18277+			stbir__simdi_store(output, i);
18278+#else
18279+			stbir__simdf_pack_to_8bytes(i, e0, e1);
18280+			stbir__simdi_store2(output, i);
18281 #endif
18282-#ifdef STBIR_FLOAT_LOW_CLAMP
18283-      stbir__simdfX_max( e0, e0, low_clamp );
18284-      stbir__simdfX_max( e1, e1, low_clamp );
18285-#endif
18286-      stbir__encode_simdfX_unflip( e0 );
18287-      stbir__encode_simdfX_unflip( e1 );
18288-      stbir__simdfX_store( output, e0 );
18289-      stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
18290-      encode += stbir__simdfX_float_count * 2;
18291-      output += stbir__simdfX_float_count * 2;
18292-      if ( output < end_output )
18293-        continue;
18294-      if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
18295-        break;
18296-      output = end_output; // backup and do last couple
18297-      encode = end_encode_m8;
18298-    }
18299-    return;
18300-  }
18301-
18302-  // try to do blocks of 4 when you can
18303-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18304-  output += 4;
18305-  STBIR_NO_UNROLL_LOOP_START
18306-  while( output <= end_output )
18307-  {
18308-    stbir__simdf e0;
18309-    STBIR_NO_UNROLL(encode);
18310-    stbir__simdf_load( e0, encode );
18311-#ifdef STBIR_FLOAT_HIGH_CLAMP
18312-    stbir__simdf_min( e0, e0, high_clamp );
18313+			encode += stbir__simdfX_float_count * 2;
18314+			output += stbir__simdfX_float_count * 2;
18315+			if (output <= end_output) {
18316+				continue;
18317+			}
18318+			if (output == (end_output + stbir__simdfX_float_count * 2)) {
18319+				break;
18320+			}
18321+			output = end_output; // backup and do last couple
18322+			encode = end_encode_m8;
18323+		}
18324+		return;
18325+	}
18326+
18327+// try to do blocks of 4 when you can
18328+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18329+	output += 4;
18330+	STBIR_NO_UNROLL_LOOP_START
18331+	while (output <= end_output) {
18332+		stbir__simdf e0;
18333+		stbir__simdi i0;
18334+		STBIR_NO_UNROLL(encode);
18335+		stbir__simdf_load(e0, encode);
18336+		stbir__simdf_madd(e0, STBIR__CONSTF(STBIR_simd_point5),
18337+		                  STBIR__CONSTF(STBIR_max_uint8_as_float), e0);
18338+		stbir__encode_simdf4_unflip(e0);
18339+		stbir__simdf_pack_to_8bytes(i0, e0, e0); // only use first 4
18340+		*(int *)(output - 4) = stbir__simdi_to_int(i0);
18341+		output += 4;
18342+		encode += 4;
18343+	}
18344+	output -= 4;
18345 #endif
18346-#ifdef STBIR_FLOAT_LOW_CLAMP
18347-    stbir__simdf_max( e0, e0, low_clamp );
18348-#endif
18349-    stbir__encode_simdf4_unflip( e0 );
18350-    stbir__simdf_store( output-4, e0 );
18351-    output += 4;
18352-    encode += 4;
18353-  }
18354-  output -= 4;
18355-  #endif
18356-
18357-  #else
18358-
18359-  // try to do blocks of 4 when you can
18360-  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18361-  output += 4;
18362-  STBIR_SIMD_NO_UNROLL_LOOP_START
18363-  while( output <= end_output )
18364-  {
18365-    float e;
18366-    STBIR_SIMD_NO_UNROLL(encode);
18367-    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
18368-    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
18369-    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
18370-    e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
18371-    output += 4;
18372-    encode += 4;
18373-  }
18374-  output -= 4;
18375-
18376-  #endif
18377-
18378-  #endif
18379-
18380-  // do the remnants
18381-  #if stbir__coder_min_num < 4
18382-  STBIR_NO_UNROLL_LOOP_START
18383-  while( output < end_output )
18384-  {
18385-    float e;
18386-    STBIR_NO_UNROLL(encode);
18387-    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
18388-    #if stbir__coder_min_num >= 2
18389-    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
18390-    #endif
18391-    #if stbir__coder_min_num >= 3
18392-    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
18393-    #endif
18394-    output += stbir__coder_min_num;
18395-    encode += stbir__coder_min_num;
18396-  }
18397-  #endif
18398-
18399-  #endif
18400-}
18401+
18402+// do the remnants
18403+#if stbir__coder_min_num < 4
18404+	STBIR_NO_UNROLL_LOOP_START
18405+	while (output < end_output) {
18406+		stbir__simdf e0;
18407+		STBIR_NO_UNROLL(encode);
18408+		stbir__simdf_madd1_mem(e0, STBIR__CONSTF(STBIR_simd_point5),
18409+		                       STBIR__CONSTF(STBIR_max_uint8_as_float),
18410+		                       encode + stbir__encode_order0);
18411+		output[0] = stbir__simdf_convert_float_to_uint8(e0);
18412+#if stbir__coder_min_num >= 2
18413+		stbir__simdf_madd1_mem(e0, STBIR__CONSTF(STBIR_simd_point5),
18414+		                       STBIR__CONSTF(STBIR_max_uint8_as_float),
18415+		                       encode + stbir__encode_order1);
18416+		output[1] = stbir__simdf_convert_float_to_uint8(e0);
18417+#endif
18418+#if stbir__coder_min_num >= 3
18419+		stbir__simdf_madd1_mem(e0, STBIR__CONSTF(STBIR_simd_point5),
18420+		                       STBIR__CONSTF(STBIR_max_uint8_as_float),
18421+		                       encode + stbir__encode_order2);
18422+		output[2] = stbir__simdf_convert_float_to_uint8(e0);
18423+#endif
18424+		output += stbir__coder_min_num;
18425+		encode += stbir__coder_min_num;
18426+	}
18427+#endif
18428+
18429+#else
18430+
18431+// try to do blocks of 4 when you can
18432+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18433+	output += 4;
18434+	while (output <= end_output) {
18435+		float f;
18436+		f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f;
18437+		STBIR_CLAMP(f, 0, 255);
18438+		output[0 - 4] = (unsigned char)f;
18439+		f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f;
18440+		STBIR_CLAMP(f, 0, 255);
18441+		output[1 - 4] = (unsigned char)f;
18442+		f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f;
18443+		STBIR_CLAMP(f, 0, 255);
18444+		output[2 - 4] = (unsigned char)f;
18445+		f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f;
18446+		STBIR_CLAMP(f, 0, 255);
18447+		output[3 - 4] = (unsigned char)f;
18448+		output += 4;
18449+		encode += 4;
18450+	}
18451+	output -= 4;
18452+#endif
18453+
18454+// do the remnants
18455+#if stbir__coder_min_num < 4
18456+	STBIR_NO_UNROLL_LOOP_START
18457+	while (output < end_output) {
18458+		float f;
18459+		STBIR_NO_UNROLL(encode);
18460+		f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f;
18461+		STBIR_CLAMP(f, 0, 255);
18462+		output[0] = (unsigned char)f;
18463+#if stbir__coder_min_num >= 2
18464+		f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f;
18465+		STBIR_CLAMP(f, 0, 255);
18466+		output[1] = (unsigned char)f;
18467+#endif
18468+#if stbir__coder_min_num >= 3
18469+		f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f;
18470+		STBIR_CLAMP(f, 0, 255);
18471+		output[2] = (unsigned char)f;
18472+#endif
18473+		output += stbir__coder_min_num;
18474+		encode += stbir__coder_min_num;
18475+	}
18476+#endif
18477+#endif
18478+}
18479+
18480+static float *
18481+STBIR__CODER_NAME(stbir__decode_uint8_linear)(float *decodep,
18482+                                              int width_times_channels,
18483+                                              void const *inputp)
18484+{
18485+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
18486+	float *decode_end = (float *)decode + width_times_channels;
18487+	unsigned char const *input = (unsigned char const *)inputp;
18488+
18489+#ifdef STBIR_SIMD
18490+	unsigned char const *end_input_m16 = input + width_times_channels - 16;
18491+	if (width_times_channels >= 16) {
18492+		decode_end -= 16;
18493+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
18494+		for (;;) {
18495+#ifdef STBIR_SIMD8
18496+			stbir__simdi i;
18497+			stbir__simdi8 o0, o1;
18498+			stbir__simdf8 of0, of1;
18499+			STBIR_NO_UNROLL(decode);
18500+			stbir__simdi_load(i, input);
18501+			stbir__simdi8_expand_u8_to_u32(o0, o1, i);
18502+			stbir__simdi8_convert_i32_to_float(of0, o0);
18503+			stbir__simdi8_convert_i32_to_float(of1, o1);
18504+			stbir__decode_simdf8_flip(of0);
18505+			stbir__decode_simdf8_flip(of1);
18506+			stbir__simdf8_store(decode + 0, of0);
18507+			stbir__simdf8_store(decode + 8, of1);
18508+#else
18509+			stbir__simdi i, o0, o1, o2, o3;
18510+			stbir__simdf of0, of1, of2, of3;
18511+			STBIR_NO_UNROLL(decode);
18512+			stbir__simdi_load(i, input);
18513+			stbir__simdi_expand_u8_to_u32(o0, o1, o2, o3, i);
18514+			stbir__simdi_convert_i32_to_float(of0, o0);
18515+			stbir__simdi_convert_i32_to_float(of1, o1);
18516+			stbir__simdi_convert_i32_to_float(of2, o2);
18517+			stbir__simdi_convert_i32_to_float(of3, o3);
18518+			stbir__decode_simdf4_flip(of0);
18519+			stbir__decode_simdf4_flip(of1);
18520+			stbir__decode_simdf4_flip(of2);
18521+			stbir__decode_simdf4_flip(of3);
18522+			stbir__simdf_store(decode + 0, of0);
18523+			stbir__simdf_store(decode + 4, of1);
18524+			stbir__simdf_store(decode + 8, of2);
18525+			stbir__simdf_store(decode + 12, of3);
18526+#endif
18527+			decode += 16;
18528+			input += 16;
18529+			if (decode <= decode_end) {
18530+				continue;
18531+			}
18532+			if (decode == (decode_end + 16)) {
18533+				break;
18534+			}
18535+			decode = decode_end; // backup and do last couple
18536+			input = end_input_m16;
18537+		}
18538+		return decode_end + 16;
18539+	}
18540+#endif
18541+
18542+// try to do blocks of 4 when you can
18543+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18544+	decode += 4;
18545+	STBIR_SIMD_NO_UNROLL_LOOP_START
18546+	while (decode <= decode_end) {
18547+		STBIR_SIMD_NO_UNROLL(decode);
18548+		decode[0 - 4] = ((float)(input[stbir__decode_order0]));
18549+		decode[1 - 4] = ((float)(input[stbir__decode_order1]));
18550+		decode[2 - 4] = ((float)(input[stbir__decode_order2]));
18551+		decode[3 - 4] = ((float)(input[stbir__decode_order3]));
18552+		decode += 4;
18553+		input += 4;
18554+	}
18555+	decode -= 4;
18556+#endif
18557+
18558+// do the remnants
18559+#if stbir__coder_min_num < 4
18560+	STBIR_NO_UNROLL_LOOP_START
18561+	while (decode < decode_end) {
18562+		STBIR_NO_UNROLL(decode);
18563+		decode[0] = ((float)(input[stbir__decode_order0]));
18564+#if stbir__coder_min_num >= 2
18565+		decode[1] = ((float)(input[stbir__decode_order1]));
18566+#endif
18567+#if stbir__coder_min_num >= 3
18568+		decode[2] = ((float)(input[stbir__decode_order2]));
18569+#endif
18570+		decode += stbir__coder_min_num;
18571+		input += stbir__coder_min_num;
18572+	}
18573+#endif
18574+	return decode_end;
18575+}
18576+
18577+static void
18578+STBIR__CODER_NAME(stbir__encode_uint8_linear)(void *outputp,
18579+                                              int width_times_channels,
18580+                                              float const *encode)
18581+{
18582+	unsigned char STBIR_SIMD_STREAMOUT_PTR(*) output = (unsigned char *)outputp;
18583+	unsigned char *end_output =
18584+	    ((unsigned char *)output) + width_times_channels;
18585+
18586+#ifdef STBIR_SIMD
18587+	if (width_times_channels >= stbir__simdfX_float_count * 2) {
18588+		float const *end_encode_m8 =
18589+		    encode + width_times_channels - stbir__simdfX_float_count * 2;
18590+		end_output -= stbir__simdfX_float_count * 2;
18591+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
18592+		for (;;) {
18593+			stbir__simdfX e0, e1;
18594+			stbir__simdi i;
18595+			STBIR_SIMD_NO_UNROLL(encode);
18596+			stbir__simdfX_add_mem(e0, STBIR_simd_point5X, encode);
18597+			stbir__simdfX_add_mem(e1, STBIR_simd_point5X,
18598+			                      encode + stbir__simdfX_float_count);
18599+			stbir__encode_simdfX_unflip(e0);
18600+			stbir__encode_simdfX_unflip(e1);
18601+#ifdef STBIR_SIMD8
18602+			stbir__simdf8_pack_to_16bytes(i, e0, e1);
18603+			stbir__simdi_store(output, i);
18604+#else
18605+			stbir__simdf_pack_to_8bytes(i, e0, e1);
18606+			stbir__simdi_store2(output, i);
18607+#endif
18608+			encode += stbir__simdfX_float_count * 2;
18609+			output += stbir__simdfX_float_count * 2;
18610+			if (output <= end_output) {
18611+				continue;
18612+			}
18613+			if (output == (end_output + stbir__simdfX_float_count * 2)) {
18614+				break;
18615+			}
18616+			output = end_output; // backup and do last couple
18617+			encode = end_encode_m8;
18618+		}
18619+		return;
18620+	}
18621+
18622+// try to do blocks of 4 when you can
18623+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18624+	output += 4;
18625+	STBIR_NO_UNROLL_LOOP_START
18626+	while (output <= end_output) {
18627+		stbir__simdf e0;
18628+		stbir__simdi i0;
18629+		STBIR_NO_UNROLL(encode);
18630+		stbir__simdf_load(e0, encode);
18631+		stbir__simdf_add(e0, STBIR__CONSTF(STBIR_simd_point5), e0);
18632+		stbir__encode_simdf4_unflip(e0);
18633+		stbir__simdf_pack_to_8bytes(i0, e0, e0); // only use first 4
18634+		*(int *)(output - 4) = stbir__simdi_to_int(i0);
18635+		output += 4;
18636+		encode += 4;
18637+	}
18638+	output -= 4;
18639+#endif
18640+
18641+#else
18642+
18643+// try to do blocks of 4 when you can
18644+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18645+	output += 4;
18646+	while (output <= end_output) {
18647+		float f;
18648+		f = encode[stbir__encode_order0] + 0.5f;
18649+		STBIR_CLAMP(f, 0, 255);
18650+		output[0 - 4] = (unsigned char)f;
18651+		f = encode[stbir__encode_order1] + 0.5f;
18652+		STBIR_CLAMP(f, 0, 255);
18653+		output[1 - 4] = (unsigned char)f;
18654+		f = encode[stbir__encode_order2] + 0.5f;
18655+		STBIR_CLAMP(f, 0, 255);
18656+		output[2 - 4] = (unsigned char)f;
18657+		f = encode[stbir__encode_order3] + 0.5f;
18658+		STBIR_CLAMP(f, 0, 255);
18659+		output[3 - 4] = (unsigned char)f;
18660+		output += 4;
18661+		encode += 4;
18662+	}
18663+	output -= 4;
18664+#endif
18665+
18666+#endif
18667+
18668+// do the remnants
18669+#if stbir__coder_min_num < 4
18670+	STBIR_NO_UNROLL_LOOP_START
18671+	while (output < end_output) {
18672+		float f;
18673+		STBIR_NO_UNROLL(encode);
18674+		f = encode[stbir__encode_order0] + 0.5f;
18675+		STBIR_CLAMP(f, 0, 255);
18676+		output[0] = (unsigned char)f;
18677+#if stbir__coder_min_num >= 2
18678+		f = encode[stbir__encode_order1] + 0.5f;
18679+		STBIR_CLAMP(f, 0, 255);
18680+		output[1] = (unsigned char)f;
18681+#endif
18682+#if stbir__coder_min_num >= 3
18683+		f = encode[stbir__encode_order2] + 0.5f;
18684+		STBIR_CLAMP(f, 0, 255);
18685+		output[2] = (unsigned char)f;
18686+#endif
18687+		output += stbir__coder_min_num;
18688+		encode += stbir__coder_min_num;
18689+	}
18690+#endif
18691+}
18692+
18693+static float *
18694+STBIR__CODER_NAME(stbir__decode_uint8_srgb)(float *decodep,
18695+                                            int width_times_channels,
18696+                                            void const *inputp)
18697+{
18698+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
18699+	float *decode_end = (float *)decode + width_times_channels;
18700+	unsigned char const *input = (unsigned char const *)inputp;
18701+
18702+// try to do blocks of 4 when you can
18703+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18704+	decode += 4;
18705+	while (decode <= decode_end) {
18706+		decode[0 - 4] =
18707+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order0]];
18708+		decode[1 - 4] =
18709+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order1]];
18710+		decode[2 - 4] =
18711+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order2]];
18712+		decode[3 - 4] =
18713+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order3]];
18714+		decode += 4;
18715+		input += 4;
18716+	}
18717+	decode -= 4;
18718+#endif
18719+
18720+// do the remnants
18721+#if stbir__coder_min_num < 4
18722+	STBIR_NO_UNROLL_LOOP_START
18723+	while (decode < decode_end) {
18724+		STBIR_NO_UNROLL(decode);
18725+		decode[0] =
18726+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order0]];
18727+#if stbir__coder_min_num >= 2
18728+		decode[1] =
18729+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order1]];
18730+#endif
18731+#if stbir__coder_min_num >= 3
18732+		decode[2] =
18733+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order2]];
18734+#endif
18735+		decode += stbir__coder_min_num;
18736+		input += stbir__coder_min_num;
18737+	}
18738+#endif
18739+	return decode_end;
18740+}
18741+
18742+#define stbir__min_max_shift20(i, f)                                           \
18743+	stbir__simdf_max(f, f,                                                     \
18744+	                 stbir_simdf_casti(STBIR__CONSTI(STBIR_almost_zero)));     \
18745+	stbir__simdf_min(f, f,                                                     \
18746+	                 stbir_simdf_casti(STBIR__CONSTI(STBIR_almost_one)));      \
18747+	stbir__simdi_32shr(i, stbir_simdi_castf(f), 20);
18748+
18749+#define stbir__scale_and_convert(i, f)                                         \
18750+	stbir__simdf_madd(f, STBIR__CONSTF(STBIR_simd_point5),                     \
18751+	                  STBIR__CONSTF(STBIR_max_uint8_as_float), f);             \
18752+	stbir__simdf_max(f, f, stbir__simdf_zeroP());                              \
18753+	stbir__simdf_min(f, f, STBIR__CONSTF(STBIR_max_uint8_as_float));           \
18754+	stbir__simdf_convert_float_to_i32(i, f);
18755+
18756+#define stbir__linear_to_srgb_finish(i, f)                                     \
18757+	{                                                                          \
18758+		stbir__simdi temp;                                                     \
18759+		stbir__simdi_32shr(temp, stbir_simdi_castf(f), 12);                    \
18760+		stbir__simdi_and(temp, temp, STBIR__CONSTI(STBIR_mastissa_mask));      \
18761+		stbir__simdi_or(temp, temp, STBIR__CONSTI(STBIR_topscale));            \
18762+		stbir__simdi_16madd(i, i, temp);                                       \
18763+		stbir__simdi_32shr(i, i, 16);                                          \
18764+	}
18765+
18766+#define stbir__simdi_table_lookup2(v0, v1, table)                              \
18767+	{                                                                          \
18768+		stbir__simdi_u32 temp0, temp1;                                         \
18769+		temp0.m128i_i128 = v0;                                                 \
18770+		temp1.m128i_i128 = v1;                                                 \
18771+		temp0.m128i_u32[0] = table[temp0.m128i_i32[0]];                        \
18772+		temp0.m128i_u32[1] = table[temp0.m128i_i32[1]];                        \
18773+		temp0.m128i_u32[2] = table[temp0.m128i_i32[2]];                        \
18774+		temp0.m128i_u32[3] = table[temp0.m128i_i32[3]];                        \
18775+		temp1.m128i_u32[0] = table[temp1.m128i_i32[0]];                        \
18776+		temp1.m128i_u32[1] = table[temp1.m128i_i32[1]];                        \
18777+		temp1.m128i_u32[2] = table[temp1.m128i_i32[2]];                        \
18778+		temp1.m128i_u32[3] = table[temp1.m128i_i32[3]];                        \
18779+		v0 = temp0.m128i_i128;                                                 \
18780+		v1 = temp1.m128i_i128;                                                 \
18781+	}
18782+
18783+#define stbir__simdi_table_lookup3(v0, v1, v2, table)                          \
18784+	{                                                                          \
18785+		stbir__simdi_u32 temp0, temp1, temp2;                                  \
18786+		temp0.m128i_i128 = v0;                                                 \
18787+		temp1.m128i_i128 = v1;                                                 \
18788+		temp2.m128i_i128 = v2;                                                 \
18789+		temp0.m128i_u32[0] = table[temp0.m128i_i32[0]];                        \
18790+		temp0.m128i_u32[1] = table[temp0.m128i_i32[1]];                        \
18791+		temp0.m128i_u32[2] = table[temp0.m128i_i32[2]];                        \
18792+		temp0.m128i_u32[3] = table[temp0.m128i_i32[3]];                        \
18793+		temp1.m128i_u32[0] = table[temp1.m128i_i32[0]];                        \
18794+		temp1.m128i_u32[1] = table[temp1.m128i_i32[1]];                        \
18795+		temp1.m128i_u32[2] = table[temp1.m128i_i32[2]];                        \
18796+		temp1.m128i_u32[3] = table[temp1.m128i_i32[3]];                        \
18797+		temp2.m128i_u32[0] = table[temp2.m128i_i32[0]];                        \
18798+		temp2.m128i_u32[1] = table[temp2.m128i_i32[1]];                        \
18799+		temp2.m128i_u32[2] = table[temp2.m128i_i32[2]];                        \
18800+		temp2.m128i_u32[3] = table[temp2.m128i_i32[3]];                        \
18801+		v0 = temp0.m128i_i128;                                                 \
18802+		v1 = temp1.m128i_i128;                                                 \
18803+		v2 = temp2.m128i_i128;                                                 \
18804+	}
18805+
18806+#define stbir__simdi_table_lookup4(v0, v1, v2, v3, table)                      \
18807+	{                                                                          \
18808+		stbir__simdi_u32 temp0, temp1, temp2, temp3;                           \
18809+		temp0.m128i_i128 = v0;                                                 \
18810+		temp1.m128i_i128 = v1;                                                 \
18811+		temp2.m128i_i128 = v2;                                                 \
18812+		temp3.m128i_i128 = v3;                                                 \
18813+		temp0.m128i_u32[0] = table[temp0.m128i_i32[0]];                        \
18814+		temp0.m128i_u32[1] = table[temp0.m128i_i32[1]];                        \
18815+		temp0.m128i_u32[2] = table[temp0.m128i_i32[2]];                        \
18816+		temp0.m128i_u32[3] = table[temp0.m128i_i32[3]];                        \
18817+		temp1.m128i_u32[0] = table[temp1.m128i_i32[0]];                        \
18818+		temp1.m128i_u32[1] = table[temp1.m128i_i32[1]];                        \
18819+		temp1.m128i_u32[2] = table[temp1.m128i_i32[2]];                        \
18820+		temp1.m128i_u32[3] = table[temp1.m128i_i32[3]];                        \
18821+		temp2.m128i_u32[0] = table[temp2.m128i_i32[0]];                        \
18822+		temp2.m128i_u32[1] = table[temp2.m128i_i32[1]];                        \
18823+		temp2.m128i_u32[2] = table[temp2.m128i_i32[2]];                        \
18824+		temp2.m128i_u32[3] = table[temp2.m128i_i32[3]];                        \
18825+		temp3.m128i_u32[0] = table[temp3.m128i_i32[0]];                        \
18826+		temp3.m128i_u32[1] = table[temp3.m128i_i32[1]];                        \
18827+		temp3.m128i_u32[2] = table[temp3.m128i_i32[2]];                        \
18828+		temp3.m128i_u32[3] = table[temp3.m128i_i32[3]];                        \
18829+		v0 = temp0.m128i_i128;                                                 \
18830+		v1 = temp1.m128i_i128;                                                 \
18831+		v2 = temp2.m128i_i128;                                                 \
18832+		v3 = temp3.m128i_i128;                                                 \
18833+	}
18834+
18835+static void
18836+STBIR__CODER_NAME(stbir__encode_uint8_srgb)(void *outputp,
18837+                                            int width_times_channels,
18838+                                            float const *encode)
18839+{
18840+	unsigned char STBIR_SIMD_STREAMOUT_PTR(*) output = (unsigned char *)outputp;
18841+	unsigned char *end_output =
18842+	    ((unsigned char *)output) + width_times_channels;
18843+
18844+#ifdef STBIR_SIMD
18845+
18846+	if (width_times_channels >= 16) {
18847+		float const *end_encode_m16 = encode + width_times_channels - 16;
18848+		end_output -= 16;
18849+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
18850+		for (;;) {
18851+			stbir__simdf f0, f1, f2, f3;
18852+			stbir__simdi i0, i1, i2, i3;
18853+			STBIR_SIMD_NO_UNROLL(encode);
18854+
18855+			stbir__simdf_load4_transposed(f0, f1, f2, f3, encode);
18856+
18857+			stbir__min_max_shift20(i0, f0);
18858+			stbir__min_max_shift20(i1, f1);
18859+			stbir__min_max_shift20(i2, f2);
18860+			stbir__min_max_shift20(i3, f3);
18861+
18862+			stbir__simdi_table_lookup4(i0, i1, i2, i3,
18863+			                           (fp32_to_srgb8_tab4 - (127 - 13) * 8));
18864+
18865+			stbir__linear_to_srgb_finish(i0, f0);
18866+			stbir__linear_to_srgb_finish(i1, f1);
18867+			stbir__linear_to_srgb_finish(i2, f2);
18868+			stbir__linear_to_srgb_finish(i3, f3);
18869+
18870+			stbir__interleave_pack_and_store_16_u8(
18871+			    output, STBIR_strs_join1(i, , stbir__encode_order0),
18872+			    STBIR_strs_join1(i, , stbir__encode_order1),
18873+			    STBIR_strs_join1(i, , stbir__encode_order2),
18874+			    STBIR_strs_join1(i, , stbir__encode_order3));
18875+
18876+			encode += 16;
18877+			output += 16;
18878+			if (output <= end_output) {
18879+				continue;
18880+			}
18881+			if (output == (end_output + 16)) {
18882+				break;
18883+			}
18884+			output = end_output; // backup and do last couple
18885+			encode = end_encode_m16;
18886+		}
18887+		return;
18888+	}
18889+#endif
18890+
18891+// try to do blocks of 4 when you can
18892+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
18893+	output += 4;
18894+	STBIR_SIMD_NO_UNROLL_LOOP_START
18895+	while (output <= end_output) {
18896+		STBIR_SIMD_NO_UNROLL(encode);
18897+
18898+		output[0 - 4] =
18899+		    stbir__linear_to_srgb_uchar(encode[stbir__encode_order0]);
18900+		output[1 - 4] =
18901+		    stbir__linear_to_srgb_uchar(encode[stbir__encode_order1]);
18902+		output[2 - 4] =
18903+		    stbir__linear_to_srgb_uchar(encode[stbir__encode_order2]);
18904+		output[3 - 4] =
18905+		    stbir__linear_to_srgb_uchar(encode[stbir__encode_order3]);
18906+
18907+		output += 4;
18908+		encode += 4;
18909+	}
18910+	output -= 4;
18911+#endif
18912+
18913+// do the remnants
18914+#if stbir__coder_min_num < 4
18915+	STBIR_NO_UNROLL_LOOP_START
18916+	while (output < end_output) {
18917+		STBIR_NO_UNROLL(encode);
18918+		output[0] = stbir__linear_to_srgb_uchar(encode[stbir__encode_order0]);
18919+#if stbir__coder_min_num >= 2
18920+		output[1] = stbir__linear_to_srgb_uchar(encode[stbir__encode_order1]);
18921+#endif
18922+#if stbir__coder_min_num >= 3
18923+		output[2] = stbir__linear_to_srgb_uchar(encode[stbir__encode_order2]);
18924+#endif
18925+		output += stbir__coder_min_num;
18926+		encode += stbir__coder_min_num;
18927+	}
18928+#endif
18929+}
18930+
18931+#if (stbir__coder_min_num == 4) ||                                             \
18932+    ((stbir__coder_min_num == 1) && (!defined(stbir__decode_swizzle)))
18933+
18934+static float *
18935+STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)(
18936+    float *decodep, int width_times_channels, void const *inputp)
18937+{
18938+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
18939+	float *decode_end = (float *)decode + width_times_channels;
18940+	unsigned char const *input = (unsigned char const *)inputp;
18941+
18942+	do {
18943+		decode[0] =
18944+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order0]];
18945+		decode[1] =
18946+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order1]];
18947+		decode[2] =
18948+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order2]];
18949+		decode[3] = ((float)input[stbir__decode_order3]) *
18950+		            stbir__max_uint8_as_float_inverted;
18951+		input += 4;
18952+		decode += 4;
18953+	} while (decode < decode_end);
18954+	return decode_end;
18955+}
18956+
18957+static void
18958+STBIR__CODER_NAME(stbir__encode_uint8_srgb4_linearalpha)(
18959+    void *outputp, int width_times_channels, float const *encode)
18960+{
18961+	unsigned char STBIR_SIMD_STREAMOUT_PTR(*) output = (unsigned char *)outputp;
18962+	unsigned char *end_output =
18963+	    ((unsigned char *)output) + width_times_channels;
18964+
18965+#ifdef STBIR_SIMD
18966+
18967+	if (width_times_channels >= 16) {
18968+		float const *end_encode_m16 = encode + width_times_channels - 16;
18969+		end_output -= 16;
18970+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
18971+		for (;;) {
18972+			stbir__simdf f0, f1, f2, f3;
18973+			stbir__simdi i0, i1, i2, i3;
18974+
18975+			STBIR_SIMD_NO_UNROLL(encode);
18976+			stbir__simdf_load4_transposed(f0, f1, f2, f3, encode);
18977+
18978+			stbir__min_max_shift20(i0, f0);
18979+			stbir__min_max_shift20(i1, f1);
18980+			stbir__min_max_shift20(i2, f2);
18981+			stbir__scale_and_convert(i3, f3);
18982+
18983+			stbir__simdi_table_lookup3(i0, i1, i2,
18984+			                           (fp32_to_srgb8_tab4 - (127 - 13) * 8));
18985+
18986+			stbir__linear_to_srgb_finish(i0, f0);
18987+			stbir__linear_to_srgb_finish(i1, f1);
18988+			stbir__linear_to_srgb_finish(i2, f2);
18989+
18990+			stbir__interleave_pack_and_store_16_u8(
18991+			    output, STBIR_strs_join1(i, , stbir__encode_order0),
18992+			    STBIR_strs_join1(i, , stbir__encode_order1),
18993+			    STBIR_strs_join1(i, , stbir__encode_order2),
18994+			    STBIR_strs_join1(i, , stbir__encode_order3));
18995+
18996+			output += 16;
18997+			encode += 16;
18998+
18999+			if (output <= end_output) {
19000+				continue;
19001+			}
19002+			if (output == (end_output + 16)) {
19003+				break;
19004+			}
19005+			output = end_output; // backup and do last couple
19006+			encode = end_encode_m16;
19007+		}
19008+		return;
19009+	}
19010+#endif
19011+
19012+	STBIR_SIMD_NO_UNROLL_LOOP_START
19013+	do {
19014+		float f;
19015+		STBIR_SIMD_NO_UNROLL(encode);
19016+
19017+		output[stbir__decode_order0] = stbir__linear_to_srgb_uchar(encode[0]);
19018+		output[stbir__decode_order1] = stbir__linear_to_srgb_uchar(encode[1]);
19019+		output[stbir__decode_order2] = stbir__linear_to_srgb_uchar(encode[2]);
19020+
19021+		f = encode[3] * stbir__max_uint8_as_float + 0.5f;
19022+		STBIR_CLAMP(f, 0, 255);
19023+		output[stbir__decode_order3] = (unsigned char)f;
19024+
19025+		output += 4;
19026+		encode += 4;
19027+	} while (output < end_output);
19028+}
19029+
19030+#endif
19031+
19032+#if (stbir__coder_min_num == 2) ||                                             \
19033+    ((stbir__coder_min_num == 1) && (!defined(stbir__decode_swizzle)))
19034+
19035+static float *
19036+STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)(
19037+    float *decodep, int width_times_channels, void const *inputp)
19038+{
19039+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
19040+	float *decode_end = (float *)decode + width_times_channels;
19041+	unsigned char const *input = (unsigned char const *)inputp;
19042+
19043+	decode += 4;
19044+	while (decode <= decode_end) {
19045+		decode[0 - 4] =
19046+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order0]];
19047+		decode[1 - 4] = ((float)input[stbir__decode_order1]) *
19048+		                stbir__max_uint8_as_float_inverted;
19049+		decode[2 - 4] =
19050+		    stbir__srgb_uchar_to_linear_float[input[stbir__decode_order0 + 2]];
19051+		decode[3 - 4] = ((float)input[stbir__decode_order1 + 2]) *
19052+		                stbir__max_uint8_as_float_inverted;
19053+		input += 4;
19054+		decode += 4;
19055+	}
19056+	decode -= 4;
19057+	if (decode < decode_end) {
19058+		decode[0] = stbir__srgb_uchar_to_linear_float[stbir__decode_order0];
19059+		decode[1] = ((float)input[stbir__decode_order1]) *
19060+		            stbir__max_uint8_as_float_inverted;
19061+	}
19062+	return decode_end;
19063+}
19064+
19065+static void
19066+STBIR__CODER_NAME(stbir__encode_uint8_srgb2_linearalpha)(
19067+    void *outputp, int width_times_channels, float const *encode)
19068+{
19069+	unsigned char STBIR_SIMD_STREAMOUT_PTR(*) output = (unsigned char *)outputp;
19070+	unsigned char *end_output =
19071+	    ((unsigned char *)output) + width_times_channels;
19072+
19073+#ifdef STBIR_SIMD
19074+
19075+	if (width_times_channels >= 16) {
19076+		float const *end_encode_m16 = encode + width_times_channels - 16;
19077+		end_output -= 16;
19078+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
19079+		for (;;) {
19080+			stbir__simdf f0, f1, f2, f3;
19081+			stbir__simdi i0, i1, i2, i3;
19082+
19083+			STBIR_SIMD_NO_UNROLL(encode);
19084+			stbir__simdf_load4_transposed(f0, f1, f2, f3, encode);
19085+
19086+			stbir__min_max_shift20(i0, f0);
19087+			stbir__scale_and_convert(i1, f1);
19088+			stbir__min_max_shift20(i2, f2);
19089+			stbir__scale_and_convert(i3, f3);
19090+
19091+			stbir__simdi_table_lookup2(i0, i2,
19092+			                           (fp32_to_srgb8_tab4 - (127 - 13) * 8));
19093+
19094+			stbir__linear_to_srgb_finish(i0, f0);
19095+			stbir__linear_to_srgb_finish(i2, f2);
19096+
19097+			stbir__interleave_pack_and_store_16_u8(
19098+			    output, STBIR_strs_join1(i, , stbir__encode_order0),
19099+			    STBIR_strs_join1(i, , stbir__encode_order1),
19100+			    STBIR_strs_join1(i, , stbir__encode_order2),
19101+			    STBIR_strs_join1(i, , stbir__encode_order3));
19102+
19103+			output += 16;
19104+			encode += 16;
19105+			if (output <= end_output) {
19106+				continue;
19107+			}
19108+			if (output == (end_output + 16)) {
19109+				break;
19110+			}
19111+			output = end_output; // backup and do last couple
19112+			encode = end_encode_m16;
19113+		}
19114+		return;
19115+	}
19116+#endif
19117+
19118+	STBIR_SIMD_NO_UNROLL_LOOP_START
19119+	do {
19120+		float f;
19121+		STBIR_SIMD_NO_UNROLL(encode);
19122+
19123+		output[stbir__decode_order0] = stbir__linear_to_srgb_uchar(encode[0]);
19124+
19125+		f = encode[1] * stbir__max_uint8_as_float + 0.5f;
19126+		STBIR_CLAMP(f, 0, 255);
19127+		output[stbir__decode_order1] = (unsigned char)f;
19128+
19129+		output += 2;
19130+		encode += 2;
19131+	} while (output < end_output);
19132+}
19133+
19134+#endif
19135+
19136+static float *
19137+STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)(float *decodep,
19138+                                                      int width_times_channels,
19139+                                                      void const *inputp)
19140+{
19141+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
19142+	float *decode_end = (float *)decode + width_times_channels;
19143+	unsigned short const *input = (unsigned short const *)inputp;
19144+
19145+#ifdef STBIR_SIMD
19146+	unsigned short const *end_input_m8 = input + width_times_channels - 8;
19147+	if (width_times_channels >= 8) {
19148+		decode_end -= 8;
19149+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
19150+		for (;;) {
19151+#ifdef STBIR_SIMD8
19152+			stbir__simdi i;
19153+			stbir__simdi8 o;
19154+			stbir__simdf8 of;
19155+			STBIR_NO_UNROLL(decode);
19156+			stbir__simdi_load(i, input);
19157+			stbir__simdi8_expand_u16_to_u32(o, i);
19158+			stbir__simdi8_convert_i32_to_float(of, o);
19159+			stbir__simdf8_mult(of, of, STBIR_max_uint16_as_float_inverted8);
19160+			stbir__decode_simdf8_flip(of);
19161+			stbir__simdf8_store(decode + 0, of);
19162+#else
19163+			stbir__simdi i, o0, o1;
19164+			stbir__simdf of0, of1;
19165+			STBIR_NO_UNROLL(decode);
19166+			stbir__simdi_load(i, input);
19167+			stbir__simdi_expand_u16_to_u32(o0, o1, i);
19168+			stbir__simdi_convert_i32_to_float(of0, o0);
19169+			stbir__simdi_convert_i32_to_float(of1, o1);
19170+			stbir__simdf_mult(
19171+			    of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
19172+			stbir__simdf_mult(
19173+			    of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
19174+			stbir__decode_simdf4_flip(of0);
19175+			stbir__decode_simdf4_flip(of1);
19176+			stbir__simdf_store(decode + 0, of0);
19177+			stbir__simdf_store(decode + 4, of1);
19178+#endif
19179+			decode += 8;
19180+			input += 8;
19181+			if (decode <= decode_end) {
19182+				continue;
19183+			}
19184+			if (decode == (decode_end + 8)) {
19185+				break;
19186+			}
19187+			decode = decode_end; // backup and do last couple
19188+			input = end_input_m8;
19189+		}
19190+		return decode_end + 8;
19191+	}
19192+#endif
19193+
19194+// try to do blocks of 4 when you can
19195+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19196+	decode += 4;
19197+	STBIR_SIMD_NO_UNROLL_LOOP_START
19198+	while (decode <= decode_end) {
19199+		STBIR_SIMD_NO_UNROLL(decode);
19200+		decode[0 - 4] = ((float)(input[stbir__decode_order0])) *
19201+		                stbir__max_uint16_as_float_inverted;
19202+		decode[1 - 4] = ((float)(input[stbir__decode_order1])) *
19203+		                stbir__max_uint16_as_float_inverted;
19204+		decode[2 - 4] = ((float)(input[stbir__decode_order2])) *
19205+		                stbir__max_uint16_as_float_inverted;
19206+		decode[3 - 4] = ((float)(input[stbir__decode_order3])) *
19207+		                stbir__max_uint16_as_float_inverted;
19208+		decode += 4;
19209+		input += 4;
19210+	}
19211+	decode -= 4;
19212+#endif
19213+
19214+// do the remnants
19215+#if stbir__coder_min_num < 4
19216+	STBIR_NO_UNROLL_LOOP_START
19217+	while (decode < decode_end) {
19218+		STBIR_NO_UNROLL(decode);
19219+		decode[0] = ((float)(input[stbir__decode_order0])) *
19220+		            stbir__max_uint16_as_float_inverted;
19221+#if stbir__coder_min_num >= 2
19222+		decode[1] = ((float)(input[stbir__decode_order1])) *
19223+		            stbir__max_uint16_as_float_inverted;
19224+#endif
19225+#if stbir__coder_min_num >= 3
19226+		decode[2] = ((float)(input[stbir__decode_order2])) *
19227+		            stbir__max_uint16_as_float_inverted;
19228+#endif
19229+		decode += stbir__coder_min_num;
19230+		input += stbir__coder_min_num;
19231+	}
19232+#endif
19233+	return decode_end;
19234+}
19235+
19236+static void
19237+STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)(void *outputp,
19238+                                                      int width_times_channels,
19239+                                                      float const *encode)
19240+{
19241+	unsigned short STBIR_SIMD_STREAMOUT_PTR(*) output =
19242+	    (unsigned short *)outputp;
19243+	unsigned short *end_output =
19244+	    ((unsigned short *)output) + width_times_channels;
19245+
19246+#ifdef STBIR_SIMD
19247+	{
19248+		if (width_times_channels >= stbir__simdfX_float_count * 2) {
19249+			float const *end_encode_m8 =
19250+			    encode + width_times_channels - stbir__simdfX_float_count * 2;
19251+			end_output -= stbir__simdfX_float_count * 2;
19252+			STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
19253+			for (;;) {
19254+				stbir__simdfX e0, e1;
19255+				stbir__simdiX i;
19256+				STBIR_SIMD_NO_UNROLL(encode);
19257+				stbir__simdfX_madd_mem(e0, STBIR_simd_point5X,
19258+				                       STBIR_max_uint16_as_floatX, encode);
19259+				stbir__simdfX_madd_mem(e1, STBIR_simd_point5X,
19260+				                       STBIR_max_uint16_as_floatX,
19261+				                       encode + stbir__simdfX_float_count);
19262+				stbir__encode_simdfX_unflip(e0);
19263+				stbir__encode_simdfX_unflip(e1);
19264+				stbir__simdfX_pack_to_words(i, e0, e1);
19265+				stbir__simdiX_store(output, i);
19266+				encode += stbir__simdfX_float_count * 2;
19267+				output += stbir__simdfX_float_count * 2;
19268+				if (output <= end_output) {
19269+					continue;
19270+				}
19271+				if (output == (end_output + stbir__simdfX_float_count * 2)) {
19272+					break;
19273+				}
19274+				output = end_output; // backup and do last couple
19275+				encode = end_encode_m8;
19276+			}
19277+			return;
19278+		}
19279+	}
19280+
19281+// try to do blocks of 4 when you can
19282+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19283+	output += 4;
19284+	STBIR_NO_UNROLL_LOOP_START
19285+	while (output <= end_output) {
19286+		stbir__simdf e;
19287+		stbir__simdi i;
19288+		STBIR_NO_UNROLL(encode);
19289+		stbir__simdf_load(e, encode);
19290+		stbir__simdf_madd(e, STBIR__CONSTF(STBIR_simd_point5),
19291+		                  STBIR__CONSTF(STBIR_max_uint16_as_float), e);
19292+		stbir__encode_simdf4_unflip(e);
19293+		stbir__simdf_pack_to_8words(i, e, e); // only use first 4
19294+		stbir__simdi_store2(output - 4, i);
19295+		output += 4;
19296+		encode += 4;
19297+	}
19298+	output -= 4;
19299+#endif
19300+
19301+// do the remnants
19302+#if stbir__coder_min_num < 4
19303+	STBIR_NO_UNROLL_LOOP_START
19304+	while (output < end_output) {
19305+		stbir__simdf e;
19306+		STBIR_NO_UNROLL(encode);
19307+		stbir__simdf_madd1_mem(e, STBIR__CONSTF(STBIR_simd_point5),
19308+		                       STBIR__CONSTF(STBIR_max_uint16_as_float),
19309+		                       encode + stbir__encode_order0);
19310+		output[0] = stbir__simdf_convert_float_to_short(e);
19311+#if stbir__coder_min_num >= 2
19312+		stbir__simdf_madd1_mem(e, STBIR__CONSTF(STBIR_simd_point5),
19313+		                       STBIR__CONSTF(STBIR_max_uint16_as_float),
19314+		                       encode + stbir__encode_order1);
19315+		output[1] = stbir__simdf_convert_float_to_short(e);
19316+#endif
19317+#if stbir__coder_min_num >= 3
19318+		stbir__simdf_madd1_mem(e, STBIR__CONSTF(STBIR_simd_point5),
19319+		                       STBIR__CONSTF(STBIR_max_uint16_as_float),
19320+		                       encode + stbir__encode_order2);
19321+		output[2] = stbir__simdf_convert_float_to_short(e);
19322+#endif
19323+		output += stbir__coder_min_num;
19324+		encode += stbir__coder_min_num;
19325+	}
19326+#endif
19327+
19328+#else
19329+
19330+// try to do blocks of 4 when you can
19331+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19332+	output += 4;
19333+	STBIR_SIMD_NO_UNROLL_LOOP_START
19334+	while (output <= end_output) {
19335+		float f;
19336+		STBIR_SIMD_NO_UNROLL(encode);
19337+		f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f;
19338+		STBIR_CLAMP(f, 0, 65535);
19339+		output[0 - 4] = (unsigned short)f;
19340+		f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f;
19341+		STBIR_CLAMP(f, 0, 65535);
19342+		output[1 - 4] = (unsigned short)f;
19343+		f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f;
19344+		STBIR_CLAMP(f, 0, 65535);
19345+		output[2 - 4] = (unsigned short)f;
19346+		f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f;
19347+		STBIR_CLAMP(f, 0, 65535);
19348+		output[3 - 4] = (unsigned short)f;
19349+		output += 4;
19350+		encode += 4;
19351+	}
19352+	output -= 4;
19353+#endif
19354+
19355+// do the remnants
19356+#if stbir__coder_min_num < 4
19357+	STBIR_NO_UNROLL_LOOP_START
19358+	while (output < end_output) {
19359+		float f;
19360+		STBIR_NO_UNROLL(encode);
19361+		f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f;
19362+		STBIR_CLAMP(f, 0, 65535);
19363+		output[0] = (unsigned short)f;
19364+#if stbir__coder_min_num >= 2
19365+		f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f;
19366+		STBIR_CLAMP(f, 0, 65535);
19367+		output[1] = (unsigned short)f;
19368+#endif
19369+#if stbir__coder_min_num >= 3
19370+		f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f;
19371+		STBIR_CLAMP(f, 0, 65535);
19372+		output[2] = (unsigned short)f;
19373+#endif
19374+		output += stbir__coder_min_num;
19375+		encode += stbir__coder_min_num;
19376+	}
19377+#endif
19378+#endif
19379+}
19380+
19381+static float *
19382+STBIR__CODER_NAME(stbir__decode_uint16_linear)(float *decodep,
19383+                                               int width_times_channels,
19384+                                               void const *inputp)
19385+{
19386+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
19387+	float *decode_end = (float *)decode + width_times_channels;
19388+	unsigned short const *input = (unsigned short const *)inputp;
19389+
19390+#ifdef STBIR_SIMD
19391+	unsigned short const *end_input_m8 = input + width_times_channels - 8;
19392+	if (width_times_channels >= 8) {
19393+		decode_end -= 8;
19394+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
19395+		for (;;) {
19396+#ifdef STBIR_SIMD8
19397+			stbir__simdi i;
19398+			stbir__simdi8 o;
19399+			stbir__simdf8 of;
19400+			STBIR_NO_UNROLL(decode);
19401+			stbir__simdi_load(i, input);
19402+			stbir__simdi8_expand_u16_to_u32(o, i);
19403+			stbir__simdi8_convert_i32_to_float(of, o);
19404+			stbir__decode_simdf8_flip(of);
19405+			stbir__simdf8_store(decode + 0, of);
19406+#else
19407+			stbir__simdi i, o0, o1;
19408+			stbir__simdf of0, of1;
19409+			STBIR_NO_UNROLL(decode);
19410+			stbir__simdi_load(i, input);
19411+			stbir__simdi_expand_u16_to_u32(o0, o1, i);
19412+			stbir__simdi_convert_i32_to_float(of0, o0);
19413+			stbir__simdi_convert_i32_to_float(of1, o1);
19414+			stbir__decode_simdf4_flip(of0);
19415+			stbir__decode_simdf4_flip(of1);
19416+			stbir__simdf_store(decode + 0, of0);
19417+			stbir__simdf_store(decode + 4, of1);
19418+#endif
19419+			decode += 8;
19420+			input += 8;
19421+			if (decode <= decode_end) {
19422+				continue;
19423+			}
19424+			if (decode == (decode_end + 8)) {
19425+				break;
19426+			}
19427+			decode = decode_end; // backup and do last couple
19428+			input = end_input_m8;
19429+		}
19430+		return decode_end + 8;
19431+	}
19432+#endif
19433+
19434+// try to do blocks of 4 when you can
19435+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19436+	decode += 4;
19437+	STBIR_SIMD_NO_UNROLL_LOOP_START
19438+	while (decode <= decode_end) {
19439+		STBIR_SIMD_NO_UNROLL(decode);
19440+		decode[0 - 4] = ((float)(input[stbir__decode_order0]));
19441+		decode[1 - 4] = ((float)(input[stbir__decode_order1]));
19442+		decode[2 - 4] = ((float)(input[stbir__decode_order2]));
19443+		decode[3 - 4] = ((float)(input[stbir__decode_order3]));
19444+		decode += 4;
19445+		input += 4;
19446+	}
19447+	decode -= 4;
19448+#endif
19449+
19450+// do the remnants
19451+#if stbir__coder_min_num < 4
19452+	STBIR_NO_UNROLL_LOOP_START
19453+	while (decode < decode_end) {
19454+		STBIR_NO_UNROLL(decode);
19455+		decode[0] = ((float)(input[stbir__decode_order0]));
19456+#if stbir__coder_min_num >= 2
19457+		decode[1] = ((float)(input[stbir__decode_order1]));
19458+#endif
19459+#if stbir__coder_min_num >= 3
19460+		decode[2] = ((float)(input[stbir__decode_order2]));
19461+#endif
19462+		decode += stbir__coder_min_num;
19463+		input += stbir__coder_min_num;
19464+	}
19465+#endif
19466+	return decode_end;
19467+}
19468+
19469+static void
19470+STBIR__CODER_NAME(stbir__encode_uint16_linear)(void *outputp,
19471+                                               int width_times_channels,
19472+                                               float const *encode)
19473+{
19474+	unsigned short STBIR_SIMD_STREAMOUT_PTR(*) output =
19475+	    (unsigned short *)outputp;
19476+	unsigned short *end_output =
19477+	    ((unsigned short *)output) + width_times_channels;
19478+
19479+#ifdef STBIR_SIMD
19480+	{
19481+		if (width_times_channels >= stbir__simdfX_float_count * 2) {
19482+			float const *end_encode_m8 =
19483+			    encode + width_times_channels - stbir__simdfX_float_count * 2;
19484+			end_output -= stbir__simdfX_float_count * 2;
19485+			STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
19486+			for (;;) {
19487+				stbir__simdfX e0, e1;
19488+				stbir__simdiX i;
19489+				STBIR_SIMD_NO_UNROLL(encode);
19490+				stbir__simdfX_add_mem(e0, STBIR_simd_point5X, encode);
19491+				stbir__simdfX_add_mem(e1, STBIR_simd_point5X,
19492+				                      encode + stbir__simdfX_float_count);
19493+				stbir__encode_simdfX_unflip(e0);
19494+				stbir__encode_simdfX_unflip(e1);
19495+				stbir__simdfX_pack_to_words(i, e0, e1);
19496+				stbir__simdiX_store(output, i);
19497+				encode += stbir__simdfX_float_count * 2;
19498+				output += stbir__simdfX_float_count * 2;
19499+				if (output <= end_output) {
19500+					continue;
19501+				}
19502+				if (output == (end_output + stbir__simdfX_float_count * 2)) {
19503+					break;
19504+				}
19505+				output = end_output; // backup and do last couple
19506+				encode = end_encode_m8;
19507+			}
19508+			return;
19509+		}
19510+	}
19511+
19512+// try to do blocks of 4 when you can
19513+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19514+	output += 4;
19515+	STBIR_NO_UNROLL_LOOP_START
19516+	while (output <= end_output) {
19517+		stbir__simdf e;
19518+		stbir__simdi i;
19519+		STBIR_NO_UNROLL(encode);
19520+		stbir__simdf_load(e, encode);
19521+		stbir__simdf_add(e, STBIR__CONSTF(STBIR_simd_point5), e);
19522+		stbir__encode_simdf4_unflip(e);
19523+		stbir__simdf_pack_to_8words(i, e, e); // only use first 4
19524+		stbir__simdi_store2(output - 4, i);
19525+		output += 4;
19526+		encode += 4;
19527+	}
19528+	output -= 4;
19529+#endif
19530+
19531+#else
19532+
19533+// try to do blocks of 4 when you can
19534+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19535+	output += 4;
19536+	STBIR_SIMD_NO_UNROLL_LOOP_START
19537+	while (output <= end_output) {
19538+		float f;
19539+		STBIR_SIMD_NO_UNROLL(encode);
19540+		f = encode[stbir__encode_order0] + 0.5f;
19541+		STBIR_CLAMP(f, 0, 65535);
19542+		output[0 - 4] = (unsigned short)f;
19543+		f = encode[stbir__encode_order1] + 0.5f;
19544+		STBIR_CLAMP(f, 0, 65535);
19545+		output[1 - 4] = (unsigned short)f;
19546+		f = encode[stbir__encode_order2] + 0.5f;
19547+		STBIR_CLAMP(f, 0, 65535);
19548+		output[2 - 4] = (unsigned short)f;
19549+		f = encode[stbir__encode_order3] + 0.5f;
19550+		STBIR_CLAMP(f, 0, 65535);
19551+		output[3 - 4] = (unsigned short)f;
19552+		output += 4;
19553+		encode += 4;
19554+	}
19555+	output -= 4;
19556+#endif
19557+
19558+#endif
19559+
19560+// do the remnants
19561+#if stbir__coder_min_num < 4
19562+	STBIR_NO_UNROLL_LOOP_START
19563+	while (output < end_output) {
19564+		float f;
19565+		STBIR_NO_UNROLL(encode);
19566+		f = encode[stbir__encode_order0] + 0.5f;
19567+		STBIR_CLAMP(f, 0, 65535);
19568+		output[0] = (unsigned short)f;
19569+#if stbir__coder_min_num >= 2
19570+		f = encode[stbir__encode_order1] + 0.5f;
19571+		STBIR_CLAMP(f, 0, 65535);
19572+		output[1] = (unsigned short)f;
19573+#endif
19574+#if stbir__coder_min_num >= 3
19575+		f = encode[stbir__encode_order2] + 0.5f;
19576+		STBIR_CLAMP(f, 0, 65535);
19577+		output[2] = (unsigned short)f;
19578+#endif
19579+		output += stbir__coder_min_num;
19580+		encode += stbir__coder_min_num;
19581+	}
19582+#endif
19583+}
19584+
19585+static float *
19586+STBIR__CODER_NAME(stbir__decode_half_float_linear)(float *decodep,
19587+                                                   int width_times_channels,
19588+                                                   void const *inputp)
19589+{
19590+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
19591+	float *decode_end = (float *)decode + width_times_channels;
19592+	stbir__FP16 const *input = (stbir__FP16 const *)inputp;
19593+
19594+#ifdef STBIR_SIMD
19595+	if (width_times_channels >= 8) {
19596+		stbir__FP16 const *end_input_m8 = input + width_times_channels - 8;
19597+		decode_end -= 8;
19598+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
19599+		for (;;) {
19600+			STBIR_NO_UNROLL(decode);
19601+
19602+			stbir__half_to_float_SIMD(decode, input);
19603+#ifdef stbir__decode_swizzle
19604+#ifdef STBIR_SIMD8
19605+			{
19606+				stbir__simdf8 of;
19607+				stbir__simdf8_load(of, decode);
19608+				stbir__decode_simdf8_flip(of);
19609+				stbir__simdf8_store(decode, of);
19610+			}
19611+#else
19612+			{
19613+				stbir__simdf of0, of1;
19614+				stbir__simdf_load(of0, decode);
19615+				stbir__simdf_load(of1, decode + 4);
19616+				stbir__decode_simdf4_flip(of0);
19617+				stbir__decode_simdf4_flip(of1);
19618+				stbir__simdf_store(decode, of0);
19619+				stbir__simdf_store(decode + 4, of1);
19620+			}
19621+#endif
19622+#endif
19623+			decode += 8;
19624+			input += 8;
19625+			if (decode <= decode_end) {
19626+				continue;
19627+			}
19628+			if (decode == (decode_end + 8)) {
19629+				break;
19630+			}
19631+			decode = decode_end; // backup and do last couple
19632+			input = end_input_m8;
19633+		}
19634+		return decode_end + 8;
19635+	}
19636+#endif
19637+
19638+// try to do blocks of 4 when you can
19639+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19640+	decode += 4;
19641+	STBIR_SIMD_NO_UNROLL_LOOP_START
19642+	while (decode <= decode_end) {
19643+		STBIR_SIMD_NO_UNROLL(decode);
19644+		decode[0 - 4] = stbir__half_to_float(input[stbir__decode_order0]);
19645+		decode[1 - 4] = stbir__half_to_float(input[stbir__decode_order1]);
19646+		decode[2 - 4] = stbir__half_to_float(input[stbir__decode_order2]);
19647+		decode[3 - 4] = stbir__half_to_float(input[stbir__decode_order3]);
19648+		decode += 4;
19649+		input += 4;
19650+	}
19651+	decode -= 4;
19652+#endif
19653+
19654+// do the remnants
19655+#if stbir__coder_min_num < 4
19656+	STBIR_NO_UNROLL_LOOP_START
19657+	while (decode < decode_end) {
19658+		STBIR_NO_UNROLL(decode);
19659+		decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
19660+#if stbir__coder_min_num >= 2
19661+		decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
19662+#endif
19663+#if stbir__coder_min_num >= 3
19664+		decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
19665+#endif
19666+		decode += stbir__coder_min_num;
19667+		input += stbir__coder_min_num;
19668+	}
19669+#endif
19670+	return decode_end;
19671+}
19672+
19673+static void
19674+STBIR__CODER_NAME(stbir__encode_half_float_linear)(void *outputp,
19675+                                                   int width_times_channels,
19676+                                                   float const *encode)
19677+{
19678+	stbir__FP16 STBIR_SIMD_STREAMOUT_PTR(*) output = (stbir__FP16 *)outputp;
19679+	stbir__FP16 *end_output = ((stbir__FP16 *)output) + width_times_channels;
19680+
19681+#ifdef STBIR_SIMD
19682+	if (width_times_channels >= 8) {
19683+		float const *end_encode_m8 = encode + width_times_channels - 8;
19684+		end_output -= 8;
19685+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
19686+		for (;;) {
19687+			STBIR_SIMD_NO_UNROLL(encode);
19688+#ifdef stbir__decode_swizzle
19689+#ifdef STBIR_SIMD8
19690+			{
19691+				stbir__simdf8 of;
19692+				stbir__simdf8_load(of, encode);
19693+				stbir__encode_simdf8_unflip(of);
19694+				stbir__float_to_half_SIMD(output, (float *)&of);
19695+			}
19696+#else
19697+			{
19698+				stbir__simdf of[2];
19699+				stbir__simdf_load(of[0], encode);
19700+				stbir__simdf_load(of[1], encode + 4);
19701+				stbir__encode_simdf4_unflip(of[0]);
19702+				stbir__encode_simdf4_unflip(of[1]);
19703+				stbir__float_to_half_SIMD(output, (float *)of);
19704+			}
19705+#endif
19706+#else
19707+			stbir__float_to_half_SIMD(output, encode);
19708+#endif
19709+			encode += 8;
19710+			output += 8;
19711+			if (output <= end_output) {
19712+				continue;
19713+			}
19714+			if (output == (end_output + 8)) {
19715+				break;
19716+			}
19717+			output = end_output; // backup and do last couple
19718+			encode = end_encode_m8;
19719+		}
19720+		return;
19721+	}
19722+#endif
19723+
19724+// try to do blocks of 4 when you can
19725+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19726+	output += 4;
19727+	STBIR_SIMD_NO_UNROLL_LOOP_START
19728+	while (output <= end_output) {
19729+		STBIR_SIMD_NO_UNROLL(output);
19730+		output[0 - 4] = stbir__float_to_half(encode[stbir__encode_order0]);
19731+		output[1 - 4] = stbir__float_to_half(encode[stbir__encode_order1]);
19732+		output[2 - 4] = stbir__float_to_half(encode[stbir__encode_order2]);
19733+		output[3 - 4] = stbir__float_to_half(encode[stbir__encode_order3]);
19734+		output += 4;
19735+		encode += 4;
19736+	}
19737+	output -= 4;
19738+#endif
19739+
19740+// do the remnants
19741+#if stbir__coder_min_num < 4
19742+	STBIR_NO_UNROLL_LOOP_START
19743+	while (output < end_output) {
19744+		STBIR_NO_UNROLL(output);
19745+		output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
19746+#if stbir__coder_min_num >= 2
19747+		output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
19748+#endif
19749+#if stbir__coder_min_num >= 3
19750+		output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
19751+#endif
19752+		output += stbir__coder_min_num;
19753+		encode += stbir__coder_min_num;
19754+	}
19755+#endif
19756+}
19757+
19758+static float *
19759+STBIR__CODER_NAME(stbir__decode_float_linear)(float *decodep,
19760+                                              int width_times_channels,
19761+                                              void const *inputp)
19762+{
19763+#ifdef stbir__decode_swizzle
19764+	float STBIR_STREAMOUT_PTR(*) decode = decodep;
19765+	float *decode_end = (float *)decode + width_times_channels;
19766+	float const *input = (float const *)inputp;
19767+
19768+#ifdef STBIR_SIMD
19769+	if (width_times_channels >= 16) {
19770+		float const *end_input_m16 = input + width_times_channels - 16;
19771+		decode_end -= 16;
19772+		STBIR_NO_UNROLL_LOOP_START_INF_FOR
19773+		for (;;) {
19774+			STBIR_NO_UNROLL(decode);
19775+#ifdef stbir__decode_swizzle
19776+#ifdef STBIR_SIMD8
19777+			{
19778+				stbir__simdf8 of0, of1;
19779+				stbir__simdf8_load(of0, input);
19780+				stbir__simdf8_load(of1, input + 8);
19781+				stbir__decode_simdf8_flip(of0);
19782+				stbir__decode_simdf8_flip(of1);
19783+				stbir__simdf8_store(decode, of0);
19784+				stbir__simdf8_store(decode + 8, of1);
19785+			}
19786+#else
19787+			{
19788+				stbir__simdf of0, of1, of2, of3;
19789+				stbir__simdf_load(of0, input);
19790+				stbir__simdf_load(of1, input + 4);
19791+				stbir__simdf_load(of2, input + 8);
19792+				stbir__simdf_load(of3, input + 12);
19793+				stbir__decode_simdf4_flip(of0);
19794+				stbir__decode_simdf4_flip(of1);
19795+				stbir__decode_simdf4_flip(of2);
19796+				stbir__decode_simdf4_flip(of3);
19797+				stbir__simdf_store(decode, of0);
19798+				stbir__simdf_store(decode + 4, of1);
19799+				stbir__simdf_store(decode + 8, of2);
19800+				stbir__simdf_store(decode + 12, of3);
19801+			}
19802+#endif
19803+#endif
19804+			decode += 16;
19805+			input += 16;
19806+			if (decode <= decode_end) {
19807+				continue;
19808+			}
19809+			if (decode == (decode_end + 16)) {
19810+				break;
19811+			}
19812+			decode = decode_end; // backup and do last couple
19813+			input = end_input_m16;
19814+		}
19815+		return decode_end + 16;
19816+	}
19817+#endif
19818+
19819+// try to do blocks of 4 when you can
19820+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19821+	decode += 4;
19822+	STBIR_SIMD_NO_UNROLL_LOOP_START
19823+	while (decode <= decode_end) {
19824+		STBIR_SIMD_NO_UNROLL(decode);
19825+		decode[0 - 4] = input[stbir__decode_order0];
19826+		decode[1 - 4] = input[stbir__decode_order1];
19827+		decode[2 - 4] = input[stbir__decode_order2];
19828+		decode[3 - 4] = input[stbir__decode_order3];
19829+		decode += 4;
19830+		input += 4;
19831+	}
19832+	decode -= 4;
19833+#endif
19834+
19835+// do the remnants
19836+#if stbir__coder_min_num < 4
19837+	STBIR_NO_UNROLL_LOOP_START
19838+	while (decode < decode_end) {
19839+		STBIR_NO_UNROLL(decode);
19840+		decode[0] = input[stbir__decode_order0];
19841+#if stbir__coder_min_num >= 2
19842+		decode[1] = input[stbir__decode_order1];
19843+#endif
19844+#if stbir__coder_min_num >= 3
19845+		decode[2] = input[stbir__decode_order2];
19846+#endif
19847+		decode += stbir__coder_min_num;
19848+		input += stbir__coder_min_num;
19849+	}
19850+#endif
19851+	return decode_end;
19852+
19853+#else
19854+
19855+	if ((void *)decodep != inputp) {
19856+		STBIR_MEMCPY(decodep, inputp, width_times_channels * sizeof(float));
19857+	}
19858+
19859+	return decodep + width_times_channels;
19860+
19861+#endif
19862+}
19863+
19864+static void
19865+STBIR__CODER_NAME(stbir__encode_float_linear)(void *outputp,
19866+                                              int width_times_channels,
19867+                                              float const *encode)
19868+{
19869+#if !defined(STBIR_FLOAT_HIGH_CLAMP) && !defined(STBIR_FLOAT_LO_CLAMP) &&      \
19870+    !defined(stbir__decode_swizzle)
19871+
19872+	if ((void *)outputp != (void *)encode) {
19873+		STBIR_MEMCPY(outputp, encode, width_times_channels * sizeof(float));
19874+	}
19875+
19876+#else
19877+
19878+	float STBIR_SIMD_STREAMOUT_PTR(*) output = (float *)outputp;
19879+	float *end_output = ((float *)output) + width_times_channels;
19880+
19881+#ifdef STBIR_FLOAT_HIGH_CLAMP
19882+#define stbir_scalar_hi_clamp(v)                                               \
19883+	if (v > STBIR_FLOAT_HIGH_CLAMP)                                            \
19884+		v = STBIR_FLOAT_HIGH_CLAMP;
19885+#else
19886+#define stbir_scalar_hi_clamp(v)
19887+#endif
19888+#ifdef STBIR_FLOAT_LOW_CLAMP
19889+#define stbir_scalar_lo_clamp(v)                                               \
19890+	if (v < STBIR_FLOAT_LOW_CLAMP)                                             \
19891+		v = STBIR_FLOAT_LOW_CLAMP;
19892+#else
19893+#define stbir_scalar_lo_clamp(v)
19894+#endif
19895+
19896+#ifdef STBIR_SIMD
19897+
19898+#ifdef STBIR_FLOAT_HIGH_CLAMP
19899+	const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
19900+#endif
19901+#ifdef STBIR_FLOAT_LOW_CLAMP
19902+	const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
19903+#endif
19904+
19905+	if (width_times_channels >= (stbir__simdfX_float_count * 2)) {
19906+		float const *end_encode_m8 =
19907+		    encode + width_times_channels - (stbir__simdfX_float_count * 2);
19908+		end_output -= (stbir__simdfX_float_count * 2);
19909+		STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
19910+		for (;;) {
19911+			stbir__simdfX e0, e1;
19912+			STBIR_SIMD_NO_UNROLL(encode);
19913+			stbir__simdfX_load(e0, encode);
19914+			stbir__simdfX_load(e1, encode + stbir__simdfX_float_count);
19915+#ifdef STBIR_FLOAT_HIGH_CLAMP
19916+			stbir__simdfX_min(e0, e0, high_clamp);
19917+			stbir__simdfX_min(e1, e1, high_clamp);
19918+#endif
19919+#ifdef STBIR_FLOAT_LOW_CLAMP
19920+			stbir__simdfX_max(e0, e0, low_clamp);
19921+			stbir__simdfX_max(e1, e1, low_clamp);
19922+#endif
19923+			stbir__encode_simdfX_unflip(e0);
19924+			stbir__encode_simdfX_unflip(e1);
19925+			stbir__simdfX_store(output, e0);
19926+			stbir__simdfX_store(output + stbir__simdfX_float_count, e1);
19927+			encode += stbir__simdfX_float_count * 2;
19928+			output += stbir__simdfX_float_count * 2;
19929+			if (output < end_output) {
19930+				continue;
19931+			}
19932+			if (output == (end_output + (stbir__simdfX_float_count * 2))) {
19933+				break;
19934+			}
19935+			output = end_output; // backup and do last couple
19936+			encode = end_encode_m8;
19937+		}
19938+		return;
19939+	}
19940+
19941+// try to do blocks of 4 when you can
19942+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19943+	output += 4;
19944+	STBIR_NO_UNROLL_LOOP_START
19945+	while (output <= end_output) {
19946+		stbir__simdf e0;
19947+		STBIR_NO_UNROLL(encode);
19948+		stbir__simdf_load(e0, encode);
19949+#ifdef STBIR_FLOAT_HIGH_CLAMP
19950+		stbir__simdf_min(e0, e0, high_clamp);
19951+#endif
19952+#ifdef STBIR_FLOAT_LOW_CLAMP
19953+		stbir__simdf_max(e0, e0, low_clamp);
19954+#endif
19955+		stbir__encode_simdf4_unflip(e0);
19956+		stbir__simdf_store(output - 4, e0);
19957+		output += 4;
19958+		encode += 4;
19959+	}
19960+	output -= 4;
19961+#endif
19962+
19963+#else
19964+
19965+// try to do blocks of 4 when you can
19966+#if stbir__coder_min_num != 3 // doesn't divide cleanly by four
19967+	output += 4;
19968+	STBIR_SIMD_NO_UNROLL_LOOP_START
19969+	while (output <= end_output) {
19970+		float e;
19971+		STBIR_SIMD_NO_UNROLL(encode);
19972+		e = encode[stbir__encode_order0];
19973+		stbir_scalar_hi_clamp(e);
19974+		stbir_scalar_lo_clamp(e);
19975+		output[0 - 4] = e;
19976+		e = encode[stbir__encode_order1];
19977+		stbir_scalar_hi_clamp(e);
19978+		stbir_scalar_lo_clamp(e);
19979+		output[1 - 4] = e;
19980+		e = encode[stbir__encode_order2];
19981+		stbir_scalar_hi_clamp(e);
19982+		stbir_scalar_lo_clamp(e);
19983+		output[2 - 4] = e;
19984+		e = encode[stbir__encode_order3];
19985+		stbir_scalar_hi_clamp(e);
19986+		stbir_scalar_lo_clamp(e);
19987+		output[3 - 4] = e;
19988+		output += 4;
19989+		encode += 4;
19990+	}
19991+	output -= 4;
19992+
19993+#endif
19994+
19995+#endif
19996+
19997+// do the remnants
19998+#if stbir__coder_min_num < 4
19999+	STBIR_NO_UNROLL_LOOP_START
20000+	while (output < end_output) {
20001+		float e;
20002+		STBIR_NO_UNROLL(encode);
20003+		e = encode[stbir__encode_order0];
20004+		stbir_scalar_hi_clamp(e);
20005+		stbir_scalar_lo_clamp(e);
20006+		output[0] = e;
20007+#if stbir__coder_min_num >= 2
20008+		e = encode[stbir__encode_order1];
20009+		stbir_scalar_hi_clamp(e);
20010+		stbir_scalar_lo_clamp(e);
20011+		output[1] = e;
20012+#endif
20013+#if stbir__coder_min_num >= 3
20014+		e = encode[stbir__encode_order2];
20015+		stbir_scalar_hi_clamp(e);
20016+		stbir_scalar_lo_clamp(e);
20017+		output[2] = e;
20018+#endif
20019+		output += stbir__coder_min_num;
20020+		encode += stbir__coder_min_num;
20021+	}
20022+#endif
20023+
20024+#endif
20025+}
20026 
20027 #undef stbir__decode_suffix
20028 #undef stbir__decode_simdf8_flip
20029@@ -9862,373 +11729,989 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
20030 #undef stbir_scalar_lo_clamp
20031 #undef STB_IMAGE_RESIZE_DO_CODERS
20032 
20033-#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
20034+#elif defined(STB_IMAGE_RESIZE_DO_VERTICALS)
20035 
20036 #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20037-#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
20038+#define STBIR_chans(start, end)                                                \
20039+	STBIR_strs_join14(start, STBIR__vertical_channels, end, _cont)
20040 #else
20041-#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
20042+#define STBIR_chans(start, end)                                                \
20043+	STBIR_strs_join1(start, STBIR__vertical_channels, end)
20044 #endif
20045 
20046 #if STBIR__vertical_channels >= 1
20047-#define stbIF0( code ) code
20048+#define stbIF0(code) code
20049 #else
20050-#define stbIF0( code )
20051+#define stbIF0(code)
20052 #endif
20053 #if STBIR__vertical_channels >= 2
20054-#define stbIF1( code ) code
20055+#define stbIF1(code) code
20056 #else
20057-#define stbIF1( code )
20058+#define stbIF1(code)
20059 #endif
20060 #if STBIR__vertical_channels >= 3
20061-#define stbIF2( code ) code
20062+#define stbIF2(code) code
20063 #else
20064-#define stbIF2( code )
20065+#define stbIF2(code)
20066 #endif
20067 #if STBIR__vertical_channels >= 4
20068-#define stbIF3( code ) code
20069+#define stbIF3(code) code
20070 #else
20071-#define stbIF3( code )
20072+#define stbIF3(code)
20073 #endif
20074 #if STBIR__vertical_channels >= 5
20075-#define stbIF4( code ) code
20076+#define stbIF4(code) code
20077 #else
20078-#define stbIF4( code )
20079+#define stbIF4(code)
20080 #endif
20081 #if STBIR__vertical_channels >= 6
20082-#define stbIF5( code ) code
20083+#define stbIF5(code) code
20084 #else
20085-#define stbIF5( code )
20086+#define stbIF5(code)
20087 #endif
20088 #if STBIR__vertical_channels >= 7
20089-#define stbIF6( code ) code
20090+#define stbIF6(code) code
20091 #else
20092-#define stbIF6( code )
20093+#define stbIF6(code)
20094 #endif
20095 #if STBIR__vertical_channels >= 8
20096-#define stbIF7( code ) code
20097+#define stbIF7(code) code
20098+#else
20099+#define stbIF7(code)
20100+#endif
20101+
20102+static void
20103+STBIR_chans(stbir__vertical_scatter_with_,
20104+            _coeffs)(float **outputs,
20105+                     float const *vertical_coefficients,
20106+                     float const *input,
20107+                     float const *input_end)
20108+{
20109+	stbIF0(float STBIR_SIMD_STREAMOUT_PTR(*) output0 = outputs[0];
20110+	       float c0s = vertical_coefficients[0];)
20111+	    stbIF1(float STBIR_SIMD_STREAMOUT_PTR(*) output1 = outputs[1];
20112+	           float c1s = vertical_coefficients[1];)
20113+	        stbIF2(float STBIR_SIMD_STREAMOUT_PTR(*) output2 = outputs[2];
20114+	               float c2s = vertical_coefficients[2];)
20115+	            stbIF3(float STBIR_SIMD_STREAMOUT_PTR(*) output3 = outputs[3];
20116+	                   float c3s = vertical_coefficients[3];)
20117+	                stbIF4(float STBIR_SIMD_STREAMOUT_PTR(*) output4 =
20118+	                           outputs[4];
20119+	                       float c4s = vertical_coefficients[4];)
20120+	                    stbIF5(float STBIR_SIMD_STREAMOUT_PTR(*) output5 =
20121+	                               outputs[5];
20122+	                           float c5s = vertical_coefficients[5];)
20123+	                        stbIF6(float STBIR_SIMD_STREAMOUT_PTR(*) output6 =
20124+	                                   outputs[6];
20125+	                               float c6s = vertical_coefficients[6];)
20126+	                            stbIF7(float STBIR_SIMD_STREAMOUT_PTR(*)
20127+	                                       output7 = outputs[7];
20128+	                                   float c7s = vertical_coefficients[7];)
20129+
20130+#ifdef STBIR_SIMD
20131+	{
20132+		stbIF0(stbir__simdfX c0 = stbir__simdf_frepX(c0s);)
20133+		    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX(c1s);)
20134+		        stbIF2(stbir__simdfX c2 = stbir__simdf_frepX(c2s);) stbIF3(
20135+		            stbir__simdfX c3 = stbir__simdf_frepX(c3s);)
20136+		            stbIF4(stbir__simdfX c4 = stbir__simdf_frepX(c4s);) stbIF5(
20137+		                stbir__simdfX c5 = stbir__simdf_frepX(c5s);)
20138+		                stbIF6(stbir__simdfX c6 = stbir__simdf_frepX(c6s);)
20139+		                    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX(c7s);)
20140+		                        STBIR_SIMD_NO_UNROLL_LOOP_START while (
20141+		                            ((char *)input_end - (char *)input) >=
20142+		                            (16 * stbir__simdfX_float_count))
20143+		{
20144+			stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
20145+			STBIR_SIMD_NO_UNROLL(output0);
20146+
20147+			stbir__simdfX_load(r0, input);
20148+			stbir__simdfX_load(r1, input + stbir__simdfX_float_count);
20149+			stbir__simdfX_load(r2, input + (2 * stbir__simdfX_float_count));
20150+			stbir__simdfX_load(r3, input + (3 * stbir__simdfX_float_count));
20151+
20152+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20153+			stbIF0(
20154+			    stbir__simdfX_load(o0, output0);
20155+			    stbir__simdfX_load(o1, output0 + stbir__simdfX_float_count);
20156+			    stbir__simdfX_load(o2,
20157+			                       output0 + (2 * stbir__simdfX_float_count));
20158+			    stbir__simdfX_load(o3,
20159+			                       output0 + (3 * stbir__simdfX_float_count));
20160+			    stbir__simdfX_madd(o0, o0, r0, c0);
20161+			    stbir__simdfX_madd(o1, o1, r1, c0);
20162+			    stbir__simdfX_madd(o2, o2, r2, c0);
20163+			    stbir__simdfX_madd(o3, o3, r3, c0);
20164+			    stbir__simdfX_store(output0, o0);
20165+			    stbir__simdfX_store(output0 + stbir__simdfX_float_count, o1);
20166+			    stbir__simdfX_store(output0 + (2 * stbir__simdfX_float_count),
20167+			                        o2);
20168+			    stbir__simdfX_store(
20169+			        output0 + (3 * stbir__simdfX_float_count),
20170+			        o3);) stbIF1(stbir__simdfX_load(o0, output1);
20171+			                     stbir__simdfX_load(
20172+			                         o1, output1 + stbir__simdfX_float_count);
20173+			                     stbir__simdfX_load(
20174+			                         o2,
20175+			                         output1 + (2 * stbir__simdfX_float_count));
20176+			                     stbir__simdfX_load(
20177+			                         o3,
20178+			                         output1 + (3 * stbir__simdfX_float_count));
20179+			                     stbir__simdfX_madd(o0, o0, r0, c1);
20180+			                     stbir__simdfX_madd(o1, o1, r1, c1);
20181+			                     stbir__simdfX_madd(o2, o2, r2, c1);
20182+			                     stbir__simdfX_madd(o3, o3, r3, c1);
20183+			                     stbir__simdfX_store(output1, o0);
20184+			                     stbir__simdfX_store(
20185+			                         output1 + stbir__simdfX_float_count, o1);
20186+			                     stbir__simdfX_store(
20187+			                         output1 + (2 * stbir__simdfX_float_count),
20188+			                         o2);
20189+			                     stbir__simdfX_store(
20190+			                         output1 + (3 * stbir__simdfX_float_count),
20191+			                         o3);)
20192+			    stbIF2(
20193+			        stbir__simdfX_load(o0, output2);
20194+			        stbir__simdfX_load(o1, output2 + stbir__simdfX_float_count);
20195+			        stbir__simdfX_load(
20196+			            o2, output2 + (2 * stbir__simdfX_float_count));
20197+			        stbir__simdfX_load(
20198+			            o3, output2 + (3 * stbir__simdfX_float_count));
20199+			        stbir__simdfX_madd(o0, o0, r0, c2);
20200+			        stbir__simdfX_madd(o1, o1, r1, c2);
20201+			        stbir__simdfX_madd(o2, o2, r2, c2);
20202+			        stbir__simdfX_madd(o3, o3, r3, c2);
20203+			        stbir__simdfX_store(output2, o0);
20204+			        stbir__simdfX_store(output2 + stbir__simdfX_float_count,
20205+			                            o1);
20206+			        stbir__simdfX_store(
20207+			            output2 + (2 * stbir__simdfX_float_count), o2);
20208+			        stbir__simdfX_store(
20209+			            output2 + (3 * stbir__simdfX_float_count),
20210+			            o3);) stbIF3(stbir__simdfX_load(o0, output3);
20211+			                         stbir__simdfX_load(
20212+			                             o1,
20213+			                             output3 + stbir__simdfX_float_count);
20214+			                         stbir__simdfX_load(
20215+			                             o2,
20216+			                             output3 +
20217+			                                 (2 * stbir__simdfX_float_count));
20218+			                         stbir__simdfX_load(
20219+			                             o3,
20220+			                             output3 +
20221+			                                 (3 * stbir__simdfX_float_count));
20222+			                         stbir__simdfX_madd(o0, o0, r0, c3);
20223+			                         stbir__simdfX_madd(o1, o1, r1, c3);
20224+			                         stbir__simdfX_madd(o2, o2, r2, c3);
20225+			                         stbir__simdfX_madd(o3, o3, r3, c3);
20226+			                         stbir__simdfX_store(output3, o0);
20227+			                         stbir__simdfX_store(
20228+			                             output3 + stbir__simdfX_float_count,
20229+			                             o1);
20230+			                         stbir__simdfX_store(
20231+			                             output3 +
20232+			                                 (2 * stbir__simdfX_float_count),
20233+			                             o2);
20234+			                         stbir__simdfX_store(
20235+			                             output3 +
20236+			                                 (3 * stbir__simdfX_float_count),
20237+			                             o3);)
20238+			        stbIF4(stbir__simdfX_load(o0, output4); stbir__simdfX_load(
20239+			                   o1, output4 + stbir__simdfX_float_count);
20240+			               stbir__simdfX_load(
20241+			                   o2, output4 + (2 * stbir__simdfX_float_count));
20242+			               stbir__simdfX_load(
20243+			                   o3, output4 + (3 * stbir__simdfX_float_count));
20244+			               stbir__simdfX_madd(o0, o0, r0, c4);
20245+			               stbir__simdfX_madd(o1, o1, r1, c4);
20246+			               stbir__simdfX_madd(o2, o2, r2, c4);
20247+			               stbir__simdfX_madd(o3, o3, r3, c4);
20248+			               stbir__simdfX_store(output4, o0);
20249+			               stbir__simdfX_store(
20250+			                   output4 + stbir__simdfX_float_count, o1);
20251+			               stbir__simdfX_store(
20252+			                   output4 + (2 * stbir__simdfX_float_count), o2);
20253+			               stbir__simdfX_store(
20254+			                   output4 + (3 * stbir__simdfX_float_count), o3);)
20255+			            stbIF5(
20256+			                stbir__simdfX_load(o0, output5); stbir__simdfX_load(
20257+			                    o1, output5 + stbir__simdfX_float_count);
20258+			                stbir__simdfX_load(
20259+			                    o2, output5 + (2 * stbir__simdfX_float_count));
20260+			                stbir__simdfX_load(
20261+			                    o3, output5 + (3 * stbir__simdfX_float_count));
20262+			                stbir__simdfX_madd(o0, o0, r0, c5);
20263+			                stbir__simdfX_madd(o1, o1, r1, c5);
20264+			                stbir__simdfX_madd(o2, o2, r2, c5);
20265+			                stbir__simdfX_madd(o3, o3, r3, c5);
20266+			                stbir__simdfX_store(output5, o0);
20267+			                stbir__simdfX_store(
20268+			                    output5 + stbir__simdfX_float_count, o1);
20269+			                stbir__simdfX_store(
20270+			                    output5 + (2 * stbir__simdfX_float_count), o2);
20271+			                stbir__simdfX_store(
20272+			                    output5 + (3 * stbir__simdfX_float_count), o3);)
20273+			                stbIF6(
20274+			                    stbir__simdfX_load(o0, output6);
20275+			                    stbir__simdfX_load(
20276+			                        o1, output6 + stbir__simdfX_float_count);
20277+			                    stbir__simdfX_load(
20278+			                        o2,
20279+			                        output6 + (2 * stbir__simdfX_float_count));
20280+			                    stbir__simdfX_load(
20281+			                        o3,
20282+			                        output6 + (3 * stbir__simdfX_float_count));
20283+			                    stbir__simdfX_madd(o0, o0, r0, c6);
20284+			                    stbir__simdfX_madd(o1, o1, r1, c6);
20285+			                    stbir__simdfX_madd(o2, o2, r2, c6);
20286+			                    stbir__simdfX_madd(o3, o3, r3, c6);
20287+			                    stbir__simdfX_store(output6, o0);
20288+			                    stbir__simdfX_store(
20289+			                        output6 + stbir__simdfX_float_count, o1);
20290+			                    stbir__simdfX_store(
20291+			                        output6 + (2 * stbir__simdfX_float_count),
20292+			                        o2);
20293+			                    stbir__simdfX_store(
20294+			                        output6 + (3 * stbir__simdfX_float_count),
20295+			                        o3);)
20296+			                    stbIF7(stbir__simdfX_load(o0, output7);
20297+			                           stbir__simdfX_load(
20298+			                               o1,
20299+			                               output7 + stbir__simdfX_float_count);
20300+			                           stbir__simdfX_load(
20301+			                               o2,
20302+			                               output7 +
20303+			                                   (2 * stbir__simdfX_float_count));
20304+			                           stbir__simdfX_load(
20305+			                               o3,
20306+			                               output7 +
20307+			                                   (3 * stbir__simdfX_float_count));
20308+			                           stbir__simdfX_madd(o0, o0, r0, c7);
20309+			                           stbir__simdfX_madd(o1, o1, r1, c7);
20310+			                           stbir__simdfX_madd(o2, o2, r2, c7);
20311+			                           stbir__simdfX_madd(o3, o3, r3, c7);
20312+			                           stbir__simdfX_store(output7, o0);
20313+			                           stbir__simdfX_store(
20314+			                               output7 + stbir__simdfX_float_count,
20315+			                               o1);
20316+			                           stbir__simdfX_store(
20317+			                               output7 +
20318+			                                   (2 * stbir__simdfX_float_count),
20319+			                               o2);
20320+			                           stbir__simdfX_store(
20321+			                               output7 +
20322+			                                   (3 * stbir__simdfX_float_count),
20323+			                               o3);)
20324+#else
20325+			stbIF0(
20326+			    stbir__simdfX_mult(o0, r0, c0); stbir__simdfX_mult(o1, r1, c0);
20327+			    stbir__simdfX_mult(o2, r2, c0);
20328+			    stbir__simdfX_mult(o3, r3, c0);
20329+			    stbir__simdfX_store(output0, o0);
20330+			    stbir__simdfX_store(output0 + stbir__simdfX_float_count, o1);
20331+			    stbir__simdfX_store(output0 + (2 * stbir__simdfX_float_count),
20332+			                        o2);
20333+			    stbir__simdfX_store(
20334+			        output0 + (3 * stbir__simdfX_float_count),
20335+			        o3);) stbIF1(stbir__simdfX_mult(o0, r0, c1);
20336+			                     stbir__simdfX_mult(o1, r1, c1);
20337+			                     stbir__simdfX_mult(o2, r2, c1);
20338+			                     stbir__simdfX_mult(o3, r3, c1);
20339+			                     stbir__simdfX_store(output1, o0);
20340+			                     stbir__simdfX_store(
20341+			                         output1 + stbir__simdfX_float_count, o1);
20342+			                     stbir__simdfX_store(
20343+			                         output1 + (2 * stbir__simdfX_float_count),
20344+			                         o2);
20345+			                     stbir__simdfX_store(
20346+			                         output1 + (3 * stbir__simdfX_float_count),
20347+			                         o3);)
20348+			    stbIF2(stbir__simdfX_mult(o0, r0, c2);
20349+			           stbir__simdfX_mult(o1, r1, c2);
20350+			           stbir__simdfX_mult(o2, r2, c2);
20351+			           stbir__simdfX_mult(o3, r3, c2);
20352+			           stbir__simdfX_store(output2, o0);
20353+			           stbir__simdfX_store(output2 + stbir__simdfX_float_count,
20354+			                               o1);
20355+			           stbir__simdfX_store(
20356+			               output2 + (2 * stbir__simdfX_float_count), o2);
20357+			           stbir__simdfX_store(
20358+			               output2 + (3 * stbir__simdfX_float_count),
20359+			               o3);) stbIF3(stbir__simdfX_mult(o0, r0, c3);
20360+			                            stbir__simdfX_mult(o1, r1, c3);
20361+			                            stbir__simdfX_mult(o2, r2, c3);
20362+			                            stbir__simdfX_mult(o3, r3, c3);
20363+			                            stbir__simdfX_store(output3, o0);
20364+			                            stbir__simdfX_store(
20365+			                                output3 + stbir__simdfX_float_count,
20366+			                                o1);
20367+			                            stbir__simdfX_store(
20368+			                                output3 +
20369+			                                    (2 * stbir__simdfX_float_count),
20370+			                                o2);
20371+			                            stbir__simdfX_store(
20372+			                                output3 +
20373+			                                    (3 * stbir__simdfX_float_count),
20374+			                                o3);)
20375+			        stbIF4(stbir__simdfX_mult(o0, r0, c4);
20376+			               stbir__simdfX_mult(o1, r1, c4);
20377+			               stbir__simdfX_mult(o2, r2, c4);
20378+			               stbir__simdfX_mult(o3, r3, c4);
20379+			               stbir__simdfX_store(output4, o0);
20380+			               stbir__simdfX_store(
20381+			                   output4 + stbir__simdfX_float_count, o1);
20382+			               stbir__simdfX_store(
20383+			                   output4 + (2 * stbir__simdfX_float_count), o2);
20384+			               stbir__simdfX_store(
20385+			                   output4 + (3 * stbir__simdfX_float_count), o3);)
20386+			            stbIF5(
20387+			                stbir__simdfX_mult(o0, r0, c5);
20388+			                stbir__simdfX_mult(o1, r1, c5);
20389+			                stbir__simdfX_mult(o2, r2, c5);
20390+			                stbir__simdfX_mult(o3, r3, c5);
20391+			                stbir__simdfX_store(output5, o0);
20392+			                stbir__simdfX_store(
20393+			                    output5 + stbir__simdfX_float_count, o1);
20394+			                stbir__simdfX_store(
20395+			                    output5 + (2 * stbir__simdfX_float_count), o2);
20396+			                stbir__simdfX_store(
20397+			                    output5 + (3 * stbir__simdfX_float_count), o3);)
20398+			                stbIF6(
20399+			                    stbir__simdfX_mult(o0, r0, c6);
20400+			                    stbir__simdfX_mult(o1, r1, c6);
20401+			                    stbir__simdfX_mult(o2, r2, c6);
20402+			                    stbir__simdfX_mult(o3, r3, c6);
20403+			                    stbir__simdfX_store(output6, o0);
20404+			                    stbir__simdfX_store(
20405+			                        output6 + stbir__simdfX_float_count, o1);
20406+			                    stbir__simdfX_store(
20407+			                        output6 + (2 * stbir__simdfX_float_count),
20408+			                        o2);
20409+			                    stbir__simdfX_store(
20410+			                        output6 + (3 * stbir__simdfX_float_count),
20411+			                        o3);)
20412+			                    stbIF7(stbir__simdfX_mult(o0, r0, c7);
20413+			                           stbir__simdfX_mult(o1, r1, c7);
20414+			                           stbir__simdfX_mult(o2, r2, c7);
20415+			                           stbir__simdfX_mult(o3, r3, c7);
20416+			                           stbir__simdfX_store(output7, o0);
20417+			                           stbir__simdfX_store(
20418+			                               output7 + stbir__simdfX_float_count,
20419+			                               o1);
20420+			                           stbir__simdfX_store(
20421+			                               output7 +
20422+			                                   (2 * stbir__simdfX_float_count),
20423+			                               o2);
20424+			                           stbir__simdfX_store(
20425+			                               output7 +
20426+			                                   (3 * stbir__simdfX_float_count),
20427+			                               o3);)
20428+#endif
20429+
20430+			                        input += (4 * stbir__simdfX_float_count);
20431+			stbIF0(output0 += (4 * stbir__simdfX_float_count);) stbIF1(
20432+			    output1 += (4 * stbir__simdfX_float_count);)
20433+			    stbIF2(output2 += (4 * stbir__simdfX_float_count);) stbIF3(
20434+			        output3 += (4 * stbir__simdfX_float_count);)
20435+			        stbIF4(output4 += (4 * stbir__simdfX_float_count);) stbIF5(
20436+			            output5 += (4 * stbir__simdfX_float_count);)
20437+			            stbIF6(output6 += (4 * stbir__simdfX_float_count);)
20438+			                stbIF7(output7 += (4 * stbir__simdfX_float_count);)
20439+		}
20440+		STBIR_SIMD_NO_UNROLL_LOOP_START
20441+		while (((char *)input_end - (char *)input) >= 16) {
20442+			stbir__simdf o0, r0;
20443+			STBIR_SIMD_NO_UNROLL(output0);
20444+
20445+			stbir__simdf_load(r0, input);
20446+
20447+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20448+			stbIF0(stbir__simdf_load(o0, output0); stbir__simdf_madd(
20449+			           o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c0));
20450+			       stbir__simdf_store(
20451+			           output0,
20452+			           o0);) stbIF1(stbir__simdf_load(o0, output1);
20453+			                        stbir__simdf_madd(
20454+			                            o0,
20455+			                            o0,
20456+			                            r0,
20457+			                            stbir__if_simdf8_cast_to_simdf4(c1));
20458+			                        stbir__simdf_store(output1, o0);)
20459+			    stbIF2(stbir__simdf_load(o0, output2); stbir__simdf_madd(
20460+			               o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c2));
20461+			           stbir__simdf_store(output2, o0);)
20462+			        stbIF3(stbir__simdf_load(o0, output3); stbir__simdf_madd(
20463+			                   o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c3));
20464+			               stbir__simdf_store(output3, o0);)
20465+			            stbIF4(stbir__simdf_load(o0, output4);
20466+			                   stbir__simdf_madd(
20467+			                       o0,
20468+			                       o0,
20469+			                       r0,
20470+			                       stbir__if_simdf8_cast_to_simdf4(c4));
20471+			                   stbir__simdf_store(output4, o0);)
20472+			                stbIF5(stbir__simdf_load(o0, output5);
20473+			                       stbir__simdf_madd(
20474+			                           o0,
20475+			                           o0,
20476+			                           r0,
20477+			                           stbir__if_simdf8_cast_to_simdf4(c5));
20478+			                       stbir__simdf_store(output5, o0);)
20479+			                    stbIF6(stbir__simdf_load(o0, output6);
20480+			                           stbir__simdf_madd(
20481+			                               o0,
20482+			                               o0,
20483+			                               r0,
20484+			                               stbir__if_simdf8_cast_to_simdf4(c6));
20485+			                           stbir__simdf_store(output6, o0);)
20486+			                        stbIF7(stbir__simdf_load(o0, output7);
20487+			                               stbir__simdf_madd(
20488+			                                   o0,
20489+			                                   o0,
20490+			                                   r0,
20491+			                                   stbir__if_simdf8_cast_to_simdf4(
20492+			                                       c7));
20493+			                               stbir__simdf_store(output7, o0);)
20494+#else
20495+			stbIF0(
20496+			    stbir__simdf_mult(o0, r0, stbir__if_simdf8_cast_to_simdf4(c0));
20497+			    stbir__simdf_store(output0, o0);)
20498+			    stbIF1(stbir__simdf_mult(
20499+			               o0, r0, stbir__if_simdf8_cast_to_simdf4(c1));
20500+			           stbir__simdf_store(output1, o0);)
20501+			        stbIF2(stbir__simdf_mult(
20502+			                   o0, r0, stbir__if_simdf8_cast_to_simdf4(c2));
20503+			               stbir__simdf_store(output2, o0);)
20504+			            stbIF3(stbir__simdf_mult(
20505+			                       o0, r0, stbir__if_simdf8_cast_to_simdf4(c3));
20506+			                   stbir__simdf_store(output3, o0);)
20507+			                stbIF4(stbir__simdf_mult(
20508+			                           o0,
20509+			                           r0,
20510+			                           stbir__if_simdf8_cast_to_simdf4(c4));
20511+			                       stbir__simdf_store(output4, o0);)
20512+			                    stbIF5(stbir__simdf_mult(
20513+			                               o0,
20514+			                               r0,
20515+			                               stbir__if_simdf8_cast_to_simdf4(c5));
20516+			                           stbir__simdf_store(output5, o0);)
20517+			                        stbIF6(stbir__simdf_mult(
20518+			                                   o0,
20519+			                                   r0,
20520+			                                   stbir__if_simdf8_cast_to_simdf4(
20521+			                                       c6));
20522+			                               stbir__simdf_store(output6, o0);)
20523+			                            stbIF7(
20524+			                                stbir__simdf_mult(
20525+			                                    o0,
20526+			                                    r0,
20527+			                                    stbir__if_simdf8_cast_to_simdf4(
20528+			                                        c7));
20529+			                                stbir__simdf_store(output7, o0);)
20530+#endif
20531+
20532+			                            input += 4;
20533+			stbIF0(output0 += 4;) stbIF1(output1 += 4;) stbIF2(output2 += 4;)
20534+			    stbIF3(output3 += 4;) stbIF4(output4 += 4;)
20535+			        stbIF5(output5 += 4;) stbIF6(output6 += 4;)
20536+			            stbIF7(output7 += 4;)
20537+		}
20538+	}
20539+#else
20540+	                                STBIR_NO_UNROLL_LOOP_START while (
20541+	                                    ((char *)input_end - (char *)input) >=
20542+	                                    16)
20543+	{
20544+		float r0, r1, r2, r3;
20545+		STBIR_NO_UNROLL(input);
20546+
20547+		r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
20548+
20549+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20550+		stbIF0(output0[0] += (r0 * c0s); output0[1] += (r1 * c0s);
20551+		       output0[2] += (r2 * c0s); output0[3] += (r3 * c0s);)
20552+		    stbIF1(output1[0] += (r0 * c1s); output1[1] += (r1 * c1s);
20553+		           output1[2] += (r2 * c1s); output1[3] += (r3 * c1s);)
20554+		        stbIF2(output2[0] += (r0 * c2s); output2[1] += (r1 * c2s);
20555+		               output2[2] += (r2 * c2s); output2[3] += (r3 * c2s);)
20556+		            stbIF3(output3[0] += (r0 * c3s); output3[1] += (r1 * c3s);
20557+		                   output3[2] += (r2 * c3s); output3[3] += (r3 * c3s);)
20558+		                stbIF4(
20559+		                    output4[0] += (r0 * c4s); output4[1] += (r1 * c4s);
20560+		                    output4[2] += (r2 * c4s); output4[3] += (r3 * c4s);)
20561+		                    stbIF5(output5[0] += (r0 * c5s);
20562+		                           output5[1] += (r1 * c5s);
20563+		                           output5[2] += (r2 * c5s);
20564+		                           output5[3] += (r3 * c5s);)
20565+		                        stbIF6(output6[0] += (r0 * c6s);
20566+		                               output6[1] += (r1 * c6s);
20567+		                               output6[2] += (r2 * c6s);
20568+		                               output6[3] += (r3 * c6s);)
20569+		                            stbIF7(output7[0] += (r0 * c7s);
20570+		                                   output7[1] += (r1 * c7s);
20571+		                                   output7[2] += (r2 * c7s);
20572+		                                   output7[3] += (r3 * c7s);)
20573+#else
20574+		stbIF0(output0[0] = (r0 * c0s); output0[1] = (r1 * c0s);
20575+		       output0[2] = (r2 * c0s); output0[3] = (r3 * c0s);)
20576+		    stbIF1(output1[0] = (r0 * c1s); output1[1] = (r1 * c1s);
20577+		           output1[2] = (r2 * c1s); output1[3] = (r3 * c1s);)
20578+		        stbIF2(output2[0] = (r0 * c2s); output2[1] = (r1 * c2s);
20579+		               output2[2] = (r2 * c2s); output2[3] = (r3 * c2s);)
20580+		            stbIF3(output3[0] = (r0 * c3s); output3[1] = (r1 * c3s);
20581+		                   output3[2] = (r2 * c3s); output3[3] = (r3 * c3s);)
20582+		                stbIF4(output4[0] = (r0 * c4s); output4[1] = (r1 * c4s);
20583+		                       output4[2] = (r2 * c4s);
20584+		                       output4[3] = (r3 * c4s);)
20585+		                    stbIF5(output5[0] = (r0 * c5s);
20586+		                           output5[1] = (r1 * c5s);
20587+		                           output5[2] = (r2 * c5s);
20588+		                           output5[3] = (r3 * c5s);)
20589+		                        stbIF6(output6[0] = (r0 * c6s);
20590+		                               output6[1] = (r1 * c6s);
20591+		                               output6[2] = (r2 * c6s);
20592+		                               output6[3] = (r3 * c6s);)
20593+		                            stbIF7(output7[0] = (r0 * c7s);
20594+		                                   output7[1] = (r1 * c7s);
20595+		                                   output7[2] = (r2 * c7s);
20596+		                                   output7[3] = (r3 * c7s);)
20597+#endif
20598+
20599+		                                input += 4;
20600+		stbIF0(output0 += 4;) stbIF1(output1 += 4;) stbIF2(output2 += 4;)
20601+		    stbIF3(output3 += 4;) stbIF4(output4 += 4;) stbIF5(output5 += 4;)
20602+		        stbIF6(output6 += 4;) stbIF7(output7 += 4;)
20603+	}
20604+#endif
20605+	STBIR_NO_UNROLL_LOOP_START
20606+	while (input < input_end) {
20607+		float r = input[0];
20608+		STBIR_NO_UNROLL(output0);
20609+
20610+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20611+		stbIF0(output0[0] += (r * c0s);) stbIF1(output1[0] += (r * c1s);)
20612+		    stbIF2(output2[0] += (r * c2s);) stbIF3(output3[0] += (r * c3s);)
20613+		        stbIF4(output4[0] += (r * c4s);)
20614+		            stbIF5(output5[0] += (r * c5s);)
20615+		                stbIF6(output6[0] += (r * c6s);)
20616+		                    stbIF7(output7[0] += (r * c7s);)
20617 #else
20618-#define stbIF7( code )
20619-#endif
20620-
20621-static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
20622-{
20623-  stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
20624-  stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
20625-  stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
20626-  stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
20627-  stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
20628-  stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
20629-  stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
20630-  stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
20631-
20632-  #ifdef STBIR_SIMD
20633-  {
20634-    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
20635-    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
20636-    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
20637-    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
20638-    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
20639-    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
20640-    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
20641-    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
20642-    STBIR_SIMD_NO_UNROLL_LOOP_START
20643-    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
20644-    {
20645-      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
20646-      STBIR_SIMD_NO_UNROLL(output0);
20647-
20648-      stbir__simdfX_load( r0, input );               stbir__simdfX_load( r1, input+stbir__simdfX_float_count );     stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) );      stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
20649-
20650-      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20651-      stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
20652-              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );
20653-              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
20654-      stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
20655-              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );
20656-              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
20657-      stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
20658-              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );
20659-              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
20660-      stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
20661-              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );
20662-              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
20663-      stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
20664-              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );
20665-              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
20666-      stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
20667-              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );
20668-              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
20669-      stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
20670-              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );
20671-              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
20672-      stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
20673-              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );
20674-              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
20675-      #else
20676-      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );
20677-              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
20678-      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );
20679-              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
20680-      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );
20681-              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
20682-      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );
20683-              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
20684-      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );
20685-              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
20686-      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );
20687-              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
20688-      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );
20689-              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
20690-      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );
20691-              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
20692-      #endif
20693-
20694-      input += (4*stbir__simdfX_float_count);
20695-      stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
20696-    }
20697-    STBIR_SIMD_NO_UNROLL_LOOP_START
20698-    while ( ( (char*)input_end - (char*) input ) >= 16 )
20699-    {
20700-      stbir__simdf o0, r0;
20701-      STBIR_SIMD_NO_UNROLL(output0);
20702-
20703-      stbir__simdf_load( r0, input );
20704-
20705-      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20706-      stbIF0( stbir__simdf_load( o0, output0 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );  stbir__simdf_store( output0, o0 ); )
20707-      stbIF1( stbir__simdf_load( o0, output1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );  stbir__simdf_store( output1, o0 ); )
20708-      stbIF2( stbir__simdf_load( o0, output2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );  stbir__simdf_store( output2, o0 ); )
20709-      stbIF3( stbir__simdf_load( o0, output3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );  stbir__simdf_store( output3, o0 ); )
20710-      stbIF4( stbir__simdf_load( o0, output4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );  stbir__simdf_store( output4, o0 ); )
20711-      stbIF5( stbir__simdf_load( o0, output5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );  stbir__simdf_store( output5, o0 ); )
20712-      stbIF6( stbir__simdf_load( o0, output6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );  stbir__simdf_store( output6, o0 ); )
20713-      stbIF7( stbir__simdf_load( o0, output7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );  stbir__simdf_store( output7, o0 ); )
20714-      #else
20715-      stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );   stbir__simdf_store( output0, o0 ); )
20716-      stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );   stbir__simdf_store( output1, o0 ); )
20717-      stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );   stbir__simdf_store( output2, o0 ); )
20718-      stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );   stbir__simdf_store( output3, o0 ); )
20719-      stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );   stbir__simdf_store( output4, o0 ); )
20720-      stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );   stbir__simdf_store( output5, o0 ); )
20721-      stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
20722-      stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
20723-      #endif
20724-
20725-      input += 4;
20726-      stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
20727-    }
20728-  }
20729-  #else
20730-  STBIR_NO_UNROLL_LOOP_START
20731-  while ( ( (char*)input_end - (char*) input ) >= 16 )
20732-  {
20733-    float r0, r1, r2, r3;
20734-    STBIR_NO_UNROLL(input);
20735-
20736-    r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
20737-
20738-    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20739-    stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
20740-    stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
20741-    stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
20742-    stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
20743-    stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
20744-    stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
20745-    stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
20746-    stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
20747-    #else
20748-    stbIF0( output0[0]  = ( r0 * c0s ); output0[1]  = ( r1 * c0s ); output0[2]  = ( r2 * c0s ); output0[3]  = ( r3 * c0s ); )
20749-    stbIF1( output1[0]  = ( r0 * c1s ); output1[1]  = ( r1 * c1s ); output1[2]  = ( r2 * c1s ); output1[3]  = ( r3 * c1s ); )
20750-    stbIF2( output2[0]  = ( r0 * c2s ); output2[1]  = ( r1 * c2s ); output2[2]  = ( r2 * c2s ); output2[3]  = ( r3 * c2s ); )
20751-    stbIF3( output3[0]  = ( r0 * c3s ); output3[1]  = ( r1 * c3s ); output3[2]  = ( r2 * c3s ); output3[3]  = ( r3 * c3s ); )
20752-    stbIF4( output4[0]  = ( r0 * c4s ); output4[1]  = ( r1 * c4s ); output4[2]  = ( r2 * c4s ); output4[3]  = ( r3 * c4s ); )
20753-    stbIF5( output5[0]  = ( r0 * c5s ); output5[1]  = ( r1 * c5s ); output5[2]  = ( r2 * c5s ); output5[3]  = ( r3 * c5s ); )
20754-    stbIF6( output6[0]  = ( r0 * c6s ); output6[1]  = ( r1 * c6s ); output6[2]  = ( r2 * c6s ); output6[3]  = ( r3 * c6s ); )
20755-    stbIF7( output7[0]  = ( r0 * c7s ); output7[1]  = ( r1 * c7s ); output7[2]  = ( r2 * c7s ); output7[3]  = ( r3 * c7s ); )
20756-    #endif
20757-
20758-    input += 4;
20759-    stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
20760-  }
20761-  #endif
20762-  STBIR_NO_UNROLL_LOOP_START
20763-  while ( input < input_end )
20764-  {
20765-    float r = input[0];
20766-    STBIR_NO_UNROLL(output0);
20767-
20768-    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20769-    stbIF0( output0[0] += ( r * c0s ); )
20770-    stbIF1( output1[0] += ( r * c1s ); )
20771-    stbIF2( output2[0] += ( r * c2s ); )
20772-    stbIF3( output3[0] += ( r * c3s ); )
20773-    stbIF4( output4[0] += ( r * c4s ); )
20774-    stbIF5( output5[0] += ( r * c5s ); )
20775-    stbIF6( output6[0] += ( r * c6s ); )
20776-    stbIF7( output7[0] += ( r * c7s ); )
20777-    #else
20778-    stbIF0( output0[0]  = ( r * c0s ); )
20779-    stbIF1( output1[0]  = ( r * c1s ); )
20780-    stbIF2( output2[0]  = ( r * c2s ); )
20781-    stbIF3( output3[0]  = ( r * c3s ); )
20782-    stbIF4( output4[0]  = ( r * c4s ); )
20783-    stbIF5( output5[0]  = ( r * c5s ); )
20784-    stbIF6( output6[0]  = ( r * c6s ); )
20785-    stbIF7( output7[0]  = ( r * c7s ); )
20786-    #endif
20787-
20788-    ++input;
20789-    stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
20790-  }
20791-}
20792-
20793-static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
20794-{
20795-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
20796-
20797-  stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
20798-  stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
20799-  stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
20800-  stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
20801-  stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
20802-  stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
20803-  stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
20804-  stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
20805-
20806-#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
20807-  // check single channel one weight
20808-  if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
20809-  {
20810-    STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
20811-    return;
20812-  }
20813-#endif
20814-
20815-  #ifdef STBIR_SIMD
20816-  {
20817-    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
20818-    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
20819-    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
20820-    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
20821-    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
20822-    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
20823-    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
20824-    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
20825-
20826-    STBIR_SIMD_NO_UNROLL_LOOP_START
20827-    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
20828-    {
20829-      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
20830-      STBIR_SIMD_NO_UNROLL(output);
20831-
20832-      // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
20833-      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
20834-      stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
20835-      stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
20836-      stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
20837-      stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
20838-      stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
20839-      stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
20840-      stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
20841-
20842-      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20843-      stbIF0( stbir__simdfX_load( o0, output );      stbir__simdfX_load( o1, output+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
20844-              stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
20845-              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );                         stbir__simdfX_madd( o2, o2, r2, c0 );                             stbir__simdfX_madd( o3, o3, r3, c0 ); )
20846-      #else
20847-      stbIF0( stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
20848-              stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );                             stbir__simdfX_mult( o2, r2, c0 );                                 stbir__simdfX_mult( o3, r3, c0 );  )
20849-      #endif
20850-
20851-      stbIF1( stbir__simdfX_load( r0, input1 );      stbir__simdfX_load( r1, input1+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
20852-              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );                         stbir__simdfX_madd( o2, o2, r2, c1 );                             stbir__simdfX_madd( o3, o3, r3, c1 ); )
20853-      stbIF2( stbir__simdfX_load( r0, input2 );      stbir__simdfX_load( r1, input2+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
20854-              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );                         stbir__simdfX_madd( o2, o2, r2, c2 );                             stbir__simdfX_madd( o3, o3, r3, c2 ); )
20855-      stbIF3( stbir__simdfX_load( r0, input3 );      stbir__simdfX_load( r1, input3+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
20856-              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );                         stbir__simdfX_madd( o2, o2, r2, c3 );                             stbir__simdfX_madd( o3, o3, r3, c3 ); )
20857-      stbIF4( stbir__simdfX_load( r0, input4 );      stbir__simdfX_load( r1, input4+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
20858-              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );                         stbir__simdfX_madd( o2, o2, r2, c4 );                             stbir__simdfX_madd( o3, o3, r3, c4 ); )
20859-      stbIF5( stbir__simdfX_load( r0, input5 );      stbir__simdfX_load( r1, input5+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
20860-              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );                         stbir__simdfX_madd( o2, o2, r2, c5 );                             stbir__simdfX_madd( o3, o3, r3, c5 ); )
20861-      stbIF6( stbir__simdfX_load( r0, input6 );      stbir__simdfX_load( r1, input6+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
20862-              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );                         stbir__simdfX_madd( o2, o2, r2, c6 );                             stbir__simdfX_madd( o3, o3, r3, c6 ); )
20863-      stbIF7( stbir__simdfX_load( r0, input7 );      stbir__simdfX_load( r1, input7+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
20864-              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );                         stbir__simdfX_madd( o2, o2, r2, c7 );                             stbir__simdfX_madd( o3, o3, r3, c7 ); )
20865-
20866-      stbir__simdfX_store( output, o0 );             stbir__simdfX_store( output+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 );  stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
20867-      output += (4*stbir__simdfX_float_count);
20868-      stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
20869-    }
20870-
20871-    STBIR_SIMD_NO_UNROLL_LOOP_START
20872-    while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
20873-    {
20874-      stbir__simdf o0, r0;
20875-      STBIR_SIMD_NO_UNROLL(output);
20876-
20877-      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20878-      stbIF0( stbir__simdf_load( o0, output );   stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
20879-      #else
20880-      stbIF0( stbir__simdf_load( r0, input0 );  stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
20881-      #endif
20882-      stbIF1( stbir__simdf_load( r0, input1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
20883-      stbIF2( stbir__simdf_load( r0, input2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
20884-      stbIF3( stbir__simdf_load( r0, input3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
20885-      stbIF4( stbir__simdf_load( r0, input4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
20886-      stbIF5( stbir__simdf_load( r0, input5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
20887-      stbIF6( stbir__simdf_load( r0, input6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
20888-      stbIF7( stbir__simdf_load( r0, input7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
20889-
20890-      stbir__simdf_store( output, o0 );
20891-      output += 4;
20892-      stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
20893-    }
20894-  }
20895-  #else
20896-  STBIR_NO_UNROLL_LOOP_START
20897-  while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
20898-  {
20899-    float o0, o1, o2, o3;
20900-    STBIR_NO_UNROLL(output);
20901-    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20902-    stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
20903-    #else
20904-    stbIF0( o0  = input0[0] * c0s; o1  = input0[1] * c0s; o2  = input0[2] * c0s; o3  = input0[3] * c0s; )
20905-    #endif
20906-    stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
20907-    stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
20908-    stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
20909-    stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
20910-    stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
20911-    stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
20912-    stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
20913-    output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
20914-    output += 4;
20915-    stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
20916-  }
20917-  #endif
20918-  STBIR_NO_UNROLL_LOOP_START
20919-  while ( input0 < input0_end )
20920-  {
20921-    float o0;
20922-    STBIR_NO_UNROLL(output);
20923-    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
20924-    stbIF0( o0 = output[0] + input0[0] * c0s; )
20925-    #else
20926-    stbIF0( o0  = input0[0] * c0s; )
20927-    #endif
20928-    stbIF1( o0 += input1[0] * c1s; )
20929-    stbIF2( o0 += input2[0] * c2s; )
20930-    stbIF3( o0 += input3[0] * c3s; )
20931-    stbIF4( o0 += input4[0] * c4s; )
20932-    stbIF5( o0 += input5[0] * c5s; )
20933-    stbIF6( o0 += input6[0] * c6s; )
20934-    stbIF7( o0 += input7[0] * c7s; )
20935-    output[0] = o0;
20936-    ++output;
20937-    stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
20938-  }
20939+		stbIF0(output0[0] = (r * c0s);) stbIF1(output1[0] = (r * c1s);)
20940+		    stbIF2(output2[0] = (r * c2s);) stbIF3(output3[0] = (r * c3s);)
20941+		        stbIF4(output4[0] = (r * c4s);) stbIF5(output5[0] = (r * c5s);)
20942+		            stbIF6(output6[0] = (r * c6s);)
20943+		                stbIF7(output7[0] = (r * c7s);)
20944+#endif
20945+
20946+		                        ++ input;
20947+		stbIF0(++output0;) stbIF1(++output1;) stbIF2(++output2;)
20948+		    stbIF3(++output3;) stbIF4(++output4;) stbIF5(++output5;)
20949+		        stbIF6(++output6;) stbIF7(++output7;)
20950+	}
20951+}
20952+
20953+static void
20954+STBIR_chans(stbir__vertical_gather_with_,
20955+            _coeffs)(float *outputp,
20956+                     float const *vertical_coefficients,
20957+                     float const **inputs,
20958+                     float const *input0_end)
20959+{
20960+	float STBIR_SIMD_STREAMOUT_PTR(*) output = outputp;
20961+
20962+	stbIF0(float const *input0 = inputs[0];
20963+	       float c0s = vertical_coefficients[0];)
20964+	    stbIF1(float const *input1 = inputs[1];
20965+	           float c1s = vertical_coefficients[1];)
20966+	        stbIF2(float const *input2 = inputs[2];
20967+	               float c2s = vertical_coefficients[2];)
20968+	            stbIF3(float const *input3 = inputs[3];
20969+	                   float c3s = vertical_coefficients[3];)
20970+	                stbIF4(float const *input4 = inputs[4];
20971+	                       float c4s = vertical_coefficients[4];)
20972+	                    stbIF5(float const *input5 = inputs[5];
20973+	                           float c5s = vertical_coefficients[5];)
20974+	                        stbIF6(float const *input6 = inputs[6];
20975+	                               float c6s = vertical_coefficients[6];)
20976+	                            stbIF7(float const *input7 = inputs[7];
20977+	                                   float c7s = vertical_coefficients[7];)
20978+
20979+#if (STBIR__vertical_channels == 1) &&                                         \
20980+    !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
20981+	    // check single channel one weight
20982+	    if ((c0s >= (1.0f - 0.000001f)) && (c0s <= (1.0f + 0.000001f)))
20983+	{
20984+		STBIR_MEMCPY(output, input0, (char *)input0_end - (char *)input0);
20985+		return;
20986+	}
20987+#endif
20988+
20989+#ifdef STBIR_SIMD
20990+	{
20991+		stbIF0(stbir__simdfX c0 = stbir__simdf_frepX(c0s);)
20992+		    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX(c1s);)
20993+		        stbIF2(stbir__simdfX c2 = stbir__simdf_frepX(c2s);) stbIF3(
20994+		            stbir__simdfX c3 = stbir__simdf_frepX(c3s);)
20995+		            stbIF4(stbir__simdfX c4 = stbir__simdf_frepX(c4s);) stbIF5(
20996+		                stbir__simdfX c5 = stbir__simdf_frepX(c5s);)
20997+		                stbIF6(stbir__simdfX c6 = stbir__simdf_frepX(c6s);)
20998+		                    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX(c7s);)
20999+
21000+		                        STBIR_SIMD_NO_UNROLL_LOOP_START while (
21001+		                            ((char *)input0_end - (char *)input0) >=
21002+		                            (16 * stbir__simdfX_float_count))
21003+		{
21004+			stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
21005+			STBIR_SIMD_NO_UNROLL(output);
21006+
21007+			// prefetch four loop iterations ahead (doesn't affect much for
21008+			// small resizes, but helps with big ones)
21009+			stbIF0(stbir__prefetch(input0 + (16 * stbir__simdfX_float_count));) stbIF1(
21010+			    stbir__prefetch(input1 + (16 * stbir__simdfX_float_count));)
21011+			    stbIF2(stbir__prefetch(input2 + (16 * stbir__simdfX_float_count));) stbIF3(
21012+			        stbir__prefetch(input3 + (16 * stbir__simdfX_float_count));)
21013+			        stbIF4(stbir__prefetch(input4 + (16 * stbir__simdfX_float_count));) stbIF5(
21014+			            stbir__prefetch(input5 +
21015+			                            (16 * stbir__simdfX_float_count));)
21016+			            stbIF6(stbir__prefetch(input6 + (16 * stbir__simdfX_float_count));) stbIF7(
21017+			                stbir__prefetch(input7 +
21018+			                                (16 * stbir__simdfX_float_count));)
21019+
21020+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
21021+			                stbIF0(
21022+			                    stbir__simdfX_load(o0, output);
21023+			                    stbir__simdfX_load(
21024+			                        o1, output + stbir__simdfX_float_count);
21025+			                    stbir__simdfX_load(
21026+			                        o2,
21027+			                        output + (2 * stbir__simdfX_float_count));
21028+			                    stbir__simdfX_load(
21029+			                        o3,
21030+			                        output + (3 * stbir__simdfX_float_count));
21031+			                    stbir__simdfX_load(r0, input0);
21032+			                    stbir__simdfX_load(
21033+			                        r1, input0 + stbir__simdfX_float_count);
21034+			                    stbir__simdfX_load(
21035+			                        r2,
21036+			                        input0 + (2 * stbir__simdfX_float_count));
21037+			                    stbir__simdfX_load(
21038+			                        r3,
21039+			                        input0 + (3 * stbir__simdfX_float_count));
21040+			                    stbir__simdfX_madd(o0, o0, r0, c0);
21041+			                    stbir__simdfX_madd(o1, o1, r1, c0);
21042+			                    stbir__simdfX_madd(o2, o2, r2, c0);
21043+			                    stbir__simdfX_madd(o3, o3, r3, c0);)
21044+#else
21045+			                stbIF0(
21046+			                    stbir__simdfX_load(r0, input0);
21047+			                    stbir__simdfX_load(
21048+			                        r1, input0 + stbir__simdfX_float_count);
21049+			                    stbir__simdfX_load(
21050+			                        r2,
21051+			                        input0 + (2 * stbir__simdfX_float_count));
21052+			                    stbir__simdfX_load(
21053+			                        r3,
21054+			                        input0 + (3 * stbir__simdfX_float_count));
21055+			                    stbir__simdfX_mult(o0, r0, c0);
21056+			                    stbir__simdfX_mult(o1, r1, c0);
21057+			                    stbir__simdfX_mult(o2, r2, c0);
21058+			                    stbir__simdfX_mult(o3, r3, c0);)
21059+#endif
21060+
21061+			                    stbIF1(
21062+			                        stbir__simdfX_load(r0, input1);
21063+			                        stbir__simdfX_load(
21064+			                            r1, input1 + stbir__simdfX_float_count);
21065+			                        stbir__simdfX_load(
21066+			                            r2,
21067+			                            input1 +
21068+			                                (2 * stbir__simdfX_float_count));
21069+			                        stbir__simdfX_load(
21070+			                            r3,
21071+			                            input1 +
21072+			                                (3 * stbir__simdfX_float_count));
21073+			                        stbir__simdfX_madd(o0, o0, r0, c1);
21074+			                        stbir__simdfX_madd(o1, o1, r1, c1);
21075+			                        stbir__simdfX_madd(o2, o2, r2, c1);
21076+			                        stbir__simdfX_madd(
21077+			                            o3,
21078+			                            o3,
21079+			                            r3,
21080+			                            c1);) stbIF2(stbir__simdfX_load(r0,
21081+			                                                            input2);
21082+			                                         stbir__simdfX_load(
21083+			                                             r1,
21084+			                                             input2 +
21085+			                                                 stbir__simdfX_float_count);
21086+			                                         stbir__simdfX_load(
21087+			                                             r2,
21088+			                                             input2 +
21089+			                                                 (2 *
21090+			                                                  stbir__simdfX_float_count));
21091+			                                         stbir__simdfX_load(
21092+			                                             r3,
21093+			                                             input2 +
21094+			                                                 (3 *
21095+			                                                  stbir__simdfX_float_count));
21096+			                                         stbir__simdfX_madd(
21097+			                                             o0, o0, r0, c2);
21098+			                                         stbir__simdfX_madd(
21099+			                                             o1, o1, r1, c2);
21100+			                                         stbir__simdfX_madd(
21101+			                                             o2, o2, r2, c2);
21102+			                                         stbir__simdfX_madd(
21103+			                                             o3, o3, r3, c2);)
21104+			                        stbIF3(
21105+			                            stbir__simdfX_load(r0, input3);
21106+			                            stbir__simdfX_load(
21107+			                                r1,
21108+			                                input3 + stbir__simdfX_float_count);
21109+			                            stbir__simdfX_load(
21110+			                                r2,
21111+			                                input3 +
21112+			                                    (2 *
21113+			                                     stbir__simdfX_float_count));
21114+			                            stbir__simdfX_load(
21115+			                                r3,
21116+			                                input3 +
21117+			                                    (3 *
21118+			                                     stbir__simdfX_float_count));
21119+			                            stbir__simdfX_madd(o0, o0, r0, c3);
21120+			                            stbir__simdfX_madd(o1, o1, r1, c3);
21121+			                            stbir__simdfX_madd(o2, o2, r2, c3);
21122+			                            stbir__simdfX_madd(o3, o3, r3, c3);)
21123+			                            stbIF4(
21124+			                                stbir__simdfX_load(r0, input4);
21125+			                                stbir__simdfX_load(
21126+			                                    r1,
21127+			                                    input4 +
21128+			                                        stbir__simdfX_float_count);
21129+			                                stbir__simdfX_load(
21130+			                                    r2,
21131+			                                    input4 +
21132+			                                        (2 *
21133+			                                         stbir__simdfX_float_count));
21134+			                                stbir__simdfX_load(
21135+			                                    r3,
21136+			                                    input4 +
21137+			                                        (3 *
21138+			                                         stbir__simdfX_float_count));
21139+			                                stbir__simdfX_madd(o0, o0, r0, c4);
21140+			                                stbir__simdfX_madd(o1, o1, r1, c4);
21141+			                                stbir__simdfX_madd(o2, o2, r2, c4);
21142+			                                stbir__simdfX_madd(o3, o3, r3, c4);)
21143+			                                stbIF5(
21144+			                                    stbir__simdfX_load(r0, input5);
21145+			                                    stbir__simdfX_load(
21146+			                                        r1,
21147+			                                        input5 +
21148+			                                            stbir__simdfX_float_count);
21149+			                                    stbir__simdfX_load(
21150+			                                        r2,
21151+			                                        input5 +
21152+			                                            (2 *
21153+			                                             stbir__simdfX_float_count));
21154+			                                    stbir__simdfX_load(
21155+			                                        r3,
21156+			                                        input5 +
21157+			                                            (3 *
21158+			                                             stbir__simdfX_float_count));
21159+			                                    stbir__simdfX_madd(
21160+			                                        o0, o0, r0, c5);
21161+			                                    stbir__simdfX_madd(
21162+			                                        o1, o1, r1, c5);
21163+			                                    stbir__simdfX_madd(
21164+			                                        o2, o2, r2, c5);
21165+			                                    stbir__simdfX_madd(
21166+			                                        o3, o3, r3, c5);)
21167+			                                    stbIF6(
21168+			                                        stbir__simdfX_load(r0,
21169+			                                                           input6);
21170+			                                        stbir__simdfX_load(
21171+			                                            r1,
21172+			                                            input6 +
21173+			                                                stbir__simdfX_float_count);
21174+			                                        stbir__simdfX_load(
21175+			                                            r2,
21176+			                                            input6 +
21177+			                                                (2 *
21178+			                                                 stbir__simdfX_float_count));
21179+			                                        stbir__simdfX_load(
21180+			                                            r3,
21181+			                                            input6 +
21182+			                                                (3 *
21183+			                                                 stbir__simdfX_float_count));
21184+			                                        stbir__simdfX_madd(
21185+			                                            o0, o0, r0, c6);
21186+			                                        stbir__simdfX_madd(
21187+			                                            o1, o1, r1, c6);
21188+			                                        stbir__simdfX_madd(
21189+			                                            o2, o2, r2, c6);
21190+			                                        stbir__simdfX_madd(
21191+			                                            o3, o3, r3, c6);)
21192+			                                        stbIF7(
21193+			                                            stbir__simdfX_load(
21194+			                                                r0, input7);
21195+			                                            stbir__simdfX_load(
21196+			                                                r1,
21197+			                                                input7 +
21198+			                                                    stbir__simdfX_float_count);
21199+			                                            stbir__simdfX_load(
21200+			                                                r2,
21201+			                                                input7 +
21202+			                                                    (2 *
21203+			                                                     stbir__simdfX_float_count));
21204+			                                            stbir__simdfX_load(
21205+			                                                r3,
21206+			                                                input7 +
21207+			                                                    (3 *
21208+			                                                     stbir__simdfX_float_count));
21209+			                                            stbir__simdfX_madd(
21210+			                                                o0, o0, r0, c7);
21211+			                                            stbir__simdfX_madd(
21212+			                                                o1, o1, r1, c7);
21213+			                                            stbir__simdfX_madd(
21214+			                                                o2, o2, r2, c7);
21215+			                                            stbir__simdfX_madd(
21216+			                                                o3, o3, r3, c7);)
21217+
21218+			                                            stbir__simdfX_store(
21219+			                                                output, o0);
21220+			stbir__simdfX_store(output + stbir__simdfX_float_count, o1);
21221+			stbir__simdfX_store(output + (2 * stbir__simdfX_float_count), o2);
21222+			stbir__simdfX_store(output + (3 * stbir__simdfX_float_count), o3);
21223+			output += (4 * stbir__simdfX_float_count);
21224+			stbIF0(input0 += (4 * stbir__simdfX_float_count);) stbIF1(
21225+			    input1 += (4 * stbir__simdfX_float_count);)
21226+			    stbIF2(input2 += (4 * stbir__simdfX_float_count);) stbIF3(
21227+			        input3 += (4 * stbir__simdfX_float_count);)
21228+			        stbIF4(input4 += (4 * stbir__simdfX_float_count);) stbIF5(
21229+			            input5 += (4 * stbir__simdfX_float_count);)
21230+			            stbIF6(input6 += (4 * stbir__simdfX_float_count);)
21231+			                stbIF7(input7 += (4 * stbir__simdfX_float_count);)
21232+		}
21233+
21234+		STBIR_SIMD_NO_UNROLL_LOOP_START
21235+		while (((char *)input0_end - (char *)input0) >= 16) {
21236+			stbir__simdf o0, r0;
21237+			STBIR_SIMD_NO_UNROLL(output);
21238+
21239+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
21240+			stbIF0(stbir__simdf_load(o0, output); stbir__simdf_load(r0, input0);
21241+			       stbir__simdf_madd(
21242+			           o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c0));)
21243+#else
21244+			stbIF0(stbir__simdf_load(r0, input0); stbir__simdf_mult(
21245+			           o0, r0, stbir__if_simdf8_cast_to_simdf4(c0));)
21246+#endif
21247+			    stbIF1(stbir__simdf_load(r0, input1); stbir__simdf_madd(
21248+			               o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c1));)
21249+			        stbIF2(
21250+			            stbir__simdf_load(r0, input2); stbir__simdf_madd(
21251+			                o0, o0, r0, stbir__if_simdf8_cast_to_simdf4(c2));)
21252+			            stbIF3(stbir__simdf_load(r0, input3); stbir__simdf_madd(
21253+			                       o0,
21254+			                       o0,
21255+			                       r0,
21256+			                       stbir__if_simdf8_cast_to_simdf4(c3));)
21257+			                stbIF4(stbir__simdf_load(r0, input4);
21258+			                       stbir__simdf_madd(
21259+			                           o0,
21260+			                           o0,
21261+			                           r0,
21262+			                           stbir__if_simdf8_cast_to_simdf4(c4));)
21263+			                    stbIF5(
21264+			                        stbir__simdf_load(r0, input5);
21265+			                        stbir__simdf_madd(
21266+			                            o0,
21267+			                            o0,
21268+			                            r0,
21269+			                            stbir__if_simdf8_cast_to_simdf4(c5));)
21270+			                        stbIF6(stbir__simdf_load(r0, input6);
21271+			                               stbir__simdf_madd(
21272+			                                   o0,
21273+			                                   o0,
21274+			                                   r0,
21275+			                                   stbir__if_simdf8_cast_to_simdf4(
21276+			                                       c6));)
21277+			                            stbIF7(
21278+			                                stbir__simdf_load(r0, input7);
21279+			                                stbir__simdf_madd(
21280+			                                    o0,
21281+			                                    o0,
21282+			                                    r0,
21283+			                                    stbir__if_simdf8_cast_to_simdf4(
21284+			                                        c7));)
21285+
21286+			                                stbir__simdf_store(output, o0);
21287+			output += 4;
21288+			stbIF0(input0 += 4;) stbIF1(input1 += 4;) stbIF2(input2 += 4;)
21289+			    stbIF3(input3 += 4;) stbIF4(input4 += 4;) stbIF5(input5 += 4;)
21290+			        stbIF6(input6 += 4;) stbIF7(input7 += 4;)
21291+		}
21292+	}
21293+#else
21294+	                                STBIR_NO_UNROLL_LOOP_START while (
21295+	                                    ((char *)input0_end - (char *)input0) >=
21296+	                                    16)
21297+	{
21298+		float o0, o1, o2, o3;
21299+		STBIR_NO_UNROLL(output);
21300+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
21301+		stbIF0(
21302+		    o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s;
21303+		    o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s;)
21304+#else
21305+		stbIF0(o0 = input0[0] * c0s; o1 = input0[1] * c0s; o2 = input0[2] * c0s;
21306+		       o3 = input0[3] * c0s;)
21307+#endif
21308+		    stbIF1(o0 += input1[0] * c1s; o1 += input1[1] * c1s;
21309+		           o2 += input1[2] * c1s; o3 += input1[3] * c1s;)
21310+		        stbIF2(o0 += input2[0] * c2s; o1 += input2[1] * c2s;
21311+		               o2 += input2[2] * c2s;
21312+		               o3 += input2[3] * c2s;) stbIF3(o0 += input3[0] * c3s;
21313+		                                              o1 += input3[1] * c3s;
21314+		                                              o2 += input3[2] * c3s;
21315+		                                              o3 += input3[3] * c3s;)
21316+		            stbIF4(o0 += input4[0] * c4s; o1 += input4[1] * c4s;
21317+		                   o2 += input4[2] * c4s; o3 += input4[3] * c4s;)
21318+		                stbIF5(o0 += input5[0] * c5s; o1 += input5[1] * c5s;
21319+		                       o2 += input5[2] * c5s; o3 += input5[3] * c5s;)
21320+		                    stbIF6(o0 += input6[0] * c6s; o1 += input6[1] * c6s;
21321+		                           o2 += input6[2] * c6s;
21322+		                           o3 += input6[3] * c6s;)
21323+		                        stbIF7(o0 += input7[0] * c7s;
21324+		                               o1 += input7[1] * c7s;
21325+		                               o2 += input7[2] * c7s;
21326+		                               o3 += input7[3] * c7s;) output[0] = o0;
21327+		output[1] = o1;
21328+		output[2] = o2;
21329+		output[3] = o3;
21330+		output += 4;
21331+		stbIF0(input0 += 4;) stbIF1(input1 += 4;) stbIF2(input2 += 4;)
21332+		    stbIF3(input3 += 4;) stbIF4(input4 += 4;) stbIF5(input5 += 4;)
21333+		        stbIF6(input6 += 4;) stbIF7(input7 += 4;)
21334+	}
21335+#endif
21336+	STBIR_NO_UNROLL_LOOP_START
21337+	while (input0 < input0_end) {
21338+		float o0;
21339+		STBIR_NO_UNROLL(output);
21340+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
21341+		stbIF0(o0 = output[0] + input0[0] * c0s;)
21342+#else
21343+		stbIF0(o0 = input0[0] * c0s;)
21344+#endif
21345+		    stbIF1(o0 += input1[0] * c1s;) stbIF2(o0 += input2[0] * c2s;)
21346+		        stbIF3(o0 += input3[0] * c3s;) stbIF4(o0 += input4[0] * c4s;)
21347+		            stbIF5(o0 += input5[0] * c5s;)
21348+		                stbIF6(o0 += input6[0] * c6s;)
21349+		                    stbIF7(o0 += input7[0] * c7s;) output[0] = o0;
21350+		++output;
21351+		stbIF0(++input0;) stbIF1(++input1;) stbIF2(++input2;) stbIF3(++input3;)
21352+		    stbIF4(++input4;) stbIF5(++input5;) stbIF6(++input6;)
21353+		        stbIF7(++input7;)
21354+	}
21355 }
21356 
21357 #undef stbIF0
21358@@ -10251,30 +12734,31 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
21359 
21360 #else // !STB_IMAGE_RESIZE_DO_VERTICALS
21361 
21362-#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
21363+#define STBIR_chans(start, end)                                                \
21364+	STBIR_strs_join1(start, STBIR__horizontal_channels, end)
21365 
21366 #ifndef stbir__2_coeff_only
21367-#define stbir__2_coeff_only()             \
21368-    stbir__1_coeff_only();                \
21369-    stbir__1_coeff_remnant(1);
21370+#define stbir__2_coeff_only()                                                  \
21371+	stbir__1_coeff_only();                                                     \
21372+	stbir__1_coeff_remnant(1);
21373 #endif
21374 
21375 #ifndef stbir__2_coeff_remnant
21376-#define stbir__2_coeff_remnant( ofs )     \
21377-    stbir__1_coeff_remnant(ofs);          \
21378-    stbir__1_coeff_remnant((ofs)+1);
21379+#define stbir__2_coeff_remnant(ofs)                                            \
21380+	stbir__1_coeff_remnant(ofs);                                               \
21381+	stbir__1_coeff_remnant((ofs) + 1);
21382 #endif
21383 
21384 #ifndef stbir__3_coeff_only
21385-#define stbir__3_coeff_only()             \
21386-    stbir__2_coeff_only();                \
21387-    stbir__1_coeff_remnant(2);
21388+#define stbir__3_coeff_only()                                                  \
21389+	stbir__2_coeff_only();                                                     \
21390+	stbir__1_coeff_remnant(2);
21391 #endif
21392 
21393 #ifndef stbir__3_coeff_remnant
21394-#define stbir__3_coeff_remnant( ofs )     \
21395-    stbir__2_coeff_remnant(ofs);          \
21396-    stbir__1_coeff_remnant((ofs)+2);
21397+#define stbir__3_coeff_remnant(ofs)                                            \
21398+	stbir__2_coeff_remnant(ofs);                                               \
21399+	stbir__1_coeff_remnant((ofs) + 2);
21400 #endif
21401 
21402 #ifndef stbir__3_coeff_setup
21403@@ -10282,308 +12766,432 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
21404 #endif
21405 
21406 #ifndef stbir__4_coeff_start
21407-#define stbir__4_coeff_start()            \
21408-    stbir__2_coeff_only();                \
21409-    stbir__2_coeff_remnant(2);
21410+#define stbir__4_coeff_start()                                                 \
21411+	stbir__2_coeff_only();                                                     \
21412+	stbir__2_coeff_remnant(2);
21413 #endif
21414 
21415 #ifndef stbir__4_coeff_continue_from_4
21416-#define stbir__4_coeff_continue_from_4( ofs )     \
21417-    stbir__2_coeff_remnant(ofs);                  \
21418-    stbir__2_coeff_remnant((ofs)+2);
21419+#define stbir__4_coeff_continue_from_4(ofs)                                    \
21420+	stbir__2_coeff_remnant(ofs);                                               \
21421+	stbir__2_coeff_remnant((ofs) + 2);
21422 #endif
21423 
21424 #ifndef stbir__store_output_tiny
21425 #define stbir__store_output_tiny stbir__store_output
21426 #endif
21427 
21428-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21429-{
21430-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21431-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21432-  STBIR_SIMD_NO_UNROLL_LOOP_START
21433-  do {
21434-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21435-    float const * hc = horizontal_coefficients;
21436-    stbir__1_coeff_only();
21437-    stbir__store_output_tiny();
21438-  } while ( output < output_end );
21439-}
21440-
21441-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21442-{
21443-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21444-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21445-  STBIR_SIMD_NO_UNROLL_LOOP_START
21446-  do {
21447-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21448-    float const * hc = horizontal_coefficients;
21449-    stbir__2_coeff_only();
21450-    stbir__store_output_tiny();
21451-  } while ( output < output_end );
21452-}
21453-
21454-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21455-{
21456-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21457-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21458-  STBIR_SIMD_NO_UNROLL_LOOP_START
21459-  do {
21460-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21461-    float const * hc = horizontal_coefficients;
21462-    stbir__3_coeff_only();
21463-    stbir__store_output_tiny();
21464-  } while ( output < output_end );
21465-}
21466-
21467-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21468-{
21469-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21470-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21471-  STBIR_SIMD_NO_UNROLL_LOOP_START
21472-  do {
21473-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21474-    float const * hc = horizontal_coefficients;
21475-    stbir__4_coeff_start();
21476-    stbir__store_output();
21477-  } while ( output < output_end );
21478-}
21479-
21480-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21481-{
21482-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21483-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21484-  STBIR_SIMD_NO_UNROLL_LOOP_START
21485-  do {
21486-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21487-    float const * hc = horizontal_coefficients;
21488-    stbir__4_coeff_start();
21489-    stbir__1_coeff_remnant(4);
21490-    stbir__store_output();
21491-  } while ( output < output_end );
21492-}
21493-
21494-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21495-{
21496-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21497-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21498-  STBIR_SIMD_NO_UNROLL_LOOP_START
21499-  do {
21500-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21501-    float const * hc = horizontal_coefficients;
21502-    stbir__4_coeff_start();
21503-    stbir__2_coeff_remnant(4);
21504-    stbir__store_output();
21505-  } while ( output < output_end );
21506-}
21507-
21508-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21509-{
21510-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21511-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21512-  stbir__3_coeff_setup();
21513-  STBIR_SIMD_NO_UNROLL_LOOP_START
21514-  do {
21515-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21516-    float const * hc = horizontal_coefficients;
21517-
21518-    stbir__4_coeff_start();
21519-    stbir__3_coeff_remnant(4);
21520-    stbir__store_output();
21521-  } while ( output < output_end );
21522-}
21523-
21524-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21525-{
21526-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21527-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21528-  STBIR_SIMD_NO_UNROLL_LOOP_START
21529-  do {
21530-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21531-    float const * hc = horizontal_coefficients;
21532-    stbir__4_coeff_start();
21533-    stbir__4_coeff_continue_from_4(4);
21534-    stbir__store_output();
21535-  } while ( output < output_end );
21536-}
21537-
21538-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21539-{
21540-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21541-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21542-  STBIR_SIMD_NO_UNROLL_LOOP_START
21543-  do {
21544-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21545-    float const * hc = horizontal_coefficients;
21546-    stbir__4_coeff_start();
21547-    stbir__4_coeff_continue_from_4(4);
21548-    stbir__1_coeff_remnant(8);
21549-    stbir__store_output();
21550-  } while ( output < output_end );
21551-}
21552-
21553-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21554-{
21555-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21556-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21557-  STBIR_SIMD_NO_UNROLL_LOOP_START
21558-  do {
21559-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21560-    float const * hc = horizontal_coefficients;
21561-    stbir__4_coeff_start();
21562-    stbir__4_coeff_continue_from_4(4);
21563-    stbir__2_coeff_remnant(8);
21564-    stbir__store_output();
21565-  } while ( output < output_end );
21566-}
21567-
21568-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21569-{
21570-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21571-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21572-  stbir__3_coeff_setup();
21573-  STBIR_SIMD_NO_UNROLL_LOOP_START
21574-  do {
21575-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21576-    float const * hc = horizontal_coefficients;
21577-    stbir__4_coeff_start();
21578-    stbir__4_coeff_continue_from_4(4);
21579-    stbir__3_coeff_remnant(8);
21580-    stbir__store_output();
21581-  } while ( output < output_end );
21582-}
21583-
21584-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21585-{
21586-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21587-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21588-  STBIR_SIMD_NO_UNROLL_LOOP_START
21589-  do {
21590-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21591-    float const * hc = horizontal_coefficients;
21592-    stbir__4_coeff_start();
21593-    stbir__4_coeff_continue_from_4(4);
21594-    stbir__4_coeff_continue_from_4(8);
21595-    stbir__store_output();
21596-  } while ( output < output_end );
21597-}
21598-
21599-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21600-{
21601-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21602-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21603-  STBIR_SIMD_NO_UNROLL_LOOP_START
21604-  do {
21605-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21606-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
21607-    float const * hc = horizontal_coefficients;
21608-
21609-    stbir__4_coeff_start();
21610-    STBIR_SIMD_NO_UNROLL_LOOP_START
21611-    do {
21612-      hc += 4;
21613-      decode += STBIR__horizontal_channels * 4;
21614-      stbir__4_coeff_continue_from_4( 0 );
21615-      --n;
21616-    } while ( n > 0 );
21617-    stbir__store_output();
21618-  } while ( output < output_end );
21619-}
21620-
21621-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21622-{
21623-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21624-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21625-  STBIR_SIMD_NO_UNROLL_LOOP_START
21626-  do {
21627-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21628-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
21629-    float const * hc = horizontal_coefficients;
21630-
21631-    stbir__4_coeff_start();
21632-    STBIR_SIMD_NO_UNROLL_LOOP_START
21633-    do {
21634-      hc += 4;
21635-      decode += STBIR__horizontal_channels * 4;
21636-      stbir__4_coeff_continue_from_4( 0 );
21637-      --n;
21638-    } while ( n > 0 );
21639-    stbir__1_coeff_remnant( 4 );
21640-    stbir__store_output();
21641-  } while ( output < output_end );
21642-}
21643-
21644-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21645-{
21646-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21647-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21648-  STBIR_SIMD_NO_UNROLL_LOOP_START
21649-  do {
21650-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21651-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
21652-    float const * hc = horizontal_coefficients;
21653-
21654-    stbir__4_coeff_start();
21655-    STBIR_SIMD_NO_UNROLL_LOOP_START
21656-    do {
21657-      hc += 4;
21658-      decode += STBIR__horizontal_channels * 4;
21659-      stbir__4_coeff_continue_from_4( 0 );
21660-      --n;
21661-    } while ( n > 0 );
21662-    stbir__2_coeff_remnant( 4 );
21663-
21664-    stbir__store_output();
21665-  } while ( output < output_end );
21666-}
21667-
21668-static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
21669-{
21670-  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
21671-  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
21672-  stbir__3_coeff_setup();
21673-  STBIR_SIMD_NO_UNROLL_LOOP_START
21674-  do {
21675-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
21676-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
21677-    float const * hc = horizontal_coefficients;
21678-
21679-    stbir__4_coeff_start();
21680-    STBIR_SIMD_NO_UNROLL_LOOP_START
21681-    do {
21682-      hc += 4;
21683-      decode += STBIR__horizontal_channels * 4;
21684-      stbir__4_coeff_continue_from_4( 0 );
21685-      --n;
21686-    } while ( n > 0 );
21687-    stbir__3_coeff_remnant( 4 );
21688-
21689-    stbir__store_output();
21690-  } while ( output < output_end );
21691-}
21692-
21693-static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
21694-{
21695-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
21696-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
21697-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
21698-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
21699+static void
21700+STBIR_chans(stbir__horizontal_gather_, _channels_with_1_coeff)(
21701+    float *output_buffer, unsigned int output_sub_size,
21702+    float const *decode_buffer,
21703+    stbir__contributors const *horizontal_contributors,
21704+    float const *horizontal_coefficients, int coefficient_width)
21705+{
21706+	float const *output_end =
21707+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21708+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21709+	STBIR_SIMD_NO_UNROLL_LOOP_START
21710+	do {
21711+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21712+		                                          STBIR__horizontal_channels;
21713+		float const *hc = horizontal_coefficients;
21714+		stbir__1_coeff_only();
21715+		stbir__store_output_tiny();
21716+	} while (output < output_end);
21717+}
21718+
21719+static void
21720+STBIR_chans(stbir__horizontal_gather_, _channels_with_2_coeffs)(
21721+    float *output_buffer, unsigned int output_sub_size,
21722+    float const *decode_buffer,
21723+    stbir__contributors const *horizontal_contributors,
21724+    float const *horizontal_coefficients, int coefficient_width)
21725+{
21726+	float const *output_end =
21727+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21728+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21729+	STBIR_SIMD_NO_UNROLL_LOOP_START
21730+	do {
21731+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21732+		                                          STBIR__horizontal_channels;
21733+		float const *hc = horizontal_coefficients;
21734+		stbir__2_coeff_only();
21735+		stbir__store_output_tiny();
21736+	} while (output < output_end);
21737+}
21738+
21739+static void
21740+STBIR_chans(stbir__horizontal_gather_, _channels_with_3_coeffs)(
21741+    float *output_buffer, unsigned int output_sub_size,
21742+    float const *decode_buffer,
21743+    stbir__contributors const *horizontal_contributors,
21744+    float const *horizontal_coefficients, int coefficient_width)
21745+{
21746+	float const *output_end =
21747+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21748+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21749+	STBIR_SIMD_NO_UNROLL_LOOP_START
21750+	do {
21751+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21752+		                                          STBIR__horizontal_channels;
21753+		float const *hc = horizontal_coefficients;
21754+		stbir__3_coeff_only();
21755+		stbir__store_output_tiny();
21756+	} while (output < output_end);
21757+}
21758+
21759+static void
21760+STBIR_chans(stbir__horizontal_gather_, _channels_with_4_coeffs)(
21761+    float *output_buffer, unsigned int output_sub_size,
21762+    float const *decode_buffer,
21763+    stbir__contributors const *horizontal_contributors,
21764+    float const *horizontal_coefficients, int coefficient_width)
21765+{
21766+	float const *output_end =
21767+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21768+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21769+	STBIR_SIMD_NO_UNROLL_LOOP_START
21770+	do {
21771+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21772+		                                          STBIR__horizontal_channels;
21773+		float const *hc = horizontal_coefficients;
21774+		stbir__4_coeff_start();
21775+		stbir__store_output();
21776+	} while (output < output_end);
21777+}
21778+
21779+static void
21780+STBIR_chans(stbir__horizontal_gather_, _channels_with_5_coeffs)(
21781+    float *output_buffer, unsigned int output_sub_size,
21782+    float const *decode_buffer,
21783+    stbir__contributors const *horizontal_contributors,
21784+    float const *horizontal_coefficients, int coefficient_width)
21785+{
21786+	float const *output_end =
21787+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21788+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21789+	STBIR_SIMD_NO_UNROLL_LOOP_START
21790+	do {
21791+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21792+		                                          STBIR__horizontal_channels;
21793+		float const *hc = horizontal_coefficients;
21794+		stbir__4_coeff_start();
21795+		stbir__1_coeff_remnant(4);
21796+		stbir__store_output();
21797+	} while (output < output_end);
21798+}
21799+
21800+static void
21801+STBIR_chans(stbir__horizontal_gather_, _channels_with_6_coeffs)(
21802+    float *output_buffer, unsigned int output_sub_size,
21803+    float const *decode_buffer,
21804+    stbir__contributors const *horizontal_contributors,
21805+    float const *horizontal_coefficients, int coefficient_width)
21806+{
21807+	float const *output_end =
21808+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21809+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21810+	STBIR_SIMD_NO_UNROLL_LOOP_START
21811+	do {
21812+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21813+		                                          STBIR__horizontal_channels;
21814+		float const *hc = horizontal_coefficients;
21815+		stbir__4_coeff_start();
21816+		stbir__2_coeff_remnant(4);
21817+		stbir__store_output();
21818+	} while (output < output_end);
21819+}
21820+
21821+static void
21822+STBIR_chans(stbir__horizontal_gather_, _channels_with_7_coeffs)(
21823+    float *output_buffer, unsigned int output_sub_size,
21824+    float const *decode_buffer,
21825+    stbir__contributors const *horizontal_contributors,
21826+    float const *horizontal_coefficients, int coefficient_width)
21827+{
21828+	float const *output_end =
21829+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21830+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21831+	stbir__3_coeff_setup();
21832+	STBIR_SIMD_NO_UNROLL_LOOP_START
21833+	do {
21834+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21835+		                                          STBIR__horizontal_channels;
21836+		float const *hc = horizontal_coefficients;
21837+
21838+		stbir__4_coeff_start();
21839+		stbir__3_coeff_remnant(4);
21840+		stbir__store_output();
21841+	} while (output < output_end);
21842+}
21843+
21844+static void
21845+STBIR_chans(stbir__horizontal_gather_, _channels_with_8_coeffs)(
21846+    float *output_buffer, unsigned int output_sub_size,
21847+    float const *decode_buffer,
21848+    stbir__contributors const *horizontal_contributors,
21849+    float const *horizontal_coefficients, int coefficient_width)
21850+{
21851+	float const *output_end =
21852+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21853+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21854+	STBIR_SIMD_NO_UNROLL_LOOP_START
21855+	do {
21856+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21857+		                                          STBIR__horizontal_channels;
21858+		float const *hc = horizontal_coefficients;
21859+		stbir__4_coeff_start();
21860+		stbir__4_coeff_continue_from_4(4);
21861+		stbir__store_output();
21862+	} while (output < output_end);
21863+}
21864+
21865+static void
21866+STBIR_chans(stbir__horizontal_gather_, _channels_with_9_coeffs)(
21867+    float *output_buffer, unsigned int output_sub_size,
21868+    float const *decode_buffer,
21869+    stbir__contributors const *horizontal_contributors,
21870+    float const *horizontal_coefficients, int coefficient_width)
21871+{
21872+	float const *output_end =
21873+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21874+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21875+	STBIR_SIMD_NO_UNROLL_LOOP_START
21876+	do {
21877+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21878+		                                          STBIR__horizontal_channels;
21879+		float const *hc = horizontal_coefficients;
21880+		stbir__4_coeff_start();
21881+		stbir__4_coeff_continue_from_4(4);
21882+		stbir__1_coeff_remnant(8);
21883+		stbir__store_output();
21884+	} while (output < output_end);
21885+}
21886+
21887+static void
21888+STBIR_chans(stbir__horizontal_gather_, _channels_with_10_coeffs)(
21889+    float *output_buffer, unsigned int output_sub_size,
21890+    float const *decode_buffer,
21891+    stbir__contributors const *horizontal_contributors,
21892+    float const *horizontal_coefficients, int coefficient_width)
21893+{
21894+	float const *output_end =
21895+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21896+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21897+	STBIR_SIMD_NO_UNROLL_LOOP_START
21898+	do {
21899+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21900+		                                          STBIR__horizontal_channels;
21901+		float const *hc = horizontal_coefficients;
21902+		stbir__4_coeff_start();
21903+		stbir__4_coeff_continue_from_4(4);
21904+		stbir__2_coeff_remnant(8);
21905+		stbir__store_output();
21906+	} while (output < output_end);
21907+}
21908+
21909+static void
21910+STBIR_chans(stbir__horizontal_gather_, _channels_with_11_coeffs)(
21911+    float *output_buffer, unsigned int output_sub_size,
21912+    float const *decode_buffer,
21913+    stbir__contributors const *horizontal_contributors,
21914+    float const *horizontal_coefficients, int coefficient_width)
21915+{
21916+	float const *output_end =
21917+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21918+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21919+	stbir__3_coeff_setup();
21920+	STBIR_SIMD_NO_UNROLL_LOOP_START
21921+	do {
21922+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21923+		                                          STBIR__horizontal_channels;
21924+		float const *hc = horizontal_coefficients;
21925+		stbir__4_coeff_start();
21926+		stbir__4_coeff_continue_from_4(4);
21927+		stbir__3_coeff_remnant(8);
21928+		stbir__store_output();
21929+	} while (output < output_end);
21930+}
21931+
21932+static void
21933+STBIR_chans(stbir__horizontal_gather_, _channels_with_12_coeffs)(
21934+    float *output_buffer, unsigned int output_sub_size,
21935+    float const *decode_buffer,
21936+    stbir__contributors const *horizontal_contributors,
21937+    float const *horizontal_coefficients, int coefficient_width)
21938+{
21939+	float const *output_end =
21940+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21941+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21942+	STBIR_SIMD_NO_UNROLL_LOOP_START
21943+	do {
21944+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21945+		                                          STBIR__horizontal_channels;
21946+		float const *hc = horizontal_coefficients;
21947+		stbir__4_coeff_start();
21948+		stbir__4_coeff_continue_from_4(4);
21949+		stbir__4_coeff_continue_from_4(8);
21950+		stbir__store_output();
21951+	} while (output < output_end);
21952+}
21953+
21954+static void
21955+STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod0)(
21956+    float *output_buffer, unsigned int output_sub_size,
21957+    float const *decode_buffer,
21958+    stbir__contributors const *horizontal_contributors,
21959+    float const *horizontal_coefficients, int coefficient_width)
21960+{
21961+	float const *output_end =
21962+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21963+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21964+	STBIR_SIMD_NO_UNROLL_LOOP_START
21965+	do {
21966+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21967+		                                          STBIR__horizontal_channels;
21968+		int n =
21969+		    ((horizontal_contributors->n1 - horizontal_contributors->n0 + 1) -
21970+		     4 + 3) >>
21971+		    2;
21972+		float const *hc = horizontal_coefficients;
21973+
21974+		stbir__4_coeff_start();
21975+		STBIR_SIMD_NO_UNROLL_LOOP_START
21976+		do {
21977+			hc += 4;
21978+			decode += STBIR__horizontal_channels * 4;
21979+			stbir__4_coeff_continue_from_4(0);
21980+			--n;
21981+		} while (n > 0);
21982+		stbir__store_output();
21983+	} while (output < output_end);
21984+}
21985+
21986+static void
21987+STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod1)(
21988+    float *output_buffer, unsigned int output_sub_size,
21989+    float const *decode_buffer,
21990+    stbir__contributors const *horizontal_contributors,
21991+    float const *horizontal_coefficients, int coefficient_width)
21992+{
21993+	float const *output_end =
21994+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
21995+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
21996+	STBIR_SIMD_NO_UNROLL_LOOP_START
21997+	do {
21998+		float const *decode = decode_buffer + horizontal_contributors->n0 *
21999+		                                          STBIR__horizontal_channels;
22000+		int n =
22001+		    ((horizontal_contributors->n1 - horizontal_contributors->n0 + 1) -
22002+		     5 + 3) >>
22003+		    2;
22004+		float const *hc = horizontal_coefficients;
22005+
22006+		stbir__4_coeff_start();
22007+		STBIR_SIMD_NO_UNROLL_LOOP_START
22008+		do {
22009+			hc += 4;
22010+			decode += STBIR__horizontal_channels * 4;
22011+			stbir__4_coeff_continue_from_4(0);
22012+			--n;
22013+		} while (n > 0);
22014+		stbir__1_coeff_remnant(4);
22015+		stbir__store_output();
22016+	} while (output < output_end);
22017+}
22018+
22019+static void
22020+STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod2)(
22021+    float *output_buffer, unsigned int output_sub_size,
22022+    float const *decode_buffer,
22023+    stbir__contributors const *horizontal_contributors,
22024+    float const *horizontal_coefficients, int coefficient_width)
22025+{
22026+	float const *output_end =
22027+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
22028+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
22029+	STBIR_SIMD_NO_UNROLL_LOOP_START
22030+	do {
22031+		float const *decode = decode_buffer + horizontal_contributors->n0 *
22032+		                                          STBIR__horizontal_channels;
22033+		int n =
22034+		    ((horizontal_contributors->n1 - horizontal_contributors->n0 + 1) -
22035+		     6 + 3) >>
22036+		    2;
22037+		float const *hc = horizontal_coefficients;
22038+
22039+		stbir__4_coeff_start();
22040+		STBIR_SIMD_NO_UNROLL_LOOP_START
22041+		do {
22042+			hc += 4;
22043+			decode += STBIR__horizontal_channels * 4;
22044+			stbir__4_coeff_continue_from_4(0);
22045+			--n;
22046+		} while (n > 0);
22047+		stbir__2_coeff_remnant(4);
22048+
22049+		stbir__store_output();
22050+	} while (output < output_end);
22051+}
22052+
22053+static void
22054+STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod3)(
22055+    float *output_buffer, unsigned int output_sub_size,
22056+    float const *decode_buffer,
22057+    stbir__contributors const *horizontal_contributors,
22058+    float const *horizontal_coefficients, int coefficient_width)
22059+{
22060+	float const *output_end =
22061+	    output_buffer + output_sub_size * STBIR__horizontal_channels;
22062+	float STBIR_SIMD_STREAMOUT_PTR(*) output = output_buffer;
22063+	stbir__3_coeff_setup();
22064+	STBIR_SIMD_NO_UNROLL_LOOP_START
22065+	do {
22066+		float const *decode = decode_buffer + horizontal_contributors->n0 *
22067+		                                          STBIR__horizontal_channels;
22068+		int n =
22069+		    ((horizontal_contributors->n1 - horizontal_contributors->n0 + 1) -
22070+		     7 + 3) >>
22071+		    2;
22072+		float const *hc = horizontal_coefficients;
22073+
22074+		stbir__4_coeff_start();
22075+		STBIR_SIMD_NO_UNROLL_LOOP_START
22076+		do {
22077+			hc += 4;
22078+			decode += STBIR__horizontal_channels * 4;
22079+			stbir__4_coeff_continue_from_4(0);
22080+			--n;
22081+		} while (n > 0);
22082+		stbir__3_coeff_remnant(4);
22083+
22084+		stbir__store_output();
22085+	} while (output < output_end);
22086+}
22087+
22088+static stbir__horizontal_gather_channels_func *
22089+    STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_funcs)[4] = {
22090+        STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod0),
22091+        STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod1),
22092+        STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod2),
22093+        STBIR_chans(stbir__horizontal_gather_, _channels_with_n_coeffs_mod3),
22094 };
22095 
22096-static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
22097-{
22098-  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
22099-  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
22100-  STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
22101-  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
22102-  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
22103-  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
22104-  STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
22105-  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
22106-  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
22107-  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
22108-  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
22109-  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
22110+static stbir__horizontal_gather_channels_func *
22111+    STBIR_chans(stbir__horizontal_gather_, _channels_funcs)[12] = {
22112+        STBIR_chans(stbir__horizontal_gather_, _channels_with_1_coeff),
22113+        STBIR_chans(stbir__horizontal_gather_, _channels_with_2_coeffs),
22114+        STBIR_chans(stbir__horizontal_gather_, _channels_with_3_coeffs),
22115+        STBIR_chans(stbir__horizontal_gather_, _channels_with_4_coeffs),
22116+        STBIR_chans(stbir__horizontal_gather_, _channels_with_5_coeffs),
22117+        STBIR_chans(stbir__horizontal_gather_, _channels_with_6_coeffs),
22118+        STBIR_chans(stbir__horizontal_gather_, _channels_with_7_coeffs),
22119+        STBIR_chans(stbir__horizontal_gather_, _channels_with_8_coeffs),
22120+        STBIR_chans(stbir__horizontal_gather_, _channels_with_9_coeffs),
22121+        STBIR_chans(stbir__horizontal_gather_, _channels_with_10_coeffs),
22122+        STBIR_chans(stbir__horizontal_gather_, _channels_with_11_coeffs),
22123+        STBIR_chans(stbir__horizontal_gather_, _channels_with_12_coeffs),
22124 };
22125 
22126 #undef STBIR__horizontal_channels
22127@@ -10601,7 +13209,7 @@ static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_ga
22128 #undef stbir__store_output_tiny
22129 #undef STBIR_chans
22130 
22131-#endif  // HORIZONALS
22132+#endif // HORIZONALS
22133 
22134 #undef STBIR_strs_join2
22135 #undef STBIR_strs_join1