commit 2f6ca91
shrub
·
2025-12-20 20:40:26 +0000 UTC
parent 7c35d5c
this adds wallpaper scaling to the initial wallpaper implementation, so the wallpaper scales to screen size.
2 files changed,
+10681,
-1
+30,
-1
1@@ -5,11 +5,15 @@
2 #define STBI_NO_HDR
3 #include "../stb/stb_image.h"
4
5+#define STB_IMAGE_RESIZE_IMPLEMENTATION
6+#include "../stb/stb_image_resize2.h"
7+
8 #include "swc.h"
9 #include "internal.h"
10 #include "drm.h"
11 #include "util.h"
12 #include "shm.h"
13+#include "screen.h"
14
15 unsigned char *wallpaper = NULL;
16 struct wld_buffer *wallbuf = NULL;
17@@ -20,8 +24,33 @@ EXPORT void
18 swc_wallpaper_init(char* path)
19 {
20 int width, height, chan;
21+ unsigned char *loaded;
22+ struct screen *screen;
23+ int target_width = 0, target_height = 0;
24+
25+ loaded = stbi_load(path, &width, &height, &chan, 4);
26+ if (!loaded)
27+ return;
28
29- wallpaper = stbi_load(path, &width, &height, &chan, 4);
30+ /* get screen dimensions */
31+ wl_list_for_each(screen, &swc.screens, link) {
32+ target_width = screen->base.geometry.width;
33+ target_height = screen->base.geometry.height;
34+ break;
35+ }
36+
37+ /* If we have a screen and dimensions wrong scale */
38+ if (target_width > 0 && target_height > 0 &&
39+ (width != target_width || height != target_height)) {
40+ wallpaper = stbir_resize_uint8_srgb(loaded, width, height, 0,
41+ NULL, target_width, target_height, 0,
42+ STBIR_RGBA);
43+ stbi_image_free(loaded);
44+ width = target_width;
45+ height = target_height;
46+ } else {
47+ wallpaper = loaded;
48+ }
49
50 /* swap color channels to be compatible */
51 for(int i = 0; i < width * height; i++) {
+10651,
-0
1@@ -0,0 +1,10651 @@
2+/* stb_image_resize2 - v2.17 - public domain image resizing
3+
4+ by Jeff Roberts (v2) and Jorge L Rodriguez
5+ http://github.com/nothings/stb
6+
7+ Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
8+ scaling and translation is supported, no rotations or shears.
9+
10+ COMPILING & LINKING
11+ In one C/C++ file that #includes this file, do this:
12+ #define STB_IMAGE_RESIZE_IMPLEMENTATION
13+ before the #include. That will create the implementation in that file.
14+
15+ EASY API CALLS:
16+ Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
17+
18+ stbir_resize_uint8_srgb( input_pixels, input_w, input_h, input_stride_in_bytes,
19+ output_pixels, output_w, output_h, output_stride_in_bytes,
20+ pixel_layout_enum )
21+
22+ stbir_resize_uint8_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
23+ output_pixels, output_w, output_h, output_stride_in_bytes,
24+ pixel_layout_enum )
25+
26+ stbir_resize_float_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
27+ output_pixels, output_w, output_h, output_stride_in_bytes,
28+ pixel_layout_enum )
29+
30+ If you pass NULL or zero for the output_pixels, we will allocate the output buffer
31+ for you and return it from the function (free with free() or STBIR_FREE).
32+ As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
33+
34+ API LEVELS
35+ There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
36+
37+ See the "header file" section of the source for API documentation.
38+
39+ ADDITIONAL DOCUMENTATION
40+
41+ MEMORY ALLOCATION
42+ By default, we use malloc and free for memory allocation. To override the
43+ memory allocation, before the implementation #include, add a:
44+
45+ #define STBIR_MALLOC(size,user_data) ...
46+ #define STBIR_FREE(ptr,user_data) ...
47+
48+ Each resize makes exactly one call to malloc/free (unless you use the
49+ extended API where you can do one allocation for many resizes). Under
50+ address sanitizer, we do separate allocations to find overread/writes.
51+
52+ PERFORMANCE
53+ This library was written with an emphasis on performance. When testing
54+ stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
55+ STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
56+ libs do by default). Also, make sure SIMD is turned on of course (default
57+ for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
58+
59+ This library also comes with profiling built-in. If you define STBIR_PROFILE,
60+ you can use the advanced API and get low-level profiling information by
61+ calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
62+ after a resize.
63+
64+ SIMD
65+ Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
66+
67+ On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
68+ ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
69+ STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
70+ or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
71+ support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
72+
73+ On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
74+ we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
75+ on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
76+ clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
77+ automatically enable NEON.
78+
79+ On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
80+ for converting back and forth to half-floats. This is autoselected when we
81+ are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
82+ the built-in half float hardware NEON instructions.
83+
84+ You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
85+ Because x86 doesn't always have fma, we turn it off by default to maintain
86+ determinism across all platforms. If you don't care about non-FMA determinism
87+ and are willing to restrict yourself to more recent x86 CPUs (around the AVX
88+ timeframe), then fma will give you around a 15% speedup.
89+
90+ You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
91+ off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
92+ to 40% faster, and AVX2 is generally another 12%.
93+
94+ ALPHA CHANNEL
95+ Most of the resizing functions provide the ability to control how the alpha
96+ channel of an image is processed.
97+
98+ When alpha represents transparency, it is important that when combining
99+ colors with filtering, the pixels should not be treated equally; they
100+ should use a weighted average based on their alpha values. For example,
101+ if a pixel is 1% opaque bright green and another pixel is 99% opaque
102+ black and you average them, the average will be 50% opaque, but the
103+ unweighted average and will be a middling green color, while the weighted
104+ average will be nearly black. This means the unweighted version introduced
105+ green energy that didn't exist in the source image.
106+
107+ (If you want to know why this makes sense, you can work out the math for
108+ the following: consider what happens if you alpha composite a source image
109+ over a fixed color and then average the output, vs. if you average the
110+ source image pixels and then composite that over the same fixed color.
111+ Only the weighted average produces the same result as the ground truth
112+ composite-then-average result.)
113+
114+ Therefore, it is in general best to "alpha weight" the pixels when applying
115+ filters to them. This essentially means multiplying the colors by the alpha
116+ values before combining them, and then dividing by the alpha value at the
117+ end.
118+
119+ The computer graphics industry introduced a technique called "premultiplied
120+ alpha" or "associated alpha" in which image colors are stored in image files
121+ already multiplied by their alpha. This saves some math when compositing,
122+ and also avoids the need to divide by the alpha at the end (which is quite
123+ inefficient). However, while premultiplied alpha is common in the movie CGI
124+ industry, it is not commonplace in other industries like videogames, and most
125+ consumer file formats are generally expected to contain not-premultiplied
126+ colors. For example, Photoshop saves PNG files "unpremultiplied", and web
127+ browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
128+
129+ Note that there are three possibilities that might describe your image
130+ and resize expectation:
131+
132+ 1. images are not premultiplied, alpha weighting is desired
133+ 2. images are not premultiplied, alpha weighting is not desired
134+ 3. images are premultiplied
135+
136+ Both case #2 and case #3 require the exact same math: no alpha weighting
137+ should be applied or removed. Only case 1 requires extra math operations;
138+ the other two cases can be handled identically.
139+
140+ stb_image_resize expects case #1 by default, applying alpha weighting to
141+ images, expecting the input images to be unpremultiplied. This is what the
142+ COLOR+ALPHA buffer types tell the resizer to do.
143+
144+ When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
145+ STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
146+ non-premultiplied. In these cases, the resizer will alpha weight the colors
147+ (effectively creating the premultiplied image), do the filtering, and then
148+ convert back to non-premult on exit.
149+
150+ When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
151+ STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
152+ ARE premultiplied. In this case, the resizer doesn't have to do the
153+ premultipling - it can filter directly on the input. This about twice as
154+ fast as the non-premultiplied case, so it's the right option if your data is
155+ already setup correctly.
156+
157+ When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
158+ telling us that there is no channel that represents transparency; it may be
159+ RGB and some unrelated fourth channel that has been stored in the alpha
160+ channel, but it is actually not alpha. No special processing will be
161+ performed.
162+
163+ The difference between the generic 4 or 2 channel layouts, and the
164+ specialized _PM versions is with the _PM versions you are telling us that
165+ the data *is* alpha, just don't premultiply it. That's important when
166+ using SRGB pixel formats, we need to know where the alpha is, because
167+ it is converted linearly (rather than with the SRGB converters).
168+
169+ Because alpha weighting produces the same effect as premultiplying, you
170+ even have the option with non-premultiplied inputs to let the resizer
171+ produce a premultiplied output. Because the intially computed alpha-weighted
172+ output image is effectively premultiplied, this is actually more performant
173+ than the normal path which un-premultiplies the output image as a final step.
174+
175+ Finally, when converting both in and out of non-premulitplied space (for
176+ example, when using STBIR_RGBA), we go to somewhat heroic measures to
177+ ensure that areas with zero alpha value pixels get something reasonable
178+ in the RGB values. If you don't care about the RGB values of zero alpha
179+ pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
180+ function - this runs a premultiplied resize about 25% faster. That said,
181+ when you really care about speed, using premultiplied pixels for both in
182+ and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
183+ options.
184+
185+ PIXEL LAYOUT CONVERSION
186+ The resizer can convert from some pixel layouts to others. When using the
187+ stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
188+ on input, and STBIR_ARGB on output, and it will re-organize the channels
189+ during the resize. Currently, you can only convert between two pixel
190+ layouts with the same number of channels.
191+
192+ DETERMINISM
193+ We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
194+ This requires compiling with fast-math off (using at least /fp:precise).
195+ Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
196+ We attempt to do this with pragmas, but with Clang, you usually want to add
197+ -ffp-contract=off to the command line as well.
198+
199+ For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
200+ if the scalar x87 unit gets used at all, we immediately lose determinism.
201+ On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
202+ no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
203+ with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
204+ -fpmath=sse.
205+
206+ Note that we will not be deterministic with float data containing NaNs -
207+ the NaNs will propagate differently on different SIMD and platforms.
208+
209+ If you turn on STBIR_USE_FMA, then we will be deterministic with other
210+ fma targets, but we will differ from non-fma targets (this is unavoidable,
211+ because a fma isn't simply an add with a mult - it also introduces a
212+ rounding difference compared to non-fma instruction sequences.
213+
214+ FLOAT PIXEL FORMAT RANGE
215+ Any range of values can be used for the non-alpha float data that you pass
216+ in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
217+ but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
218+ scale back properly. The alpha channel must also be 0 to 1 for any format
219+ that does premultiplication prior to resizing.
220+
221+ Note also that with float output, using filters with negative lobes, the
222+ output filtered values might go slightly out of range. You can define
223+ STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
224+ to clamp to on output, if that's important.
225+
226+ MAX/MIN SCALE FACTORS
227+ The input pixel resolutions are in integers, and we do the internal pointer
228+ resolution in size_t sized integers. However, the scale ratio from input
229+ resolution to output resolution is calculated in float form. This means
230+ the effective possible scale ratio is limited to 24 bits (or 16 million
231+ to 1). As you get close to the size of the float resolution (again, 16
232+ million pixels wide or high), you might start seeing float inaccuracy
233+ issues in general in the pipeline. If you have to do extreme resizes,
234+ you can usually do this is multiple stages (using float intermediate
235+ buffers).
236+
237+ FLIPPED IMAGES
238+ Stride is just the delta from one scanline to the next. This means you can
239+ use a negative stride to handle inverted images (point to the final
240+ scanline and use a negative stride). You can invert the input or output,
241+ using negative strides.
242+
243+ DEFAULT FILTERS
244+ For functions which don't provide explicit control over what filters to
245+ use, you can change the compile-time defaults with:
246+
247+ #define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_something
248+ #define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_something
249+
250+ See stbir_filter in the header-file section for the list of filters.
251+
252+ NEW FILTERS
253+ A number of 1D filter kernels are supplied. For a list of supported
254+ filters, see the stbir_filter enum. You can install your own filters by
255+ using the stbir_set_filter_callbacks function.
256+
257+ PROGRESS
258+ For interactive use with slow resize operations, you can use the
259+ scanline callbacks in the extended API. It would have to be a *very* large
260+ image resample to need progress though - we're very fast.
261+
262+ CEIL and FLOOR
263+ In scalar mode, the only functions we use from math.h are ceilf and floorf,
264+ but if you have your own versions, you can define the STBIR_CEILF(v) and
265+ STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
266+ our own versions.
267+
268+ ASSERT
269+ Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
270+
271+ PORTING FROM VERSION 1
272+ The API has changed. You can continue to use the old version of stb_image_resize.h,
273+ which is available in the "deprecated/" directory.
274+
275+ If you're using the old simple-to-use API, porting is straightforward.
276+ (For more advanced APIs, read the documentation.)
277+
278+ stbir_resize_uint8():
279+ - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
280+
281+ stbir_resize_float():
282+ - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
283+
284+ stbir_resize_uint8_srgb():
285+ - function name is unchanged
286+ - cast channel count to `stbir_pixel_layout`
287+ - above is sufficient unless your image has alpha and it's not RGBA/BGRA
288+ - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
289+
290+ stbir_resize_uint8_srgb_edgemode()
291+ - switch to the "medium complexity" API
292+ - stbir_resize(), very similar API but a few more parameters:
293+ - pixel_layout: cast channel count to `stbir_pixel_layout`
294+ - data_type: STBIR_TYPE_UINT8_SRGB
295+ - edge: unchanged (STBIR_EDGE_WRAP, etc.)
296+ - filter: STBIR_FILTER_DEFAULT
297+ - which channel is alpha is specified in stbir_pixel_layout, see enum for details
298+
299+ FUTURE TODOS
300+ * For polyphase integral filters, we just memcpy the coeffs to dupe
301+ them, but we should indirect and use the same coeff memory.
302+ * Add pixel layout conversions for sensible different channel counts
303+ (maybe, 1->3/4, 3->4, 4->1, 3->1).
304+ * For SIMD encode and decode scanline routines, do any pre-aligning
305+ for bad input/output buffer alignments and pitch?
306+ * For very wide scanlines, we should we do vertical strips to stay within
307+ L2 cache. Maybe do chunks of 1K pixels at a time. There would be
308+ some pixel reconversion, but probably dwarfed by things falling out
309+ of cache. Probably also something possible with alternating between
310+ scattering and gathering at high resize scales?
311+ * Should we have a multiple MIPs at the same time function (could keep
312+ more memory in cache during multiple resizes)?
313+ * Rewrite the coefficient generator to do many at once.
314+ * AVX-512 vertical kernels - worried about downclocking here.
315+ * Convert the reincludes to macros when we know they aren't changing.
316+ * Experiment with pivoting the horizontal and always using the
317+ vertical filters (which are faster, but perhaps not enough to overcome
318+ the pivot cost and the extra memory touches). Need to buffer the whole
319+ image so have to balance memory use.
320+ * Most of our code is internally function pointers, should we compile
321+ all the SIMD stuff always and dynamically dispatch?
322+
323+ CONTRIBUTORS
324+ Jeff Roberts: 2.0 implementation, optimizations, SIMD
325+ Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
326+ Fabian Giesen: half float and srgb converters
327+ Sean Barrett: API design, optimizations
328+ Jorge L Rodriguez: Original 1.0 implementation
329+ Aras Pranckevicius: bugfixes
330+ Nathan Reed: warning fixes for 1.0
331+
332+ REVISIONS
333+ 2.17 (2025-10-25) silly format bug in easy-to-use APIs.
334+ 2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative
335+ strides), fix vertical filter kernel callback, fix threaded
336+ gather buffer priming (and assert).
337+ (thanks adipose, TainZerL, and Harrison Green)
338+ 2.15 (2025-07-17) fixed an assert in debug mode when using floats with input
339+ callbacks, work around GCC warning when adding to null ptr
340+ (thanks Johannes Spohr and Pyry Kovanen).
341+ 2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and
342+ scatter with vertical first.
343+ 2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for
344+ tiny-c, fixed some variables that should have been static,
345+ fixes a bug when calculating temp memory with resizes that
346+ exceed 2GB of temp memory (very large resizes).
347+ 2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
348+ 2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
349+ with AVX-2, fix some weird scaling edge conditions with
350+ point sample mode.
351+ 2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
352+ fix MSVC 32-bit arm half float routines.
353+ 2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
354+ hardware half floats).
355+ 2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
356+ to Ryan Salsbury), fix for sub-rect resizes, use the
357+ pragmas to control unrolling when they are available.
358+ 2.07 (2024-05-24) fix for slow final split during threaded conversions of very
359+ wide scanlines when downsampling (caused by extra input
360+ converting), fix for wide scanline resamples with many
361+ splits (int overflow), fix GCC warning.
362+ 2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
363+ undersampling a single row on rare resize ratios (about 1%).
364+ 2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
365+ fix for output callback (thanks Julien Koenen).
366+ 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
367+ 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
368+ 2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
369+ 2x-5x faster without simd, 4x-12x faster with simd,
370+ in some cases, 20x to 40x faster esp resizing large to very small.
371+ 0.96 (2019-03-04) fixed warnings
372+ 0.95 (2017-07-23) fixed warnings
373+ 0.94 (2017-03-18) fixed warnings
374+ 0.93 (2017-03-03) fixed bug with certain combinations of heights
375+ 0.92 (2017-01-02) fix integer overflow on large (>2GB) images
376+ 0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
377+ 0.90 (2014-09-17) first released version
378+
379+ LICENSE
380+ See end of file for license information.
381+*/
382+
383+#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS) // for internal re-includes
384+
385+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
386+#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
387+
388+#include <stddef.h>
389+#ifdef _MSC_VER
390+typedef unsigned char stbir_uint8;
391+typedef unsigned short stbir_uint16;
392+typedef unsigned int stbir_uint32;
393+typedef unsigned __int64 stbir_uint64;
394+#else
395+#include <stdint.h>
396+typedef uint8_t stbir_uint8;
397+typedef uint16_t stbir_uint16;
398+typedef uint32_t stbir_uint32;
399+typedef uint64_t stbir_uint64;
400+#endif
401+
402+#ifndef STBIRDEF
403+#ifdef STB_IMAGE_RESIZE_STATIC
404+#define STBIRDEF static
405+#else
406+#ifdef __cplusplus
407+#define STBIRDEF extern "C"
408+#else
409+#define STBIRDEF extern
410+#endif
411+#endif
412+#endif
413+
414+//////////////////////////////////////////////////////////////////////////////
415+//// start "header file" ///////////////////////////////////////////////////
416+//
417+// Easy-to-use API:
418+//
419+// * stride is the offset between successive rows of image data
420+// in memory, in bytes. specify 0 for packed continuously in memory
421+// * colorspace is linear or sRGB as specified by function name
422+// * Uses the default filters
423+// * Uses edge mode clamped
424+// * returned result is 1 for success or 0 in case of an error.
425+
426+
427+// stbir_pixel_layout specifies:
428+// number of channels
429+// order of channels
430+// whether color is premultiplied by alpha
431+// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
432+typedef enum
433+{
434+ STBIR_1CHANNEL = 1,
435+ STBIR_2CHANNEL = 2,
436+ STBIR_RGB = 3, // 3-chan, with order specified (for channel flipping)
437+ STBIR_BGR = 0, // 3-chan, with order specified (for channel flipping)
438+ STBIR_4CHANNEL = 5,
439+
440+ STBIR_RGBA = 4, // alpha formats, where alpha is NOT premultiplied into color channels
441+ STBIR_BGRA = 6,
442+ STBIR_ARGB = 7,
443+ STBIR_ABGR = 8,
444+ STBIR_RA = 9,
445+ STBIR_AR = 10,
446+
447+ STBIR_RGBA_PM = 11, // alpha formats, where alpha is premultiplied into color channels
448+ STBIR_BGRA_PM = 12,
449+ STBIR_ARGB_PM = 13,
450+ STBIR_ABGR_PM = 14,
451+ STBIR_RA_PM = 15,
452+ STBIR_AR_PM = 16,
453+
454+ STBIR_RGBA_NO_AW = 11, // alpha formats, where NO alpha weighting is applied at all!
455+ STBIR_BGRA_NO_AW = 12, // these are just synonyms for the _PM flags (which also do
456+ STBIR_ARGB_NO_AW = 13, // no alpha weighting). These names just make it more clear
457+ STBIR_ABGR_NO_AW = 14, // for some folks).
458+ STBIR_RA_NO_AW = 15,
459+ STBIR_AR_NO_AW = 16,
460+
461+} stbir_pixel_layout;
462+
463+//===============================================================
464+// Simple-complexity API
465+//
466+// If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
467+//--------------------------------
468+
469+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
470+ unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
471+ stbir_pixel_layout pixel_type );
472+
473+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
474+ unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
475+ stbir_pixel_layout pixel_type );
476+
477+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
478+ float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
479+ stbir_pixel_layout pixel_type );
480+//===============================================================
481+
482+//===============================================================
483+// Medium-complexity API
484+//
485+// This extends the easy-to-use API as follows:
486+//
487+// * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
488+// * Edge wrap can selected explicitly
489+// * Filter can be selected explicitly
490+//--------------------------------
491+
492+typedef enum
493+{
494+ STBIR_EDGE_CLAMP = 0,
495+ STBIR_EDGE_REFLECT = 1,
496+ STBIR_EDGE_WRAP = 2, // this edge mode is slower and uses more memory
497+ STBIR_EDGE_ZERO = 3,
498+} stbir_edge;
499+
500+typedef enum
501+{
502+ STBIR_FILTER_DEFAULT = 0, // use same filter type that easy-to-use API chooses
503+ STBIR_FILTER_BOX = 1, // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
504+ STBIR_FILTER_TRIANGLE = 2, // On upsampling, produces same results as bilinear texture filtering
505+ STBIR_FILTER_CUBICBSPLINE = 3, // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
506+ STBIR_FILTER_CATMULLROM = 4, // An interpolating cubic spline
507+ STBIR_FILTER_MITCHELL = 5, // Mitchell-Netrevalli filter with B=1/3, C=1/3
508+ STBIR_FILTER_POINT_SAMPLE = 6, // Simple point sampling
509+ STBIR_FILTER_OTHER = 7, // User callback specified
510+} stbir_filter;
511+
512+typedef enum
513+{
514+ STBIR_TYPE_UINT8 = 0,
515+ STBIR_TYPE_UINT8_SRGB = 1,
516+ STBIR_TYPE_UINT8_SRGB_ALPHA = 2, // alpha channel, when present, should also be SRGB (this is very unusual)
517+ STBIR_TYPE_UINT16 = 3,
518+ STBIR_TYPE_FLOAT = 4,
519+ STBIR_TYPE_HALF_FLOAT = 5
520+} stbir_datatype;
521+
522+// medium api
523+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
524+ void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
525+ stbir_pixel_layout pixel_layout, stbir_datatype data_type,
526+ stbir_edge edge, stbir_filter filter );
527+//===============================================================
528+
529+
530+
531+//===============================================================
532+// Extended-complexity API
533+//
534+// This API exposes all resize functionality.
535+//
536+// * Separate filter types for each axis
537+// * Separate edge modes for each axis
538+// * Separate input and output data types
539+// * Can specify regions with subpixel correctness
540+// * Can specify alpha flags
541+// * Can specify a memory callback
542+// * Can specify a callback data type for pixel input and output
543+// * Can be threaded for a single resize
544+// * Can be used to resize many frames without recalculating the sampler info
545+//
546+// Use this API as follows:
547+// 1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
548+// 2) Call any of the stbir_set functions
549+// 3) Optionally call stbir_build_samplers() if you are going to resample multiple times
550+// with the same input and output dimensions (like resizing video frames)
551+// 4) Resample by calling stbir_resize_extended().
552+// 5) Call stbir_free_samplers() if you called stbir_build_samplers()
553+//--------------------------------
554+
555+
556+// Types:
557+
558+// INPUT CALLBACK: this callback is used for input scanlines
559+typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
560+
561+// OUTPUT CALLBACK: this callback is used for output scanlines
562+typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
563+
564+// callbacks for user installed filters
565+typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
566+typedef float stbir__support_callback( float scale, void * user_data );
567+
568+// internal structure with precomputed scaling
569+typedef struct stbir__info stbir__info;
570+
571+typedef struct STBIR_RESIZE // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
572+{
573+ void * user_data;
574+ void const * input_pixels;
575+ int input_w, input_h;
576+ double input_s0, input_t0, input_s1, input_t1;
577+ stbir_input_callback * input_cb;
578+ void * output_pixels;
579+ int output_w, output_h;
580+ int output_subx, output_suby, output_subw, output_subh;
581+ stbir_output_callback * output_cb;
582+ int input_stride_in_bytes;
583+ int output_stride_in_bytes;
584+ int splits;
585+ int fast_alpha;
586+ int needs_rebuild;
587+ int called_alloc;
588+ stbir_pixel_layout input_pixel_layout_public;
589+ stbir_pixel_layout output_pixel_layout_public;
590+ stbir_datatype input_data_type;
591+ stbir_datatype output_data_type;
592+ stbir_filter horizontal_filter, vertical_filter;
593+ stbir_edge horizontal_edge, vertical_edge;
594+ stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
595+ stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
596+ stbir__info * samplers;
597+} STBIR_RESIZE;
598+
599+// extended complexity api
600+
601+
602+// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
603+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
604+ const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero
605+ void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
606+ stbir_pixel_layout pixel_layout, stbir_datatype data_type );
607+
608+//===============================================================
609+// You can update these parameters any time after resize_init and there is no cost
610+//--------------------------------
611+
612+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
613+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ); // no callbacks by default
614+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ); // pass back STBIR_RESIZE* by default
615+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
616+
617+//===============================================================
618+
619+
620+//===============================================================
621+// If you call any of these functions, you will trigger a sampler rebuild!
622+//--------------------------------
623+
624+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ); // sets new buffer layouts
625+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ); // CLAMP by default
626+
627+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
628+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
629+
630+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets both sub-regions (full regions by default)
631+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ); // sets input sub-region (full region by default)
632+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
633+
634+// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
635+// that fills the zero alpha pixel's RGB values with something plausible. If you don't care about areas of
636+// zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
637+// types of resizes.
638+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
639+//===============================================================
640+
641+
642+//===============================================================
643+// You can call build_samplers to prebuild all the internal data we need to resample.
644+// Then, if you call resize_extended many times with the same resize, you only pay the
645+// cost once.
646+// If you do call build_samplers, you MUST call free_samplers eventually.
647+//--------------------------------
648+
649+// This builds the samplers and does one allocation
650+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
651+
652+// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
653+STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
654+//===============================================================
655+
656+
657+// And this is the main function to perform the resize synchronously on one thread.
658+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
659+
660+
661+//===============================================================
662+// Use these functions for multithreading.
663+// 1) You call stbir_build_samplers_with_splits first on the main thread
664+// 2) Then stbir_resize_with_split on each thread
665+// 3) stbir_free_samplers when done on the main thread
666+//--------------------------------
667+
668+// This will build samplers for threading.
669+// You can pass in the number of threads you'd like to use (try_splits).
670+// It returns the number of splits (threads) that you can call it with.
671+/// It might be less if the image resize can't be split up that many ways.
672+
673+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
674+
675+// This function does a split of the resizing (you call this fuction for each
676+// split, on multiple threads). A split is a piece of the output resize pixel space.
677+
678+// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
679+
680+// Usually, you will always call stbir_resize_split with split_start as the thread_index
681+// and "1" for the split_count.
682+// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
683+// only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
684+// split_count each time to turn in into a 4 thread resize. (This is unusual).
685+
686+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
687+//===============================================================
688+
689+
690+//===============================================================
691+// Pixel Callbacks info:
692+//--------------------------------
693+
694+// The input callback is super flexible - it calls you with the input address
695+// (based on the stride and base pointer), it gives you an optional_output
696+// pointer that you can fill, or you can just return your own pointer into
697+// your own data.
698+//
699+// You can also do conversion from non-supported data types if necessary - in
700+// this case, you ignore the input_ptr and just use the x and y parameters to
701+// calculate your own input_ptr based on the size of each non-supported pixel.
702+// (Something like the third example below.)
703+//
704+// You can also install just an input or just an output callback by setting the
705+// callback that you don't want to zero.
706+//
707+// First example, progress: (getting a callback that you can monitor the progress):
708+// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
709+// {
710+// percentage_done = y / input_height;
711+// return input_ptr; // use buffer from call
712+// }
713+//
714+// Next example, copying: (copy from some other buffer or stream):
715+// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
716+// {
717+// CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
718+// return optional_output; // return the optional buffer that we filled
719+// }
720+//
721+// Third example, input another buffer without copying: (zero-copy from other buffer):
722+// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
723+// {
724+// void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
725+// return pixels; // return pointer to your data without copying
726+// }
727+//
728+//
729+// The output callback is considerably simpler - it just calls you so that you can dump
730+// out each scanline. You could even directly copy out to disk if you have a simple format
731+// like TGA or BMP. You can also convert to other output types here if you want.
732+//
733+// Simple example:
734+// void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
735+// {
736+// percentage_done = y / output_height;
737+// fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
738+// }
739+//===============================================================
740+
741+
742+
743+
744+//===============================================================
745+// optional built-in profiling API
746+//--------------------------------
747+
748+#ifdef STBIR_PROFILE
749+
750+typedef struct STBIR_PROFILE_INFO
751+{
752+ stbir_uint64 total_clocks;
753+
754+ // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
755+ // there are "resize_count" number of zones
756+ stbir_uint64 clocks[ 8 ];
757+ char const ** descriptions;
758+
759+ // count of clocks and descriptions
760+ stbir_uint32 count;
761+} STBIR_PROFILE_INFO;
762+
763+// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
764+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
765+
766+// use after calling stbir_resize_extended
767+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
768+
769+// use after calling stbir_resize_extended_split
770+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
771+
772+//===============================================================
773+
774+#endif
775+
776+
777+//// end header file /////////////////////////////////////////////////////
778+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
779+
780+#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
781+
782+#ifndef STBIR_ASSERT
783+#include <assert.h>
784+#define STBIR_ASSERT(x) assert(x)
785+#endif
786+
787+#ifndef STBIR_MALLOC
788+#include <stdlib.h>
789+#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
790+#define STBIR_FREE(ptr,user_data) ((void)(user_data), free(ptr))
791+// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
792+#endif
793+
794+#ifdef _MSC_VER
795+
796+#define stbir__inline __forceinline
797+
798+#else
799+
800+#define stbir__inline __inline__
801+
802+// Clang address sanitizer
803+#if defined(__has_feature)
804+ #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
805+ #ifndef STBIR__SEPARATE_ALLOCATIONS
806+ #define STBIR__SEPARATE_ALLOCATIONS
807+ #endif
808+ #endif
809+#endif
810+
811+#endif
812+
813+// GCC and MSVC
814+#if defined(__SANITIZE_ADDRESS__)
815+ #ifndef STBIR__SEPARATE_ALLOCATIONS
816+ #define STBIR__SEPARATE_ALLOCATIONS
817+ #endif
818+#endif
819+
820+// Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
821+// Otherwise, this is a determinism disaster.
822+#ifndef STBIR_DONT_CHANGE_FP_CONTRACT // override in case you don't want this behavior
823+#if defined(_MSC_VER) && !defined(__clang__)
824+#if _MSC_VER > 1200
825+#pragma fp_contract(off)
826+#endif
827+#elif defined(__GNUC__) && !defined(__clang__)
828+#pragma GCC optimize("fp-contract=off")
829+#else
830+#pragma STDC FP_CONTRACT OFF
831+#endif
832+#endif
833+
834+#ifdef _MSC_VER
835+#define STBIR__UNUSED(v) (void)(v)
836+#else
837+#define STBIR__UNUSED(v) (void)sizeof(v)
838+#endif
839+
840+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
841+
842+
843+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
844+#define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_CATMULLROM
845+#endif
846+
847+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
848+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_MITCHELL
849+#endif
850+
851+
852+#ifndef STBIR__HEADER_FILENAME
853+#define STBIR__HEADER_FILENAME "stb_image_resize2.h"
854+#endif
855+
856+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
857+// the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
858+typedef enum
859+{
860+ STBIRI_1CHANNEL = 0,
861+ STBIRI_2CHANNEL = 1,
862+ STBIRI_RGB = 2,
863+ STBIRI_BGR = 3,
864+ STBIRI_4CHANNEL = 4,
865+
866+ STBIRI_RGBA = 5,
867+ STBIRI_BGRA = 6,
868+ STBIRI_ARGB = 7,
869+ STBIRI_ABGR = 8,
870+ STBIRI_RA = 9,
871+ STBIRI_AR = 10,
872+
873+ STBIRI_RGBA_PM = 11,
874+ STBIRI_BGRA_PM = 12,
875+ STBIRI_ARGB_PM = 13,
876+ STBIRI_ABGR_PM = 14,
877+ STBIRI_RA_PM = 15,
878+ STBIRI_AR_PM = 16,
879+} stbir_internal_pixel_layout;
880+
881+// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
882+#define STBIR_BGR bad_dont_use_in_implementation
883+#define STBIR_1CHANNEL STBIR_BGR
884+#define STBIR_2CHANNEL STBIR_BGR
885+#define STBIR_RGB STBIR_BGR
886+#define STBIR_RGBA STBIR_BGR
887+#define STBIR_4CHANNEL STBIR_BGR
888+#define STBIR_BGRA STBIR_BGR
889+#define STBIR_ARGB STBIR_BGR
890+#define STBIR_ABGR STBIR_BGR
891+#define STBIR_RA STBIR_BGR
892+#define STBIR_AR STBIR_BGR
893+#define STBIR_RGBA_PM STBIR_BGR
894+#define STBIR_BGRA_PM STBIR_BGR
895+#define STBIR_ARGB_PM STBIR_BGR
896+#define STBIR_ABGR_PM STBIR_BGR
897+#define STBIR_RA_PM STBIR_BGR
898+#define STBIR_AR_PM STBIR_BGR
899+
900+// must match stbir_datatype
901+static unsigned char stbir__type_size[] = {
902+ 1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
903+};
904+
905+// When gathering, the contributors are which source pixels contribute.
906+// When scattering, the contributors are which destination pixels are contributed to.
907+typedef struct
908+{
909+ int n0; // First contributing pixel
910+ int n1; // Last contributing pixel
911+} stbir__contributors;
912+
913+typedef struct
914+{
915+ int lowest; // First sample index for whole filter
916+ int highest; // Last sample index for whole filter
917+ int widest; // widest single set of samples for an output
918+} stbir__filter_extent_info;
919+
920+typedef struct
921+{
922+ int n0; // First pixel of decode buffer to write to
923+ int n1; // Last pixel of decode that will be written to
924+ int pixel_offset_for_input; // Pixel offset into input_scanline
925+} stbir__span;
926+
927+typedef struct stbir__scale_info
928+{
929+ int input_full_size;
930+ int output_sub_size;
931+ float scale;
932+ float inv_scale;
933+ float pixel_shift; // starting shift in output pixel space (in pixels)
934+ int scale_is_rational;
935+ stbir_uint32 scale_numerator, scale_denominator;
936+} stbir__scale_info;
937+
938+typedef struct
939+{
940+ stbir__contributors * contributors;
941+ float* coefficients;
942+ stbir__contributors * gather_prescatter_contributors;
943+ float * gather_prescatter_coefficients;
944+ stbir__scale_info scale_info;
945+ float support;
946+ stbir_filter filter_enum;
947+ stbir__kernel_callback * filter_kernel;
948+ stbir__support_callback * filter_support;
949+ stbir_edge edge;
950+ int coefficient_width;
951+ int filter_pixel_width;
952+ int filter_pixel_margin;
953+ int num_contributors;
954+ int contributors_size;
955+ int coefficients_size;
956+ stbir__filter_extent_info extent_info;
957+ int is_gather; // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
958+ int gather_prescatter_num_contributors;
959+ int gather_prescatter_coefficient_width;
960+ int gather_prescatter_contributors_size;
961+ int gather_prescatter_coefficients_size;
962+} stbir__sampler;
963+
964+typedef struct
965+{
966+ stbir__contributors conservative;
967+ int edge_sizes[2]; // this can be less than filter_pixel_margin, if the filter and scaling falls off
968+ stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
969+} stbir__extents;
970+
971+typedef struct
972+{
973+#ifdef STBIR_PROFILE
974+ union
975+ {
976+ struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
977+ stbir_uint64 array[8];
978+ } profile;
979+ stbir_uint64 * current_zone_excluded_ptr;
980+#endif
981+ float* decode_buffer;
982+
983+ int ring_buffer_first_scanline;
984+ int ring_buffer_last_scanline;
985+ int ring_buffer_begin_index; // first_scanline is at this index in the ring buffer
986+ int start_output_y, end_output_y;
987+ int start_input_y, end_input_y; // used in scatter only
988+
989+ #ifdef STBIR__SEPARATE_ALLOCATIONS
990+ float** ring_buffers; // one pointer for each ring buffer
991+ #else
992+ float* ring_buffer; // one big buffer that we index into
993+ #endif
994+
995+ float* vertical_buffer;
996+
997+ char no_cache_straddle[64];
998+} stbir__per_split_info;
999+
1000+typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
1001+typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
1002+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
1003+ stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
1004+typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
1005+typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
1006+
1007+struct stbir__info
1008+{
1009+#ifdef STBIR_PROFILE
1010+ union
1011+ {
1012+ struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
1013+ stbir_uint64 array[7];
1014+ } profile;
1015+ stbir_uint64 * current_zone_excluded_ptr;
1016+#endif
1017+ stbir__sampler horizontal;
1018+ stbir__sampler vertical;
1019+
1020+ void const * input_data;
1021+ void * output_data;
1022+
1023+ int input_stride_bytes;
1024+ int output_stride_bytes;
1025+ int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
1026+ int ring_buffer_num_entries; // Total number of entries in the ring buffer.
1027+
1028+ stbir_datatype input_type;
1029+ stbir_datatype output_type;
1030+
1031+ stbir_input_callback * in_pixels_cb;
1032+ void * user_data;
1033+ stbir_output_callback * out_pixels_cb;
1034+
1035+ stbir__extents scanline_extents;
1036+
1037+ void * alloced_mem;
1038+ stbir__per_split_info * split_info; // by default 1, but there will be N of these allocated based on the thread init you did
1039+
1040+ stbir__decode_pixels_func * decode_pixels;
1041+ stbir__alpha_weight_func * alpha_weight;
1042+ stbir__horizontal_gather_channels_func * horizontal_gather_channels;
1043+ stbir__alpha_unweight_func * alpha_unweight;
1044+ stbir__encode_pixels_func * encode_pixels;
1045+
1046+ int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated
1047+ int splits; // count of splits
1048+
1049+ stbir_internal_pixel_layout input_pixel_layout_internal;
1050+ stbir_internal_pixel_layout output_pixel_layout_internal;
1051+
1052+ int input_color_and_type;
1053+ int offset_x, offset_y; // offset within output_data
1054+ int vertical_first;
1055+ int channels;
1056+ int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
1057+ size_t alloced_total;
1058+};
1059+
1060+
1061+#define stbir__max_uint8_as_float 255.0f
1062+#define stbir__max_uint16_as_float 65535.0f
1063+#define stbir__max_uint8_as_float_inverted 3.9215689e-03f // (1.0f/255.0f)
1064+#define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f)
1065+#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
1066+
1067+// min/max friendly
1068+#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
1069+ if ( (x) < (xmin) ) (x) = (xmin); \
1070+ if ( (x) > (xmax) ) (x) = (xmax); \
1071+ break; \
1072+}
1073+
1074+static stbir__inline int stbir__min(int a, int b)
1075+{
1076+ return a < b ? a : b;
1077+}
1078+
1079+static stbir__inline int stbir__max(int a, int b)
1080+{
1081+ return a > b ? a : b;
1082+}
1083+
1084+static float stbir__srgb_uchar_to_linear_float[256] = {
1085+ 0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
1086+ 0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
1087+ 0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
1088+ 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
1089+ 0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
1090+ 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
1091+ 0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
1092+ 0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
1093+ 0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
1094+ 0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
1095+ 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
1096+ 0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
1097+ 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
1098+ 0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
1099+ 0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
1100+ 0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
1101+ 0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
1102+ 0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
1103+ 0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
1104+ 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
1105+ 0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
1106+ 0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
1107+ 0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
1108+ 0.982251f, 0.991102f, 1.0f
1109+};
1110+
1111+typedef union
1112+{
1113+ unsigned int u;
1114+ float f;
1115+} stbir__FP32;
1116+
1117+// From https://gist.github.com/rygorous/2203834
1118+
1119+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
1120+ 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
1121+ 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
1122+ 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
1123+ 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
1124+ 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
1125+ 0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
1126+ 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
1127+ 0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
1128+ 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
1129+ 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
1130+ 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
1131+ 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
1132+ 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
1133+};
1134+
1135+static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
1136+{
1137+ static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
1138+ static const stbir__FP32 minval = { (127-13) << 23 };
1139+ stbir_uint32 tab,bias,scale,t;
1140+ stbir__FP32 f;
1141+
1142+ // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
1143+ // The tests are carefully written so that NaNs map to 0, same as in the reference
1144+ // implementation.
1145+ if (!(in > minval.f)) // written this way to catch NaNs
1146+ return 0;
1147+ if (in > almostone.f)
1148+ return 255;
1149+
1150+ // Do the table lookup and unpack bias, scale
1151+ f.f = in;
1152+ tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
1153+ bias = (tab >> 16) << 9;
1154+ scale = tab & 0xffff;
1155+
1156+ // Grab next-highest mantissa bits and perform linear interpolation
1157+ t = (f.u >> 12) & 0xff;
1158+ return (unsigned char) ((bias + scale*t) >> 16);
1159+}
1160+
1161+#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
1162+#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
1163+#endif
1164+
1165+#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
1166+#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
1167+#endif
1168+
1169+#define STBIR_INPUT_CALLBACK_PADDING 3
1170+
1171+#ifdef _M_IX86_FP
1172+#if ( _M_IX86_FP >= 1 )
1173+#ifndef STBIR_SSE
1174+#define STBIR_SSE
1175+#endif
1176+#endif
1177+#endif
1178+
1179+#ifdef __TINYC__
1180+ // tiny c has no intrinsics yet - this can become a version check if they add them
1181+ #define STBIR_NO_SIMD
1182+#endif
1183+
1184+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
1185+ #ifndef STBIR_SSE2
1186+ #define STBIR_SSE2
1187+ #endif
1188+ #if defined(__AVX__) || defined(STBIR_AVX2)
1189+ #ifndef STBIR_AVX
1190+ #ifndef STBIR_NO_AVX
1191+ #define STBIR_AVX
1192+ #endif
1193+ #endif
1194+ #endif
1195+ #if defined(__AVX2__) || defined(STBIR_AVX2)
1196+ #ifndef STBIR_NO_AVX2
1197+ #ifndef STBIR_AVX2
1198+ #define STBIR_AVX2
1199+ #endif
1200+ #if defined( _MSC_VER ) && !defined(__clang__)
1201+ #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
1202+ #define STBIR_FP16C
1203+ #endif
1204+ #endif
1205+ #endif
1206+ #endif
1207+ #ifdef __F16C__
1208+ #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc)
1209+ #define STBIR_FP16C
1210+ #endif
1211+ #endif
1212+#endif
1213+
1214+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
1215+#ifndef STBIR_NEON
1216+#define STBIR_NEON
1217+#endif
1218+#endif
1219+
1220+#if defined(_M_ARM) || defined(__arm__)
1221+#ifdef STBIR_USE_FMA
1222+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
1223+#endif
1224+#endif
1225+
1226+#if defined(__wasm__) && defined(__wasm_simd128__)
1227+#ifndef STBIR_WASM
1228+#define STBIR_WASM
1229+#endif
1230+#endif
1231+
1232+// restrict pointers for the output pointers, other loop and unroll control
1233+#if defined( _MSC_VER ) && !defined(__clang__)
1234+ #define STBIR_STREAMOUT_PTR( star ) star __restrict
1235+ #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
1236+ #if _MSC_VER >= 1900
1237+ #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
1238+ #else
1239+ #define STBIR_NO_UNROLL_LOOP_START
1240+ #endif
1241+#elif defined( __clang__ )
1242+ #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1243+ #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
1244+ #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
1245+ #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
1246+ #else
1247+ #define STBIR_NO_UNROLL_LOOP_START
1248+ #endif
1249+#elif defined( __GNUC__ )
1250+ #define STBIR_STREAMOUT_PTR( star ) star __restrict__
1251+ #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
1252+ #if __GNUC__ >= 14
1253+ #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
1254+ #else
1255+ #define STBIR_NO_UNROLL_LOOP_START
1256+ #endif
1257+ #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
1258+#else
1259+ #define STBIR_STREAMOUT_PTR( star ) star
1260+ #define STBIR_NO_UNROLL( ptr )
1261+ #define STBIR_NO_UNROLL_LOOP_START
1262+#endif
1263+
1264+#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
1265+#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
1266+#endif
1267+
1268+#ifdef STBIR_NO_SIMD // force simd off for whatever reason
1269+
1270+// force simd off overrides everything else, so clear it all
1271+
1272+#ifdef STBIR_SSE2
1273+#undef STBIR_SSE2
1274+#endif
1275+
1276+#ifdef STBIR_AVX
1277+#undef STBIR_AVX
1278+#endif
1279+
1280+#ifdef STBIR_NEON
1281+#undef STBIR_NEON
1282+#endif
1283+
1284+#ifdef STBIR_AVX2
1285+#undef STBIR_AVX2
1286+#endif
1287+
1288+#ifdef STBIR_FP16C
1289+#undef STBIR_FP16C
1290+#endif
1291+
1292+#ifdef STBIR_WASM
1293+#undef STBIR_WASM
1294+#endif
1295+
1296+#ifdef STBIR_SIMD
1297+#undef STBIR_SIMD
1298+#endif
1299+
1300+#else // STBIR_SIMD
1301+
1302+#ifdef STBIR_SSE2
1303+ #include <emmintrin.h>
1304+
1305+ #define stbir__simdf __m128
1306+ #define stbir__simdi __m128i
1307+
1308+ #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
1309+ #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
1310+
1311+ #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
1312+ #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
1313+ #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1314+ #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
1315+ #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values must be zero
1316+ #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
1317+ #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
1318+ #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
1319+ #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
1320+ #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
1321+
1322+ #define stbir__simdf_zeroP() _mm_setzero_ps()
1323+ #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
1324+
1325+ #define stbir__simdf_store( ptr, reg ) _mm_storeu_ps( (float*)(ptr), reg )
1326+ #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
1327+ #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
1328+ #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
1329+
1330+ #define stbir__simdi_store( ptr, reg ) _mm_storeu_si128( (__m128i*)(ptr), reg )
1331+ #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
1332+ #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
1333+
1334+ #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
1335+
1336+ #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1337+ { \
1338+ stbir__simdi zero = _mm_setzero_si128(); \
1339+ out2 = _mm_unpacklo_epi8( ireg, zero ); \
1340+ out3 = _mm_unpackhi_epi8( ireg, zero ); \
1341+ out0 = _mm_unpacklo_epi16( out2, zero ); \
1342+ out1 = _mm_unpackhi_epi16( out2, zero ); \
1343+ out2 = _mm_unpacklo_epi16( out3, zero ); \
1344+ out3 = _mm_unpackhi_epi16( out3, zero ); \
1345+ }
1346+
1347+#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1348+ { \
1349+ stbir__simdi zero = _mm_setzero_si128(); \
1350+ out = _mm_unpacklo_epi8( ireg, zero ); \
1351+ out = _mm_unpacklo_epi16( out, zero ); \
1352+ }
1353+
1354+ #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1355+ { \
1356+ stbir__simdi zero = _mm_setzero_si128(); \
1357+ out0 = _mm_unpacklo_epi16( ireg, zero ); \
1358+ out1 = _mm_unpackhi_epi16( ireg, zero ); \
1359+ }
1360+
1361+ #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
1362+ #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
1363+ #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
1364+ #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
1365+
1366+ #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
1367+ #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
1368+ #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
1369+ #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
1370+ #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1371+ #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1372+ #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
1373+ #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
1374+
1375+ #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
1376+ #include <immintrin.h>
1377+ #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
1378+ #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
1379+ #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
1380+ #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
1381+ #else
1382+ #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
1383+ #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
1384+ #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
1385+ #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
1386+ #endif
1387+
1388+ #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
1389+ #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
1390+
1391+ #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
1392+ #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
1393+
1394+ #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
1395+ #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
1396+ #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
1397+ #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
1398+
1399+ #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
1400+ #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
1401+
1402+ static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
1403+ static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
1404+ #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
1405+ #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
1406+ #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
1407+ #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
1408+
1409+ #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
1410+
1411+ #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
1412+ #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
1413+ #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
1414+
1415+ #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1416+ { \
1417+ stbir__simdf af,bf; \
1418+ stbir__simdi a,b; \
1419+ af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
1420+ bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
1421+ af = _mm_max_ps( af, _mm_setzero_ps() ); \
1422+ bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
1423+ a = _mm_cvttps_epi32( af ); \
1424+ b = _mm_cvttps_epi32( bf ); \
1425+ a = _mm_packs_epi32( a, b ); \
1426+ out = _mm_packus_epi16( a, a ); \
1427+ }
1428+
1429+ #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1430+ stbir__simdf_load( o0, (ptr) ); \
1431+ stbir__simdf_load( o1, (ptr)+4 ); \
1432+ stbir__simdf_load( o2, (ptr)+8 ); \
1433+ stbir__simdf_load( o3, (ptr)+12 ); \
1434+ { \
1435+ __m128 tmp0, tmp1, tmp2, tmp3; \
1436+ tmp0 = _mm_unpacklo_ps(o0, o1); \
1437+ tmp2 = _mm_unpacklo_ps(o2, o3); \
1438+ tmp1 = _mm_unpackhi_ps(o0, o1); \
1439+ tmp3 = _mm_unpackhi_ps(o2, o3); \
1440+ o0 = _mm_movelh_ps(tmp0, tmp2); \
1441+ o1 = _mm_movehl_ps(tmp2, tmp0); \
1442+ o2 = _mm_movelh_ps(tmp1, tmp3); \
1443+ o3 = _mm_movehl_ps(tmp3, tmp1); \
1444+ }
1445+
1446+ #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1447+ r0 = _mm_packs_epi32( r0, r1 ); \
1448+ r2 = _mm_packs_epi32( r2, r3 ); \
1449+ r1 = _mm_unpacklo_epi16( r0, r2 ); \
1450+ r3 = _mm_unpackhi_epi16( r0, r2 ); \
1451+ r0 = _mm_unpacklo_epi16( r1, r3 ); \
1452+ r2 = _mm_unpackhi_epi16( r1, r3 ); \
1453+ r0 = _mm_packus_epi16( r0, r2 ); \
1454+ stbir__simdi_store( ptr, r0 ); \
1455+
1456+ #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
1457+
1458+ #if defined(_MSC_VER) && !defined(__clang__)
1459+ // msvc inits with 8 bytes
1460+ #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
1461+ #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
1462+ #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
1463+ #else
1464+ // everything else inits with long long's
1465+ #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
1466+ #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
1467+ #endif
1468+
1469+ #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1470+ #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
1471+ #define STBIR__CONSTF(var) (var)
1472+ #define STBIR__CONSTI(var) (var)
1473+
1474+ #if defined(STBIR_AVX) || defined(__SSE4_1__)
1475+ #include <smmintrin.h>
1476+ #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
1477+ #else
1478+ static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
1479+ static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
1480+
1481+ #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
1482+ { \
1483+ stbir__simdi tmp0,tmp1; \
1484+ tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1485+ tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
1486+ tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
1487+ tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
1488+ out = _mm_packs_epi32( tmp0, tmp1 ); \
1489+ out = _mm_sub_epi16( out, stbir__s16_32768 ); \
1490+ }
1491+
1492+ #endif
1493+
1494+ #define STBIR_SIMD
1495+
1496+ // if we detect AVX, set the simd8 defines
1497+ #ifdef STBIR_AVX
1498+ #include <immintrin.h>
1499+ #define STBIR_SIMD8
1500+ #define stbir__simdf8 __m256
1501+ #define stbir__simdi8 __m256i
1502+ #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
1503+ #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
1504+ #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
1505+ #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
1506+ #define stbir__simdi8_store( ptr, reg ) _mm256_storeu_si256( (__m256i*)(ptr), reg )
1507+ #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
1508+
1509+ #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
1510+ #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
1511+
1512+ #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
1513+ #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1514+ #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
1515+ #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
1516+ #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
1517+ #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr ) // avx load instruction
1518+
1519+ #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
1520+ #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
1521+
1522+ #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
1523+ #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
1524+
1525+ #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
1526+
1527+ #ifdef STBIR_AVX2
1528+
1529+ #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1530+ { \
1531+ stbir__simdi8 a, zero =_mm256_setzero_si256();\
1532+ a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
1533+ out0 = _mm256_unpacklo_epi16( a, zero ); \
1534+ out1 = _mm256_unpackhi_epi16( a, zero ); \
1535+ }
1536+
1537+ #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1538+ { \
1539+ stbir__simdi8 t; \
1540+ stbir__simdf8 af,bf; \
1541+ stbir__simdi8 a,b; \
1542+ af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1543+ bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1544+ af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1545+ bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1546+ a = _mm256_cvttps_epi32( af ); \
1547+ b = _mm256_cvttps_epi32( bf ); \
1548+ t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1549+ out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
1550+ }
1551+
1552+ #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
1553+
1554+ #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1555+ { \
1556+ stbir__simdf8 af,bf; \
1557+ stbir__simdi8 a,b; \
1558+ af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1559+ bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1560+ af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1561+ bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1562+ a = _mm256_cvttps_epi32( af ); \
1563+ b = _mm256_cvttps_epi32( bf ); \
1564+ (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
1565+ }
1566+
1567+ #else
1568+
1569+ #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
1570+ { \
1571+ stbir__simdi a,zero = _mm_setzero_si128(); \
1572+ a = _mm_unpacklo_epi8( ireg, zero ); \
1573+ out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1574+ a = _mm_unpackhi_epi8( ireg, zero ); \
1575+ out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
1576+ }
1577+
1578+ #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
1579+ { \
1580+ stbir__simdi t; \
1581+ stbir__simdf8 af,bf; \
1582+ stbir__simdi8 a,b; \
1583+ af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
1584+ bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
1585+ af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1586+ bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1587+ a = _mm256_cvttps_epi32( af ); \
1588+ b = _mm256_cvttps_epi32( bf ); \
1589+ out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1590+ out = _mm_packus_epi16( out, out ); \
1591+ t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1592+ t = _mm_packus_epi16( t, t ); \
1593+ out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
1594+ }
1595+
1596+ #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
1597+ { \
1598+ stbir__simdi a,b,zero = _mm_setzero_si128(); \
1599+ a = _mm_unpacklo_epi16( ireg, zero ); \
1600+ b = _mm_unpackhi_epi16( ireg, zero ); \
1601+ out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
1602+ }
1603+
1604+ #define stbir__simdf8_pack_to_16words(out,aa,bb) \
1605+ { \
1606+ stbir__simdi t0,t1; \
1607+ stbir__simdf8 af,bf; \
1608+ stbir__simdi8 a,b; \
1609+ af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
1610+ bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
1611+ af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
1612+ bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
1613+ a = _mm256_cvttps_epi32( af ); \
1614+ b = _mm256_cvttps_epi32( bf ); \
1615+ t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
1616+ t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
1617+ out = _mm256_setr_m128i( t0, t1 ); \
1618+ }
1619+
1620+ #endif
1621+
1622+ static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
1623+ #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
1624+
1625+ static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
1626+ #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
1627+
1628+ #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
1629+
1630+ #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
1631+
1632+ static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
1633+ #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
1634+ #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8, _mm256_castps128_ps256( b ) )
1635+
1636+ static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i( 0x80000000, 0x80000000, 0, 0 ) };
1637+ #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
1638+
1639+ #define stbir__simdf8_0123to00000000( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
1640+ #define stbir__simdf8_0123to11111111( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
1641+ #define stbir__simdf8_0123to22222222( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
1642+ #define stbir__simdf8_0123to33333333( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
1643+ #define stbir__simdf8_0123to21032103( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
1644+ #define stbir__simdf8_0123to32103210( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
1645+ #define stbir__simdf8_0123to12301230( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
1646+ #define stbir__simdf8_0123to10321032( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
1647+ #define stbir__simdf8_0123to30123012( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
1648+
1649+ #define stbir__simdf8_0123to11331133( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
1650+ #define stbir__simdf8_0123to00220022( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
1651+
1652+ #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
1653+ #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
1654+ #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1655+ #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
1656+
1657+ #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
1658+
1659+ #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
1660+ #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
1661+ #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
1662+ #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
1663+ #else
1664+ #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
1665+ #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
1666+ #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
1667+ #endif
1668+ #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
1669+
1670+ #endif
1671+
1672+ #ifdef STBIR_FLOORF
1673+ #undef STBIR_FLOORF
1674+ #endif
1675+ #define STBIR_FLOORF stbir_simd_floorf
1676+ static stbir__inline float stbir_simd_floorf(float x) // martins floorf
1677+ {
1678+ #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1679+ __m128 t = _mm_set_ss(x);
1680+ return _mm_cvtss_f32( _mm_floor_ss(t, t) );
1681+ #else
1682+ __m128 f = _mm_set_ss(x);
1683+ __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1684+ __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
1685+ return _mm_cvtss_f32(r);
1686+ #endif
1687+ }
1688+
1689+ #ifdef STBIR_CEILF
1690+ #undef STBIR_CEILF
1691+ #endif
1692+ #define STBIR_CEILF stbir_simd_ceilf
1693+ static stbir__inline float stbir_simd_ceilf(float x) // martins ceilf
1694+ {
1695+ #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
1696+ __m128 t = _mm_set_ss(x);
1697+ return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
1698+ #else
1699+ __m128 f = _mm_set_ss(x);
1700+ __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
1701+ __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
1702+ return _mm_cvtss_f32(r);
1703+ #endif
1704+ }
1705+
1706+#elif defined(STBIR_NEON)
1707+
1708+ #include <arm_neon.h>
1709+
1710+ #define stbir__simdf float32x4_t
1711+ #define stbir__simdi uint32x4_t
1712+
1713+ #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
1714+ #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
1715+
1716+ #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
1717+ #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
1718+ #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1719+ #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
1720+ #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
1721+ #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
1722+ #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
1723+ #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
1724+ #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values must be zero
1725+ #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
1726+
1727+ #define stbir__simdf_zeroP() vdupq_n_f32(0)
1728+ #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
1729+
1730+ #define stbir__simdf_store( ptr, reg ) vst1q_f32( (float*)(ptr), reg )
1731+ #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
1732+ #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
1733+ #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
1734+
1735+ #define stbir__simdi_store( ptr, reg ) vst1q_u32( (uint32_t*)(ptr), reg )
1736+ #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
1737+ #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
1738+
1739+ #define stbir__prefetch( ptr )
1740+
1741+ #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
1742+ { \
1743+ uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
1744+ uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
1745+ out0 = vmovl_u16( vget_low_u16 ( l ) ); \
1746+ out1 = vmovl_u16( vget_high_u16( l ) ); \
1747+ out2 = vmovl_u16( vget_low_u16 ( h ) ); \
1748+ out3 = vmovl_u16( vget_high_u16( h ) ); \
1749+ }
1750+
1751+ #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
1752+ { \
1753+ uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
1754+ out = vmovl_u16( vget_low_u16( tmp ) ); \
1755+ }
1756+
1757+ #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
1758+ { \
1759+ uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
1760+ out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
1761+ out1 = vmovl_u16( vget_high_u16( tmp ) ); \
1762+ }
1763+
1764+ #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
1765+ #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
1766+ #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
1767+ #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
1768+ #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
1769+ #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
1770+ #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1771+ #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1772+ #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1773+ #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1774+ #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
1775+ #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
1776+
1777+ #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
1778+ #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1779+ #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
1780+ #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
1781+ #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
1782+ #else
1783+ #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1784+ #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
1785+ #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
1786+ #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
1787+ #endif
1788+
1789+ #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
1790+ #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
1791+
1792+ #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1793+ #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
1794+
1795+ #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1796+ #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1797+ #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
1798+ #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
1799+
1800+ #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
1801+ #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
1802+
1803+ #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
1804+ #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
1805+
1806+ #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1807+
1808+ #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
1809+ #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
1810+
1811+ #if defined( _MSC_VER ) && !defined(__clang__)
1812+ #define stbir_make16(a,b,c,d) vcombine_u8( \
1813+ vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1814+ ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
1815+ vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
1816+ ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
1817+
1818+ static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
1819+ {
1820+ uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
1821+ return r;
1822+ }
1823+ #else
1824+ #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
1825+ #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
1826+ #endif
1827+
1828+ #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
1829+ #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
1830+
1831+ #define stbir__simdi_16madd( out, reg0, reg1 ) \
1832+ { \
1833+ int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1834+ int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1835+ int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1836+ int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1837+ (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
1838+ }
1839+
1840+ #else
1841+
1842+ #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
1843+ #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
1844+
1845+ #if defined( _MSC_VER ) && !defined(__clang__)
1846+ static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
1847+ {
1848+ uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
1849+ return r;
1850+ }
1851+ #define stbir_make8(a,b) vcreate_u8( \
1852+ (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
1853+ ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
1854+ #else
1855+ #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
1856+ #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
1857+ #endif
1858+
1859+ #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
1860+ vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
1861+ vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
1862+
1863+ #define stbir__simdi_16madd( out, reg0, reg1 ) \
1864+ { \
1865+ int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
1866+ int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
1867+ int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
1868+ int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
1869+ int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
1870+ int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
1871+ (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
1872+ }
1873+
1874+ #endif
1875+
1876+ #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
1877+ #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
1878+
1879+ #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
1880+ { \
1881+ float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1882+ float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
1883+ int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
1884+ int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
1885+ uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
1886+ out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
1887+ }
1888+
1889+ #define stbir__simdf_pack_to_8words(out,aa,bb) \
1890+ { \
1891+ float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1892+ float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
1893+ int32x4_t ai = vcvtq_s32_f32( af ); \
1894+ int32x4_t bi = vcvtq_s32_f32( bf ); \
1895+ out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
1896+ }
1897+
1898+ #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
1899+ { \
1900+ int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
1901+ int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
1902+ uint8x8x2_t out = \
1903+ { { \
1904+ vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
1905+ vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
1906+ } }; \
1907+ vst2_u8(ptr, out); \
1908+ }
1909+
1910+ #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
1911+ { \
1912+ float32x4x4_t tmp = vld4q_f32(ptr); \
1913+ o0 = tmp.val[0]; \
1914+ o1 = tmp.val[1]; \
1915+ o2 = tmp.val[2]; \
1916+ o3 = tmp.val[3]; \
1917+ }
1918+
1919+ #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
1920+
1921+ #if defined( _MSC_VER ) && !defined(__clang__)
1922+ #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
1923+ #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
1924+ #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
1925+ #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
1926+ #else
1927+ #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
1928+ #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
1929+ #define STBIR__CONSTF(var) (var)
1930+ #define STBIR__CONSTI(var) (var)
1931+ #endif
1932+
1933+ #ifdef STBIR_FLOORF
1934+ #undef STBIR_FLOORF
1935+ #endif
1936+ #define STBIR_FLOORF stbir_simd_floorf
1937+ static stbir__inline float stbir_simd_floorf(float x)
1938+ {
1939+ #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1940+ return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
1941+ #else
1942+ float32x2_t f = vdup_n_f32(x);
1943+ float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1944+ uint32x2_t a = vclt_f32(f, t);
1945+ uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
1946+ float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1947+ return vget_lane_f32(r, 0);
1948+ #endif
1949+ }
1950+
1951+ #ifdef STBIR_CEILF
1952+ #undef STBIR_CEILF
1953+ #endif
1954+ #define STBIR_CEILF stbir_simd_ceilf
1955+ static stbir__inline float stbir_simd_ceilf(float x)
1956+ {
1957+ #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
1958+ return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
1959+ #else
1960+ float32x2_t f = vdup_n_f32(x);
1961+ float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
1962+ uint32x2_t a = vclt_f32(t, f);
1963+ uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
1964+ float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
1965+ return vget_lane_f32(r, 0);
1966+ #endif
1967+ }
1968+
1969+ #define STBIR_SIMD
1970+
1971+#elif defined(STBIR_WASM)
1972+
1973+ #include <wasm_simd128.h>
1974+
1975+ #define stbir__simdf v128_t
1976+ #define stbir__simdi v128_t
1977+
1978+ #define stbir_simdi_castf( reg ) (reg)
1979+ #define stbir_simdf_casti( reg ) (reg)
1980+
1981+ #define stbir__simdf_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
1982+ #define stbir__simdi_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
1983+ #define stbir__simdf_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1984+ #define stbir__simdi_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) )
1985+ #define stbir__simdf_load1z( out, ptr ) (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
1986+ #define stbir__simdf_frep4( fvar ) wasm_f32x4_splat( fvar )
1987+ #define stbir__simdf_load1frep4( out, fvar ) (out) = wasm_f32x4_splat( fvar )
1988+ #define stbir__simdf_load2( out, ptr ) (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
1989+ #define stbir__simdf_load2z( out, ptr ) (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
1990+ #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
1991+
1992+ #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
1993+ #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
1994+
1995+ #define stbir__simdf_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
1996+ #define stbir__simdf_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
1997+ #define stbir__simdf_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
1998+ #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
1999+
2000+ #define stbir__simdi_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
2001+ #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
2002+ #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
2003+
2004+ #define stbir__prefetch( ptr )
2005+
2006+ #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
2007+ { \
2008+ v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
2009+ v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
2010+ out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
2011+ out1 = wasm_u32x4_extend_high_u16x8( l ); \
2012+ out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
2013+ out3 = wasm_u32x4_extend_high_u16x8( h ); \
2014+ }
2015+
2016+ #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
2017+ { \
2018+ v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
2019+ out = wasm_u32x4_extend_low_u16x8(tmp); \
2020+ }
2021+
2022+ #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
2023+ { \
2024+ out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
2025+ out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
2026+ }
2027+
2028+ #define stbir__simdf_convert_float_to_i32( i, f ) (i) = wasm_i32x4_trunc_sat_f32x4(f)
2029+ #define stbir__simdf_convert_float_to_int( f ) wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
2030+ #define stbir__simdi_to_int( i ) wasm_i32x4_extract_lane(i, 0)
2031+ #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
2032+ #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
2033+ #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
2034+ #define stbir__simdf_add( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
2035+ #define stbir__simdf_mult( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
2036+ #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
2037+ #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2038+ #define stbir__simdf_add_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
2039+ #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
2040+
2041+ #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2042+ #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
2043+ #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
2044+ #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
2045+
2046+ #define stbir__simdf_add1( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
2047+ #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
2048+
2049+ #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
2050+ #define stbir__simdf_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
2051+
2052+ #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2053+ #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2054+ #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
2055+ #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
2056+
2057+ #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
2058+ #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
2059+
2060+ #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
2061+ #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
2062+ #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
2063+ #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
2064+
2065+ #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
2066+
2067+ #define stbir__simdi_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
2068+ #define stbir__simdi_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
2069+ #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
2070+
2071+ #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
2072+ { \
2073+ v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2074+ v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
2075+ v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2076+ v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2077+ v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
2078+ out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
2079+ }
2080+
2081+ #define stbir__simdf_pack_to_8words(out,aa,bb) \
2082+ { \
2083+ v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2084+ v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
2085+ v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
2086+ v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
2087+ out = wasm_u16x8_narrow_i32x4( ai, bi ); \
2088+ }
2089+
2090+ #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
2091+ { \
2092+ v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
2093+ v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
2094+ v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
2095+ tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
2096+ wasm_v128_store( (void*)(ptr), tmp); \
2097+ }
2098+
2099+ #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
2100+ { \
2101+ v128_t t0 = wasm_v128_load( ptr ); \
2102+ v128_t t1 = wasm_v128_load( ptr+4 ); \
2103+ v128_t t2 = wasm_v128_load( ptr+8 ); \
2104+ v128_t t3 = wasm_v128_load( ptr+12 ); \
2105+ v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
2106+ v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
2107+ v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
2108+ v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
2109+ o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
2110+ o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
2111+ o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
2112+ o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
2113+ }
2114+
2115+ #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
2116+
2117+ typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
2118+ #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
2119+ #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
2120+ #define STBIR__CONSTF(var) (var)
2121+ #define STBIR__CONSTI(var) (var)
2122+
2123+ #ifdef STBIR_FLOORF
2124+ #undef STBIR_FLOORF
2125+ #endif
2126+ #define STBIR_FLOORF stbir_simd_floorf
2127+ static stbir__inline float stbir_simd_floorf(float x)
2128+ {
2129+ return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
2130+ }
2131+
2132+ #ifdef STBIR_CEILF
2133+ #undef STBIR_CEILF
2134+ #endif
2135+ #define STBIR_CEILF stbir_simd_ceilf
2136+ static stbir__inline float stbir_simd_ceilf(float x)
2137+ {
2138+ return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
2139+ }
2140+
2141+ #define STBIR_SIMD
2142+
2143+#endif // SSE2/NEON/WASM
2144+
2145+#endif // NO SIMD
2146+
2147+#ifdef STBIR_SIMD8
2148+ #define stbir__simdfX stbir__simdf8
2149+ #define stbir__simdiX stbir__simdi8
2150+ #define stbir__simdfX_load stbir__simdf8_load
2151+ #define stbir__simdiX_load stbir__simdi8_load
2152+ #define stbir__simdfX_mult stbir__simdf8_mult
2153+ #define stbir__simdfX_add_mem stbir__simdf8_add_mem
2154+ #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
2155+ #define stbir__simdfX_store stbir__simdf8_store
2156+ #define stbir__simdiX_store stbir__simdi8_store
2157+ #define stbir__simdf_frepX stbir__simdf8_frep8
2158+ #define stbir__simdfX_madd stbir__simdf8_madd
2159+ #define stbir__simdfX_min stbir__simdf8_min
2160+ #define stbir__simdfX_max stbir__simdf8_max
2161+ #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
2162+ #define stbir__simdfX_1aaa stbir__simdf8_1aaa
2163+ #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
2164+ #define stbir__simdfX_1a1a stbir__simdf8_1a1a
2165+ #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
2166+ #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
2167+ #define stbir__simdfX_zero stbir__simdf8_zero
2168+ #define STBIR_onesX STBIR_ones8
2169+ #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
2170+ #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
2171+ #define STBIR_simd_point5X STBIR_simd_point58
2172+ #define stbir__simdfX_float_count 8
2173+ #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
2174+ #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
2175+ static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
2176+ static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
2177+ static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
2178+ static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
2179+ static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
2180+ static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
2181+#else
2182+ #define stbir__simdfX stbir__simdf
2183+ #define stbir__simdiX stbir__simdi
2184+ #define stbir__simdfX_load stbir__simdf_load
2185+ #define stbir__simdiX_load stbir__simdi_load
2186+ #define stbir__simdfX_mult stbir__simdf_mult
2187+ #define stbir__simdfX_add_mem stbir__simdf_add_mem
2188+ #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
2189+ #define stbir__simdfX_store stbir__simdf_store
2190+ #define stbir__simdiX_store stbir__simdi_store
2191+ #define stbir__simdf_frepX stbir__simdf_frep4
2192+ #define stbir__simdfX_madd stbir__simdf_madd
2193+ #define stbir__simdfX_min stbir__simdf_min
2194+ #define stbir__simdfX_max stbir__simdf_max
2195+ #define stbir__simdfX_aaa1 stbir__simdf_aaa1
2196+ #define stbir__simdfX_1aaa stbir__simdf_1aaa
2197+ #define stbir__simdfX_a1a1 stbir__simdf_a1a1
2198+ #define stbir__simdfX_1a1a stbir__simdf_1a1a
2199+ #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
2200+ #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
2201+ #define stbir__simdfX_zero stbir__simdf_zero
2202+ #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
2203+ #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
2204+ #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
2205+ #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
2206+ #define stbir__simdfX_float_count 4
2207+ #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
2208+ #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
2209+ #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
2210+#endif
2211+
2212+
2213+#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
2214+
2215+ #if defined( _MSC_VER ) && !defined(__clang__)
2216+ typedef __int16 stbir__FP16;
2217+ #else
2218+ typedef float16_t stbir__FP16;
2219+ #endif
2220+
2221+#else // no NEON, or 32-bit ARM for MSVC
2222+
2223+ typedef union stbir__FP16
2224+ {
2225+ unsigned short u;
2226+ } stbir__FP16;
2227+
2228+#endif
2229+
2230+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
2231+
2232+ // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2233+
2234+ static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2235+ {
2236+ static const stbir__FP32 magic = { (254 - 15) << 23 };
2237+ static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
2238+ stbir__FP32 o;
2239+
2240+ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
2241+ o.f *= magic.f; // exponent adjust
2242+ if (o.f >= was_infnan.f) // make sure Inf/NaN survive
2243+ o.u |= 255 << 23;
2244+ o.u |= (h.u & 0x8000) << 16; // sign bit
2245+ return o.f;
2246+ }
2247+
2248+ static stbir__inline stbir__FP16 stbir__float_to_half(float val)
2249+ {
2250+ stbir__FP32 f32infty = { 255 << 23 };
2251+ stbir__FP32 f16max = { (127 + 16) << 23 };
2252+ stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
2253+ unsigned int sign_mask = 0x80000000u;
2254+ stbir__FP16 o = { 0 };
2255+ stbir__FP32 f;
2256+ unsigned int sign;
2257+
2258+ f.f = val;
2259+ sign = f.u & sign_mask;
2260+ f.u ^= sign;
2261+
2262+ if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2263+ o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2264+ else // (De)normalized number or zero
2265+ {
2266+ if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
2267+ {
2268+ // use a magic value to align our 10 mantissa bits at the bottom of
2269+ // the float. as long as FP addition is round-to-nearest-even this
2270+ // just works.
2271+ f.f += denorm_magic.f;
2272+ // and one integer subtract of the bias later, we have our final float!
2273+ o.u = (unsigned short) ( f.u - denorm_magic.u );
2274+ }
2275+ else
2276+ {
2277+ unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
2278+ // update exponent, rounding bias part 1
2279+ f.u = f.u + ((15u - 127) << 23) + 0xfff;
2280+ // rounding bias part 2
2281+ f.u += mant_odd;
2282+ // take the bits!
2283+ o.u = (unsigned short) ( f.u >> 13 );
2284+ }
2285+ }
2286+
2287+ o.u |= sign >> 16;
2288+ return o;
2289+ }
2290+
2291+#endif
2292+
2293+
2294+#if defined(STBIR_FP16C)
2295+
2296+ #include <immintrin.h>
2297+
2298+ static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2299+ {
2300+ _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
2301+ }
2302+
2303+ static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2304+ {
2305+ _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
2306+ }
2307+
2308+ static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2309+ {
2310+ return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
2311+ }
2312+
2313+ static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2314+ {
2315+ stbir__FP16 h;
2316+ h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
2317+ return h;
2318+ }
2319+
2320+#elif defined(STBIR_SSE2)
2321+
2322+ // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
2323+ stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
2324+ {
2325+ static const STBIR__SIMDI_CONST(mask_nosign, 0x7fff);
2326+ static const STBIR__SIMDI_CONST(smallest_normal, 0x0400);
2327+ static const STBIR__SIMDI_CONST(infinity, 0x7c00);
2328+ static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
2329+ static const STBIR__SIMDI_CONST(magic_denorm, 113 << 23);
2330+
2331+ __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
2332+ __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
2333+ __m128i mnosign = STBIR__CONSTI(mask_nosign);
2334+ __m128i eadjust = STBIR__CONSTI(expadjust_normal);
2335+ __m128i smallest = STBIR__CONSTI(smallest_normal);
2336+ __m128i infty = STBIR__CONSTI(infinity);
2337+ __m128i expmant = _mm_and_si128(mnosign, h);
2338+ __m128i justsign = _mm_xor_si128(h, expmant);
2339+ __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2340+ __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
2341+ __m128i shifted = _mm_slli_epi32(expmant, 13);
2342+ __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
2343+ __m128i adjusted = _mm_add_epi32(eadjust, shifted);
2344+ __m128i den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2345+ __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
2346+ __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2347+ __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2348+ __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2349+ __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
2350+ __m128i sign = _mm_slli_epi32(justsign, 16);
2351+ __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2352+ stbir__simdf_store( output + 0, final );
2353+
2354+ h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
2355+ expmant = _mm_and_si128(mnosign, h);
2356+ justsign = _mm_xor_si128(h, expmant);
2357+ b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
2358+ b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
2359+ shifted = _mm_slli_epi32(expmant, 13);
2360+ adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
2361+ adjusted = _mm_add_epi32(eadjust, shifted);
2362+ den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
2363+ adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
2364+ den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
2365+ adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
2366+ adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
2367+ adjusted5 = _mm_or_ps(adjusted3, adjusted4);
2368+ sign = _mm_slli_epi32(justsign, 16);
2369+ final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
2370+ stbir__simdf_store( output + 4, final );
2371+
2372+ // ~38 SSE2 ops for 8 values
2373+ }
2374+
2375+ // Fabian's round-to-nearest-even float to half
2376+ // ~48 SSE2 ops for 8 output
2377+ stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
2378+ {
2379+ static const STBIR__SIMDI_CONST(mask_sign, 0x80000000u);
2380+ static const STBIR__SIMDI_CONST(c_f16max, (127 + 16) << 23); // all FP32 values >=this round to +inf
2381+ static const STBIR__SIMDI_CONST(c_nanbit, 0x200);
2382+ static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
2383+ static const STBIR__SIMDI_CONST(c_min_normal, (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
2384+ static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
2385+ static const STBIR__SIMDI_CONST(c_normal_bias, 0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
2386+
2387+ __m128 f = _mm_loadu_ps(input);
2388+ __m128 msign = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
2389+ __m128 justsign = _mm_and_ps(msign, f);
2390+ __m128 absf = _mm_xor_ps(f, justsign);
2391+ __m128i absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2392+ __m128i f16max = STBIR__CONSTI(c_f16max);
2393+ __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2394+ __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2395+ __m128i nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
2396+ __m128i inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2397+
2398+ __m128i min_normal = STBIR__CONSTI(c_min_normal);
2399+ __m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
2400+
2401+ // "result is subnormal" path
2402+ __m128 subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2403+ __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2404+
2405+ // "result is normal" path
2406+ __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2407+ __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2408+
2409+ __m128i round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2410+ __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2411+ __m128i normal = _mm_srli_epi32(round2, 13); // rounded result
2412+
2413+ // combine the two non-specials
2414+ __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2415+
2416+ // merge in specials as well
2417+ __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2418+
2419+ __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2420+ __m128i final2, final= _mm_or_si128(joined, sign_shift);
2421+
2422+ f = _mm_loadu_ps(input+4);
2423+ justsign = _mm_and_ps(msign, f);
2424+ absf = _mm_xor_ps(f, justsign);
2425+ absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
2426+ b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
2427+ b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
2428+ nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
2429+ inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
2430+
2431+ b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
2432+
2433+ // "result is subnormal" path
2434+ subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
2435+ subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
2436+
2437+ // "result is normal" path
2438+ mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
2439+ mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
2440+
2441+ round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
2442+ round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
2443+ normal = _mm_srli_epi32(round2, 13); // rounded result
2444+
2445+ // combine the two non-specials
2446+ nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
2447+
2448+ // merge in specials as well
2449+ joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
2450+
2451+ sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
2452+ final2 = _mm_or_si128(joined, sign_shift);
2453+ final = _mm_packs_epi32(final, final2);
2454+ stbir__simdi_store( output,final );
2455+ }
2456+
2457+#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
2458+
2459+ static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2460+ {
2461+ float16x4_t in0 = vld1_f16(input + 0);
2462+ float16x4_t in1 = vld1_f16(input + 4);
2463+ vst1q_f32(output + 0, vcvt_f32_f16(in0));
2464+ vst1q_f32(output + 4, vcvt_f32_f16(in1));
2465+ }
2466+
2467+ static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2468+ {
2469+ float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2470+ float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2471+ vst1_f16(output+0, out0);
2472+ vst1_f16(output+4, out1);
2473+ }
2474+
2475+ static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2476+ {
2477+ return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
2478+ }
2479+
2480+ static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2481+ {
2482+ return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
2483+ }
2484+
2485+#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
2486+
2487+ static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2488+ {
2489+ float16x8_t in = vld1q_f16(input);
2490+ vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
2491+ vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
2492+ }
2493+
2494+ static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2495+ {
2496+ float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
2497+ float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
2498+ vst1q_f16(output, vcombine_f16(out0, out1));
2499+ }
2500+
2501+ static stbir__inline float stbir__half_to_float( stbir__FP16 h )
2502+ {
2503+ return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
2504+ }
2505+
2506+ static stbir__inline stbir__FP16 stbir__float_to_half( float f )
2507+ {
2508+ return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
2509+ }
2510+
2511+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
2512+
2513+ static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
2514+ {
2515+ for (int i=0; i<8; i++)
2516+ {
2517+ output[i] = stbir__half_to_float(input[i]);
2518+ }
2519+ }
2520+ static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
2521+ {
2522+ for (int i=0; i<8; i++)
2523+ {
2524+ output[i] = stbir__float_to_half(input[i]);
2525+ }
2526+ }
2527+
2528+#endif
2529+
2530+
2531+#ifdef STBIR_SIMD
2532+
2533+#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
2534+#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
2535+#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
2536+#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
2537+#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
2538+#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
2539+#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
2540+#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
2541+#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
2542+#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
2543+#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
2544+#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
2545+#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
2546+#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
2547+#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
2548+#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
2549+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
2550+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
2551+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
2552+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
2553+
2554+typedef union stbir__simdi_u32
2555+{
2556+ stbir_uint32 m128i_u32[4];
2557+ int m128i_i32[4];
2558+ stbir__simdi m128i_i128;
2559+} stbir__simdi_u32;
2560+
2561+static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
2562+
2563+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float, stbir__max_uint8_as_float);
2564+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float, stbir__max_uint16_as_float);
2565+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted);
2566+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
2567+
2568+static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f);
2569+static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f);
2570+static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23);
2571+static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff);
2572+static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
2573+static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000);
2574+
2575+// Basically, in simd mode, we unroll the proper amount, and we don't want
2576+// the non-simd remnant loops to be unroll because they only run a few times
2577+// Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
2578+#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
2579+#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
2580+#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
2581+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
2582+
2583+#ifdef STBIR_MEMCPY
2584+#undef STBIR_MEMCPY
2585+#endif
2586+#define STBIR_MEMCPY stbir_simd_memcpy
2587+
2588+// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
2589+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
2590+{
2591+ char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
2592+ char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
2593+ ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
2594+
2595+ // check overlaps
2596+ STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
2597+
2598+ if ( bytes < (16*stbir__simdfX_float_count) )
2599+ {
2600+ if ( bytes < 16 )
2601+ {
2602+ if ( bytes )
2603+ {
2604+ STBIR_SIMD_NO_UNROLL_LOOP_START
2605+ do
2606+ {
2607+ STBIR_SIMD_NO_UNROLL(d);
2608+ d[ 0 ] = d[ ofs_to_src ];
2609+ ++d;
2610+ } while ( d < d_end );
2611+ }
2612+ }
2613+ else
2614+ {
2615+ stbir__simdf x;
2616+ // do one unaligned to get us aligned for the stream out below
2617+ stbir__simdf_load( x, ( d + ofs_to_src ) );
2618+ stbir__simdf_store( d, x );
2619+ d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
2620+
2621+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2622+ for(;;)
2623+ {
2624+ STBIR_SIMD_NO_UNROLL(d);
2625+
2626+ if ( d > ( d_end - 16 ) )
2627+ {
2628+ if ( d == d_end )
2629+ return;
2630+ d = d_end - 16;
2631+ }
2632+
2633+ stbir__simdf_load( x, ( d + ofs_to_src ) );
2634+ stbir__simdf_store( d, x );
2635+ d += 16;
2636+ }
2637+ }
2638+ }
2639+ else
2640+ {
2641+ stbir__simdfX x0,x1,x2,x3;
2642+
2643+ // do one unaligned to get us aligned for the stream out below
2644+ stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
2645+ stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
2646+ stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
2647+ stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2648+ stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
2649+ stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
2650+ stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
2651+ stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2652+ d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
2653+
2654+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2655+ for(;;)
2656+ {
2657+ STBIR_SIMD_NO_UNROLL(d);
2658+
2659+ if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
2660+ {
2661+ if ( d == d_end )
2662+ return;
2663+ d = d_end - (16*stbir__simdfX_float_count);
2664+ }
2665+
2666+ stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
2667+ stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
2668+ stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
2669+ stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
2670+ stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
2671+ stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
2672+ stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
2673+ stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
2674+ d += (16*stbir__simdfX_float_count);
2675+ }
2676+ }
2677+}
2678+
2679+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
2680+// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2681+// the diff between dest and src)
2682+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2683+{
2684+ char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2685+ char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2686+ ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2687+
2688+ if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
2689+ {
2690+ char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
2691+ STBIR_SIMD_NO_UNROLL_LOOP_START
2692+ do
2693+ {
2694+ stbir__simdf x;
2695+ STBIR_SIMD_NO_UNROLL(sd);
2696+ stbir__simdf_load( x, sd );
2697+ stbir__simdf_store( ( sd + ofs_to_dest ), x );
2698+ sd += 16;
2699+ } while ( sd < s_end16 );
2700+
2701+ if ( sd == s_end )
2702+ return;
2703+ }
2704+
2705+ do
2706+ {
2707+ STBIR_SIMD_NO_UNROLL(sd);
2708+ *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2709+ sd += 4;
2710+ } while ( sd < s_end );
2711+}
2712+
2713+#else // no SSE2
2714+
2715+// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
2716+#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
2717+#define STBIR_SIMD_NO_UNROLL(ptr)
2718+#define STBIR_SIMD_NO_UNROLL_LOOP_START
2719+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
2720+
2721+#endif // SSE2
2722+
2723+
2724+#ifdef STBIR_PROFILE
2725+
2726+#ifndef STBIR_PROFILE_FUNC
2727+
2728+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
2729+
2730+#ifdef _MSC_VER
2731+
2732+ STBIRDEF stbir_uint64 __rdtsc();
2733+ #define STBIR_PROFILE_FUNC() __rdtsc()
2734+
2735+#else // non msvc
2736+
2737+ static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2738+ {
2739+ stbir_uint32 lo, hi;
2740+ asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
2741+ return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
2742+ }
2743+
2744+#endif // msvc
2745+
2746+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
2747+
2748+#if defined( _MSC_VER ) && !defined(__clang__)
2749+
2750+ #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
2751+
2752+#else
2753+
2754+ static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
2755+ {
2756+ stbir_uint64 tsc;
2757+ asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
2758+ return tsc;
2759+ }
2760+
2761+#endif
2762+
2763+#else // x64, arm
2764+
2765+#error Unknown platform for profiling.
2766+
2767+#endif // x64, arm
2768+
2769+#endif // STBIR_PROFILE_FUNC
2770+
2771+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
2772+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
2773+
2774+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
2775+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
2776+
2777+// super light-weight micro profiler
2778+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
2779+#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
2780+#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
2781+#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
2782+
2783+// for thread data
2784+#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
2785+#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
2786+#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
2787+#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
2788+
2789+// for build data
2790+#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
2791+#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
2792+#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
2793+#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
2794+
2795+#else // no profile
2796+
2797+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
2798+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
2799+
2800+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO
2801+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO
2802+
2803+#define STBIR_PROFILE_START( wh )
2804+#define STBIR_PROFILE_END( wh )
2805+#define STBIR_PROFILE_FIRST_START( wh )
2806+#define STBIR_PROFILE_CLEAR_EXTRAS( )
2807+
2808+#define STBIR_PROFILE_BUILD_START( wh )
2809+#define STBIR_PROFILE_BUILD_END( wh )
2810+#define STBIR_PROFILE_BUILD_FIRST_START( wh )
2811+#define STBIR_PROFILE_BUILD_CLEAR( info )
2812+
2813+#endif // stbir_profile
2814+
2815+#ifndef STBIR_CEILF
2816+#include <math.h>
2817+#if _MSC_VER <= 1200 // support VC6 for Sean
2818+#define STBIR_CEILF(x) ((float)ceil((float)(x)))
2819+#define STBIR_FLOORF(x) ((float)floor((float)(x)))
2820+#else
2821+#define STBIR_CEILF(x) ceilf(x)
2822+#define STBIR_FLOORF(x) floorf(x)
2823+#endif
2824+#endif
2825+
2826+#ifndef STBIR_MEMCPY
2827+// For memcpy
2828+#include <string.h>
2829+#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
2830+#endif
2831+
2832+#ifndef STBIR_SIMD
2833+
2834+// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
2835+// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
2836+// the diff between dest and src)
2837+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
2838+{
2839+ char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
2840+ char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
2841+ ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
2842+
2843+ if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
2844+ {
2845+ char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
2846+ STBIR_NO_UNROLL_LOOP_START
2847+ do
2848+ {
2849+ STBIR_NO_UNROLL(sd);
2850+ *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
2851+ sd += 8;
2852+ } while ( sd < s_end8 );
2853+
2854+ if ( sd == s_end )
2855+ return;
2856+ }
2857+
2858+ STBIR_NO_UNROLL_LOOP_START
2859+ do
2860+ {
2861+ STBIR_NO_UNROLL(sd);
2862+ *(int*)( sd + ofs_to_dest ) = *(int*) sd;
2863+ sd += 4;
2864+ } while ( sd < s_end );
2865+}
2866+
2867+#endif
2868+
2869+static float stbir__filter_trapezoid(float x, float scale, void * user_data)
2870+{
2871+ float halfscale = scale / 2;
2872+ float t = 0.5f + halfscale;
2873+ STBIR_ASSERT(scale <= 1);
2874+ STBIR__UNUSED(user_data);
2875+
2876+ if ( x < 0.0f ) x = -x;
2877+
2878+ if (x >= t)
2879+ return 0.0f;
2880+ else
2881+ {
2882+ float r = 0.5f - halfscale;
2883+ if (x <= r)
2884+ return 1.0f;
2885+ else
2886+ return (t - x) / scale;
2887+ }
2888+}
2889+
2890+static float stbir__support_trapezoid(float scale, void * user_data)
2891+{
2892+ STBIR__UNUSED(user_data);
2893+ return 0.5f + scale / 2.0f;
2894+}
2895+
2896+static float stbir__filter_triangle(float x, float s, void * user_data)
2897+{
2898+ STBIR__UNUSED(s);
2899+ STBIR__UNUSED(user_data);
2900+
2901+ if ( x < 0.0f ) x = -x;
2902+
2903+ if (x <= 1.0f)
2904+ return 1.0f - x;
2905+ else
2906+ return 0.0f;
2907+}
2908+
2909+static float stbir__filter_point(float x, float s, void * user_data)
2910+{
2911+ STBIR__UNUSED(x);
2912+ STBIR__UNUSED(s);
2913+ STBIR__UNUSED(user_data);
2914+
2915+ return 1.0f;
2916+}
2917+
2918+static float stbir__filter_cubic(float x, float s, void * user_data)
2919+{
2920+ STBIR__UNUSED(s);
2921+ STBIR__UNUSED(user_data);
2922+
2923+ if ( x < 0.0f ) x = -x;
2924+
2925+ if (x < 1.0f)
2926+ return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
2927+ else if (x < 2.0f)
2928+ return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
2929+
2930+ return (0.0f);
2931+}
2932+
2933+static float stbir__filter_catmullrom(float x, float s, void * user_data)
2934+{
2935+ STBIR__UNUSED(s);
2936+ STBIR__UNUSED(user_data);
2937+
2938+ if ( x < 0.0f ) x = -x;
2939+
2940+ if (x < 1.0f)
2941+ return 1.0f - x*x*(2.5f - 1.5f*x);
2942+ else if (x < 2.0f)
2943+ return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
2944+
2945+ return (0.0f);
2946+}
2947+
2948+static float stbir__filter_mitchell(float x, float s, void * user_data)
2949+{
2950+ STBIR__UNUSED(s);
2951+ STBIR__UNUSED(user_data);
2952+
2953+ if ( x < 0.0f ) x = -x;
2954+
2955+ if (x < 1.0f)
2956+ return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
2957+ else if (x < 2.0f)
2958+ return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
2959+
2960+ return (0.0f);
2961+}
2962+
2963+static float stbir__support_zeropoint5(float s, void * user_data)
2964+{
2965+ STBIR__UNUSED(s);
2966+ STBIR__UNUSED(user_data);
2967+ return 0.5f;
2968+}
2969+
2970+static float stbir__support_one(float s, void * user_data)
2971+{
2972+ STBIR__UNUSED(s);
2973+ STBIR__UNUSED(user_data);
2974+ return 1;
2975+}
2976+
2977+static float stbir__support_two(float s, void * user_data)
2978+{
2979+ STBIR__UNUSED(s);
2980+ STBIR__UNUSED(user_data);
2981+ return 2;
2982+}
2983+
2984+// This is the maximum number of input samples that can affect an output sample
2985+// with the given filter from the output pixel's perspective
2986+static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
2987+{
2988+ STBIR_ASSERT(support != 0);
2989+
2990+ if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
2991+ return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
2992+ else
2993+ return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
2994+}
2995+
2996+// this is how many coefficents per run of the filter (which is different
2997+// from the filter_pixel_width depending on if we are scattering or gathering)
2998+static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
2999+{
3000+ float scale = samp->scale_info.scale;
3001+ stbir__support_callback * support = samp->filter_support;
3002+
3003+ switch( is_gather )
3004+ {
3005+ case 1:
3006+ return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
3007+ case 2:
3008+ return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
3009+ case 0:
3010+ return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
3011+ default:
3012+ STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
3013+ return 0;
3014+ }
3015+}
3016+
3017+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
3018+{
3019+ if (is_gather)
3020+ return samp->scale_info.output_sub_size;
3021+ else
3022+ return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
3023+}
3024+
3025+static int stbir__edge_zero_full( int n, int max )
3026+{
3027+ STBIR__UNUSED(n);
3028+ STBIR__UNUSED(max);
3029+ return 0; // NOTREACHED
3030+}
3031+
3032+static int stbir__edge_clamp_full( int n, int max )
3033+{
3034+ if (n < 0)
3035+ return 0;
3036+
3037+ if (n >= max)
3038+ return max - 1;
3039+
3040+ return n; // NOTREACHED
3041+}
3042+
3043+static int stbir__edge_reflect_full( int n, int max )
3044+{
3045+ if (n < 0)
3046+ {
3047+ if (n > -max)
3048+ return -n;
3049+ else
3050+ return max - 1;
3051+ }
3052+
3053+ if (n >= max)
3054+ {
3055+ int max2 = max * 2;
3056+ if (n >= max2)
3057+ return 0;
3058+ else
3059+ return max2 - n - 1;
3060+ }
3061+
3062+ return n; // NOTREACHED
3063+}
3064+
3065+static int stbir__edge_wrap_full( int n, int max )
3066+{
3067+ if (n >= 0)
3068+ return (n % max);
3069+ else
3070+ {
3071+ int m = (-n) % max;
3072+
3073+ if (m != 0)
3074+ m = max - m;
3075+
3076+ return (m);
3077+ }
3078+}
3079+
3080+typedef int stbir__edge_wrap_func( int n, int max );
3081+static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
3082+{
3083+ stbir__edge_clamp_full, // STBIR_EDGE_CLAMP
3084+ stbir__edge_reflect_full, // STBIR_EDGE_REFLECT
3085+ stbir__edge_wrap_full, // STBIR_EDGE_WRAP
3086+ stbir__edge_zero_full, // STBIR_EDGE_ZERO
3087+};
3088+
3089+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
3090+{
3091+ // avoid per-pixel switch
3092+ if (n >= 0 && n < max)
3093+ return n;
3094+ return stbir__edge_wrap_slow[edge]( n, max );
3095+}
3096+
3097+#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
3098+
3099+// get information on the extents of a sampler
3100+static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
3101+{
3102+ int j, stop;
3103+ int left_margin, right_margin;
3104+ int min_n = 0x7fffffff, max_n = -0x7fffffff;
3105+ int min_left = 0x7fffffff, max_left = -0x7fffffff;
3106+ int min_right = 0x7fffffff, max_right = -0x7fffffff;
3107+ stbir_edge edge = samp->edge;
3108+ stbir__contributors* contributors = samp->contributors;
3109+ int output_sub_size = samp->scale_info.output_sub_size;
3110+ int input_full_size = samp->scale_info.input_full_size;
3111+ int filter_pixel_margin = samp->filter_pixel_margin;
3112+
3113+ STBIR_ASSERT( samp->is_gather );
3114+
3115+ stop = output_sub_size;
3116+ for (j = 0; j < stop; j++ )
3117+ {
3118+ STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3119+ if ( contributors[j].n0 < min_n )
3120+ {
3121+ min_n = contributors[j].n0;
3122+ stop = j + filter_pixel_margin; // if we find a new min, only scan another filter width
3123+ if ( stop > output_sub_size ) stop = output_sub_size;
3124+ }
3125+ }
3126+
3127+ stop = 0;
3128+ for (j = output_sub_size - 1; j >= stop; j-- )
3129+ {
3130+ STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
3131+ if ( contributors[j].n1 > max_n )
3132+ {
3133+ max_n = contributors[j].n1;
3134+ stop = j - filter_pixel_margin; // if we find a new max, only scan another filter width
3135+ if (stop<0) stop = 0;
3136+ }
3137+ }
3138+
3139+ STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3140+ STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3141+
3142+ // now calculate how much into the margins we really read
3143+ left_margin = 0;
3144+ if ( min_n < 0 )
3145+ {
3146+ left_margin = -min_n;
3147+ min_n = 0;
3148+ }
3149+
3150+ right_margin = 0;
3151+ if ( max_n >= input_full_size )
3152+ {
3153+ right_margin = max_n - input_full_size + 1;
3154+ max_n = input_full_size - 1;
3155+ }
3156+
3157+ // index 1 is margin pixel extents (how many pixels we hang over the edge)
3158+ scanline_extents->edge_sizes[0] = left_margin;
3159+ scanline_extents->edge_sizes[1] = right_margin;
3160+
3161+ // index 2 is pixels read from the input
3162+ scanline_extents->spans[0].n0 = min_n;
3163+ scanline_extents->spans[0].n1 = max_n;
3164+ scanline_extents->spans[0].pixel_offset_for_input = min_n;
3165+
3166+ // default to no other input range
3167+ scanline_extents->spans[1].n0 = 0;
3168+ scanline_extents->spans[1].n1 = -1;
3169+ scanline_extents->spans[1].pixel_offset_for_input = 0;
3170+
3171+ // don't have to do edge calc for zero clamp
3172+ if ( edge == STBIR_EDGE_ZERO )
3173+ return;
3174+
3175+ // convert margin pixels to the pixels within the input (min and max)
3176+ for( j = -left_margin ; j < 0 ; j++ )
3177+ {
3178+ int p = stbir__edge_wrap( edge, j, input_full_size );
3179+ if ( p < min_left )
3180+ min_left = p;
3181+ if ( p > max_left )
3182+ max_left = p;
3183+ }
3184+
3185+ for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
3186+ {
3187+ int p = stbir__edge_wrap( edge, j, input_full_size );
3188+ if ( p < min_right )
3189+ min_right = p;
3190+ if ( p > max_right )
3191+ max_right = p;
3192+ }
3193+
3194+ // merge the left margin pixel region if it connects within 4 pixels of main pixel region
3195+ if ( min_left != 0x7fffffff )
3196+ {
3197+ if ( ( ( min_left <= min_n ) && ( ( max_left + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3198+ ( ( min_n <= min_left ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
3199+ {
3200+ scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
3201+ scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
3202+ scanline_extents->spans[0].pixel_offset_for_input = min_n;
3203+ left_margin = 0;
3204+ }
3205+ }
3206+
3207+ // merge the right margin pixel region if it connects within 4 pixels of main pixel region
3208+ if ( min_right != 0x7fffffff )
3209+ {
3210+ if ( ( ( min_right <= min_n ) && ( ( max_right + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
3211+ ( ( min_n <= min_right ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
3212+ {
3213+ scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
3214+ scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
3215+ scanline_extents->spans[0].pixel_offset_for_input = min_n;
3216+ right_margin = 0;
3217+ }
3218+ }
3219+
3220+ STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
3221+ STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
3222+
3223+ // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
3224+ // so you need to get a second run of pixels from the opposite side of the scanline (which you
3225+ // wouldn't need except for WRAP)
3226+
3227+
3228+ // if we can't merge the min_left range, add it as a second range
3229+ if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
3230+ {
3231+ stbir__span * newspan = scanline_extents->spans + 1;
3232+ STBIR_ASSERT( right_margin == 0 );
3233+ if ( min_left < scanline_extents->spans[0].n0 )
3234+ {
3235+ scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3236+ scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3237+ scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3238+ --newspan;
3239+ }
3240+ newspan->pixel_offset_for_input = min_left;
3241+ newspan->n0 = -left_margin;
3242+ newspan->n1 = ( max_left - min_left ) - left_margin;
3243+ scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin
3244+ }
3245+ // if we can't merge the min_left range, add it as a second range
3246+ else
3247+ if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
3248+ {
3249+ stbir__span * newspan = scanline_extents->spans + 1;
3250+ if ( min_right < scanline_extents->spans[0].n0 )
3251+ {
3252+ scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
3253+ scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
3254+ scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
3255+ --newspan;
3256+ }
3257+ newspan->pixel_offset_for_input = min_right;
3258+ newspan->n0 = scanline_extents->spans[1].n1 + 1;
3259+ newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
3260+ scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin
3261+ }
3262+
3263+ // sort the spans into write output order
3264+ if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
3265+ {
3266+ stbir__span tspan = scanline_extents->spans[0];
3267+ scanline_extents->spans[0] = scanline_extents->spans[1];
3268+ scanline_extents->spans[1] = tspan;
3269+ }
3270+}
3271+
3272+static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
3273+{
3274+ int first, last;
3275+ float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
3276+ float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
3277+
3278+ float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
3279+ float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
3280+
3281+ first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
3282+ last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
3283+ if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
3284+
3285+ if ( edge == STBIR_EDGE_WRAP )
3286+ {
3287+ if ( first < -input_size )
3288+ first = -input_size;
3289+ if ( last >= (input_size*2))
3290+ last = (input_size*2) - 1;
3291+ }
3292+
3293+ *first_pixel = first;
3294+ *last_pixel = last;
3295+}
3296+
3297+static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
3298+{
3299+ int n, end;
3300+ float inv_scale = scale_info->inv_scale;
3301+ float out_shift = scale_info->pixel_shift;
3302+ int input_size = scale_info->input_full_size;
3303+ int numerator = scale_info->scale_numerator;
3304+ int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3305+
3306+ // Looping through out pixels
3307+ end = num_contributors; if ( polyphase ) end = numerator;
3308+ for (n = 0; n < end; n++)
3309+ {
3310+ int i;
3311+ int last_non_zero;
3312+ float out_pixel_center = (float)n + 0.5f;
3313+ float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
3314+
3315+ int in_first_pixel, in_last_pixel;
3316+
3317+ stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
3318+
3319+ // make sure we never generate a range larger than our precalculated coeff width
3320+ // this only happens in point sample mode, but it's a good safe thing to do anyway
3321+ if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
3322+ in_last_pixel = in_first_pixel + coefficient_width - 1;
3323+
3324+ last_non_zero = -1;
3325+ for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
3326+ {
3327+ float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
3328+ float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
3329+
3330+ // kill denormals
3331+ if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3332+ {
3333+ if ( i == 0 ) // if we're at the front, just eat zero contributors
3334+ {
3335+ STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
3336+ ++in_first_pixel;
3337+ i--;
3338+ continue;
3339+ }
3340+ coeff = 0; // make sure is fully zero (should keep denormals away)
3341+ }
3342+ else
3343+ last_non_zero = i;
3344+
3345+ coefficient_group[i] = coeff;
3346+ }
3347+
3348+ in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
3349+ contributors->n0 = in_first_pixel;
3350+ contributors->n1 = in_last_pixel;
3351+
3352+ STBIR_ASSERT(contributors->n1 >= contributors->n0);
3353+
3354+ ++contributors;
3355+ coefficient_group += coefficient_width;
3356+ }
3357+}
3358+
3359+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
3360+{
3361+ if ( new_pixel <= contribs->n1 ) // before the end
3362+ {
3363+ if ( new_pixel < contribs->n0 ) // before the front?
3364+ {
3365+ if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
3366+ {
3367+ int j, o = contribs->n0 - new_pixel;
3368+ for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
3369+ coeffs[ j + o ] = coeffs[ j ];
3370+ for ( j = 1 ; j < o ; j-- )
3371+ coeffs[ j ] = coeffs[ 0 ];
3372+ coeffs[ 0 ] = new_coeff;
3373+ contribs->n0 = new_pixel;
3374+ }
3375+ }
3376+ else
3377+ {
3378+ coeffs[ new_pixel - contribs->n0 ] += new_coeff;
3379+ }
3380+ }
3381+ else
3382+ {
3383+ if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
3384+ {
3385+ int j, e = new_pixel - contribs->n0;
3386+ for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
3387+ coeffs[j] = 0;
3388+
3389+ coeffs[ e ] = new_coeff;
3390+ contribs->n1 = new_pixel;
3391+ }
3392+ }
3393+}
3394+
3395+static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
3396+{
3397+ float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
3398+ float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
3399+ float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
3400+ float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
3401+ int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
3402+ int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
3403+
3404+ if ( out_first_pixel < 0 )
3405+ out_first_pixel = 0;
3406+ if ( out_last_pixel >= out_size )
3407+ out_last_pixel = out_size - 1;
3408+ *first_pixel = out_first_pixel;
3409+ *last_pixel = out_last_pixel;
3410+}
3411+
3412+static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
3413+{
3414+ int in_pixel;
3415+ int i;
3416+ int first_out_inited = -1;
3417+ float scale = scale_info->scale;
3418+ float out_shift = scale_info->pixel_shift;
3419+ int out_size = scale_info->output_sub_size;
3420+ int numerator = scale_info->scale_numerator;
3421+ int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
3422+
3423+ STBIR__UNUSED(num_contributors);
3424+
3425+ // Loop through the input pixels
3426+ for (in_pixel = start; in_pixel < end; in_pixel++)
3427+ {
3428+ float in_pixel_center = (float)in_pixel + 0.5f;
3429+ float out_center_of_in = in_pixel_center * scale - out_shift;
3430+ int out_first_pixel, out_last_pixel;
3431+
3432+ stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
3433+
3434+ if ( out_first_pixel > out_last_pixel )
3435+ continue;
3436+
3437+ // clamp or exit if we are using polyphase filtering, and the limit is up
3438+ if ( polyphase )
3439+ {
3440+ // when polyphase, you only have to do coeffs up to the numerator count
3441+ if ( out_first_pixel == numerator )
3442+ break;
3443+
3444+ // don't do any extra work, clamp last pixel at numerator too
3445+ if ( out_last_pixel >= numerator )
3446+ out_last_pixel = numerator - 1;
3447+ }
3448+
3449+ for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
3450+ {
3451+ float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
3452+ float x = out_pixel_center - out_center_of_in;
3453+ float coeff = kernel(x, scale, user_data) * scale;
3454+
3455+ // kill the coeff if it's too small (avoid denormals)
3456+ if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
3457+ coeff = 0.0f;
3458+
3459+ {
3460+ int out = i + out_first_pixel;
3461+ float * coeffs = coefficient_group + out * coefficient_width;
3462+ stbir__contributors * contribs = contributors + out;
3463+
3464+ // is this the first time this output pixel has been seen? Init it.
3465+ if ( out > first_out_inited )
3466+ {
3467+ STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
3468+ first_out_inited = out;
3469+ contribs->n0 = in_pixel;
3470+ contribs->n1 = in_pixel;
3471+ coeffs[0] = coeff;
3472+ }
3473+ else
3474+ {
3475+ // insert on end (always in order)
3476+ if ( coeffs[0] == 0.0f ) // if the first coefficent is zero, then zap it for this coeffs
3477+ {
3478+ STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
3479+ contribs->n0 = in_pixel;
3480+ }
3481+ contribs->n1 = in_pixel;
3482+ STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
3483+ coeffs[in_pixel - contribs->n0] = coeff;
3484+ }
3485+ }
3486+ }
3487+ }
3488+}
3489+
3490+#ifdef STBIR_RENORMALIZE_IN_FLOAT
3491+#define STBIR_RENORM_TYPE float
3492+#else
3493+#define STBIR_RENORM_TYPE double
3494+#endif
3495+
3496+static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
3497+{
3498+ int input_size = scale_info->input_full_size;
3499+ int input_last_n1 = input_size - 1;
3500+ int n, end;
3501+ int lowest = 0x7fffffff;
3502+ int highest = -0x7fffffff;
3503+ int widest = -1;
3504+ int numerator = scale_info->scale_numerator;
3505+ int denominator = scale_info->scale_denominator;
3506+ int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
3507+ float * coeffs;
3508+ stbir__contributors * contribs;
3509+
3510+ // weight all the coeffs for each sample
3511+ coeffs = coefficient_group;
3512+ contribs = contributors;
3513+ end = num_contributors; if ( polyphase ) end = numerator;
3514+ for (n = 0; n < end; n++)
3515+ {
3516+ int i;
3517+ STBIR_RENORM_TYPE filter_scale, total_filter = 0;
3518+ int e;
3519+
3520+ // add all contribs
3521+ e = contribs->n1 - contribs->n0;
3522+ for( i = 0 ; i <= e ; i++ )
3523+ {
3524+ total_filter += (STBIR_RENORM_TYPE) coeffs[i];
3525+ STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f ) ); // check for wonky weights
3526+ }
3527+
3528+ // rescale
3529+ if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
3530+ {
3531+ // all coeffs are extremely small, just zero it
3532+ contribs->n1 = contribs->n0;
3533+ coeffs[0] = 0.0f;
3534+ }
3535+ else
3536+ {
3537+ // if the total isn't 1.0, rescale everything
3538+ if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
3539+ {
3540+ filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
3541+
3542+ // scale them all
3543+ for (i = 0; i <= e; i++)
3544+ coeffs[i] = (float) ( coeffs[i] * filter_scale );
3545+ }
3546+ }
3547+ ++contribs;
3548+ coeffs += coefficient_width;
3549+ }
3550+
3551+ // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
3552+ // most of the coefficients, so we copy them here
3553+ if ( polyphase )
3554+ {
3555+ stbir__contributors * prev_contribs = contributors;
3556+ stbir__contributors * cur_contribs = contributors + numerator;
3557+
3558+ for( n = numerator ; n < num_contributors ; n++ )
3559+ {
3560+ cur_contribs->n0 = prev_contribs->n0 + denominator;
3561+ cur_contribs->n1 = prev_contribs->n1 + denominator;
3562+ ++cur_contribs;
3563+ ++prev_contribs;
3564+ }
3565+ stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
3566+ }
3567+
3568+ coeffs = coefficient_group;
3569+ contribs = contributors;
3570+
3571+ for (n = 0; n < num_contributors; n++)
3572+ {
3573+ int i;
3574+
3575+ // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
3576+ if ( edge == STBIR_EDGE_ZERO )
3577+ {
3578+ // shrink the right side if necessary
3579+ if ( contribs->n1 > input_last_n1 )
3580+ contribs->n1 = input_last_n1;
3581+
3582+ // shrink the left side
3583+ if ( contribs->n0 < 0 )
3584+ {
3585+ int j, left, skips = 0;
3586+
3587+ skips = -contribs->n0;
3588+ contribs->n0 = 0;
3589+
3590+ // now move down the weights
3591+ left = contribs->n1 - contribs->n0 + 1;
3592+ if ( left > 0 )
3593+ {
3594+ for( j = 0 ; j < left ; j++ )
3595+ coeffs[ j ] = coeffs[ j + skips ];
3596+ }
3597+ }
3598+ }
3599+ else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
3600+ {
3601+ // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
3602+
3603+ // right hand side first
3604+ if ( contribs->n1 > input_last_n1 )
3605+ {
3606+ int start = contribs->n0;
3607+ int endi = contribs->n1;
3608+ contribs->n1 = input_last_n1;
3609+ for( i = input_size; i <= endi; i++ )
3610+ stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
3611+ }
3612+
3613+ // now check left hand edge
3614+ if ( contribs->n0 < 0 )
3615+ {
3616+ int save_n0;
3617+ float save_n0_coeff;
3618+ float * c = coeffs - ( contribs->n0 + 1 );
3619+
3620+ // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
3621+ for( i = -1 ; i > contribs->n0 ; i-- )
3622+ stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
3623+ save_n0 = contribs->n0;
3624+ save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
3625+
3626+ // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
3627+ contribs->n0 = 0;
3628+ for(i = 0 ; i <= contribs->n1 ; i++ )
3629+ coeffs[i] = coeffs[i-save_n0];
3630+
3631+ // now that we have shrunk down the contribs, we insert the first one safely
3632+ stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
3633+ }
3634+ }
3635+
3636+ if ( contribs->n0 <= contribs->n1 )
3637+ {
3638+ int diff = contribs->n1 - contribs->n0 + 1;
3639+ while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
3640+ --diff;
3641+
3642+ contribs->n1 = contribs->n0 + diff - 1;
3643+
3644+ if ( contribs->n0 <= contribs->n1 )
3645+ {
3646+ if ( contribs->n0 < lowest )
3647+ lowest = contribs->n0;
3648+ if ( contribs->n1 > highest )
3649+ highest = contribs->n1;
3650+ if ( diff > widest )
3651+ widest = diff;
3652+ }
3653+
3654+ // re-zero out unused coefficients (if any)
3655+ for( i = diff ; i < coefficient_width ; i++ )
3656+ coeffs[i] = 0.0f;
3657+ }
3658+
3659+ ++contribs;
3660+ coeffs += coefficient_width;
3661+ }
3662+ filter_info->lowest = lowest;
3663+ filter_info->highest = highest;
3664+ filter_info->widest = widest;
3665+}
3666+
3667+#undef STBIR_RENORM_TYPE
3668+
3669+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
3670+{
3671+ #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
3672+ #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
3673+ #ifdef STBIR_SIMD
3674+ #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
3675+ #else
3676+ #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
3677+ #endif
3678+
3679+ int row_end = row1 + 1;
3680+ STBIR__UNUSED( row0 ); // only used in an assert
3681+
3682+ if ( coefficient_width != widest )
3683+ {
3684+ float * pc = coefficents;
3685+ float * coeffs = coefficents;
3686+ float * pc_end = coefficents + num_contributors * widest;
3687+ switch( widest )
3688+ {
3689+ case 1:
3690+ STBIR_NO_UNROLL_LOOP_START
3691+ do {
3692+ STBIR_MOVE_1( pc, coeffs );
3693+ ++pc;
3694+ coeffs += coefficient_width;
3695+ } while ( pc < pc_end );
3696+ break;
3697+ case 2:
3698+ STBIR_NO_UNROLL_LOOP_START
3699+ do {
3700+ STBIR_MOVE_2( pc, coeffs );
3701+ pc += 2;
3702+ coeffs += coefficient_width;
3703+ } while ( pc < pc_end );
3704+ break;
3705+ case 3:
3706+ STBIR_NO_UNROLL_LOOP_START
3707+ do {
3708+ STBIR_MOVE_2( pc, coeffs );
3709+ STBIR_MOVE_1( pc+2, coeffs+2 );
3710+ pc += 3;
3711+ coeffs += coefficient_width;
3712+ } while ( pc < pc_end );
3713+ break;
3714+ case 4:
3715+ STBIR_NO_UNROLL_LOOP_START
3716+ do {
3717+ STBIR_MOVE_4( pc, coeffs );
3718+ pc += 4;
3719+ coeffs += coefficient_width;
3720+ } while ( pc < pc_end );
3721+ break;
3722+ case 5:
3723+ STBIR_NO_UNROLL_LOOP_START
3724+ do {
3725+ STBIR_MOVE_4( pc, coeffs );
3726+ STBIR_MOVE_1( pc+4, coeffs+4 );
3727+ pc += 5;
3728+ coeffs += coefficient_width;
3729+ } while ( pc < pc_end );
3730+ break;
3731+ case 6:
3732+ STBIR_NO_UNROLL_LOOP_START
3733+ do {
3734+ STBIR_MOVE_4( pc, coeffs );
3735+ STBIR_MOVE_2( pc+4, coeffs+4 );
3736+ pc += 6;
3737+ coeffs += coefficient_width;
3738+ } while ( pc < pc_end );
3739+ break;
3740+ case 7:
3741+ STBIR_NO_UNROLL_LOOP_START
3742+ do {
3743+ STBIR_MOVE_4( pc, coeffs );
3744+ STBIR_MOVE_2( pc+4, coeffs+4 );
3745+ STBIR_MOVE_1( pc+6, coeffs+6 );
3746+ pc += 7;
3747+ coeffs += coefficient_width;
3748+ } while ( pc < pc_end );
3749+ break;
3750+ case 8:
3751+ STBIR_NO_UNROLL_LOOP_START
3752+ do {
3753+ STBIR_MOVE_4( pc, coeffs );
3754+ STBIR_MOVE_4( pc+4, coeffs+4 );
3755+ pc += 8;
3756+ coeffs += coefficient_width;
3757+ } while ( pc < pc_end );
3758+ break;
3759+ case 9:
3760+ STBIR_NO_UNROLL_LOOP_START
3761+ do {
3762+ STBIR_MOVE_4( pc, coeffs );
3763+ STBIR_MOVE_4( pc+4, coeffs+4 );
3764+ STBIR_MOVE_1( pc+8, coeffs+8 );
3765+ pc += 9;
3766+ coeffs += coefficient_width;
3767+ } while ( pc < pc_end );
3768+ break;
3769+ case 10:
3770+ STBIR_NO_UNROLL_LOOP_START
3771+ do {
3772+ STBIR_MOVE_4( pc, coeffs );
3773+ STBIR_MOVE_4( pc+4, coeffs+4 );
3774+ STBIR_MOVE_2( pc+8, coeffs+8 );
3775+ pc += 10;
3776+ coeffs += coefficient_width;
3777+ } while ( pc < pc_end );
3778+ break;
3779+ case 11:
3780+ STBIR_NO_UNROLL_LOOP_START
3781+ do {
3782+ STBIR_MOVE_4( pc, coeffs );
3783+ STBIR_MOVE_4( pc+4, coeffs+4 );
3784+ STBIR_MOVE_2( pc+8, coeffs+8 );
3785+ STBIR_MOVE_1( pc+10, coeffs+10 );
3786+ pc += 11;
3787+ coeffs += coefficient_width;
3788+ } while ( pc < pc_end );
3789+ break;
3790+ case 12:
3791+ STBIR_NO_UNROLL_LOOP_START
3792+ do {
3793+ STBIR_MOVE_4( pc, coeffs );
3794+ STBIR_MOVE_4( pc+4, coeffs+4 );
3795+ STBIR_MOVE_4( pc+8, coeffs+8 );
3796+ pc += 12;
3797+ coeffs += coefficient_width;
3798+ } while ( pc < pc_end );
3799+ break;
3800+ default:
3801+ STBIR_NO_UNROLL_LOOP_START
3802+ do {
3803+ float * copy_end = pc + widest - 4;
3804+ float * c = coeffs;
3805+ do {
3806+ STBIR_NO_UNROLL( pc );
3807+ STBIR_MOVE_4( pc, c );
3808+ pc += 4;
3809+ c += 4;
3810+ } while ( pc <= copy_end );
3811+ copy_end += 4;
3812+ STBIR_NO_UNROLL_LOOP_START
3813+ while ( pc < copy_end )
3814+ {
3815+ STBIR_MOVE_1( pc, c );
3816+ ++pc; ++c;
3817+ }
3818+ coeffs += coefficient_width;
3819+ } while ( pc < pc_end );
3820+ break;
3821+ }
3822+ }
3823+
3824+ // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
3825+ coefficents[ widest * num_contributors ] = 8888.0f;
3826+
3827+ // the minimum we might read for unrolled filters widths is 12. So, we need to
3828+ // make sure we never read outside the decode buffer, by possibly moving
3829+ // the sample area back into the scanline, and putting zeros weights first.
3830+ // we start on the right edge and check until we're well past the possible
3831+ // clip area (2*widest).
3832+ {
3833+ stbir__contributors * contribs = contributors + num_contributors - 1;
3834+ float * coeffs = coefficents + widest * ( num_contributors - 1 );
3835+
3836+ // go until no chance of clipping (this is usually less than 8 lops)
3837+ while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
3838+ {
3839+ // might we clip??
3840+ if ( ( contribs->n0 + widest ) > row_end )
3841+ {
3842+ int stop_range = widest;
3843+
3844+ // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
3845+ // of this contrib n1, instead of a fixed widest amount - so calculate this
3846+ if ( widest > 12 )
3847+ {
3848+ int mod;
3849+
3850+ // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3851+ mod = widest & 3;
3852+ stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3853+
3854+ // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3855+ if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3856+ }
3857+
3858+ // now see if we still clip with the refined range
3859+ if ( ( contribs->n0 + stop_range ) > row_end )
3860+ {
3861+ int new_n0 = row_end - stop_range;
3862+ int num = contribs->n1 - contribs->n0 + 1;
3863+ int backup = contribs->n0 - new_n0;
3864+ float * from_co = coeffs + num - 1;
3865+ float * to_co = from_co + backup;
3866+
3867+ STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
3868+
3869+ // move the coeffs over
3870+ while( num )
3871+ {
3872+ *to_co-- = *from_co--;
3873+ --num;
3874+ }
3875+ // zero new positions
3876+ while ( to_co >= coeffs )
3877+ *to_co-- = 0;
3878+ // set new start point
3879+ contribs->n0 = new_n0;
3880+ if ( widest > 12 )
3881+ {
3882+ int mod;
3883+
3884+ // how far will be read in the n_coeff loop (which depends on the widest count mod4);
3885+ mod = widest & 3;
3886+ stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
3887+
3888+ // the n_coeff loops do a minimum amount of coeffs, so factor that in!
3889+ if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
3890+ }
3891+ }
3892+ }
3893+ --contribs;
3894+ coeffs -= widest;
3895+ }
3896+ }
3897+
3898+ return widest;
3899+ #undef STBIR_MOVE_1
3900+ #undef STBIR_MOVE_2
3901+ #undef STBIR_MOVE_4
3902+}
3903+
3904+static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
3905+{
3906+ int n;
3907+ float scale = samp->scale_info.scale;
3908+ stbir__kernel_callback * kernel = samp->filter_kernel;
3909+ stbir__support_callback * support = samp->filter_support;
3910+ float inv_scale = samp->scale_info.inv_scale;
3911+ int input_full_size = samp->scale_info.input_full_size;
3912+ int gather_num_contributors = samp->num_contributors;
3913+ stbir__contributors* gather_contributors = samp->contributors;
3914+ float * gather_coeffs = samp->coefficients;
3915+ int gather_coefficient_width = samp->coefficient_width;
3916+
3917+ switch ( samp->is_gather )
3918+ {
3919+ case 1: // gather upsample
3920+ {
3921+ float out_pixels_radius = support(inv_scale,user_data) * scale;
3922+
3923+ stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
3924+
3925+ STBIR_PROFILE_BUILD_START( cleanup );
3926+ stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3927+ STBIR_PROFILE_BUILD_END( cleanup );
3928+ }
3929+ break;
3930+
3931+ case 0: // scatter downsample (only on vertical)
3932+ case 2: // gather downsample
3933+ {
3934+ float in_pixels_radius = support(scale,user_data) * inv_scale;
3935+ int filter_pixel_margin = samp->filter_pixel_margin;
3936+ int input_end = input_full_size + filter_pixel_margin;
3937+
3938+ // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
3939+ if ( !samp->is_gather )
3940+ {
3941+ // check if we are using the same gather downsample on the horizontal as this vertical,
3942+ // if so, then we don't have to generate them, we can just pivot from the horizontal.
3943+ if ( other_axis_for_pivot )
3944+ {
3945+ gather_contributors = other_axis_for_pivot->contributors;
3946+ gather_coeffs = other_axis_for_pivot->coefficients;
3947+ gather_coefficient_width = other_axis_for_pivot->coefficient_width;
3948+ gather_num_contributors = other_axis_for_pivot->num_contributors;
3949+ samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
3950+ samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
3951+ samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
3952+ goto jump_right_to_pivot;
3953+ }
3954+
3955+ gather_contributors = samp->gather_prescatter_contributors;
3956+ gather_coeffs = samp->gather_prescatter_coefficients;
3957+ gather_coefficient_width = samp->gather_prescatter_coefficient_width;
3958+ gather_num_contributors = samp->gather_prescatter_num_contributors;
3959+ }
3960+
3961+ stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
3962+
3963+ STBIR_PROFILE_BUILD_START( cleanup );
3964+ stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
3965+ STBIR_PROFILE_BUILD_END( cleanup );
3966+
3967+ if ( !samp->is_gather )
3968+ {
3969+ // if this is a scatter (vertical only), then we need to pivot the coeffs
3970+ stbir__contributors * scatter_contributors;
3971+ int highest_set;
3972+
3973+ jump_right_to_pivot:
3974+
3975+ STBIR_PROFILE_BUILD_START( pivot );
3976+
3977+ highest_set = (-filter_pixel_margin) - 1;
3978+ for (n = 0; n < gather_num_contributors; n++)
3979+ {
3980+ int k;
3981+ int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
3982+ int scatter_coefficient_width = samp->coefficient_width;
3983+ float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
3984+ float * g_coeffs = gather_coeffs;
3985+ scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
3986+
3987+ for (k = gn0 ; k <= gn1 ; k++ )
3988+ {
3989+ float gc = *g_coeffs++;
3990+
3991+ // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
3992+ // (which happens when pivoting from horizontal, which might have dummy zeros)
3993+ if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
3994+ {
3995+ if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
3996+ {
3997+ {
3998+ // if we are skipping over several contributors, we need to clear the skipped ones
3999+ stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
4000+ while ( clear_contributors < scatter_contributors )
4001+ {
4002+ clear_contributors->n0 = 0;
4003+ clear_contributors->n1 = -1;
4004+ ++clear_contributors;
4005+ }
4006+ }
4007+ scatter_contributors->n0 = n;
4008+ scatter_contributors->n1 = n;
4009+ scatter_coeffs[0] = gc;
4010+ highest_set = k;
4011+ }
4012+ else
4013+ {
4014+ stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
4015+ }
4016+ STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
4017+ }
4018+ ++scatter_contributors;
4019+ scatter_coeffs += scatter_coefficient_width;
4020+ }
4021+
4022+ ++gather_contributors;
4023+ gather_coeffs += gather_coefficient_width;
4024+ }
4025+
4026+ // now clear any unset contribs
4027+ {
4028+ stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
4029+ stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
4030+ while ( clear_contributors < end_contributors )
4031+ {
4032+ clear_contributors->n0 = 0;
4033+ clear_contributors->n1 = -1;
4034+ ++clear_contributors;
4035+ }
4036+ }
4037+
4038+ STBIR_PROFILE_BUILD_END( pivot );
4039+ }
4040+ }
4041+ break;
4042+ }
4043+}
4044+
4045+
4046+//========================================================================================================
4047+// scanline decoders and encoders
4048+
4049+#define stbir__coder_min_num 1
4050+#define STB_IMAGE_RESIZE_DO_CODERS
4051+#include STBIR__HEADER_FILENAME
4052+
4053+#define stbir__decode_suffix BGRA
4054+#define stbir__decode_swizzle
4055+#define stbir__decode_order0 2
4056+#define stbir__decode_order1 1
4057+#define stbir__decode_order2 0
4058+#define stbir__decode_order3 3
4059+#define stbir__encode_order0 2
4060+#define stbir__encode_order1 1
4061+#define stbir__encode_order2 0
4062+#define stbir__encode_order3 3
4063+#define stbir__coder_min_num 4
4064+#define STB_IMAGE_RESIZE_DO_CODERS
4065+#include STBIR__HEADER_FILENAME
4066+
4067+#define stbir__decode_suffix ARGB
4068+#define stbir__decode_swizzle
4069+#define stbir__decode_order0 1
4070+#define stbir__decode_order1 2
4071+#define stbir__decode_order2 3
4072+#define stbir__decode_order3 0
4073+#define stbir__encode_order0 3
4074+#define stbir__encode_order1 0
4075+#define stbir__encode_order2 1
4076+#define stbir__encode_order3 2
4077+#define stbir__coder_min_num 4
4078+#define STB_IMAGE_RESIZE_DO_CODERS
4079+#include STBIR__HEADER_FILENAME
4080+
4081+#define stbir__decode_suffix ABGR
4082+#define stbir__decode_swizzle
4083+#define stbir__decode_order0 3
4084+#define stbir__decode_order1 2
4085+#define stbir__decode_order2 1
4086+#define stbir__decode_order3 0
4087+#define stbir__encode_order0 3
4088+#define stbir__encode_order1 2
4089+#define stbir__encode_order2 1
4090+#define stbir__encode_order3 0
4091+#define stbir__coder_min_num 4
4092+#define STB_IMAGE_RESIZE_DO_CODERS
4093+#include STBIR__HEADER_FILENAME
4094+
4095+#define stbir__decode_suffix AR
4096+#define stbir__decode_swizzle
4097+#define stbir__decode_order0 1
4098+#define stbir__decode_order1 0
4099+#define stbir__decode_order2 3
4100+#define stbir__decode_order3 2
4101+#define stbir__encode_order0 1
4102+#define stbir__encode_order1 0
4103+#define stbir__encode_order2 3
4104+#define stbir__encode_order3 2
4105+#define stbir__coder_min_num 2
4106+#define STB_IMAGE_RESIZE_DO_CODERS
4107+#include STBIR__HEADER_FILENAME
4108+
4109+
4110+// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
4111+static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
4112+{
4113+ float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4114+ float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7; // decode buffer aligned to end of out_buffer
4115+ float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4116+
4117+ // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
4118+
4119+ #ifdef STBIR_SIMD
4120+
4121+ #ifdef STBIR_SIMD8
4122+ decode += 16;
4123+ STBIR_NO_UNROLL_LOOP_START
4124+ while ( decode <= end_decode )
4125+ {
4126+ stbir__simdf8 d0,d1,a0,a1,p0,p1;
4127+ STBIR_NO_UNROLL(decode);
4128+ stbir__simdf8_load( d0, decode-16 );
4129+ stbir__simdf8_load( d1, decode-16+8 );
4130+ stbir__simdf8_0123to33333333( a0, d0 );
4131+ stbir__simdf8_0123to33333333( a1, d1 );
4132+ stbir__simdf8_mult( p0, a0, d0 );
4133+ stbir__simdf8_mult( p1, a1, d1 );
4134+ stbir__simdf8_bot4s( a0, d0, p0 );
4135+ stbir__simdf8_bot4s( a1, d1, p1 );
4136+ stbir__simdf8_top4s( d0, d0, p0 );
4137+ stbir__simdf8_top4s( d1, d1, p1 );
4138+ stbir__simdf8_store ( out, a0 );
4139+ stbir__simdf8_store ( out+7, d0 );
4140+ stbir__simdf8_store ( out+14, a1 );
4141+ stbir__simdf8_store ( out+21, d1 );
4142+ decode += 16;
4143+ out += 28;
4144+ }
4145+ decode -= 16;
4146+ #else
4147+ decode += 8;
4148+ STBIR_NO_UNROLL_LOOP_START
4149+ while ( decode <= end_decode )
4150+ {
4151+ stbir__simdf d0,a0,d1,a1,p0,p1;
4152+ STBIR_NO_UNROLL(decode);
4153+ stbir__simdf_load( d0, decode-8 );
4154+ stbir__simdf_load( d1, decode-8+4 );
4155+ stbir__simdf_0123to3333( a0, d0 );
4156+ stbir__simdf_0123to3333( a1, d1 );
4157+ stbir__simdf_mult( p0, a0, d0 );
4158+ stbir__simdf_mult( p1, a1, d1 );
4159+ stbir__simdf_store ( out, d0 );
4160+ stbir__simdf_store ( out+4, p0 );
4161+ stbir__simdf_store ( out+7, d1 );
4162+ stbir__simdf_store ( out+7+4, p1 );
4163+ decode += 8;
4164+ out += 14;
4165+ }
4166+ decode -= 8;
4167+ #endif
4168+
4169+ // might be one last odd pixel
4170+ #ifdef STBIR_SIMD8
4171+ STBIR_NO_UNROLL_LOOP_START
4172+ while ( decode < end_decode )
4173+ #else
4174+ if ( decode < end_decode )
4175+ #endif
4176+ {
4177+ stbir__simdf d,a,p;
4178+ STBIR_NO_UNROLL(decode);
4179+ stbir__simdf_load( d, decode );
4180+ stbir__simdf_0123to3333( a, d );
4181+ stbir__simdf_mult( p, a, d );
4182+ stbir__simdf_store ( out, d );
4183+ stbir__simdf_store ( out+4, p );
4184+ decode += 4;
4185+ out += 7;
4186+ }
4187+
4188+ #else
4189+
4190+ while( decode < end_decode )
4191+ {
4192+ float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
4193+ out[0] = r;
4194+ out[1] = g;
4195+ out[2] = b;
4196+ out[3] = alpha;
4197+ out[4] = r * alpha;
4198+ out[5] = g * alpha;
4199+ out[6] = b * alpha;
4200+ out += 7;
4201+ decode += 4;
4202+ }
4203+
4204+ #endif
4205+}
4206+
4207+static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
4208+{
4209+ float STBIR_STREAMOUT_PTR(*) out = out_buffer;
4210+ float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
4211+ float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
4212+
4213+ // for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
4214+
4215+ #ifdef STBIR_SIMD
4216+
4217+ decode += 8;
4218+ if ( decode <= end_decode )
4219+ {
4220+ STBIR_NO_UNROLL_LOOP_START
4221+ do {
4222+ #ifdef STBIR_SIMD8
4223+ stbir__simdf8 d0,a0,p0;
4224+ STBIR_NO_UNROLL(decode);
4225+ stbir__simdf8_load( d0, decode-8 );
4226+ stbir__simdf8_0123to11331133( p0, d0 );
4227+ stbir__simdf8_0123to00220022( a0, d0 );
4228+ stbir__simdf8_mult( p0, p0, a0 );
4229+
4230+ stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4231+ stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
4232+ stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
4233+
4234+ stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
4235+ stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
4236+ stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
4237+ #else
4238+ stbir__simdf d0,a0,d1,a1,p0,p1;
4239+ STBIR_NO_UNROLL(decode);
4240+ stbir__simdf_load( d0, decode-8 );
4241+ stbir__simdf_load( d1, decode-8+4 );
4242+ stbir__simdf_0123to1133( p0, d0 );
4243+ stbir__simdf_0123to1133( p1, d1 );
4244+ stbir__simdf_0123to0022( a0, d0 );
4245+ stbir__simdf_0123to0022( a1, d1 );
4246+ stbir__simdf_mult( p0, p0, a0 );
4247+ stbir__simdf_mult( p1, p1, a1 );
4248+
4249+ stbir__simdf_store2( out, d0 );
4250+ stbir__simdf_store( out+2, p0 );
4251+ stbir__simdf_store2h( out+3, d0 );
4252+
4253+ stbir__simdf_store2( out+6, d1 );
4254+ stbir__simdf_store( out+8, p1 );
4255+ stbir__simdf_store2h( out+9, d1 );
4256+ #endif
4257+ decode += 8;
4258+ out += 12;
4259+ } while ( decode <= end_decode );
4260+ }
4261+ decode -= 8;
4262+ #endif
4263+
4264+ STBIR_SIMD_NO_UNROLL_LOOP_START
4265+ while( decode < end_decode )
4266+ {
4267+ float x = decode[0], y = decode[1];
4268+ STBIR_SIMD_NO_UNROLL(decode);
4269+ out[0] = x;
4270+ out[1] = y;
4271+ out[2] = x * y;
4272+ out += 3;
4273+ decode += 2;
4274+ }
4275+}
4276+
4277+static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4278+{
4279+ float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4280+ float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4281+ float const * end_output = encode_buffer + width_times_channels;
4282+
4283+ // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
4284+
4285+ STBIR_SIMD_NO_UNROLL_LOOP_START
4286+ do {
4287+ float alpha = input[3];
4288+#ifdef STBIR_SIMD
4289+ stbir__simdf i,ia;
4290+ STBIR_SIMD_NO_UNROLL(encode);
4291+ if ( alpha < stbir__small_float )
4292+ {
4293+ stbir__simdf_load( i, input );
4294+ stbir__simdf_store( encode, i );
4295+ }
4296+ else
4297+ {
4298+ stbir__simdf_load1frep4( ia, 1.0f / alpha );
4299+ stbir__simdf_load( i, input+4 );
4300+ stbir__simdf_mult( i, i, ia );
4301+ stbir__simdf_store( encode, i );
4302+ encode[3] = alpha;
4303+ }
4304+#else
4305+ if ( alpha < stbir__small_float )
4306+ {
4307+ encode[0] = input[0];
4308+ encode[1] = input[1];
4309+ encode[2] = input[2];
4310+ }
4311+ else
4312+ {
4313+ float ialpha = 1.0f / alpha;
4314+ encode[0] = input[4] * ialpha;
4315+ encode[1] = input[5] * ialpha;
4316+ encode[2] = input[6] * ialpha;
4317+ }
4318+ encode[3] = alpha;
4319+#endif
4320+
4321+ input += 7;
4322+ encode += 4;
4323+ } while ( encode < end_output );
4324+}
4325+
4326+// format: [X A Xpm][X A Xpm] etc
4327+static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4328+{
4329+ float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4330+ float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
4331+ float const * end_output = encode_buffer + width_times_channels;
4332+
4333+ do {
4334+ float alpha = input[1];
4335+ encode[0] = input[0];
4336+ if ( alpha >= stbir__small_float )
4337+ encode[0] = input[2] / alpha;
4338+ encode[1] = alpha;
4339+
4340+ input += 3;
4341+ encode += 2;
4342+ } while ( encode < end_output );
4343+}
4344+
4345+static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
4346+{
4347+ float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4348+ float const * end_decode = decode_buffer + width_times_channels;
4349+
4350+ #ifdef STBIR_SIMD
4351+ {
4352+ decode += 2 * stbir__simdfX_float_count;
4353+ STBIR_NO_UNROLL_LOOP_START
4354+ while ( decode <= end_decode )
4355+ {
4356+ stbir__simdfX d0,a0,d1,a1;
4357+ STBIR_NO_UNROLL(decode);
4358+ stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4359+ stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4360+ stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
4361+ stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
4362+ stbir__simdfX_mult( d0, d0, a0 );
4363+ stbir__simdfX_mult( d1, d1, a1 );
4364+ stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4365+ stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4366+ decode += 2 * stbir__simdfX_float_count;
4367+ }
4368+ decode -= 2 * stbir__simdfX_float_count;
4369+
4370+ // few last pixels remnants
4371+ #ifdef STBIR_SIMD8
4372+ STBIR_NO_UNROLL_LOOP_START
4373+ while ( decode < end_decode )
4374+ #else
4375+ if ( decode < end_decode )
4376+ #endif
4377+ {
4378+ stbir__simdf d,a;
4379+ stbir__simdf_load( d, decode );
4380+ stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
4381+ stbir__simdf_mult( d, d, a );
4382+ stbir__simdf_store ( decode, d );
4383+ decode += 4;
4384+ }
4385+ }
4386+
4387+ #else
4388+
4389+ while( decode < end_decode )
4390+ {
4391+ float alpha = decode[3];
4392+ decode[0] *= alpha;
4393+ decode[1] *= alpha;
4394+ decode[2] *= alpha;
4395+ decode += 4;
4396+ }
4397+
4398+ #endif
4399+}
4400+
4401+static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
4402+{
4403+ float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4404+ float const * end_decode = decode_buffer + width_times_channels;
4405+
4406+ #ifdef STBIR_SIMD
4407+ decode += 2 * stbir__simdfX_float_count;
4408+ STBIR_NO_UNROLL_LOOP_START
4409+ while ( decode <= end_decode )
4410+ {
4411+ stbir__simdfX d0,a0,d1,a1;
4412+ STBIR_NO_UNROLL(decode);
4413+ stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
4414+ stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
4415+ stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
4416+ stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
4417+ stbir__simdfX_mult( d0, d0, a0 );
4418+ stbir__simdfX_mult( d1, d1, a1 );
4419+ stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
4420+ stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
4421+ decode += 2 * stbir__simdfX_float_count;
4422+ }
4423+ decode -= 2 * stbir__simdfX_float_count;
4424+ #endif
4425+
4426+ STBIR_SIMD_NO_UNROLL_LOOP_START
4427+ while( decode < end_decode )
4428+ {
4429+ float alpha = decode[1];
4430+ STBIR_SIMD_NO_UNROLL(decode);
4431+ decode[0] *= alpha;
4432+ decode += 2;
4433+ }
4434+}
4435+
4436+static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
4437+{
4438+ float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4439+ float const * end_output = encode_buffer + width_times_channels;
4440+
4441+ STBIR_SIMD_NO_UNROLL_LOOP_START
4442+ do {
4443+ float alpha = encode[3];
4444+
4445+#ifdef STBIR_SIMD
4446+ stbir__simdf i,ia;
4447+ STBIR_SIMD_NO_UNROLL(encode);
4448+ if ( alpha >= stbir__small_float )
4449+ {
4450+ stbir__simdf_load1frep4( ia, 1.0f / alpha );
4451+ stbir__simdf_load( i, encode );
4452+ stbir__simdf_mult( i, i, ia );
4453+ stbir__simdf_store( encode, i );
4454+ encode[3] = alpha;
4455+ }
4456+#else
4457+ if ( alpha >= stbir__small_float )
4458+ {
4459+ float ialpha = 1.0f / alpha;
4460+ encode[0] *= ialpha;
4461+ encode[1] *= ialpha;
4462+ encode[2] *= ialpha;
4463+ }
4464+#endif
4465+ encode += 4;
4466+ } while ( encode < end_output );
4467+}
4468+
4469+static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
4470+{
4471+ float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
4472+ float const * end_output = encode_buffer + width_times_channels;
4473+
4474+ do {
4475+ float alpha = encode[1];
4476+ if ( alpha >= stbir__small_float )
4477+ encode[0] /= alpha;
4478+ encode += 2;
4479+ } while ( encode < end_output );
4480+}
4481+
4482+
4483+// only used in RGB->BGR or BGR->RGB
4484+static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
4485+{
4486+ float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
4487+ float const * end_decode = decode_buffer + width_times_channels;
4488+
4489+#ifdef STBIR_SIMD
4490+ #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
4491+ end_decode -= 12;
4492+ STBIR_NO_UNROLL_LOOP_START
4493+ while( decode <= end_decode )
4494+ {
4495+ // on arm64 8 instructions, no overlapping stores
4496+ stbir__simdf a,b,c,na,nb;
4497+ STBIR_SIMD_NO_UNROLL(decode);
4498+ stbir__simdf_load( a, decode );
4499+ stbir__simdf_load( b, decode+4 );
4500+ stbir__simdf_load( c, decode+8 );
4501+
4502+ na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
4503+ b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
4504+ nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
4505+ c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
4506+
4507+ stbir__simdf_store( decode, na );
4508+ stbir__simdf_store( decode+4, nb );
4509+ stbir__simdf_store( decode+8, c );
4510+ decode += 12;
4511+ }
4512+ end_decode += 12;
4513+ #else
4514+ end_decode -= 24;
4515+ STBIR_NO_UNROLL_LOOP_START
4516+ while( decode <= end_decode )
4517+ {
4518+ // 26 instructions on x64
4519+ stbir__simdf a,b,c,d,e,f,g;
4520+ float i21, i23;
4521+ STBIR_SIMD_NO_UNROLL(decode);
4522+ stbir__simdf_load( a, decode );
4523+ stbir__simdf_load( b, decode+3 );
4524+ stbir__simdf_load( c, decode+6 );
4525+ stbir__simdf_load( d, decode+9 );
4526+ stbir__simdf_load( e, decode+12 );
4527+ stbir__simdf_load( f, decode+15 );
4528+ stbir__simdf_load( g, decode+18 );
4529+
4530+ a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
4531+ b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
4532+ c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
4533+ d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
4534+ e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
4535+ f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
4536+ g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
4537+
4538+ // stores overlap, need to be in order,
4539+ stbir__simdf_store( decode, a );
4540+ i21 = decode[21];
4541+ stbir__simdf_store( decode+3, b );
4542+ i23 = decode[23];
4543+ stbir__simdf_store( decode+6, c );
4544+ stbir__simdf_store( decode+9, d );
4545+ stbir__simdf_store( decode+12, e );
4546+ stbir__simdf_store( decode+15, f );
4547+ stbir__simdf_store( decode+18, g );
4548+ decode[21] = i23;
4549+ decode[23] = i21;
4550+ decode += 24;
4551+ }
4552+ end_decode += 24;
4553+ #endif
4554+#else
4555+ end_decode -= 12;
4556+ STBIR_NO_UNROLL_LOOP_START
4557+ while( decode <= end_decode )
4558+ {
4559+ // 16 instructions
4560+ float t0,t1,t2,t3;
4561+ STBIR_NO_UNROLL(decode);
4562+ t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
4563+ decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
4564+ decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
4565+ decode += 12;
4566+ }
4567+ end_decode += 12;
4568+#endif
4569+
4570+ STBIR_NO_UNROLL_LOOP_START
4571+ while( decode < end_decode )
4572+ {
4573+ float t = decode[0];
4574+ STBIR_NO_UNROLL(decode);
4575+ decode[0] = decode[2];
4576+ decode[2] = t;
4577+ decode += 3;
4578+ }
4579+}
4580+
4581+
4582+
4583+static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
4584+{
4585+ int channels = stbir_info->channels;
4586+ int effective_channels = stbir_info->effective_channels;
4587+ int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
4588+ stbir_edge edge_horizontal = stbir_info->horizontal.edge;
4589+ stbir_edge edge_vertical = stbir_info->vertical.edge;
4590+ int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
4591+ const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
4592+ stbir__span const * spans = stbir_info->scanline_extents.spans;
4593+ float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
4594+ float * last_decoded = 0;
4595+
4596+ // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
4597+ STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
4598+
4599+ do
4600+ {
4601+ float * decode_buffer;
4602+ void const * input_data;
4603+ float * end_decode;
4604+ int width_times_channels;
4605+ int width;
4606+
4607+ if ( spans->n1 < spans->n0 )
4608+ break;
4609+
4610+ width = spans->n1 + 1 - spans->n0;
4611+ decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
4612+ end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
4613+ width_times_channels = width * channels;
4614+
4615+ // read directly out of input plane by default
4616+ input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
4617+
4618+ // if we have an input callback, call it to get the input data
4619+ if ( stbir_info->in_pixels_cb )
4620+ {
4621+ // call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself
4622+ input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
4623+ }
4624+
4625+ STBIR_PROFILE_START( decode );
4626+ // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
4627+ last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
4628+ STBIR_PROFILE_END( decode );
4629+
4630+ if (stbir_info->alpha_weight)
4631+ {
4632+ STBIR_PROFILE_START( alpha );
4633+ stbir_info->alpha_weight( decode_buffer, width_times_channels );
4634+ STBIR_PROFILE_END( alpha );
4635+ }
4636+
4637+ ++spans;
4638+ } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
4639+
4640+ // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
4641+ // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
4642+ // wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
4643+ if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
4644+ {
4645+ // this code only runs if we're in edge_wrap, and we're doing the entire scanline
4646+ int e, start_x[2];
4647+ int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
4648+
4649+ start_x[0] = -stbir_info->scanline_extents.edge_sizes[0]; // left edge start x
4650+ start_x[1] = input_full_size; // right edge
4651+
4652+ for( e = 0; e < 2 ; e++ )
4653+ {
4654+ // do each margin
4655+ int margin = stbir_info->scanline_extents.edge_sizes[e];
4656+ if ( margin )
4657+ {
4658+ int x = start_x[e];
4659+ float * marg = full_decode_buffer + x * effective_channels;
4660+ float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
4661+ STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
4662+ if ( e == 1 ) last_decoded = marg + margin * effective_channels;
4663+ }
4664+ }
4665+ }
4666+
4667+ // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
4668+ // (we can't pre-zero it, because the input callback can use that area as padding)
4669+ last_decoded[0] = 0.0f;
4670+
4671+ // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
4672+ // when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
4673+ // this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
4674+ last_decoded[1] = 0.0f;
4675+}
4676+
4677+
4678+//=================
4679+// Do 1 channel horizontal routines
4680+
4681+#ifdef STBIR_SIMD
4682+
4683+#define stbir__1_coeff_only() \
4684+ stbir__simdf tot,c; \
4685+ STBIR_SIMD_NO_UNROLL(decode); \
4686+ stbir__simdf_load1( c, hc ); \
4687+ stbir__simdf_mult1_mem( tot, c, decode );
4688+
4689+#define stbir__2_coeff_only() \
4690+ stbir__simdf tot,c,d; \
4691+ STBIR_SIMD_NO_UNROLL(decode); \
4692+ stbir__simdf_load2z( c, hc ); \
4693+ stbir__simdf_load2( d, decode ); \
4694+ stbir__simdf_mult( tot, c, d ); \
4695+ stbir__simdf_0123to1230( c, tot ); \
4696+ stbir__simdf_add1( tot, tot, c );
4697+
4698+#define stbir__3_coeff_only() \
4699+ stbir__simdf tot,c,t; \
4700+ STBIR_SIMD_NO_UNROLL(decode); \
4701+ stbir__simdf_load( c, hc ); \
4702+ stbir__simdf_mult_mem( tot, c, decode ); \
4703+ stbir__simdf_0123to1230( c, tot ); \
4704+ stbir__simdf_0123to2301( t, tot ); \
4705+ stbir__simdf_add1( tot, tot, c ); \
4706+ stbir__simdf_add1( tot, tot, t );
4707+
4708+#define stbir__store_output_tiny() \
4709+ stbir__simdf_store1( output, tot ); \
4710+ horizontal_coefficients += coefficient_width; \
4711+ ++horizontal_contributors; \
4712+ output += 1;
4713+
4714+#define stbir__4_coeff_start() \
4715+ stbir__simdf tot,c; \
4716+ STBIR_SIMD_NO_UNROLL(decode); \
4717+ stbir__simdf_load( c, hc ); \
4718+ stbir__simdf_mult_mem( tot, c, decode ); \
4719+
4720+#define stbir__4_coeff_continue_from_4( ofs ) \
4721+ STBIR_SIMD_NO_UNROLL(decode); \
4722+ stbir__simdf_load( c, hc + (ofs) ); \
4723+ stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4724+
4725+#define stbir__1_coeff_remnant( ofs ) \
4726+ { stbir__simdf d; \
4727+ stbir__simdf_load1z( c, hc + (ofs) ); \
4728+ stbir__simdf_load1( d, decode + (ofs) ); \
4729+ stbir__simdf_madd( tot, tot, d, c ); }
4730+
4731+#define stbir__2_coeff_remnant( ofs ) \
4732+ { stbir__simdf d; \
4733+ stbir__simdf_load2z( c, hc+(ofs) ); \
4734+ stbir__simdf_load2( d, decode+(ofs) ); \
4735+ stbir__simdf_madd( tot, tot, d, c ); }
4736+
4737+#define stbir__3_coeff_setup() \
4738+ stbir__simdf mask; \
4739+ stbir__simdf_load( mask, STBIR_mask + 3 );
4740+
4741+#define stbir__3_coeff_remnant( ofs ) \
4742+ stbir__simdf_load( c, hc+(ofs) ); \
4743+ stbir__simdf_and( c, c, mask ); \
4744+ stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
4745+
4746+#define stbir__store_output() \
4747+ stbir__simdf_0123to2301( c, tot ); \
4748+ stbir__simdf_add( tot, tot, c ); \
4749+ stbir__simdf_0123to1230( c, tot ); \
4750+ stbir__simdf_add1( tot, tot, c ); \
4751+ stbir__simdf_store1( output, tot ); \
4752+ horizontal_coefficients += coefficient_width; \
4753+ ++horizontal_contributors; \
4754+ output += 1;
4755+
4756+#else
4757+
4758+#define stbir__1_coeff_only() \
4759+ float tot; \
4760+ tot = decode[0]*hc[0];
4761+
4762+#define stbir__2_coeff_only() \
4763+ float tot; \
4764+ tot = decode[0] * hc[0]; \
4765+ tot += decode[1] * hc[1];
4766+
4767+#define stbir__3_coeff_only() \
4768+ float tot; \
4769+ tot = decode[0] * hc[0]; \
4770+ tot += decode[1] * hc[1]; \
4771+ tot += decode[2] * hc[2];
4772+
4773+#define stbir__store_output_tiny() \
4774+ output[0] = tot; \
4775+ horizontal_coefficients += coefficient_width; \
4776+ ++horizontal_contributors; \
4777+ output += 1;
4778+
4779+#define stbir__4_coeff_start() \
4780+ float tot0,tot1,tot2,tot3; \
4781+ tot0 = decode[0] * hc[0]; \
4782+ tot1 = decode[1] * hc[1]; \
4783+ tot2 = decode[2] * hc[2]; \
4784+ tot3 = decode[3] * hc[3];
4785+
4786+#define stbir__4_coeff_continue_from_4( ofs ) \
4787+ tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4788+ tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4789+ tot2 += decode[2+(ofs)] * hc[2+(ofs)]; \
4790+ tot3 += decode[3+(ofs)] * hc[3+(ofs)];
4791+
4792+#define stbir__1_coeff_remnant( ofs ) \
4793+ tot0 += decode[0+(ofs)] * hc[0+(ofs)];
4794+
4795+#define stbir__2_coeff_remnant( ofs ) \
4796+ tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4797+ tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4798+
4799+#define stbir__3_coeff_remnant( ofs ) \
4800+ tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
4801+ tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
4802+ tot2 += decode[2+(ofs)] * hc[2+(ofs)];
4803+
4804+#define stbir__store_output() \
4805+ output[0] = (tot0+tot2)+(tot1+tot3); \
4806+ horizontal_coefficients += coefficient_width; \
4807+ ++horizontal_contributors; \
4808+ output += 1;
4809+
4810+#endif
4811+
4812+#define STBIR__horizontal_channels 1
4813+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
4814+#include STBIR__HEADER_FILENAME
4815+
4816+
4817+//=================
4818+// Do 2 channel horizontal routines
4819+
4820+#ifdef STBIR_SIMD
4821+
4822+#define stbir__1_coeff_only() \
4823+ stbir__simdf tot,c,d; \
4824+ STBIR_SIMD_NO_UNROLL(decode); \
4825+ stbir__simdf_load1z( c, hc ); \
4826+ stbir__simdf_0123to0011( c, c ); \
4827+ stbir__simdf_load2( d, decode ); \
4828+ stbir__simdf_mult( tot, d, c );
4829+
4830+#define stbir__2_coeff_only() \
4831+ stbir__simdf tot,c; \
4832+ STBIR_SIMD_NO_UNROLL(decode); \
4833+ stbir__simdf_load2( c, hc ); \
4834+ stbir__simdf_0123to0011( c, c ); \
4835+ stbir__simdf_mult_mem( tot, c, decode );
4836+
4837+#define stbir__3_coeff_only() \
4838+ stbir__simdf tot,c,cs,d; \
4839+ STBIR_SIMD_NO_UNROLL(decode); \
4840+ stbir__simdf_load( cs, hc ); \
4841+ stbir__simdf_0123to0011( c, cs ); \
4842+ stbir__simdf_mult_mem( tot, c, decode ); \
4843+ stbir__simdf_0123to2222( c, cs ); \
4844+ stbir__simdf_load2z( d, decode+4 ); \
4845+ stbir__simdf_madd( tot, tot, d, c );
4846+
4847+#define stbir__store_output_tiny() \
4848+ stbir__simdf_0123to2301( c, tot ); \
4849+ stbir__simdf_add( tot, tot, c ); \
4850+ stbir__simdf_store2( output, tot ); \
4851+ horizontal_coefficients += coefficient_width; \
4852+ ++horizontal_contributors; \
4853+ output += 2;
4854+
4855+#ifdef STBIR_SIMD8
4856+
4857+#define stbir__4_coeff_start() \
4858+ stbir__simdf8 tot0,c,cs; \
4859+ STBIR_SIMD_NO_UNROLL(decode); \
4860+ stbir__simdf8_load4b( cs, hc ); \
4861+ stbir__simdf8_0123to00112233( c, cs ); \
4862+ stbir__simdf8_mult_mem( tot0, c, decode );
4863+
4864+#define stbir__4_coeff_continue_from_4( ofs ) \
4865+ STBIR_SIMD_NO_UNROLL(decode); \
4866+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
4867+ stbir__simdf8_0123to00112233( c, cs ); \
4868+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4869+
4870+#define stbir__1_coeff_remnant( ofs ) \
4871+ { stbir__simdf t,d; \
4872+ stbir__simdf_load1z( t, hc + (ofs) ); \
4873+ stbir__simdf_load2( d, decode + (ofs) * 2 ); \
4874+ stbir__simdf_0123to0011( t, t ); \
4875+ stbir__simdf_mult( t, t, d ); \
4876+ stbir__simdf8_add4( tot0, tot0, t ); }
4877+
4878+#define stbir__2_coeff_remnant( ofs ) \
4879+ { stbir__simdf t; \
4880+ stbir__simdf_load2( t, hc + (ofs) ); \
4881+ stbir__simdf_0123to0011( t, t ); \
4882+ stbir__simdf_mult_mem( t, t, decode+(ofs)*2 ); \
4883+ stbir__simdf8_add4( tot0, tot0, t ); }
4884+
4885+#define stbir__3_coeff_remnant( ofs ) \
4886+ { stbir__simdf8 d; \
4887+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
4888+ stbir__simdf8_0123to00112233( c, cs ); \
4889+ stbir__simdf8_load6z( d, decode+(ofs)*2 ); \
4890+ stbir__simdf8_madd( tot0, tot0, c, d ); }
4891+
4892+#define stbir__store_output() \
4893+ { stbir__simdf t,d; \
4894+ stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
4895+ stbir__simdf_0123to2301( d, t ); \
4896+ stbir__simdf_add( t, t, d ); \
4897+ stbir__simdf_store2( output, t ); \
4898+ horizontal_coefficients += coefficient_width; \
4899+ ++horizontal_contributors; \
4900+ output += 2; }
4901+
4902+#else
4903+
4904+#define stbir__4_coeff_start() \
4905+ stbir__simdf tot0,tot1,c,cs; \
4906+ STBIR_SIMD_NO_UNROLL(decode); \
4907+ stbir__simdf_load( cs, hc ); \
4908+ stbir__simdf_0123to0011( c, cs ); \
4909+ stbir__simdf_mult_mem( tot0, c, decode ); \
4910+ stbir__simdf_0123to2233( c, cs ); \
4911+ stbir__simdf_mult_mem( tot1, c, decode+4 );
4912+
4913+#define stbir__4_coeff_continue_from_4( ofs ) \
4914+ STBIR_SIMD_NO_UNROLL(decode); \
4915+ stbir__simdf_load( cs, hc + (ofs) ); \
4916+ stbir__simdf_0123to0011( c, cs ); \
4917+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
4918+ stbir__simdf_0123to2233( c, cs ); \
4919+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
4920+
4921+#define stbir__1_coeff_remnant( ofs ) \
4922+ { stbir__simdf d; \
4923+ stbir__simdf_load1z( cs, hc + (ofs) ); \
4924+ stbir__simdf_0123to0011( c, cs ); \
4925+ stbir__simdf_load2( d, decode + (ofs) * 2 ); \
4926+ stbir__simdf_madd( tot0, tot0, d, c ); }
4927+
4928+#define stbir__2_coeff_remnant( ofs ) \
4929+ stbir__simdf_load2( cs, hc + (ofs) ); \
4930+ stbir__simdf_0123to0011( c, cs ); \
4931+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
4932+
4933+#define stbir__3_coeff_remnant( ofs ) \
4934+ { stbir__simdf d; \
4935+ stbir__simdf_load( cs, hc + (ofs) ); \
4936+ stbir__simdf_0123to0011( c, cs ); \
4937+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
4938+ stbir__simdf_0123to2222( c, cs ); \
4939+ stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 ); \
4940+ stbir__simdf_madd( tot1, tot1, d, c ); }
4941+
4942+#define stbir__store_output() \
4943+ stbir__simdf_add( tot0, tot0, tot1 ); \
4944+ stbir__simdf_0123to2301( c, tot0 ); \
4945+ stbir__simdf_add( tot0, tot0, c ); \
4946+ stbir__simdf_store2( output, tot0 ); \
4947+ horizontal_coefficients += coefficient_width; \
4948+ ++horizontal_contributors; \
4949+ output += 2;
4950+
4951+#endif
4952+
4953+#else
4954+
4955+#define stbir__1_coeff_only() \
4956+ float tota,totb,c; \
4957+ c = hc[0]; \
4958+ tota = decode[0]*c; \
4959+ totb = decode[1]*c;
4960+
4961+#define stbir__2_coeff_only() \
4962+ float tota,totb,c; \
4963+ c = hc[0]; \
4964+ tota = decode[0]*c; \
4965+ totb = decode[1]*c; \
4966+ c = hc[1]; \
4967+ tota += decode[2]*c; \
4968+ totb += decode[3]*c;
4969+
4970+// this weird order of add matches the simd
4971+#define stbir__3_coeff_only() \
4972+ float tota,totb,c; \
4973+ c = hc[0]; \
4974+ tota = decode[0]*c; \
4975+ totb = decode[1]*c; \
4976+ c = hc[2]; \
4977+ tota += decode[4]*c; \
4978+ totb += decode[5]*c; \
4979+ c = hc[1]; \
4980+ tota += decode[2]*c; \
4981+ totb += decode[3]*c;
4982+
4983+#define stbir__store_output_tiny() \
4984+ output[0] = tota; \
4985+ output[1] = totb; \
4986+ horizontal_coefficients += coefficient_width; \
4987+ ++horizontal_contributors; \
4988+ output += 2;
4989+
4990+#define stbir__4_coeff_start() \
4991+ float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c; \
4992+ c = hc[0]; \
4993+ tota0 = decode[0]*c; \
4994+ totb0 = decode[1]*c; \
4995+ c = hc[1]; \
4996+ tota1 = decode[2]*c; \
4997+ totb1 = decode[3]*c; \
4998+ c = hc[2]; \
4999+ tota2 = decode[4]*c; \
5000+ totb2 = decode[5]*c; \
5001+ c = hc[3]; \
5002+ tota3 = decode[6]*c; \
5003+ totb3 = decode[7]*c;
5004+
5005+#define stbir__4_coeff_continue_from_4( ofs ) \
5006+ c = hc[0+(ofs)]; \
5007+ tota0 += decode[0+(ofs)*2]*c; \
5008+ totb0 += decode[1+(ofs)*2]*c; \
5009+ c = hc[1+(ofs)]; \
5010+ tota1 += decode[2+(ofs)*2]*c; \
5011+ totb1 += decode[3+(ofs)*2]*c; \
5012+ c = hc[2+(ofs)]; \
5013+ tota2 += decode[4+(ofs)*2]*c; \
5014+ totb2 += decode[5+(ofs)*2]*c; \
5015+ c = hc[3+(ofs)]; \
5016+ tota3 += decode[6+(ofs)*2]*c; \
5017+ totb3 += decode[7+(ofs)*2]*c;
5018+
5019+#define stbir__1_coeff_remnant( ofs ) \
5020+ c = hc[0+(ofs)]; \
5021+ tota0 += decode[0+(ofs)*2] * c; \
5022+ totb0 += decode[1+(ofs)*2] * c;
5023+
5024+#define stbir__2_coeff_remnant( ofs ) \
5025+ c = hc[0+(ofs)]; \
5026+ tota0 += decode[0+(ofs)*2] * c; \
5027+ totb0 += decode[1+(ofs)*2] * c; \
5028+ c = hc[1+(ofs)]; \
5029+ tota1 += decode[2+(ofs)*2] * c; \
5030+ totb1 += decode[3+(ofs)*2] * c;
5031+
5032+#define stbir__3_coeff_remnant( ofs ) \
5033+ c = hc[0+(ofs)]; \
5034+ tota0 += decode[0+(ofs)*2] * c; \
5035+ totb0 += decode[1+(ofs)*2] * c; \
5036+ c = hc[1+(ofs)]; \
5037+ tota1 += decode[2+(ofs)*2] * c; \
5038+ totb1 += decode[3+(ofs)*2] * c; \
5039+ c = hc[2+(ofs)]; \
5040+ tota2 += decode[4+(ofs)*2] * c; \
5041+ totb2 += decode[5+(ofs)*2] * c;
5042+
5043+#define stbir__store_output() \
5044+ output[0] = (tota0+tota2)+(tota1+tota3); \
5045+ output[1] = (totb0+totb2)+(totb1+totb3); \
5046+ horizontal_coefficients += coefficient_width; \
5047+ ++horizontal_contributors; \
5048+ output += 2;
5049+
5050+#endif
5051+
5052+#define STBIR__horizontal_channels 2
5053+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5054+#include STBIR__HEADER_FILENAME
5055+
5056+
5057+//=================
5058+// Do 3 channel horizontal routines
5059+
5060+#ifdef STBIR_SIMD
5061+
5062+#define stbir__1_coeff_only() \
5063+ stbir__simdf tot,c,d; \
5064+ STBIR_SIMD_NO_UNROLL(decode); \
5065+ stbir__simdf_load1z( c, hc ); \
5066+ stbir__simdf_0123to0001( c, c ); \
5067+ stbir__simdf_load( d, decode ); \
5068+ stbir__simdf_mult( tot, d, c );
5069+
5070+#define stbir__2_coeff_only() \
5071+ stbir__simdf tot,c,cs,d; \
5072+ STBIR_SIMD_NO_UNROLL(decode); \
5073+ stbir__simdf_load2( cs, hc ); \
5074+ stbir__simdf_0123to0000( c, cs ); \
5075+ stbir__simdf_load( d, decode ); \
5076+ stbir__simdf_mult( tot, d, c ); \
5077+ stbir__simdf_0123to1111( c, cs ); \
5078+ stbir__simdf_load( d, decode+3 ); \
5079+ stbir__simdf_madd( tot, tot, d, c );
5080+
5081+#define stbir__3_coeff_only() \
5082+ stbir__simdf tot,c,d,cs; \
5083+ STBIR_SIMD_NO_UNROLL(decode); \
5084+ stbir__simdf_load( cs, hc ); \
5085+ stbir__simdf_0123to0000( c, cs ); \
5086+ stbir__simdf_load( d, decode ); \
5087+ stbir__simdf_mult( tot, d, c ); \
5088+ stbir__simdf_0123to1111( c, cs ); \
5089+ stbir__simdf_load( d, decode+3 ); \
5090+ stbir__simdf_madd( tot, tot, d, c ); \
5091+ stbir__simdf_0123to2222( c, cs ); \
5092+ stbir__simdf_load( d, decode+6 ); \
5093+ stbir__simdf_madd( tot, tot, d, c );
5094+
5095+#define stbir__store_output_tiny() \
5096+ stbir__simdf_store2( output, tot ); \
5097+ stbir__simdf_0123to2301( tot, tot ); \
5098+ stbir__simdf_store1( output+2, tot ); \
5099+ horizontal_coefficients += coefficient_width; \
5100+ ++horizontal_contributors; \
5101+ output += 3;
5102+
5103+#ifdef STBIR_SIMD8
5104+
5105+// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
5106+#define stbir__4_coeff_start() \
5107+ stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t; \
5108+ STBIR_SIMD_NO_UNROLL(decode); \
5109+ stbir__simdf8_load4b( cs, hc ); \
5110+ stbir__simdf8_0123to00001111( c, cs ); \
5111+ stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
5112+ stbir__simdf8_0123to22223333( c, cs ); \
5113+ stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
5114+
5115+#define stbir__4_coeff_continue_from_4( ofs ) \
5116+ STBIR_SIMD_NO_UNROLL(decode); \
5117+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5118+ stbir__simdf8_0123to00001111( c, cs ); \
5119+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5120+ stbir__simdf8_0123to22223333( c, cs ); \
5121+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
5122+
5123+#define stbir__1_coeff_remnant( ofs ) \
5124+ STBIR_SIMD_NO_UNROLL(decode); \
5125+ stbir__simdf_load1rep4( t, hc + (ofs) ); \
5126+ stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
5127+
5128+#define stbir__2_coeff_remnant( ofs ) \
5129+ STBIR_SIMD_NO_UNROLL(decode); \
5130+ stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
5131+ stbir__simdf8_0123to22223333( c, cs ); \
5132+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
5133+
5134+ #define stbir__3_coeff_remnant( ofs ) \
5135+ STBIR_SIMD_NO_UNROLL(decode); \
5136+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5137+ stbir__simdf8_0123to00001111( c, cs ); \
5138+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
5139+ stbir__simdf8_0123to2222( t, cs ); \
5140+ stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
5141+
5142+#define stbir__store_output() \
5143+ stbir__simdf8_add( tot0, tot0, tot1 ); \
5144+ stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
5145+ stbir__simdf8_add4halves( t, t, tot0 ); \
5146+ horizontal_coefficients += coefficient_width; \
5147+ ++horizontal_contributors; \
5148+ output += 3; \
5149+ if ( output < output_end ) \
5150+ { \
5151+ stbir__simdf_store( output-3, t ); \
5152+ continue; \
5153+ } \
5154+ { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
5155+ stbir__simdf_store2( output-3, t ); \
5156+ stbir__simdf_store1( output+2-3, tt ); } \
5157+ break;
5158+
5159+
5160+#else
5161+
5162+#define stbir__4_coeff_start() \
5163+ stbir__simdf tot0,tot1,tot2,c,cs; \
5164+ STBIR_SIMD_NO_UNROLL(decode); \
5165+ stbir__simdf_load( cs, hc ); \
5166+ stbir__simdf_0123to0001( c, cs ); \
5167+ stbir__simdf_mult_mem( tot0, c, decode ); \
5168+ stbir__simdf_0123to1122( c, cs ); \
5169+ stbir__simdf_mult_mem( tot1, c, decode+4 ); \
5170+ stbir__simdf_0123to2333( c, cs ); \
5171+ stbir__simdf_mult_mem( tot2, c, decode+8 );
5172+
5173+#define stbir__4_coeff_continue_from_4( ofs ) \
5174+ STBIR_SIMD_NO_UNROLL(decode); \
5175+ stbir__simdf_load( cs, hc + (ofs) ); \
5176+ stbir__simdf_0123to0001( c, cs ); \
5177+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5178+ stbir__simdf_0123to1122( c, cs ); \
5179+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5180+ stbir__simdf_0123to2333( c, cs ); \
5181+ stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
5182+
5183+#define stbir__1_coeff_remnant( ofs ) \
5184+ STBIR_SIMD_NO_UNROLL(decode); \
5185+ stbir__simdf_load1z( c, hc + (ofs) ); \
5186+ stbir__simdf_0123to0001( c, c ); \
5187+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
5188+
5189+#define stbir__2_coeff_remnant( ofs ) \
5190+ { stbir__simdf d; \
5191+ STBIR_SIMD_NO_UNROLL(decode); \
5192+ stbir__simdf_load2z( cs, hc + (ofs) ); \
5193+ stbir__simdf_0123to0001( c, cs ); \
5194+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5195+ stbir__simdf_0123to1122( c, cs ); \
5196+ stbir__simdf_load2z( d, decode+(ofs)*3+4 ); \
5197+ stbir__simdf_madd( tot1, tot1, c, d ); }
5198+
5199+#define stbir__3_coeff_remnant( ofs ) \
5200+ { stbir__simdf d; \
5201+ STBIR_SIMD_NO_UNROLL(decode); \
5202+ stbir__simdf_load( cs, hc + (ofs) ); \
5203+ stbir__simdf_0123to0001( c, cs ); \
5204+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
5205+ stbir__simdf_0123to1122( c, cs ); \
5206+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
5207+ stbir__simdf_0123to2222( c, cs ); \
5208+ stbir__simdf_load1z( d, decode+(ofs)*3+8 ); \
5209+ stbir__simdf_madd( tot2, tot2, c, d ); }
5210+
5211+#define stbir__store_output() \
5212+ stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 ); \
5213+ stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 ); \
5214+ stbir__simdf_0123to1230( tot2, tot2 ); \
5215+ stbir__simdf_add( tot0, tot0, cs ); \
5216+ stbir__simdf_add( c, c, tot2 ); \
5217+ stbir__simdf_add( tot0, tot0, c ); \
5218+ horizontal_coefficients += coefficient_width; \
5219+ ++horizontal_contributors; \
5220+ output += 3; \
5221+ if ( output < output_end ) \
5222+ { \
5223+ stbir__simdf_store( output-3, tot0 ); \
5224+ continue; \
5225+ } \
5226+ stbir__simdf_0123to2301( tot1, tot0 ); \
5227+ stbir__simdf_store2( output-3, tot0 ); \
5228+ stbir__simdf_store1( output+2-3, tot1 ); \
5229+ break;
5230+
5231+#endif
5232+
5233+#else
5234+
5235+#define stbir__1_coeff_only() \
5236+ float tot0, tot1, tot2, c; \
5237+ c = hc[0]; \
5238+ tot0 = decode[0]*c; \
5239+ tot1 = decode[1]*c; \
5240+ tot2 = decode[2]*c;
5241+
5242+#define stbir__2_coeff_only() \
5243+ float tot0, tot1, tot2, c; \
5244+ c = hc[0]; \
5245+ tot0 = decode[0]*c; \
5246+ tot1 = decode[1]*c; \
5247+ tot2 = decode[2]*c; \
5248+ c = hc[1]; \
5249+ tot0 += decode[3]*c; \
5250+ tot1 += decode[4]*c; \
5251+ tot2 += decode[5]*c;
5252+
5253+#define stbir__3_coeff_only() \
5254+ float tot0, tot1, tot2, c; \
5255+ c = hc[0]; \
5256+ tot0 = decode[0]*c; \
5257+ tot1 = decode[1]*c; \
5258+ tot2 = decode[2]*c; \
5259+ c = hc[1]; \
5260+ tot0 += decode[3]*c; \
5261+ tot1 += decode[4]*c; \
5262+ tot2 += decode[5]*c; \
5263+ c = hc[2]; \
5264+ tot0 += decode[6]*c; \
5265+ tot1 += decode[7]*c; \
5266+ tot2 += decode[8]*c;
5267+
5268+#define stbir__store_output_tiny() \
5269+ output[0] = tot0; \
5270+ output[1] = tot1; \
5271+ output[2] = tot2; \
5272+ horizontal_coefficients += coefficient_width; \
5273+ ++horizontal_contributors; \
5274+ output += 3;
5275+
5276+#define stbir__4_coeff_start() \
5277+ float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c; \
5278+ c = hc[0]; \
5279+ tota0 = decode[0]*c; \
5280+ tota1 = decode[1]*c; \
5281+ tota2 = decode[2]*c; \
5282+ c = hc[1]; \
5283+ totb0 = decode[3]*c; \
5284+ totb1 = decode[4]*c; \
5285+ totb2 = decode[5]*c; \
5286+ c = hc[2]; \
5287+ totc0 = decode[6]*c; \
5288+ totc1 = decode[7]*c; \
5289+ totc2 = decode[8]*c; \
5290+ c = hc[3]; \
5291+ totd0 = decode[9]*c; \
5292+ totd1 = decode[10]*c; \
5293+ totd2 = decode[11]*c;
5294+
5295+#define stbir__4_coeff_continue_from_4( ofs ) \
5296+ c = hc[0+(ofs)]; \
5297+ tota0 += decode[0+(ofs)*3]*c; \
5298+ tota1 += decode[1+(ofs)*3]*c; \
5299+ tota2 += decode[2+(ofs)*3]*c; \
5300+ c = hc[1+(ofs)]; \
5301+ totb0 += decode[3+(ofs)*3]*c; \
5302+ totb1 += decode[4+(ofs)*3]*c; \
5303+ totb2 += decode[5+(ofs)*3]*c; \
5304+ c = hc[2+(ofs)]; \
5305+ totc0 += decode[6+(ofs)*3]*c; \
5306+ totc1 += decode[7+(ofs)*3]*c; \
5307+ totc2 += decode[8+(ofs)*3]*c; \
5308+ c = hc[3+(ofs)]; \
5309+ totd0 += decode[9+(ofs)*3]*c; \
5310+ totd1 += decode[10+(ofs)*3]*c; \
5311+ totd2 += decode[11+(ofs)*3]*c;
5312+
5313+#define stbir__1_coeff_remnant( ofs ) \
5314+ c = hc[0+(ofs)]; \
5315+ tota0 += decode[0+(ofs)*3]*c; \
5316+ tota1 += decode[1+(ofs)*3]*c; \
5317+ tota2 += decode[2+(ofs)*3]*c;
5318+
5319+#define stbir__2_coeff_remnant( ofs ) \
5320+ c = hc[0+(ofs)]; \
5321+ tota0 += decode[0+(ofs)*3]*c; \
5322+ tota1 += decode[1+(ofs)*3]*c; \
5323+ tota2 += decode[2+(ofs)*3]*c; \
5324+ c = hc[1+(ofs)]; \
5325+ totb0 += decode[3+(ofs)*3]*c; \
5326+ totb1 += decode[4+(ofs)*3]*c; \
5327+ totb2 += decode[5+(ofs)*3]*c; \
5328+
5329+#define stbir__3_coeff_remnant( ofs ) \
5330+ c = hc[0+(ofs)]; \
5331+ tota0 += decode[0+(ofs)*3]*c; \
5332+ tota1 += decode[1+(ofs)*3]*c; \
5333+ tota2 += decode[2+(ofs)*3]*c; \
5334+ c = hc[1+(ofs)]; \
5335+ totb0 += decode[3+(ofs)*3]*c; \
5336+ totb1 += decode[4+(ofs)*3]*c; \
5337+ totb2 += decode[5+(ofs)*3]*c; \
5338+ c = hc[2+(ofs)]; \
5339+ totc0 += decode[6+(ofs)*3]*c; \
5340+ totc1 += decode[7+(ofs)*3]*c; \
5341+ totc2 += decode[8+(ofs)*3]*c;
5342+
5343+#define stbir__store_output() \
5344+ output[0] = (tota0+totc0)+(totb0+totd0); \
5345+ output[1] = (tota1+totc1)+(totb1+totd1); \
5346+ output[2] = (tota2+totc2)+(totb2+totd2); \
5347+ horizontal_coefficients += coefficient_width; \
5348+ ++horizontal_contributors; \
5349+ output += 3;
5350+
5351+#endif
5352+
5353+#define STBIR__horizontal_channels 3
5354+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5355+#include STBIR__HEADER_FILENAME
5356+
5357+//=================
5358+// Do 4 channel horizontal routines
5359+
5360+#ifdef STBIR_SIMD
5361+
5362+#define stbir__1_coeff_only() \
5363+ stbir__simdf tot,c; \
5364+ STBIR_SIMD_NO_UNROLL(decode); \
5365+ stbir__simdf_load1( c, hc ); \
5366+ stbir__simdf_0123to0000( c, c ); \
5367+ stbir__simdf_mult_mem( tot, c, decode );
5368+
5369+#define stbir__2_coeff_only() \
5370+ stbir__simdf tot,c,cs; \
5371+ STBIR_SIMD_NO_UNROLL(decode); \
5372+ stbir__simdf_load2( cs, hc ); \
5373+ stbir__simdf_0123to0000( c, cs ); \
5374+ stbir__simdf_mult_mem( tot, c, decode ); \
5375+ stbir__simdf_0123to1111( c, cs ); \
5376+ stbir__simdf_madd_mem( tot, tot, c, decode+4 );
5377+
5378+#define stbir__3_coeff_only() \
5379+ stbir__simdf tot,c,cs; \
5380+ STBIR_SIMD_NO_UNROLL(decode); \
5381+ stbir__simdf_load( cs, hc ); \
5382+ stbir__simdf_0123to0000( c, cs ); \
5383+ stbir__simdf_mult_mem( tot, c, decode ); \
5384+ stbir__simdf_0123to1111( c, cs ); \
5385+ stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
5386+ stbir__simdf_0123to2222( c, cs ); \
5387+ stbir__simdf_madd_mem( tot, tot, c, decode+8 );
5388+
5389+#define stbir__store_output_tiny() \
5390+ stbir__simdf_store( output, tot ); \
5391+ horizontal_coefficients += coefficient_width; \
5392+ ++horizontal_contributors; \
5393+ output += 4;
5394+
5395+#ifdef STBIR_SIMD8
5396+
5397+#define stbir__4_coeff_start() \
5398+ stbir__simdf8 tot0,c,cs; stbir__simdf t; \
5399+ STBIR_SIMD_NO_UNROLL(decode); \
5400+ stbir__simdf8_load4b( cs, hc ); \
5401+ stbir__simdf8_0123to00001111( c, cs ); \
5402+ stbir__simdf8_mult_mem( tot0, c, decode ); \
5403+ stbir__simdf8_0123to22223333( c, cs ); \
5404+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
5405+
5406+#define stbir__4_coeff_continue_from_4( ofs ) \
5407+ STBIR_SIMD_NO_UNROLL(decode); \
5408+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5409+ stbir__simdf8_0123to00001111( c, cs ); \
5410+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5411+ stbir__simdf8_0123to22223333( c, cs ); \
5412+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5413+
5414+#define stbir__1_coeff_remnant( ofs ) \
5415+ STBIR_SIMD_NO_UNROLL(decode); \
5416+ stbir__simdf_load1rep4( t, hc + (ofs) ); \
5417+ stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
5418+
5419+#define stbir__2_coeff_remnant( ofs ) \
5420+ STBIR_SIMD_NO_UNROLL(decode); \
5421+ stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
5422+ stbir__simdf8_0123to22223333( c, cs ); \
5423+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5424+
5425+ #define stbir__3_coeff_remnant( ofs ) \
5426+ STBIR_SIMD_NO_UNROLL(decode); \
5427+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5428+ stbir__simdf8_0123to00001111( c, cs ); \
5429+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5430+ stbir__simdf8_0123to2222( t, cs ); \
5431+ stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
5432+
5433+#define stbir__store_output() \
5434+ stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
5435+ stbir__simdf_store( output, t ); \
5436+ horizontal_coefficients += coefficient_width; \
5437+ ++horizontal_contributors; \
5438+ output += 4;
5439+
5440+#else
5441+
5442+#define stbir__4_coeff_start() \
5443+ stbir__simdf tot0,tot1,c,cs; \
5444+ STBIR_SIMD_NO_UNROLL(decode); \
5445+ stbir__simdf_load( cs, hc ); \
5446+ stbir__simdf_0123to0000( c, cs ); \
5447+ stbir__simdf_mult_mem( tot0, c, decode ); \
5448+ stbir__simdf_0123to1111( c, cs ); \
5449+ stbir__simdf_mult_mem( tot1, c, decode+4 ); \
5450+ stbir__simdf_0123to2222( c, cs ); \
5451+ stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
5452+ stbir__simdf_0123to3333( c, cs ); \
5453+ stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
5454+
5455+#define stbir__4_coeff_continue_from_4( ofs ) \
5456+ STBIR_SIMD_NO_UNROLL(decode); \
5457+ stbir__simdf_load( cs, hc + (ofs) ); \
5458+ stbir__simdf_0123to0000( c, cs ); \
5459+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5460+ stbir__simdf_0123to1111( c, cs ); \
5461+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
5462+ stbir__simdf_0123to2222( c, cs ); \
5463+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); \
5464+ stbir__simdf_0123to3333( c, cs ); \
5465+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
5466+
5467+#define stbir__1_coeff_remnant( ofs ) \
5468+ STBIR_SIMD_NO_UNROLL(decode); \
5469+ stbir__simdf_load1( c, hc + (ofs) ); \
5470+ stbir__simdf_0123to0000( c, c ); \
5471+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
5472+
5473+#define stbir__2_coeff_remnant( ofs ) \
5474+ STBIR_SIMD_NO_UNROLL(decode); \
5475+ stbir__simdf_load2( cs, hc + (ofs) ); \
5476+ stbir__simdf_0123to0000( c, cs ); \
5477+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5478+ stbir__simdf_0123to1111( c, cs ); \
5479+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
5480+
5481+#define stbir__3_coeff_remnant( ofs ) \
5482+ STBIR_SIMD_NO_UNROLL(decode); \
5483+ stbir__simdf_load( cs, hc + (ofs) ); \
5484+ stbir__simdf_0123to0000( c, cs ); \
5485+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
5486+ stbir__simdf_0123to1111( c, cs ); \
5487+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
5488+ stbir__simdf_0123to2222( c, cs ); \
5489+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
5490+
5491+#define stbir__store_output() \
5492+ stbir__simdf_add( tot0, tot0, tot1 ); \
5493+ stbir__simdf_store( output, tot0 ); \
5494+ horizontal_coefficients += coefficient_width; \
5495+ ++horizontal_contributors; \
5496+ output += 4;
5497+
5498+#endif
5499+
5500+#else
5501+
5502+#define stbir__1_coeff_only() \
5503+ float p0,p1,p2,p3,c; \
5504+ STBIR_SIMD_NO_UNROLL(decode); \
5505+ c = hc[0]; \
5506+ p0 = decode[0] * c; \
5507+ p1 = decode[1] * c; \
5508+ p2 = decode[2] * c; \
5509+ p3 = decode[3] * c;
5510+
5511+#define stbir__2_coeff_only() \
5512+ float p0,p1,p2,p3,c; \
5513+ STBIR_SIMD_NO_UNROLL(decode); \
5514+ c = hc[0]; \
5515+ p0 = decode[0] * c; \
5516+ p1 = decode[1] * c; \
5517+ p2 = decode[2] * c; \
5518+ p3 = decode[3] * c; \
5519+ c = hc[1]; \
5520+ p0 += decode[4] * c; \
5521+ p1 += decode[5] * c; \
5522+ p2 += decode[6] * c; \
5523+ p3 += decode[7] * c;
5524+
5525+#define stbir__3_coeff_only() \
5526+ float p0,p1,p2,p3,c; \
5527+ STBIR_SIMD_NO_UNROLL(decode); \
5528+ c = hc[0]; \
5529+ p0 = decode[0] * c; \
5530+ p1 = decode[1] * c; \
5531+ p2 = decode[2] * c; \
5532+ p3 = decode[3] * c; \
5533+ c = hc[1]; \
5534+ p0 += decode[4] * c; \
5535+ p1 += decode[5] * c; \
5536+ p2 += decode[6] * c; \
5537+ p3 += decode[7] * c; \
5538+ c = hc[2]; \
5539+ p0 += decode[8] * c; \
5540+ p1 += decode[9] * c; \
5541+ p2 += decode[10] * c; \
5542+ p3 += decode[11] * c;
5543+
5544+#define stbir__store_output_tiny() \
5545+ output[0] = p0; \
5546+ output[1] = p1; \
5547+ output[2] = p2; \
5548+ output[3] = p3; \
5549+ horizontal_coefficients += coefficient_width; \
5550+ ++horizontal_contributors; \
5551+ output += 4;
5552+
5553+#define stbir__4_coeff_start() \
5554+ float x0,x1,x2,x3,y0,y1,y2,y3,c; \
5555+ STBIR_SIMD_NO_UNROLL(decode); \
5556+ c = hc[0]; \
5557+ x0 = decode[0] * c; \
5558+ x1 = decode[1] * c; \
5559+ x2 = decode[2] * c; \
5560+ x3 = decode[3] * c; \
5561+ c = hc[1]; \
5562+ y0 = decode[4] * c; \
5563+ y1 = decode[5] * c; \
5564+ y2 = decode[6] * c; \
5565+ y3 = decode[7] * c; \
5566+ c = hc[2]; \
5567+ x0 += decode[8] * c; \
5568+ x1 += decode[9] * c; \
5569+ x2 += decode[10] * c; \
5570+ x3 += decode[11] * c; \
5571+ c = hc[3]; \
5572+ y0 += decode[12] * c; \
5573+ y1 += decode[13] * c; \
5574+ y2 += decode[14] * c; \
5575+ y3 += decode[15] * c;
5576+
5577+#define stbir__4_coeff_continue_from_4( ofs ) \
5578+ STBIR_SIMD_NO_UNROLL(decode); \
5579+ c = hc[0+(ofs)]; \
5580+ x0 += decode[0+(ofs)*4] * c; \
5581+ x1 += decode[1+(ofs)*4] * c; \
5582+ x2 += decode[2+(ofs)*4] * c; \
5583+ x3 += decode[3+(ofs)*4] * c; \
5584+ c = hc[1+(ofs)]; \
5585+ y0 += decode[4+(ofs)*4] * c; \
5586+ y1 += decode[5+(ofs)*4] * c; \
5587+ y2 += decode[6+(ofs)*4] * c; \
5588+ y3 += decode[7+(ofs)*4] * c; \
5589+ c = hc[2+(ofs)]; \
5590+ x0 += decode[8+(ofs)*4] * c; \
5591+ x1 += decode[9+(ofs)*4] * c; \
5592+ x2 += decode[10+(ofs)*4] * c; \
5593+ x3 += decode[11+(ofs)*4] * c; \
5594+ c = hc[3+(ofs)]; \
5595+ y0 += decode[12+(ofs)*4] * c; \
5596+ y1 += decode[13+(ofs)*4] * c; \
5597+ y2 += decode[14+(ofs)*4] * c; \
5598+ y3 += decode[15+(ofs)*4] * c;
5599+
5600+#define stbir__1_coeff_remnant( ofs ) \
5601+ STBIR_SIMD_NO_UNROLL(decode); \
5602+ c = hc[0+(ofs)]; \
5603+ x0 += decode[0+(ofs)*4] * c; \
5604+ x1 += decode[1+(ofs)*4] * c; \
5605+ x2 += decode[2+(ofs)*4] * c; \
5606+ x3 += decode[3+(ofs)*4] * c;
5607+
5608+#define stbir__2_coeff_remnant( ofs ) \
5609+ STBIR_SIMD_NO_UNROLL(decode); \
5610+ c = hc[0+(ofs)]; \
5611+ x0 += decode[0+(ofs)*4] * c; \
5612+ x1 += decode[1+(ofs)*4] * c; \
5613+ x2 += decode[2+(ofs)*4] * c; \
5614+ x3 += decode[3+(ofs)*4] * c; \
5615+ c = hc[1+(ofs)]; \
5616+ y0 += decode[4+(ofs)*4] * c; \
5617+ y1 += decode[5+(ofs)*4] * c; \
5618+ y2 += decode[6+(ofs)*4] * c; \
5619+ y3 += decode[7+(ofs)*4] * c;
5620+
5621+#define stbir__3_coeff_remnant( ofs ) \
5622+ STBIR_SIMD_NO_UNROLL(decode); \
5623+ c = hc[0+(ofs)]; \
5624+ x0 += decode[0+(ofs)*4] * c; \
5625+ x1 += decode[1+(ofs)*4] * c; \
5626+ x2 += decode[2+(ofs)*4] * c; \
5627+ x3 += decode[3+(ofs)*4] * c; \
5628+ c = hc[1+(ofs)]; \
5629+ y0 += decode[4+(ofs)*4] * c; \
5630+ y1 += decode[5+(ofs)*4] * c; \
5631+ y2 += decode[6+(ofs)*4] * c; \
5632+ y3 += decode[7+(ofs)*4] * c; \
5633+ c = hc[2+(ofs)]; \
5634+ x0 += decode[8+(ofs)*4] * c; \
5635+ x1 += decode[9+(ofs)*4] * c; \
5636+ x2 += decode[10+(ofs)*4] * c; \
5637+ x3 += decode[11+(ofs)*4] * c;
5638+
5639+#define stbir__store_output() \
5640+ output[0] = x0 + y0; \
5641+ output[1] = x1 + y1; \
5642+ output[2] = x2 + y2; \
5643+ output[3] = x3 + y3; \
5644+ horizontal_coefficients += coefficient_width; \
5645+ ++horizontal_contributors; \
5646+ output += 4;
5647+
5648+#endif
5649+
5650+#define STBIR__horizontal_channels 4
5651+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
5652+#include STBIR__HEADER_FILENAME
5653+
5654+
5655+
5656+//=================
5657+// Do 7 channel horizontal routines
5658+
5659+#ifdef STBIR_SIMD
5660+
5661+#define stbir__1_coeff_only() \
5662+ stbir__simdf tot0,tot1,c; \
5663+ STBIR_SIMD_NO_UNROLL(decode); \
5664+ stbir__simdf_load1( c, hc ); \
5665+ stbir__simdf_0123to0000( c, c ); \
5666+ stbir__simdf_mult_mem( tot0, c, decode ); \
5667+ stbir__simdf_mult_mem( tot1, c, decode+3 );
5668+
5669+#define stbir__2_coeff_only() \
5670+ stbir__simdf tot0,tot1,c,cs; \
5671+ STBIR_SIMD_NO_UNROLL(decode); \
5672+ stbir__simdf_load2( cs, hc ); \
5673+ stbir__simdf_0123to0000( c, cs ); \
5674+ stbir__simdf_mult_mem( tot0, c, decode ); \
5675+ stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5676+ stbir__simdf_0123to1111( c, cs ); \
5677+ stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
5678+ stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
5679+
5680+#define stbir__3_coeff_only() \
5681+ stbir__simdf tot0,tot1,c,cs; \
5682+ STBIR_SIMD_NO_UNROLL(decode); \
5683+ stbir__simdf_load( cs, hc ); \
5684+ stbir__simdf_0123to0000( c, cs ); \
5685+ stbir__simdf_mult_mem( tot0, c, decode ); \
5686+ stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5687+ stbir__simdf_0123to1111( c, cs ); \
5688+ stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
5689+ stbir__simdf_madd_mem( tot1, tot1, c, decode+10 ); \
5690+ stbir__simdf_0123to2222( c, cs ); \
5691+ stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
5692+ stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
5693+
5694+#define stbir__store_output_tiny() \
5695+ stbir__simdf_store( output+3, tot1 ); \
5696+ stbir__simdf_store( output, tot0 ); \
5697+ horizontal_coefficients += coefficient_width; \
5698+ ++horizontal_contributors; \
5699+ output += 7;
5700+
5701+#ifdef STBIR_SIMD8
5702+
5703+#define stbir__4_coeff_start() \
5704+ stbir__simdf8 tot0,tot1,c,cs; \
5705+ STBIR_SIMD_NO_UNROLL(decode); \
5706+ stbir__simdf8_load4b( cs, hc ); \
5707+ stbir__simdf8_0123to00000000( c, cs ); \
5708+ stbir__simdf8_mult_mem( tot0, c, decode ); \
5709+ stbir__simdf8_0123to11111111( c, cs ); \
5710+ stbir__simdf8_mult_mem( tot1, c, decode+7 ); \
5711+ stbir__simdf8_0123to22222222( c, cs ); \
5712+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 ); \
5713+ stbir__simdf8_0123to33333333( c, cs ); \
5714+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
5715+
5716+#define stbir__4_coeff_continue_from_4( ofs ) \
5717+ STBIR_SIMD_NO_UNROLL(decode); \
5718+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5719+ stbir__simdf8_0123to00000000( c, cs ); \
5720+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5721+ stbir__simdf8_0123to11111111( c, cs ); \
5722+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
5723+ stbir__simdf8_0123to22222222( c, cs ); \
5724+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5725+ stbir__simdf8_0123to33333333( c, cs ); \
5726+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
5727+
5728+#define stbir__1_coeff_remnant( ofs ) \
5729+ STBIR_SIMD_NO_UNROLL(decode); \
5730+ stbir__simdf8_load1b( c, hc + (ofs) ); \
5731+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
5732+
5733+#define stbir__2_coeff_remnant( ofs ) \
5734+ STBIR_SIMD_NO_UNROLL(decode); \
5735+ stbir__simdf8_load1b( c, hc + (ofs) ); \
5736+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5737+ stbir__simdf8_load1b( c, hc + (ofs)+1 ); \
5738+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
5739+
5740+#define stbir__3_coeff_remnant( ofs ) \
5741+ STBIR_SIMD_NO_UNROLL(decode); \
5742+ stbir__simdf8_load4b( cs, hc + (ofs) ); \
5743+ stbir__simdf8_0123to00000000( c, cs ); \
5744+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5745+ stbir__simdf8_0123to11111111( c, cs ); \
5746+ stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
5747+ stbir__simdf8_0123to22222222( c, cs ); \
5748+ stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
5749+
5750+#define stbir__store_output() \
5751+ stbir__simdf8_add( tot0, tot0, tot1 ); \
5752+ horizontal_coefficients += coefficient_width; \
5753+ ++horizontal_contributors; \
5754+ output += 7; \
5755+ if ( output < output_end ) \
5756+ { \
5757+ stbir__simdf8_store( output-7, tot0 ); \
5758+ continue; \
5759+ } \
5760+ stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
5761+ stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) ); \
5762+ break;
5763+
5764+#else
5765+
5766+#define stbir__4_coeff_start() \
5767+ stbir__simdf tot0,tot1,tot2,tot3,c,cs; \
5768+ STBIR_SIMD_NO_UNROLL(decode); \
5769+ stbir__simdf_load( cs, hc ); \
5770+ stbir__simdf_0123to0000( c, cs ); \
5771+ stbir__simdf_mult_mem( tot0, c, decode ); \
5772+ stbir__simdf_mult_mem( tot1, c, decode+3 ); \
5773+ stbir__simdf_0123to1111( c, cs ); \
5774+ stbir__simdf_mult_mem( tot2, c, decode+7 ); \
5775+ stbir__simdf_mult_mem( tot3, c, decode+10 ); \
5776+ stbir__simdf_0123to2222( c, cs ); \
5777+ stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
5778+ stbir__simdf_madd_mem( tot1, tot1, c, decode+17 ); \
5779+ stbir__simdf_0123to3333( c, cs ); \
5780+ stbir__simdf_madd_mem( tot2, tot2, c, decode+21 ); \
5781+ stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
5782+
5783+#define stbir__4_coeff_continue_from_4( ofs ) \
5784+ STBIR_SIMD_NO_UNROLL(decode); \
5785+ stbir__simdf_load( cs, hc + (ofs) ); \
5786+ stbir__simdf_0123to0000( c, cs ); \
5787+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5788+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5789+ stbir__simdf_0123to1111( c, cs ); \
5790+ stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5791+ stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
5792+ stbir__simdf_0123to2222( c, cs ); \
5793+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5794+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 ); \
5795+ stbir__simdf_0123to3333( c, cs ); \
5796+ stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 ); \
5797+ stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
5798+
5799+#define stbir__1_coeff_remnant( ofs ) \
5800+ STBIR_SIMD_NO_UNROLL(decode); \
5801+ stbir__simdf_load1( c, hc + (ofs) ); \
5802+ stbir__simdf_0123to0000( c, c ); \
5803+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5804+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5805+
5806+#define stbir__2_coeff_remnant( ofs ) \
5807+ STBIR_SIMD_NO_UNROLL(decode); \
5808+ stbir__simdf_load2( cs, hc + (ofs) ); \
5809+ stbir__simdf_0123to0000( c, cs ); \
5810+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5811+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5812+ stbir__simdf_0123to1111( c, cs ); \
5813+ stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5814+ stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
5815+
5816+#define stbir__3_coeff_remnant( ofs ) \
5817+ STBIR_SIMD_NO_UNROLL(decode); \
5818+ stbir__simdf_load( cs, hc + (ofs) ); \
5819+ stbir__simdf_0123to0000( c, cs ); \
5820+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
5821+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
5822+ stbir__simdf_0123to1111( c, cs ); \
5823+ stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
5824+ stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
5825+ stbir__simdf_0123to2222( c, cs ); \
5826+ stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
5827+ stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
5828+
5829+#define stbir__store_output() \
5830+ stbir__simdf_add( tot0, tot0, tot2 ); \
5831+ stbir__simdf_add( tot1, tot1, tot3 ); \
5832+ stbir__simdf_store( output+3, tot1 ); \
5833+ stbir__simdf_store( output, tot0 ); \
5834+ horizontal_coefficients += coefficient_width; \
5835+ ++horizontal_contributors; \
5836+ output += 7;
5837+
5838+#endif
5839+
5840+#else
5841+
5842+#define stbir__1_coeff_only() \
5843+ float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5844+ c = hc[0]; \
5845+ tot0 = decode[0]*c; \
5846+ tot1 = decode[1]*c; \
5847+ tot2 = decode[2]*c; \
5848+ tot3 = decode[3]*c; \
5849+ tot4 = decode[4]*c; \
5850+ tot5 = decode[5]*c; \
5851+ tot6 = decode[6]*c;
5852+
5853+#define stbir__2_coeff_only() \
5854+ float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5855+ c = hc[0]; \
5856+ tot0 = decode[0]*c; \
5857+ tot1 = decode[1]*c; \
5858+ tot2 = decode[2]*c; \
5859+ tot3 = decode[3]*c; \
5860+ tot4 = decode[4]*c; \
5861+ tot5 = decode[5]*c; \
5862+ tot6 = decode[6]*c; \
5863+ c = hc[1]; \
5864+ tot0 += decode[7]*c; \
5865+ tot1 += decode[8]*c; \
5866+ tot2 += decode[9]*c; \
5867+ tot3 += decode[10]*c; \
5868+ tot4 += decode[11]*c; \
5869+ tot5 += decode[12]*c; \
5870+ tot6 += decode[13]*c; \
5871+
5872+#define stbir__3_coeff_only() \
5873+ float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
5874+ c = hc[0]; \
5875+ tot0 = decode[0]*c; \
5876+ tot1 = decode[1]*c; \
5877+ tot2 = decode[2]*c; \
5878+ tot3 = decode[3]*c; \
5879+ tot4 = decode[4]*c; \
5880+ tot5 = decode[5]*c; \
5881+ tot6 = decode[6]*c; \
5882+ c = hc[1]; \
5883+ tot0 += decode[7]*c; \
5884+ tot1 += decode[8]*c; \
5885+ tot2 += decode[9]*c; \
5886+ tot3 += decode[10]*c; \
5887+ tot4 += decode[11]*c; \
5888+ tot5 += decode[12]*c; \
5889+ tot6 += decode[13]*c; \
5890+ c = hc[2]; \
5891+ tot0 += decode[14]*c; \
5892+ tot1 += decode[15]*c; \
5893+ tot2 += decode[16]*c; \
5894+ tot3 += decode[17]*c; \
5895+ tot4 += decode[18]*c; \
5896+ tot5 += decode[19]*c; \
5897+ tot6 += decode[20]*c; \
5898+
5899+#define stbir__store_output_tiny() \
5900+ output[0] = tot0; \
5901+ output[1] = tot1; \
5902+ output[2] = tot2; \
5903+ output[3] = tot3; \
5904+ output[4] = tot4; \
5905+ output[5] = tot5; \
5906+ output[6] = tot6; \
5907+ horizontal_coefficients += coefficient_width; \
5908+ ++horizontal_contributors; \
5909+ output += 7;
5910+
5911+#define stbir__4_coeff_start() \
5912+ float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
5913+ STBIR_SIMD_NO_UNROLL(decode); \
5914+ c = hc[0]; \
5915+ x0 = decode[0] * c; \
5916+ x1 = decode[1] * c; \
5917+ x2 = decode[2] * c; \
5918+ x3 = decode[3] * c; \
5919+ x4 = decode[4] * c; \
5920+ x5 = decode[5] * c; \
5921+ x6 = decode[6] * c; \
5922+ c = hc[1]; \
5923+ y0 = decode[7] * c; \
5924+ y1 = decode[8] * c; \
5925+ y2 = decode[9] * c; \
5926+ y3 = decode[10] * c; \
5927+ y4 = decode[11] * c; \
5928+ y5 = decode[12] * c; \
5929+ y6 = decode[13] * c; \
5930+ c = hc[2]; \
5931+ x0 += decode[14] * c; \
5932+ x1 += decode[15] * c; \
5933+ x2 += decode[16] * c; \
5934+ x3 += decode[17] * c; \
5935+ x4 += decode[18] * c; \
5936+ x5 += decode[19] * c; \
5937+ x6 += decode[20] * c; \
5938+ c = hc[3]; \
5939+ y0 += decode[21] * c; \
5940+ y1 += decode[22] * c; \
5941+ y2 += decode[23] * c; \
5942+ y3 += decode[24] * c; \
5943+ y4 += decode[25] * c; \
5944+ y5 += decode[26] * c; \
5945+ y6 += decode[27] * c;
5946+
5947+#define stbir__4_coeff_continue_from_4( ofs ) \
5948+ STBIR_SIMD_NO_UNROLL(decode); \
5949+ c = hc[0+(ofs)]; \
5950+ x0 += decode[0+(ofs)*7] * c; \
5951+ x1 += decode[1+(ofs)*7] * c; \
5952+ x2 += decode[2+(ofs)*7] * c; \
5953+ x3 += decode[3+(ofs)*7] * c; \
5954+ x4 += decode[4+(ofs)*7] * c; \
5955+ x5 += decode[5+(ofs)*7] * c; \
5956+ x6 += decode[6+(ofs)*7] * c; \
5957+ c = hc[1+(ofs)]; \
5958+ y0 += decode[7+(ofs)*7] * c; \
5959+ y1 += decode[8+(ofs)*7] * c; \
5960+ y2 += decode[9+(ofs)*7] * c; \
5961+ y3 += decode[10+(ofs)*7] * c; \
5962+ y4 += decode[11+(ofs)*7] * c; \
5963+ y5 += decode[12+(ofs)*7] * c; \
5964+ y6 += decode[13+(ofs)*7] * c; \
5965+ c = hc[2+(ofs)]; \
5966+ x0 += decode[14+(ofs)*7] * c; \
5967+ x1 += decode[15+(ofs)*7] * c; \
5968+ x2 += decode[16+(ofs)*7] * c; \
5969+ x3 += decode[17+(ofs)*7] * c; \
5970+ x4 += decode[18+(ofs)*7] * c; \
5971+ x5 += decode[19+(ofs)*7] * c; \
5972+ x6 += decode[20+(ofs)*7] * c; \
5973+ c = hc[3+(ofs)]; \
5974+ y0 += decode[21+(ofs)*7] * c; \
5975+ y1 += decode[22+(ofs)*7] * c; \
5976+ y2 += decode[23+(ofs)*7] * c; \
5977+ y3 += decode[24+(ofs)*7] * c; \
5978+ y4 += decode[25+(ofs)*7] * c; \
5979+ y5 += decode[26+(ofs)*7] * c; \
5980+ y6 += decode[27+(ofs)*7] * c;
5981+
5982+#define stbir__1_coeff_remnant( ofs ) \
5983+ STBIR_SIMD_NO_UNROLL(decode); \
5984+ c = hc[0+(ofs)]; \
5985+ x0 += decode[0+(ofs)*7] * c; \
5986+ x1 += decode[1+(ofs)*7] * c; \
5987+ x2 += decode[2+(ofs)*7] * c; \
5988+ x3 += decode[3+(ofs)*7] * c; \
5989+ x4 += decode[4+(ofs)*7] * c; \
5990+ x5 += decode[5+(ofs)*7] * c; \
5991+ x6 += decode[6+(ofs)*7] * c; \
5992+
5993+#define stbir__2_coeff_remnant( ofs ) \
5994+ STBIR_SIMD_NO_UNROLL(decode); \
5995+ c = hc[0+(ofs)]; \
5996+ x0 += decode[0+(ofs)*7] * c; \
5997+ x1 += decode[1+(ofs)*7] * c; \
5998+ x2 += decode[2+(ofs)*7] * c; \
5999+ x3 += decode[3+(ofs)*7] * c; \
6000+ x4 += decode[4+(ofs)*7] * c; \
6001+ x5 += decode[5+(ofs)*7] * c; \
6002+ x6 += decode[6+(ofs)*7] * c; \
6003+ c = hc[1+(ofs)]; \
6004+ y0 += decode[7+(ofs)*7] * c; \
6005+ y1 += decode[8+(ofs)*7] * c; \
6006+ y2 += decode[9+(ofs)*7] * c; \
6007+ y3 += decode[10+(ofs)*7] * c; \
6008+ y4 += decode[11+(ofs)*7] * c; \
6009+ y5 += decode[12+(ofs)*7] * c; \
6010+ y6 += decode[13+(ofs)*7] * c; \
6011+
6012+#define stbir__3_coeff_remnant( ofs ) \
6013+ STBIR_SIMD_NO_UNROLL(decode); \
6014+ c = hc[0+(ofs)]; \
6015+ x0 += decode[0+(ofs)*7] * c; \
6016+ x1 += decode[1+(ofs)*7] * c; \
6017+ x2 += decode[2+(ofs)*7] * c; \
6018+ x3 += decode[3+(ofs)*7] * c; \
6019+ x4 += decode[4+(ofs)*7] * c; \
6020+ x5 += decode[5+(ofs)*7] * c; \
6021+ x6 += decode[6+(ofs)*7] * c; \
6022+ c = hc[1+(ofs)]; \
6023+ y0 += decode[7+(ofs)*7] * c; \
6024+ y1 += decode[8+(ofs)*7] * c; \
6025+ y2 += decode[9+(ofs)*7] * c; \
6026+ y3 += decode[10+(ofs)*7] * c; \
6027+ y4 += decode[11+(ofs)*7] * c; \
6028+ y5 += decode[12+(ofs)*7] * c; \
6029+ y6 += decode[13+(ofs)*7] * c; \
6030+ c = hc[2+(ofs)]; \
6031+ x0 += decode[14+(ofs)*7] * c; \
6032+ x1 += decode[15+(ofs)*7] * c; \
6033+ x2 += decode[16+(ofs)*7] * c; \
6034+ x3 += decode[17+(ofs)*7] * c; \
6035+ x4 += decode[18+(ofs)*7] * c; \
6036+ x5 += decode[19+(ofs)*7] * c; \
6037+ x6 += decode[20+(ofs)*7] * c; \
6038+
6039+#define stbir__store_output() \
6040+ output[0] = x0 + y0; \
6041+ output[1] = x1 + y1; \
6042+ output[2] = x2 + y2; \
6043+ output[3] = x3 + y3; \
6044+ output[4] = x4 + y4; \
6045+ output[5] = x5 + y5; \
6046+ output[6] = x6 + y6; \
6047+ horizontal_coefficients += coefficient_width; \
6048+ ++horizontal_contributors; \
6049+ output += 7;
6050+
6051+#endif
6052+
6053+#define STBIR__horizontal_channels 7
6054+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
6055+#include STBIR__HEADER_FILENAME
6056+
6057+
6058+// include all of the vertical resamplers (both scatter and gather versions)
6059+
6060+#define STBIR__vertical_channels 1
6061+#define STB_IMAGE_RESIZE_DO_VERTICALS
6062+#include STBIR__HEADER_FILENAME
6063+
6064+#define STBIR__vertical_channels 1
6065+#define STB_IMAGE_RESIZE_DO_VERTICALS
6066+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6067+#include STBIR__HEADER_FILENAME
6068+
6069+#define STBIR__vertical_channels 2
6070+#define STB_IMAGE_RESIZE_DO_VERTICALS
6071+#include STBIR__HEADER_FILENAME
6072+
6073+#define STBIR__vertical_channels 2
6074+#define STB_IMAGE_RESIZE_DO_VERTICALS
6075+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6076+#include STBIR__HEADER_FILENAME
6077+
6078+#define STBIR__vertical_channels 3
6079+#define STB_IMAGE_RESIZE_DO_VERTICALS
6080+#include STBIR__HEADER_FILENAME
6081+
6082+#define STBIR__vertical_channels 3
6083+#define STB_IMAGE_RESIZE_DO_VERTICALS
6084+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6085+#include STBIR__HEADER_FILENAME
6086+
6087+#define STBIR__vertical_channels 4
6088+#define STB_IMAGE_RESIZE_DO_VERTICALS
6089+#include STBIR__HEADER_FILENAME
6090+
6091+#define STBIR__vertical_channels 4
6092+#define STB_IMAGE_RESIZE_DO_VERTICALS
6093+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6094+#include STBIR__HEADER_FILENAME
6095+
6096+#define STBIR__vertical_channels 5
6097+#define STB_IMAGE_RESIZE_DO_VERTICALS
6098+#include STBIR__HEADER_FILENAME
6099+
6100+#define STBIR__vertical_channels 5
6101+#define STB_IMAGE_RESIZE_DO_VERTICALS
6102+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6103+#include STBIR__HEADER_FILENAME
6104+
6105+#define STBIR__vertical_channels 6
6106+#define STB_IMAGE_RESIZE_DO_VERTICALS
6107+#include STBIR__HEADER_FILENAME
6108+
6109+#define STBIR__vertical_channels 6
6110+#define STB_IMAGE_RESIZE_DO_VERTICALS
6111+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6112+#include STBIR__HEADER_FILENAME
6113+
6114+#define STBIR__vertical_channels 7
6115+#define STB_IMAGE_RESIZE_DO_VERTICALS
6116+#include STBIR__HEADER_FILENAME
6117+
6118+#define STBIR__vertical_channels 7
6119+#define STB_IMAGE_RESIZE_DO_VERTICALS
6120+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6121+#include STBIR__HEADER_FILENAME
6122+
6123+#define STBIR__vertical_channels 8
6124+#define STB_IMAGE_RESIZE_DO_VERTICALS
6125+#include STBIR__HEADER_FILENAME
6126+
6127+#define STBIR__vertical_channels 8
6128+#define STB_IMAGE_RESIZE_DO_VERTICALS
6129+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
6130+#include STBIR__HEADER_FILENAME
6131+
6132+typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
6133+
6134+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
6135+{
6136+ stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
6137+};
6138+
6139+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
6140+{
6141+ stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
6142+};
6143+
6144+typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
6145+
6146+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
6147+{
6148+ stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
6149+};
6150+
6151+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
6152+{
6153+ stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
6154+};
6155+
6156+
6157+static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6158+{
6159+ int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
6160+ int channels = stbir_info->channels;
6161+ int width_times_channels = num_pixels * channels;
6162+ void * output_buffer;
6163+
6164+ // un-alpha weight if we need to
6165+ if ( stbir_info->alpha_unweight )
6166+ {
6167+ STBIR_PROFILE_START( unalpha );
6168+ stbir_info->alpha_unweight( encode_buffer, width_times_channels );
6169+ STBIR_PROFILE_END( unalpha );
6170+ }
6171+
6172+ // write directly into output by default
6173+ output_buffer = output_buffer_data;
6174+
6175+ // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
6176+ if ( stbir_info->out_pixels_cb )
6177+ output_buffer = encode_buffer;
6178+
6179+ STBIR_PROFILE_START( encode );
6180+ // convert into the output buffer
6181+ stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
6182+ STBIR_PROFILE_END( encode );
6183+
6184+ // if we have an output callback, call it to send the data
6185+ if ( stbir_info->out_pixels_cb )
6186+ stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
6187+}
6188+
6189+
6190+// Get the ring buffer pointer for an index
6191+static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
6192+{
6193+ STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
6194+
6195+ #ifdef STBIR__SEPARATE_ALLOCATIONS
6196+ return split_info->ring_buffers[ index ];
6197+ #else
6198+ return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
6199+ #endif
6200+}
6201+
6202+// Get the specified scan line from the ring buffer
6203+static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
6204+{
6205+ int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6206+ return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
6207+}
6208+
6209+static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
6210+{
6211+ float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
6212+
6213+ STBIR_PROFILE_START( horizontal );
6214+ if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
6215+ STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
6216+ else
6217+ stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
6218+ STBIR_PROFILE_END( horizontal );
6219+}
6220+
6221+static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
6222+{
6223+ float* encode_buffer = split_info->vertical_buffer;
6224+ float* decode_buffer = split_info->decode_buffer;
6225+ int vertical_first = stbir_info->vertical_first;
6226+ int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
6227+ int width_times_channels = stbir_info->effective_channels * width;
6228+
6229+ STBIR_ASSERT( stbir_info->vertical.is_gather );
6230+
6231+ // loop over the contributing scanlines and scale into the buffer
6232+ STBIR_PROFILE_START( vertical );
6233+ {
6234+ int k = 0, total = contrib_n1 - contrib_n0 + 1;
6235+ STBIR_ASSERT( total > 0 );
6236+ do {
6237+ float const * inputs[8];
6238+ int i, cnt = total; if ( cnt > 8 ) cnt = 8;
6239+ for( i = 0 ; i < cnt ; i++ )
6240+ inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
6241+
6242+ // call the N scanlines at a time function (up to 8 scanlines of blending at once)
6243+ ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
6244+ k += cnt;
6245+ total -= cnt;
6246+ } while ( total );
6247+ }
6248+ STBIR_PROFILE_END( vertical );
6249+
6250+ if ( vertical_first )
6251+ {
6252+ // Now resample the gathered vertical data in the horizontal axis into the encode buffer
6253+ decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
6254+ decode_buffer[ width_times_channels+1 ] = 0.0f;
6255+ stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6256+ }
6257+
6258+ stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
6259+ encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6260+}
6261+
6262+static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
6263+{
6264+ int ring_buffer_index;
6265+ float* ring_buffer;
6266+
6267+ // Decode the nth scanline from the source image into the decode buffer.
6268+ stbir__decode_scanline( stbir_info, n, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6269+
6270+ // update new end scanline
6271+ split_info->ring_buffer_last_scanline = n;
6272+
6273+ // get ring buffer
6274+ ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
6275+ ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
6276+
6277+ // Now resample it into the ring buffer.
6278+ stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6279+
6280+ // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
6281+}
6282+
6283+static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6284+{
6285+ int y, start_output_y, end_output_y;
6286+ stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6287+ float const * vertical_coefficients = stbir_info->vertical.coefficients;
6288+
6289+ STBIR_ASSERT( stbir_info->vertical.is_gather );
6290+
6291+ start_output_y = split_info->start_output_y;
6292+ end_output_y = split_info[split_count-1].end_output_y;
6293+
6294+ vertical_contributors += start_output_y;
6295+ vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
6296+
6297+ // initialize the ring buffer for gathering
6298+ split_info->ring_buffer_begin_index = 0;
6299+ split_info->ring_buffer_first_scanline = vertical_contributors->n0;
6300+ split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
6301+
6302+ for (y = start_output_y; y < end_output_y; y++)
6303+ {
6304+ int in_first_scanline, in_last_scanline;
6305+
6306+ in_first_scanline = vertical_contributors->n0;
6307+ in_last_scanline = vertical_contributors->n1;
6308+
6309+ // make sure the indexing hasn't broken
6310+ STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
6311+
6312+ // Load in new scanlines
6313+ while (in_last_scanline > split_info->ring_buffer_last_scanline)
6314+ {
6315+ STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
6316+
6317+ // make sure there was room in the ring buffer when we add new scanlines
6318+ if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
6319+ {
6320+ split_info->ring_buffer_first_scanline++;
6321+ split_info->ring_buffer_begin_index++;
6322+ }
6323+
6324+ if ( stbir_info->vertical_first )
6325+ {
6326+ float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
6327+ // Decode the nth scanline from the source image into the decode buffer.
6328+ stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6329+ }
6330+ else
6331+ {
6332+ stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
6333+ }
6334+ }
6335+
6336+ // Now all buffers should be ready to write a row of vertical sampling, so do it.
6337+ stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
6338+
6339+ ++vertical_contributors;
6340+ vertical_coefficients += stbir_info->vertical.coefficient_width;
6341+ }
6342+}
6343+
6344+#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
6345+#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
6346+
6347+static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6348+{
6349+ // evict a scanline out into the output buffer
6350+ float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6351+
6352+ // dump the scanline out
6353+ stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6354+
6355+ // mark it as empty
6356+ ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6357+
6358+ // advance the first scanline
6359+ split_info->ring_buffer_first_scanline++;
6360+ if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6361+ split_info->ring_buffer_begin_index = 0;
6362+}
6363+
6364+static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
6365+{
6366+ // evict a scanline out into the output buffer
6367+
6368+ float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
6369+
6370+ // Now resample it into the buffer.
6371+ stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6372+
6373+ // dump the scanline out
6374+ stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6375+
6376+ // mark it as empty
6377+ ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
6378+
6379+ // advance the first scanline
6380+ split_info->ring_buffer_first_scanline++;
6381+ if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
6382+ split_info->ring_buffer_begin_index = 0;
6383+}
6384+
6385+static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
6386+{
6387+ STBIR_ASSERT( !stbir_info->vertical.is_gather );
6388+
6389+ STBIR_PROFILE_START( vertical );
6390+ {
6391+ int k = 0, total = n1 - n0 + 1;
6392+ STBIR_ASSERT( total > 0 );
6393+ do {
6394+ float * outputs[8];
6395+ int i, n = total; if ( n > 8 ) n = 8;
6396+ for( i = 0 ; i < n ; i++ )
6397+ {
6398+ outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
6399+ if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
6400+ {
6401+ n = i;
6402+ break;
6403+ }
6404+ }
6405+ // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
6406+ ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
6407+ k += n;
6408+ total -= n;
6409+ } while ( total );
6410+ }
6411+
6412+ STBIR_PROFILE_END( vertical );
6413+}
6414+
6415+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
6416+
6417+static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
6418+{
6419+ int y, start_output_y, end_output_y, start_input_y, end_input_y;
6420+ stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
6421+ float const * vertical_coefficients = stbir_info->vertical.coefficients;
6422+ stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
6423+ void * scanline_scatter_buffer;
6424+ void * scanline_scatter_buffer_end;
6425+ int on_first_input_y, last_input_y;
6426+ int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
6427+ int width_times_channels = stbir_info->effective_channels * width;
6428+
6429+ STBIR_ASSERT( !stbir_info->vertical.is_gather );
6430+
6431+ start_output_y = split_info->start_output_y;
6432+ end_output_y = split_info[split_count-1].end_output_y; // may do multiple split counts
6433+
6434+ start_input_y = split_info->start_input_y;
6435+ end_input_y = split_info[split_count-1].end_input_y;
6436+
6437+ // adjust for starting offset start_input_y
6438+ y = start_input_y + stbir_info->vertical.filter_pixel_margin;
6439+ vertical_contributors += y ;
6440+ vertical_coefficients += stbir_info->vertical.coefficient_width * y;
6441+
6442+ if ( stbir_info->vertical_first )
6443+ {
6444+ handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
6445+ scanline_scatter_buffer = split_info->decode_buffer;
6446+ scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
6447+ }
6448+ else
6449+ {
6450+ handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
6451+ scanline_scatter_buffer = split_info->vertical_buffer;
6452+ scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
6453+ }
6454+
6455+ // initialize the ring buffer for scattering
6456+ split_info->ring_buffer_first_scanline = start_output_y;
6457+ split_info->ring_buffer_last_scanline = -1;
6458+ split_info->ring_buffer_begin_index = -1;
6459+
6460+ // mark all the buffers as empty to start
6461+ for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
6462+ {
6463+ float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
6464+ decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
6465+ decode_buffer[ width_times_channels+1 ] = 0.0f;
6466+ decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
6467+ }
6468+
6469+ // do the loop in input space
6470+ on_first_input_y = 1; last_input_y = start_input_y;
6471+ for (y = start_input_y ; y < end_input_y; y++)
6472+ {
6473+ int out_first_scanline, out_last_scanline;
6474+
6475+ out_first_scanline = vertical_contributors->n0;
6476+ out_last_scanline = vertical_contributors->n1;
6477+
6478+ STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
6479+
6480+ if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
6481+ {
6482+ float const * vc = vertical_coefficients;
6483+
6484+ // keep track of the range actually seen for the next resize
6485+ last_input_y = y;
6486+ if ( ( on_first_input_y ) && ( y > start_input_y ) )
6487+ split_info->start_input_y = y;
6488+ on_first_input_y = 0;
6489+
6490+ // clip the region
6491+ if ( out_first_scanline < start_output_y )
6492+ {
6493+ vc += start_output_y - out_first_scanline;
6494+ out_first_scanline = start_output_y;
6495+ }
6496+
6497+ if ( out_last_scanline >= end_output_y )
6498+ out_last_scanline = end_output_y - 1;
6499+
6500+ // if very first scanline, init the index
6501+ if (split_info->ring_buffer_begin_index < 0)
6502+ split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
6503+
6504+ STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
6505+
6506+ // Decode the nth scanline from the source image into the decode buffer.
6507+ stbir__decode_scanline( stbir_info, y, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6508+
6509+ // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
6510+ if ( !stbir_info->vertical_first )
6511+ stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
6512+
6513+ // Now it's sitting in the buffer ready to be distributed into the ring buffers.
6514+
6515+ // evict from the ringbuffer, if we need are full
6516+ if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
6517+ ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
6518+ handle_scanline_for_scatter( stbir_info, split_info );
6519+
6520+ // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
6521+ stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
6522+
6523+ // update the end of the buffer
6524+ if ( out_last_scanline > split_info->ring_buffer_last_scanline )
6525+ split_info->ring_buffer_last_scanline = out_last_scanline;
6526+ }
6527+ ++vertical_contributors;
6528+ vertical_coefficients += stbir_info->vertical.coefficient_width;
6529+ }
6530+
6531+ // now evict the scanlines that are left over in the ring buffer
6532+ while ( split_info->ring_buffer_first_scanline < end_output_y )
6533+ handle_scanline_for_scatter(stbir_info, split_info);
6534+
6535+ // update the end_input_y if we do multiple resizes with the same data
6536+ ++last_input_y;
6537+ for( y = 0 ; y < split_count; y++ )
6538+ if ( split_info[y].end_input_y > last_input_y )
6539+ split_info[y].end_input_y = last_input_y;
6540+}
6541+
6542+
6543+static stbir__kernel_callback * stbir__builtin_kernels[] = { 0, stbir__filter_trapezoid, stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
6544+static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one, stbir__support_two, stbir__support_two, stbir__support_two, stbir__support_zeropoint5 };
6545+
6546+static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
6547+{
6548+ // set filter
6549+ if (filter == 0)
6550+ {
6551+ filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
6552+ if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
6553+ {
6554+ if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
6555+ filter = STBIR_FILTER_POINT_SAMPLE;
6556+ else
6557+ filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
6558+ }
6559+ }
6560+ samp->filter_enum = filter;
6561+
6562+ STBIR_ASSERT(samp->filter_enum != 0);
6563+ STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
6564+ samp->filter_kernel = stbir__builtin_kernels[ filter ];
6565+ samp->filter_support = stbir__builtin_supports[ filter ];
6566+
6567+ if ( kernel && support )
6568+ {
6569+ samp->filter_kernel = kernel;
6570+ samp->filter_support = support;
6571+ samp->filter_enum = STBIR_FILTER_OTHER;
6572+ }
6573+
6574+ samp->edge = edge;
6575+ samp->filter_pixel_width = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
6576+ // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
6577+ // For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
6578+ // For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
6579+ // scanlines in memory at once).
6580+ samp->is_gather = 0;
6581+ if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
6582+ samp->is_gather = 1;
6583+ else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
6584+ samp->is_gather = 2;
6585+
6586+ // pre calculate stuff based on the above
6587+ samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
6588+
6589+ // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
6590+ // In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the
6591+ // filter will extend before or after the scanline beyond just one extra entire copy of the
6592+ // scanline (we would hit the edge twice). We don't let you do that, so we clamp the total
6593+ // width to 3x the total of input pixel (once for the scanline, once for the left side
6594+ // overhang, and once for the right side). We only do this for edge mode, since the other
6595+ // modes can just re-edge clamp back in again.
6596+ if ( edge == STBIR_EDGE_WRAP )
6597+ if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
6598+ samp->filter_pixel_width = scale_info->input_full_size * 3;
6599+
6600+ // This is how much to expand buffers to account for filters seeking outside
6601+ // the image boundaries.
6602+ samp->filter_pixel_margin = samp->filter_pixel_width / 2;
6603+
6604+ // filter_pixel_margin is the amount that this filter can overhang on just one side of either
6605+ // end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's
6606+ // worth of pixels, we clamp this one side of overhang to the input scanline size. Again,
6607+ // this clamping only happens in rare cases with the default filters (2 pix to 1 pix).
6608+ if ( edge == STBIR_EDGE_WRAP )
6609+ if ( samp->filter_pixel_margin > scale_info->input_full_size )
6610+ samp->filter_pixel_margin = scale_info->input_full_size;
6611+
6612+ samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
6613+
6614+ samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
6615+ samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
6616+
6617+ samp->gather_prescatter_contributors = 0;
6618+ samp->gather_prescatter_coefficients = 0;
6619+ if ( samp->is_gather == 0 )
6620+ {
6621+ samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
6622+ samp->gather_prescatter_num_contributors = stbir__get_contributors(samp, 2);
6623+ samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
6624+ samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
6625+ }
6626+}
6627+
6628+static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
6629+{
6630+ float scale = samp->scale_info.scale;
6631+ float out_shift = samp->scale_info.pixel_shift;
6632+ stbir__support_callback * support = samp->filter_support;
6633+ int input_full_size = samp->scale_info.input_full_size;
6634+ stbir_edge edge = samp->edge;
6635+ float inv_scale = samp->scale_info.inv_scale;
6636+
6637+ STBIR_ASSERT( samp->is_gather != 0 );
6638+
6639+ if ( samp->is_gather == 1 )
6640+ {
6641+ int in_first_pixel, in_last_pixel;
6642+ float out_filter_radius = support(inv_scale, user_data) * scale;
6643+
6644+ stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
6645+ range->n0 = in_first_pixel;
6646+ stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
6647+ range->n1 = in_last_pixel;
6648+ }
6649+ else if ( samp->is_gather == 2 ) // downsample gather, refine
6650+ {
6651+ float in_pixels_radius = support(scale, user_data) * inv_scale;
6652+ int filter_pixel_margin = samp->filter_pixel_margin;
6653+ int output_sub_size = samp->scale_info.output_sub_size;
6654+ int input_end;
6655+ int n;
6656+ int in_first_pixel, in_last_pixel;
6657+
6658+ // get a conservative area of the input range
6659+ stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
6660+ range->n0 = in_first_pixel;
6661+ stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
6662+ range->n1 = in_last_pixel;
6663+
6664+ // now go through the margin to the start of area to find bottom
6665+ n = range->n0 + 1;
6666+ input_end = -filter_pixel_margin;
6667+ while( n >= input_end )
6668+ {
6669+ int out_first_pixel, out_last_pixel;
6670+ stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
6671+ if ( out_first_pixel > out_last_pixel )
6672+ break;
6673+
6674+ if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
6675+ range->n0 = n;
6676+ --n;
6677+ }
6678+
6679+ // now go through the end of the area through the margin to find top
6680+ n = range->n1 - 1;
6681+ input_end = n + 1 + filter_pixel_margin;
6682+ while( n <= input_end )
6683+ {
6684+ int out_first_pixel, out_last_pixel;
6685+ stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
6686+ if ( out_first_pixel > out_last_pixel )
6687+ break;
6688+ if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
6689+ range->n1 = n;
6690+ ++n;
6691+ }
6692+ }
6693+
6694+ if ( samp->edge == STBIR_EDGE_WRAP )
6695+ {
6696+ // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
6697+ if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
6698+ {
6699+ int marg = range->n1 - input_full_size + 1;
6700+ if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
6701+ range->n0 = 0;
6702+ }
6703+ if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
6704+ {
6705+ int marg = -range->n0;
6706+ if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
6707+ range->n1 = input_full_size - 1;
6708+ }
6709+ }
6710+ else
6711+ {
6712+ // for non-edge-wrap modes, we never read over the edge, so clamp
6713+ if ( range->n0 < 0 )
6714+ range->n0 = 0;
6715+ if ( range->n1 >= input_full_size )
6716+ range->n1 = input_full_size - 1;
6717+ }
6718+}
6719+
6720+static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs )
6721+{
6722+ int i, cur;
6723+ int left = output_height;
6724+
6725+ cur = 0;
6726+ for( i = 0 ; i < splits ; i++ )
6727+ {
6728+ int each;
6729+
6730+ split_info[i].start_output_y = cur;
6731+ each = left / ( splits - i );
6732+ split_info[i].end_output_y = cur + each;
6733+
6734+ // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have
6735+ // a "special" set of coefficients. Basically, with exactly the right filter at exactly the right
6736+ // resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we
6737+ // don't process them at all. But this leads to a tricky thing with the thread splits, where we
6738+ // might have a set of two coeffs like this for example: (4,4) and (3,6). The 4,4 means there was
6739+ // just one single coeff because things worked out perfectly (normally, they all have 4 coeffs
6740+ // like the range 3,6. The problem is that if we start right on the (4,4) on a brand new thread,
6741+ // then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load
6742+ // it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are
6743+ // larger than our existing samples - it's just how the eviction works). So, our solution here
6744+ // is pretty simple, if we start right on a range that has samples that start earlier, then we
6745+ // simply bump up our previous thread split range to include it, and then start this threads
6746+ // range with the smaller sample. It just moves one scanline from one thread split to another,
6747+ // so that we end with the unusual one, instead of start with it. To do this, we check 2-4
6748+ // sample at each thread split start and then occassionally move them.
6749+
6750+ if ( ( is_gather ) && ( i ) )
6751+ {
6752+ stbir__contributors * small_contribs;
6753+ int j, smallest, stop, start_n0;
6754+ stbir__contributors * split_contribs = contribs + cur;
6755+
6756+ // scan for a max of 3x the filter width or until the next thread split
6757+ stop = vertical_pixel_margin * 3;
6758+ if ( each < stop )
6759+ stop = each;
6760+
6761+ // loops a few times before early out
6762+ smallest = 0;
6763+ small_contribs = split_contribs;
6764+ start_n0 = small_contribs->n0;
6765+ for( j = 1 ; j <= stop ; j++ )
6766+ {
6767+ ++split_contribs;
6768+ if ( split_contribs->n0 > start_n0 )
6769+ break;
6770+ if ( split_contribs->n0 < small_contribs->n0 )
6771+ {
6772+ small_contribs = split_contribs;
6773+ smallest = j;
6774+ }
6775+ }
6776+
6777+ split_info[i-1].end_output_y += smallest;
6778+ split_info[i].start_output_y += smallest;
6779+ }
6780+
6781+ cur += each;
6782+ left -= each;
6783+
6784+ // scatter range (updated to minimum as you run it)
6785+ split_info[i].start_input_y = -vertical_pixel_margin;
6786+ split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
6787+ }
6788+}
6789+
6790+static void stbir__free_internal_mem( stbir__info *info )
6791+{
6792+ #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
6793+
6794+ if ( info )
6795+ {
6796+ #ifndef STBIR__SEPARATE_ALLOCATIONS
6797+ STBIR__FREE_AND_CLEAR( info->alloced_mem );
6798+ #else
6799+ int i,j;
6800+
6801+ if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
6802+ {
6803+ STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
6804+ STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
6805+ }
6806+ for( i = 0 ; i < info->splits ; i++ )
6807+ {
6808+ for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
6809+ {
6810+ #ifdef STBIR_SIMD8
6811+ if ( info->effective_channels == 3 )
6812+ --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
6813+ #endif
6814+ STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
6815+ }
6816+
6817+ #ifdef STBIR_SIMD8
6818+ if ( info->effective_channels == 3 )
6819+ --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
6820+ #endif
6821+ STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
6822+ STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
6823+ STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
6824+ }
6825+ STBIR__FREE_AND_CLEAR( info->split_info );
6826+ if ( info->vertical.coefficients != info->horizontal.coefficients )
6827+ {
6828+ STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
6829+ STBIR__FREE_AND_CLEAR( info->vertical.contributors );
6830+ }
6831+ STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
6832+ STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
6833+ STBIR__FREE_AND_CLEAR( info->alloced_mem );
6834+ STBIR_FREE( info, info->user_data );
6835+ #endif
6836+ }
6837+
6838+ #undef STBIR__FREE_AND_CLEAR
6839+}
6840+
6841+static int stbir__get_max_split( int splits, int height )
6842+{
6843+ int i;
6844+ int max = 0;
6845+
6846+ for( i = 0 ; i < splits ; i++ )
6847+ {
6848+ int each = height / ( splits - i );
6849+ if ( each > max )
6850+ max = each;
6851+ height -= each;
6852+ }
6853+ return max;
6854+}
6855+
6856+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
6857+{
6858+ 0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
6859+};
6860+
6861+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
6862+{
6863+ 0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
6864+};
6865+
6866+// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
6867+#define STBIR_RESIZE_CLASSIFICATIONS 8
6868+
6869+static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]= // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
6870+{
6871+ {
6872+ { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
6873+ { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
6874+ { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
6875+ { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
6876+ { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
6877+ { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
6878+ { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
6879+ { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
6880+ }, {
6881+ { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
6882+ { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
6883+ { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
6884+ { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
6885+ { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
6886+ { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
6887+ { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
6888+ { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
6889+ }, {
6890+ { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
6891+ { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
6892+ { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
6893+ { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
6894+ { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
6895+ { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
6896+ { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
6897+ { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
6898+ }, {
6899+ { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
6900+ { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
6901+ { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
6902+ { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
6903+ { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
6904+ { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
6905+ { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
6906+ { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
6907+ }, {
6908+ { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
6909+ { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
6910+ { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
6911+ { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
6912+ { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
6913+ { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
6914+ { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
6915+ { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
6916+ }
6917+};
6918+
6919+// structure that allow us to query and override info for training the costs
6920+typedef struct STBIR__V_FIRST_INFO
6921+{
6922+ double v_cost, h_cost;
6923+ int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
6924+ int v_first;
6925+ int v_resize_classification;
6926+ int is_gather;
6927+} STBIR__V_FIRST_INFO;
6928+
6929+#ifdef STBIR__V_FIRST_INFO_BUFFER
6930+static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
6931+#define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER
6932+#else
6933+#define STBIR__V_FIRST_INFO_POINTER 0
6934+#endif
6935+
6936+// Figure out whether to scale along the horizontal or vertical first.
6937+// This only *super* important when you are scaling by a massively
6938+// different amount in the vertical vs the horizontal (for example, if
6939+// you are scaling by 2x in the width, and 0.5x in the height, then you
6940+// want to do the vertical scale first, because it's around 3x faster
6941+// in that order.
6942+//
6943+// In more normal circumstances, this makes a 20-40% differences, so
6944+// it's good to get right, but not critical. The normal way that you
6945+// decide which direction goes first is just figuring out which
6946+// direction does more multiplies. But with modern CPUs with their
6947+// fancy caches and SIMD and high IPC abilities, so there's just a lot
6948+// more that goes into it.
6949+//
6950+// My handwavy sort of solution is to have an app that does a whole
6951+// bunch of timing for both vertical and horizontal first modes,
6952+// and then another app that can read lots of these timing files
6953+// and try to search for the best weights to use. Dotimings.c
6954+// is the app that does a bunch of timings, and vf_train.c is the
6955+// app that solves for the best weights (and shows how well it
6956+// does currently).
6957+
6958+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
6959+{
6960+ double v_cost, h_cost;
6961+ float * weights;
6962+ int vertical_first;
6963+ int v_classification;
6964+
6965+ // categorize the resize into buckets
6966+ if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
6967+ v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
6968+ else if ( vertical_scale <= 1.0f )
6969+ v_classification = ( is_gather ) ? 1 : 0;
6970+ else if ( vertical_scale <= 2.0f)
6971+ v_classification = 2;
6972+ else if ( vertical_scale <= 3.0f)
6973+ v_classification = 3;
6974+ else if ( vertical_scale <= 4.0f)
6975+ v_classification = 5;
6976+ else
6977+ v_classification = 6;
6978+
6979+ // use the right weights
6980+ weights = weights_table[ v_classification ];
6981+
6982+ // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
6983+ h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
6984+ v_cost = (float)vertical_filter_pixel_width * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
6985+
6986+ // use computation estimate to decide vertical first or not
6987+ vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
6988+
6989+ // save these, if requested
6990+ if ( info )
6991+ {
6992+ info->h_cost = h_cost;
6993+ info->v_cost = v_cost;
6994+ info->v_resize_classification = v_classification;
6995+ info->v_first = vertical_first;
6996+ info->is_gather = is_gather;
6997+ }
6998+
6999+ // and this allows us to override everything for testing (see dotiming.c)
7000+ if ( ( info ) && ( info->control_v_first ) )
7001+ vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
7002+
7003+ return vertical_first;
7004+}
7005+
7006+// layout lookups - must match stbir_internal_pixel_layout
7007+static unsigned char stbir__pixel_channels[] = {
7008+ 1,2,3,3,4, // 1ch, 2ch, rgb, bgr, 4ch
7009+ 4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
7010+ 4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
7011+};
7012+
7013+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
7014+// the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
7015+static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
7016+ STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
7017+ STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
7018+ STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
7019+};
7020+
7021+static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
7022+{
7023+ static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
7024+
7025+ stbir__info * info = 0;
7026+ void * alloced = 0;
7027+ size_t alloced_total = 0;
7028+ int vertical_first;
7029+ size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
7030+ int alloc_ring_buffer_num_entries;
7031+
7032+ int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
7033+ int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
7034+ stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
7035+ stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
7036+ int channels = stbir__pixel_channels[ input_pixel_layout ];
7037+ int effective_channels = channels;
7038+
7039+ // first figure out what type of alpha weighting to use (if any)
7040+ if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
7041+ {
7042+ if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
7043+ {
7044+ if ( fast_alpha )
7045+ {
7046+ alpha_weighting_type = 4;
7047+ }
7048+ else
7049+ {
7050+ static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
7051+ alpha_weighting_type = 2;
7052+ effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
7053+ }
7054+ }
7055+ else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
7056+ {
7057+ // input premult, output non-premult
7058+ alpha_weighting_type = 3;
7059+ }
7060+ else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
7061+ {
7062+ // input non-premult, output premult
7063+ alpha_weighting_type = 1;
7064+ }
7065+ }
7066+
7067+ // channel in and out count must match currently
7068+ if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
7069+ return 0;
7070+
7071+ // get vertical first
7072+ vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
7073+
7074+ // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
7075+ // we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
7076+ // the conversion routines overwriting the callback input data.
7077+ decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger
7078+
7079+#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
7080+ if ( effective_channels == 3 )
7081+ decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
7082+#endif
7083+
7084+ ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
7085+
7086+ // if we do vertical first, the ring buffer holds a whole decoded line
7087+ if ( vertical_first )
7088+ ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
7089+
7090+ if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
7091+
7092+ // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
7093+ alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
7094+
7095+ // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
7096+ if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
7097+ alloc_ring_buffer_num_entries = conservative_split_output_size;
7098+
7099+ ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;
7100+
7101+ // The vertical buffer is used differently, depending on whether we are scattering
7102+ // the vertical scanlines, or gathering them.
7103+ // If scattering, it's used at the temp buffer to accumulate each output.
7104+ // If gathering, it's just the output buffer.
7105+ vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float); // extra float for padding
7106+
7107+ // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
7108+ for(;;)
7109+ {
7110+ int i;
7111+ void * advance_mem = alloced;
7112+ int copy_horizontal = 0;
7113+ stbir__sampler * possibly_use_horizontal_for_pivot = 0;
7114+
7115+#ifdef STBIR__SEPARATE_ALLOCATIONS
7116+ #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
7117+#else
7118+ #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size));
7119+#endif
7120+
7121+ STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
7122+
7123+ STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
7124+
7125+ if ( info )
7126+ {
7127+ static stbir__alpha_weight_func * fancy_alpha_weights[6] = { stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_2ch, stbir__fancy_alpha_weight_2ch };
7128+ static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
7129+ static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
7130+ static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
7131+
7132+ // initialize info fields
7133+ info->alloced_mem = alloced;
7134+ info->alloced_total = alloced_total;
7135+
7136+ info->channels = channels;
7137+ info->effective_channels = effective_channels;
7138+
7139+ info->offset_x = new_x;
7140+ info->offset_y = new_y;
7141+ info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
7142+ info->ring_buffer_num_entries = 0;
7143+ info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
7144+ info->splits = splits;
7145+ info->vertical_first = vertical_first;
7146+
7147+ info->input_pixel_layout_internal = input_pixel_layout;
7148+ info->output_pixel_layout_internal = output_pixel_layout;
7149+
7150+ // setup alpha weight functions
7151+ info->alpha_weight = 0;
7152+ info->alpha_unweight = 0;
7153+
7154+ // handle alpha weighting functions and overrides
7155+ if ( alpha_weighting_type == 2 )
7156+ {
7157+ // high quality alpha multiplying on the way in, dividing on the way out
7158+ info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
7159+ info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
7160+ }
7161+ else if ( alpha_weighting_type == 4 )
7162+ {
7163+ // fast alpha multiplying on the way in, dividing on the way out
7164+ info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
7165+ info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
7166+ }
7167+ else if ( alpha_weighting_type == 1 )
7168+ {
7169+ // fast alpha on the way in, leave in premultiplied form on way out
7170+ info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
7171+ }
7172+ else if ( alpha_weighting_type == 3 )
7173+ {
7174+ // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
7175+ info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
7176+ }
7177+
7178+ // handle 3-chan color flipping, using the alpha weight path
7179+ if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
7180+ ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
7181+ {
7182+ // do the flipping on the smaller of the two ends
7183+ if ( horizontal->scale_info.scale < 1.0f )
7184+ info->alpha_unweight = stbir__simple_flip_3ch;
7185+ else
7186+ info->alpha_weight = stbir__simple_flip_3ch;
7187+ }
7188+
7189+ }
7190+
7191+ // get all the per-split buffers
7192+ for( i = 0 ; i < splits ; i++ )
7193+ {
7194+ STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
7195+
7196+#ifdef STBIR__SEPARATE_ALLOCATIONS
7197+
7198+ #ifdef STBIR_SIMD8
7199+ if ( ( info ) && ( effective_channels == 3 ) )
7200+ ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
7201+ #endif
7202+
7203+ STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
7204+ {
7205+ int j;
7206+ for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
7207+ {
7208+ STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
7209+ #ifdef STBIR_SIMD8
7210+ if ( ( info ) && ( effective_channels == 3 ) )
7211+ ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
7212+ #endif
7213+ }
7214+ }
7215+#else
7216+ STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
7217+#endif
7218+ STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
7219+ }
7220+
7221+ // alloc memory for to-be-pivoted coeffs (if necessary)
7222+ if ( vertical->is_gather == 0 )
7223+ {
7224+ size_t both;
7225+ size_t temp_mem_amt;
7226+
7227+ // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
7228+ // that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
7229+ // is too small, we just allocate extra memory to use as this temp.
7230+
7231+ both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;
7232+
7233+#ifdef STBIR__SEPARATE_ALLOCATIONS
7234+ temp_mem_amt = decode_buffer_size;
7235+
7236+ #ifdef STBIR_SIMD8
7237+ if ( effective_channels == 3 )
7238+ --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
7239+ #endif
7240+#else
7241+ temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
7242+#endif
7243+ if ( temp_mem_amt >= both )
7244+ {
7245+ if ( info )
7246+ {
7247+ vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
7248+ vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
7249+ }
7250+ }
7251+ else
7252+ {
7253+ // ring+decode memory is too small, so allocate temp memory
7254+ STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
7255+ STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
7256+ }
7257+ }
7258+
7259+ STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
7260+ STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
7261+
7262+ // are the two filters identical?? (happens a lot with mipmap generation)
7263+ if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
7264+ {
7265+ float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
7266+ float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
7267+ if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
7268+ if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
7269+ if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
7270+ {
7271+ if ( horizontal->is_gather == vertical->is_gather )
7272+ {
7273+ copy_horizontal = 1;
7274+ goto no_vert_alloc;
7275+ }
7276+ // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
7277+ possibly_use_horizontal_for_pivot = horizontal;
7278+ }
7279+ }
7280+
7281+ STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
7282+ STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
7283+
7284+ no_vert_alloc:
7285+
7286+ if ( info )
7287+ {
7288+ STBIR_PROFILE_BUILD_START( horizontal );
7289+
7290+ stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
7291+
7292+ // setup the horizontal gather functions
7293+ // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
7294+ info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
7295+ // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
7296+ if ( horizontal->extent_info.widest <= 12 )
7297+ info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
7298+
7299+ info->scanline_extents.conservative.n0 = conservative->n0;
7300+ info->scanline_extents.conservative.n1 = conservative->n1;
7301+
7302+ // get exact extents
7303+ stbir__get_extents( horizontal, &info->scanline_extents );
7304+
7305+ // pack the horizontal coeffs
7306+ horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
7307+
7308+ STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
7309+
7310+ STBIR_PROFILE_BUILD_END( horizontal );
7311+
7312+ if ( copy_horizontal )
7313+ {
7314+ STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
7315+ }
7316+ else
7317+ {
7318+ STBIR_PROFILE_BUILD_START( vertical );
7319+
7320+ stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
7321+ STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
7322+
7323+ STBIR_PROFILE_BUILD_END( vertical );
7324+ }
7325+
7326+ // setup the vertical split ranges
7327+ stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors );
7328+
7329+ // now we know precisely how many entries we need
7330+ info->ring_buffer_num_entries = info->vertical.extent_info.widest;
7331+
7332+ // we never need more ring buffer entries than the scanlines we're outputting
7333+ if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
7334+ info->ring_buffer_num_entries = conservative_split_output_size;
7335+ STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
7336+ }
7337+ #undef STBIR__NEXT_PTR
7338+
7339+
7340+ // is this the first time through loop?
7341+ if ( info == 0 )
7342+ {
7343+ alloced_total = ( 15 + (size_t)advance_mem );
7344+ alloced = STBIR_MALLOC( alloced_total, user_data );
7345+ if ( alloced == 0 )
7346+ return 0;
7347+ }
7348+ else
7349+ return info; // success
7350+ }
7351+}
7352+
7353+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
7354+{
7355+ stbir__per_split_info * split_info = info->split_info + split_start;
7356+
7357+ STBIR_PROFILE_CLEAR_EXTRAS();
7358+
7359+ STBIR_PROFILE_FIRST_START( looping );
7360+ if (info->vertical.is_gather)
7361+ stbir__vertical_gather_loop( info, split_info, split_count );
7362+ else
7363+ stbir__vertical_scatter_loop( info, split_info, split_count );
7364+ STBIR_PROFILE_END( looping );
7365+
7366+ return 1;
7367+}
7368+
7369+static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
7370+{
7371+ static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
7372+ {
7373+ /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
7374+ };
7375+
7376+ static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
7377+ {
7378+ { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear },
7379+ { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
7380+ { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
7381+ { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
7382+ { /* RA */ stbir__decode_uint8_srgb2_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear },
7383+ { /* AR */ stbir__decode_uint8_srgb2_linearalpha_AR, stbir__decode_uint8_srgb_AR, 0, stbir__decode_float_linear_AR, stbir__decode_half_float_linear_AR },
7384+ };
7385+
7386+ static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
7387+ {
7388+ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
7389+ };
7390+
7391+ static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
7392+ {
7393+ { /* RGBA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } },
7394+ { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA, stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
7395+ { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB, stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
7396+ { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR, stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
7397+ { /* RA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } },
7398+ { /* AR */ { stbir__decode_uint8_linear_scaled_AR, stbir__decode_uint8_linear_AR }, { stbir__decode_uint16_linear_scaled_AR, stbir__decode_uint16_linear_AR } }
7399+ };
7400+
7401+ static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
7402+ {
7403+ /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
7404+ };
7405+
7406+ static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
7407+ {
7408+ { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear },
7409+ { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
7410+ { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
7411+ { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
7412+ { /* RA */ stbir__encode_uint8_srgb2_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear },
7413+ { /* AR */ stbir__encode_uint8_srgb2_linearalpha_AR, stbir__encode_uint8_srgb_AR, 0, stbir__encode_float_linear_AR, stbir__encode_half_float_linear_AR }
7414+ };
7415+
7416+ static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
7417+ {
7418+ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
7419+ };
7420+
7421+ static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
7422+ {
7423+ { /* RGBA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } },
7424+ { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA, stbir__encode_uint8_linear_BGRA }, { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
7425+ { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB, stbir__encode_uint8_linear_ARGB }, { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
7426+ { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR, stbir__encode_uint8_linear_ABGR }, { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
7427+ { /* RA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } },
7428+ { /* AR */ { stbir__encode_uint8_linear_scaled_AR, stbir__encode_uint8_linear_AR }, { stbir__encode_uint16_linear_scaled_AR, stbir__encode_uint16_linear_AR } }
7429+ };
7430+
7431+ stbir__decode_pixels_func * decode_pixels = 0;
7432+ stbir__encode_pixels_func * encode_pixels = 0;
7433+ stbir_datatype input_type, output_type;
7434+
7435+ input_type = resize->input_data_type;
7436+ output_type = resize->output_data_type;
7437+ info->input_data = resize->input_pixels;
7438+ info->input_stride_bytes = resize->input_stride_in_bytes;
7439+ info->output_stride_bytes = resize->output_stride_in_bytes;
7440+
7441+ // if we're completely point sampling, then we can turn off SRGB
7442+ if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
7443+ {
7444+ if ( ( ( input_type == STBIR_TYPE_UINT8_SRGB ) || ( input_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
7445+ ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
7446+ {
7447+ input_type = STBIR_TYPE_UINT8;
7448+ output_type = STBIR_TYPE_UINT8;
7449+ }
7450+ }
7451+
7452+ // recalc the output and input strides
7453+ if ( info->input_stride_bytes == 0 )
7454+ info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
7455+
7456+ if ( info->output_stride_bytes == 0 )
7457+ info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
7458+
7459+ // calc offset
7460+ info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
7461+
7462+ info->in_pixels_cb = resize->input_cb;
7463+ info->user_data = resize->user_data;
7464+ info->out_pixels_cb = resize->output_cb;
7465+
7466+ // setup the input format converters
7467+ if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
7468+ {
7469+ int non_scaled = 0;
7470+
7471+ // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
7472+ if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
7473+ if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
7474+ non_scaled = 1;
7475+
7476+ if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
7477+ decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
7478+ else
7479+ decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
7480+ }
7481+ else
7482+ {
7483+ if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
7484+ decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
7485+ else
7486+ decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
7487+ }
7488+
7489+ // setup the output format converters
7490+ if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
7491+ {
7492+ int non_scaled = 0;
7493+
7494+ // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
7495+ if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
7496+ if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
7497+ non_scaled = 1;
7498+
7499+ if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
7500+ encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
7501+ else
7502+ encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
7503+ }
7504+ else
7505+ {
7506+ if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
7507+ encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
7508+ else
7509+ encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
7510+ }
7511+
7512+ info->input_type = input_type;
7513+ info->output_type = output_type;
7514+ info->decode_pixels = decode_pixels;
7515+ info->encode_pixels = encode_pixels;
7516+}
7517+
7518+static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
7519+{
7520+ double per, adj;
7521+ int over;
7522+
7523+ // do left/top edge
7524+ if ( *outx < 0 )
7525+ {
7526+ per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
7527+ adj = per * ( *u1 - *u0 );
7528+ *u0 -= adj; // increases u0
7529+ *outx = 0;
7530+ }
7531+
7532+ // do right/bot edge
7533+ over = outw - ( *outx + *outsubw );
7534+ if ( over < 0 )
7535+ {
7536+ per = ( (double)over ) / ( (double)*outsubw ); // is negative
7537+ adj = per * ( *u1 - *u0 );
7538+ *u1 += adj; // decrease u1
7539+ *outsubw = outw - *outx;
7540+ }
7541+}
7542+
7543+// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
7544+static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
7545+{
7546+ double err;
7547+ stbir_uint64 top, bot;
7548+ stbir_uint64 numer_last = 0;
7549+ stbir_uint64 denom_last = 1;
7550+ stbir_uint64 numer_estimate = 1;
7551+ stbir_uint64 denom_estimate = 0;
7552+
7553+ // scale to past float error range
7554+ top = (stbir_uint64)( f * (double)(1 << 25) );
7555+ bot = 1 << 25;
7556+
7557+ // keep refining, but usually stops in a few loops - usually 5 for bad cases
7558+ for(;;)
7559+ {
7560+ stbir_uint64 est, temp;
7561+
7562+ // hit limit, break out and do best full range estimate
7563+ if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
7564+ break;
7565+
7566+ // is the current error less than 1 bit of a float? if so, we're done
7567+ if ( denom_estimate )
7568+ {
7569+ err = ( (double)numer_estimate / (double)denom_estimate ) - f;
7570+ if ( err < 0.0 ) err = -err;
7571+ if ( err < ( 1.0 / (double)(1<<24) ) )
7572+ {
7573+ // yup, found it
7574+ *numer = (stbir_uint32) numer_estimate;
7575+ *denom = (stbir_uint32) denom_estimate;
7576+ return 1;
7577+ }
7578+ }
7579+
7580+ // no more refinement bits left? break out and do full range estimate
7581+ if ( bot == 0 )
7582+ break;
7583+
7584+ // gcd the estimate bits
7585+ est = top / bot;
7586+ temp = top % bot;
7587+ top = bot;
7588+ bot = temp;
7589+
7590+ // move remainders
7591+ temp = est * denom_estimate + denom_last;
7592+ denom_last = denom_estimate;
7593+ denom_estimate = temp;
7594+
7595+ // move remainders
7596+ temp = est * numer_estimate + numer_last;
7597+ numer_last = numer_estimate;
7598+ numer_estimate = temp;
7599+ }
7600+
7601+ // we didn't fine anything good enough for float, use a full range estimate
7602+ if ( limit_denom )
7603+ {
7604+ numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
7605+ denom_estimate = limit;
7606+ }
7607+ else
7608+ {
7609+ numer_estimate = limit;
7610+ denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
7611+ }
7612+
7613+ *numer = (stbir_uint32) numer_estimate;
7614+ *denom = (stbir_uint32) denom_estimate;
7615+
7616+ err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
7617+ if ( err < 0.0 ) err = -err;
7618+ return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
7619+}
7620+
7621+static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
7622+{
7623+ double output_range, input_range, output_s, input_s, ratio, scale;
7624+
7625+ input_s = input_s1 - input_s0;
7626+
7627+ // null area
7628+ if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
7629+ ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
7630+ return 0;
7631+
7632+ // are either of the ranges completely out of bounds?
7633+ if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
7634+ return 0;
7635+
7636+ output_range = (double)output_full_range;
7637+ input_range = (double)input_full_range;
7638+
7639+ output_s = ( (double)output_sub_range) / output_range;
7640+
7641+ // figure out the scaling to use
7642+ ratio = output_s / input_s;
7643+
7644+ // save scale before clipping
7645+ scale = ( output_range / input_range ) * ratio;
7646+ scale_info->scale = (float)scale;
7647+ scale_info->inv_scale = (float)( 1.0 / scale );
7648+
7649+ // clip output area to left/right output edges (and adjust input area)
7650+ stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
7651+
7652+ // recalc input area
7653+ input_s = input_s1 - input_s0;
7654+
7655+ // after clipping do we have zero input area?
7656+ if ( input_s <= stbir__small_float )
7657+ return 0;
7658+
7659+ // calculate and store the starting source offsets in output pixel space
7660+ scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
7661+
7662+ scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
7663+
7664+ scale_info->input_full_size = input_full_range;
7665+ scale_info->output_sub_size = output_sub_range;
7666+
7667+ return 1;
7668+}
7669+
7670+
7671+static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
7672+{
7673+ resize->input_cb = 0;
7674+ resize->output_cb = 0;
7675+ resize->user_data = resize;
7676+ resize->samplers = 0;
7677+ resize->called_alloc = 0;
7678+ resize->horizontal_filter = STBIR_FILTER_DEFAULT;
7679+ resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
7680+ resize->vertical_filter = STBIR_FILTER_DEFAULT;
7681+ resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
7682+ resize->horizontal_edge = STBIR_EDGE_CLAMP;
7683+ resize->vertical_edge = STBIR_EDGE_CLAMP;
7684+ resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
7685+ resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
7686+ resize->input_data_type = data_type;
7687+ resize->output_data_type = data_type;
7688+ resize->input_pixel_layout_public = pixel_layout;
7689+ resize->output_pixel_layout_public = pixel_layout;
7690+ resize->needs_rebuild = 1;
7691+}
7692+
7693+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
7694+ const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero
7695+ void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
7696+ stbir_pixel_layout pixel_layout, stbir_datatype data_type )
7697+{
7698+ resize->input_pixels = input_pixels;
7699+ resize->input_w = input_w;
7700+ resize->input_h = input_h;
7701+ resize->input_stride_in_bytes = input_stride_in_bytes;
7702+ resize->output_pixels = output_pixels;
7703+ resize->output_w = output_w;
7704+ resize->output_h = output_h;
7705+ resize->output_stride_in_bytes = output_stride_in_bytes;
7706+ resize->fast_alpha = 0;
7707+
7708+ stbir__init_and_set_layout( resize, pixel_layout, data_type );
7709+}
7710+
7711+// You can update parameters any time after resize_init
7712+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type ) // by default, datatype from resize_init
7713+{
7714+ resize->input_data_type = input_type;
7715+ resize->output_data_type = output_type;
7716+ if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
7717+ stbir__update_info_from_resize( resize->samplers, resize );
7718+}
7719+
7720+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ) // no callbacks by default
7721+{
7722+ resize->input_cb = input_cb;
7723+ resize->output_cb = output_cb;
7724+
7725+ if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
7726+ {
7727+ resize->samplers->in_pixels_cb = input_cb;
7728+ resize->samplers->out_pixels_cb = output_cb;
7729+ }
7730+}
7731+
7732+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ) // pass back STBIR_RESIZE* by default
7733+{
7734+ resize->user_data = user_data;
7735+ if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
7736+ resize->samplers->user_data = user_data;
7737+}
7738+
7739+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
7740+{
7741+ resize->input_pixels = input_pixels;
7742+ resize->input_stride_in_bytes = input_stride_in_bytes;
7743+ resize->output_pixels = output_pixels;
7744+ resize->output_stride_in_bytes = output_stride_in_bytes;
7745+ if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
7746+ stbir__update_info_from_resize( resize->samplers, resize );
7747+}
7748+
7749+
7750+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ) // CLAMP by default
7751+{
7752+ resize->horizontal_edge = horizontal_edge;
7753+ resize->vertical_edge = vertical_edge;
7754+ resize->needs_rebuild = 1;
7755+ return 1;
7756+}
7757+
7758+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
7759+{
7760+ resize->horizontal_filter = horizontal_filter;
7761+ resize->vertical_filter = vertical_filter;
7762+ resize->needs_rebuild = 1;
7763+ return 1;
7764+}
7765+
7766+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
7767+{
7768+ resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
7769+ resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
7770+ resize->needs_rebuild = 1;
7771+ return 1;
7772+}
7773+
7774+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ) // sets new pixel layouts
7775+{
7776+ resize->input_pixel_layout_public = input_pixel_layout;
7777+ resize->output_pixel_layout_public = output_pixel_layout;
7778+ resize->needs_rebuild = 1;
7779+ return 1;
7780+}
7781+
7782+
7783+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality ) // sets alpha speed
7784+{
7785+ resize->fast_alpha = non_pma_alpha_speed_over_quality;
7786+ resize->needs_rebuild = 1;
7787+ return 1;
7788+}
7789+
7790+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ) // sets input region (full region by default)
7791+{
7792+ resize->input_s0 = s0;
7793+ resize->input_t0 = t0;
7794+ resize->input_s1 = s1;
7795+ resize->input_t1 = t1;
7796+ resize->needs_rebuild = 1;
7797+
7798+ // are we inbounds?
7799+ if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
7800+ ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
7801+ ( s0 > (1.0f-stbir__small_float) ) ||
7802+ ( t0 > (1.0f-stbir__small_float) ) )
7803+ return 0;
7804+
7805+ return 1;
7806+}
7807+
7808+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets input region (full region by default)
7809+{
7810+ resize->output_subx = subx;
7811+ resize->output_suby = suby;
7812+ resize->output_subw = subw;
7813+ resize->output_subh = subh;
7814+ resize->needs_rebuild = 1;
7815+
7816+ // are we inbounds?
7817+ if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
7818+ return 0;
7819+
7820+ return 1;
7821+}
7822+
7823+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets both regions (full regions by default)
7824+{
7825+ double s0, t0, s1, t1;
7826+
7827+ s0 = ( (double)subx ) / ( (double)resize->output_w );
7828+ t0 = ( (double)suby ) / ( (double)resize->output_h );
7829+ s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
7830+ t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
7831+
7832+ resize->input_s0 = s0;
7833+ resize->input_t0 = t0;
7834+ resize->input_s1 = s1;
7835+ resize->input_t1 = t1;
7836+ resize->output_subx = subx;
7837+ resize->output_suby = suby;
7838+ resize->output_subw = subw;
7839+ resize->output_subh = subh;
7840+ resize->needs_rebuild = 1;
7841+
7842+ // are we inbounds?
7843+ if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
7844+ return 0;
7845+
7846+ return 1;
7847+}
7848+
7849+static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
7850+{
7851+ stbir__contributors conservative = { 0, 0 };
7852+ stbir__sampler horizontal, vertical;
7853+ int new_output_subx, new_output_suby;
7854+ stbir__info * out_info;
7855+ #ifdef STBIR_PROFILE
7856+ stbir__info profile_infod; // used to contain building profile info before everything is allocated
7857+ stbir__info * profile_info = &profile_infod;
7858+ #endif
7859+
7860+ // have we already built the samplers?
7861+ if ( resize->samplers )
7862+ return 0;
7863+
7864+ #define STBIR_RETURN_ERROR_AND_ASSERT( exp ) STBIR_ASSERT( !(exp) ); if (exp) return 0;
7865+ STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
7866+ STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
7867+ #undef STBIR_RETURN_ERROR_AND_ASSERT
7868+
7869+ if ( splits <= 0 )
7870+ return 0;
7871+
7872+ STBIR_PROFILE_BUILD_FIRST_START( build );
7873+
7874+ new_output_subx = resize->output_subx;
7875+ new_output_suby = resize->output_suby;
7876+
7877+ // do horizontal clip and scale calcs
7878+ if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
7879+ return 0;
7880+
7881+ // do vertical clip and scale calcs
7882+ if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
7883+ return 0;
7884+
7885+ // if nothing to do, just return
7886+ if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
7887+ return 0;
7888+
7889+ stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
7890+ stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
7891+ stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
7892+
7893+ if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
7894+ {
7895+ splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
7896+ if ( splits == 0 ) splits = 1;
7897+ }
7898+
7899+ STBIR_PROFILE_BUILD_START( alloc );
7900+ out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
7901+ STBIR_PROFILE_BUILD_END( alloc );
7902+ STBIR_PROFILE_BUILD_END( build );
7903+
7904+ if ( out_info )
7905+ {
7906+ resize->splits = splits;
7907+ resize->samplers = out_info;
7908+ resize->needs_rebuild = 0;
7909+ #ifdef STBIR_PROFILE
7910+ STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
7911+ #endif
7912+
7913+ // update anything that can be changed without recalcing samplers
7914+ stbir__update_info_from_resize( out_info, resize );
7915+
7916+ return splits;
7917+ }
7918+
7919+ return 0;
7920+}
7921+
7922+void stbir_free_samplers( STBIR_RESIZE * resize )
7923+{
7924+ if ( resize->samplers )
7925+ {
7926+ stbir__free_internal_mem( resize->samplers );
7927+ resize->samplers = 0;
7928+ resize->called_alloc = 0;
7929+ }
7930+}
7931+
7932+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
7933+{
7934+ if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
7935+ {
7936+ if ( resize->samplers )
7937+ stbir_free_samplers( resize );
7938+
7939+ resize->called_alloc = 1;
7940+ return stbir__perform_build( resize, splits );
7941+ }
7942+
7943+ STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
7944+
7945+ return 1;
7946+}
7947+
7948+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
7949+{
7950+ return stbir_build_samplers_with_splits( resize, 1 );
7951+}
7952+
7953+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
7954+{
7955+ int result;
7956+
7957+ if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
7958+ {
7959+ int alloc_state = resize->called_alloc; // remember allocated state
7960+
7961+ if ( resize->samplers )
7962+ {
7963+ stbir__free_internal_mem( resize->samplers );
7964+ resize->samplers = 0;
7965+ }
7966+
7967+ if ( !stbir_build_samplers( resize ) )
7968+ return 0;
7969+
7970+ resize->called_alloc = alloc_state;
7971+
7972+ // if build_samplers succeeded (above), but there are no samplers set, then
7973+ // the area to stretch into was zero pixels, so don't do anything and return
7974+ // success
7975+ if ( resize->samplers == 0 )
7976+ return 1;
7977+ }
7978+ else
7979+ {
7980+ // didn't build anything - clear it
7981+ STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
7982+ }
7983+
7984+ // do resize
7985+ result = stbir__perform_resize( resize->samplers, 0, resize->splits );
7986+
7987+ // if we alloced, then free
7988+ if ( !resize->called_alloc )
7989+ {
7990+ stbir_free_samplers( resize );
7991+ resize->samplers = 0;
7992+ }
7993+
7994+ return result;
7995+}
7996+
7997+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
7998+{
7999+ STBIR_ASSERT( resize->samplers );
8000+
8001+ // if we're just doing the whole thing, call full
8002+ if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
8003+ return stbir_resize_extended( resize );
8004+
8005+ // you **must** build samplers first when using split resize
8006+ if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
8007+ return 0;
8008+
8009+ if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
8010+ return 0;
8011+
8012+ // do resize
8013+ return stbir__perform_resize( resize->samplers, split_start, split_count );
8014+}
8015+
8016+
8017+static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
8018+ void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
8019+ stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter )
8020+{
8021+ STBIR_RESIZE resize;
8022+ int scanline_output_in_bytes;
8023+ int positive_output_stride_in_bytes;
8024+ void * start_ptr;
8025+ void * free_ptr;
8026+
8027+ scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ];
8028+ if ( scanline_output_in_bytes == 0 )
8029+ return 0;
8030+
8031+ // if zero stride, use scanline output
8032+ if ( output_stride_in_bytes == 0 )
8033+ output_stride_in_bytes = scanline_output_in_bytes;
8034+
8035+ // abs value for inverted images (negative pitches)
8036+ positive_output_stride_in_bytes = output_stride_in_bytes;
8037+ if ( positive_output_stride_in_bytes < 0 )
8038+ positive_output_stride_in_bytes = -positive_output_stride_in_bytes;
8039+
8040+ // is the requested stride smaller than the scanline output? if so, just fail
8041+ if ( positive_output_stride_in_bytes < scanline_output_in_bytes )
8042+ return 0;
8043+
8044+ start_ptr = output_pixels;
8045+ free_ptr = 0; // no free pointer, since they passed buffer to use
8046+
8047+ // did they pass a zero for the dest? if so, allocate the buffer
8048+ if ( output_pixels == 0 )
8049+ {
8050+ size_t size;
8051+ char * ptr;
8052+
8053+ size = (size_t)positive_output_stride_in_bytes * (size_t)output_h;
8054+ if ( size == 0 )
8055+ return 0;
8056+
8057+ ptr = (char*) STBIR_MALLOC( size, 0 );
8058+ if ( ptr == 0 )
8059+ return 0;
8060+
8061+ free_ptr = ptr;
8062+
8063+ // point at the last scanline, if they requested a flipped image
8064+ if ( output_stride_in_bytes < 0 )
8065+ start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) );
8066+ else
8067+ start_ptr = ptr;
8068+ }
8069+
8070+ // ok, now do the resize
8071+ stbir_resize_init( &resize,
8072+ input_pixels, input_w, input_h, input_stride_in_bytes,
8073+ start_ptr, output_w, output_h, output_stride_in_bytes,
8074+ pixel_layout, data_type );
8075+
8076+ resize.horizontal_edge = edge;
8077+ resize.vertical_edge = edge;
8078+ resize.horizontal_filter = filter;
8079+ resize.vertical_filter = filter;
8080+
8081+ if ( !stbir_resize_extended( &resize ) )
8082+ {
8083+ if ( free_ptr )
8084+ STBIR_FREE( free_ptr, 0 );
8085+ return 0;
8086+ }
8087+
8088+ return (free_ptr) ? free_ptr : start_ptr;
8089+}
8090+
8091+
8092+
8093+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
8094+ unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
8095+ stbir_pixel_layout pixel_layout )
8096+{
8097+ return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
8098+ output_pixels, output_w, output_h, output_stride_in_bytes,
8099+ pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
8100+}
8101+
8102+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
8103+ unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
8104+ stbir_pixel_layout pixel_layout )
8105+{
8106+ return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
8107+ output_pixels, output_w, output_h, output_stride_in_bytes,
8108+ pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
8109+}
8110+
8111+
8112+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
8113+ float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
8114+ stbir_pixel_layout pixel_layout )
8115+{
8116+ return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
8117+ output_pixels, output_w, output_h, output_stride_in_bytes,
8118+ pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
8119+}
8120+
8121+
8122+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
8123+ void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
8124+ stbir_pixel_layout pixel_layout, stbir_datatype data_type,
8125+ stbir_edge edge, stbir_filter filter )
8126+{
8127+ return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
8128+ output_pixels, output_w, output_h, output_stride_in_bytes,
8129+ pixel_layout, data_type, edge, filter );
8130+}
8131+
8132+#ifdef STBIR_PROFILE
8133+
8134+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
8135+{
8136+ static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
8137+ stbir__info* samp = resize->samplers;
8138+ int i;
8139+
8140+ typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
8141+ typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
8142+ typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
8143+
8144+ for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
8145+ info->clocks[i] = samp->profile.array[i+1];
8146+
8147+ info->total_clocks = samp->profile.named.total;
8148+ info->descriptions = bdescriptions;
8149+ info->count = STBIR__ARRAY_SIZE( bdescriptions );
8150+}
8151+
8152+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
8153+{
8154+ static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
8155+ stbir__per_split_info * split_info;
8156+ int s, i;
8157+
8158+ typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
8159+ typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
8160+ typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
8161+
8162+ if ( split_start == -1 )
8163+ {
8164+ split_start = 0;
8165+ split_count = resize->samplers->splits;
8166+ }
8167+
8168+ if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
8169+ {
8170+ info->total_clocks = 0;
8171+ info->descriptions = 0;
8172+ info->count = 0;
8173+ return;
8174+ }
8175+
8176+ split_info = resize->samplers->split_info + split_start;
8177+
8178+ // sum up the profile from all the splits
8179+ for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
8180+ {
8181+ stbir_uint64 sum = 0;
8182+ for( s = 0 ; s < split_count ; s++ )
8183+ sum += split_info[s].profile.array[i+1];
8184+ info->clocks[i] = sum;
8185+ }
8186+
8187+ info->total_clocks = split_info->profile.named.total;
8188+ info->descriptions = descriptions;
8189+ info->count = STBIR__ARRAY_SIZE( descriptions );
8190+}
8191+
8192+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
8193+{
8194+ stbir_resize_split_profile_info( info, resize, -1, 0 );
8195+}
8196+
8197+#endif // STBIR_PROFILE
8198+
8199+#undef STBIR_BGR
8200+#undef STBIR_1CHANNEL
8201+#undef STBIR_2CHANNEL
8202+#undef STBIR_RGB
8203+#undef STBIR_RGBA
8204+#undef STBIR_4CHANNEL
8205+#undef STBIR_BGRA
8206+#undef STBIR_ARGB
8207+#undef STBIR_ABGR
8208+#undef STBIR_RA
8209+#undef STBIR_AR
8210+#undef STBIR_RGBA_PM
8211+#undef STBIR_BGRA_PM
8212+#undef STBIR_ARGB_PM
8213+#undef STBIR_ABGR_PM
8214+#undef STBIR_RA_PM
8215+#undef STBIR_AR_PM
8216+
8217+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
8218+
8219+#else // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
8220+
8221+// we reinclude the header file to define all the horizontal functions
8222+// specializing each function for the number of coeffs is 20-40% faster *OVERALL*
8223+
8224+// by including the header file again this way, we can still debug the functions
8225+
8226+#define STBIR_strs_join2( start, mid, end ) start##mid##end
8227+#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
8228+
8229+#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
8230+#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
8231+
8232+#ifdef STB_IMAGE_RESIZE_DO_CODERS
8233+
8234+#ifdef stbir__decode_suffix
8235+#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
8236+#else
8237+#define STBIR__CODER_NAME( name ) name
8238+#endif
8239+
8240+#ifdef stbir__decode_swizzle
8241+#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
8242+#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
8243+#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
8244+#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
8245+#else
8246+#define stbir__decode_order0 0
8247+#define stbir__decode_order1 1
8248+#define stbir__decode_order2 2
8249+#define stbir__decode_order3 3
8250+#define stbir__encode_order0 0
8251+#define stbir__encode_order1 1
8252+#define stbir__encode_order2 2
8253+#define stbir__encode_order3 3
8254+#define stbir__decode_simdf8_flip(reg)
8255+#define stbir__decode_simdf4_flip(reg)
8256+#define stbir__encode_simdf8_unflip(reg)
8257+#define stbir__encode_simdf4_unflip(reg)
8258+#endif
8259+
8260+#ifdef STBIR_SIMD8
8261+#define stbir__encode_simdfX_unflip stbir__encode_simdf8_unflip
8262+#else
8263+#define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip
8264+#endif
8265+
8266+static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
8267+{
8268+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
8269+ float * decode_end = (float*) decode + width_times_channels;
8270+ unsigned char const * input = (unsigned char const*)inputp;
8271+
8272+ #ifdef STBIR_SIMD
8273+ unsigned char const * end_input_m16 = input + width_times_channels - 16;
8274+ if ( width_times_channels >= 16 )
8275+ {
8276+ decode_end -= 16;
8277+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
8278+ for(;;)
8279+ {
8280+ #ifdef STBIR_SIMD8
8281+ stbir__simdi i; stbir__simdi8 o0,o1;
8282+ stbir__simdf8 of0, of1;
8283+ STBIR_NO_UNROLL(decode);
8284+ stbir__simdi_load( i, input );
8285+ stbir__simdi8_expand_u8_to_u32( o0, o1, i );
8286+ stbir__simdi8_convert_i32_to_float( of0, o0 );
8287+ stbir__simdi8_convert_i32_to_float( of1, o1 );
8288+ stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
8289+ stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
8290+ stbir__decode_simdf8_flip( of0 );
8291+ stbir__decode_simdf8_flip( of1 );
8292+ stbir__simdf8_store( decode + 0, of0 );
8293+ stbir__simdf8_store( decode + 8, of1 );
8294+ #else
8295+ stbir__simdi i, o0, o1, o2, o3;
8296+ stbir__simdf of0, of1, of2, of3;
8297+ STBIR_NO_UNROLL(decode);
8298+ stbir__simdi_load( i, input );
8299+ stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
8300+ stbir__simdi_convert_i32_to_float( of0, o0 );
8301+ stbir__simdi_convert_i32_to_float( of1, o1 );
8302+ stbir__simdi_convert_i32_to_float( of2, o2 );
8303+ stbir__simdi_convert_i32_to_float( of3, o3 );
8304+ stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
8305+ stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
8306+ stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
8307+ stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
8308+ stbir__decode_simdf4_flip( of0 );
8309+ stbir__decode_simdf4_flip( of1 );
8310+ stbir__decode_simdf4_flip( of2 );
8311+ stbir__decode_simdf4_flip( of3 );
8312+ stbir__simdf_store( decode + 0, of0 );
8313+ stbir__simdf_store( decode + 4, of1 );
8314+ stbir__simdf_store( decode + 8, of2 );
8315+ stbir__simdf_store( decode + 12, of3 );
8316+ #endif
8317+ decode += 16;
8318+ input += 16;
8319+ if ( decode <= decode_end )
8320+ continue;
8321+ if ( decode == ( decode_end + 16 ) )
8322+ break;
8323+ decode = decode_end; // backup and do last couple
8324+ input = end_input_m16;
8325+ }
8326+ return decode_end + 16;
8327+ }
8328+ #endif
8329+
8330+ // try to do blocks of 4 when you can
8331+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8332+ decode += 4;
8333+ STBIR_SIMD_NO_UNROLL_LOOP_START
8334+ while( decode <= decode_end )
8335+ {
8336+ STBIR_SIMD_NO_UNROLL(decode);
8337+ decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
8338+ decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
8339+ decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
8340+ decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
8341+ decode += 4;
8342+ input += 4;
8343+ }
8344+ decode -= 4;
8345+ #endif
8346+
8347+ // do the remnants
8348+ #if stbir__coder_min_num < 4
8349+ STBIR_NO_UNROLL_LOOP_START
8350+ while( decode < decode_end )
8351+ {
8352+ STBIR_NO_UNROLL(decode);
8353+ decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
8354+ #if stbir__coder_min_num >= 2
8355+ decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
8356+ #endif
8357+ #if stbir__coder_min_num >= 3
8358+ decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
8359+ #endif
8360+ decode += stbir__coder_min_num;
8361+ input += stbir__coder_min_num;
8362+ }
8363+ #endif
8364+
8365+ return decode_end;
8366+}
8367+
8368+static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
8369+{
8370+ unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
8371+ unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
8372+
8373+ #ifdef STBIR_SIMD
8374+ if ( width_times_channels >= stbir__simdfX_float_count*2 )
8375+ {
8376+ float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
8377+ end_output -= stbir__simdfX_float_count*2;
8378+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
8379+ for(;;)
8380+ {
8381+ stbir__simdfX e0, e1;
8382+ stbir__simdi i;
8383+ STBIR_SIMD_NO_UNROLL(encode);
8384+ stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
8385+ stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
8386+ stbir__encode_simdfX_unflip( e0 );
8387+ stbir__encode_simdfX_unflip( e1 );
8388+ #ifdef STBIR_SIMD8
8389+ stbir__simdf8_pack_to_16bytes( i, e0, e1 );
8390+ stbir__simdi_store( output, i );
8391+ #else
8392+ stbir__simdf_pack_to_8bytes( i, e0, e1 );
8393+ stbir__simdi_store2( output, i );
8394+ #endif
8395+ encode += stbir__simdfX_float_count*2;
8396+ output += stbir__simdfX_float_count*2;
8397+ if ( output <= end_output )
8398+ continue;
8399+ if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
8400+ break;
8401+ output = end_output; // backup and do last couple
8402+ encode = end_encode_m8;
8403+ }
8404+ return;
8405+ }
8406+
8407+ // try to do blocks of 4 when you can
8408+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8409+ output += 4;
8410+ STBIR_NO_UNROLL_LOOP_START
8411+ while( output <= end_output )
8412+ {
8413+ stbir__simdf e0;
8414+ stbir__simdi i0;
8415+ STBIR_NO_UNROLL(encode);
8416+ stbir__simdf_load( e0, encode );
8417+ stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
8418+ stbir__encode_simdf4_unflip( e0 );
8419+ stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4
8420+ *(int*)(output-4) = stbir__simdi_to_int( i0 );
8421+ output += 4;
8422+ encode += 4;
8423+ }
8424+ output -= 4;
8425+ #endif
8426+
8427+ // do the remnants
8428+ #if stbir__coder_min_num < 4
8429+ STBIR_NO_UNROLL_LOOP_START
8430+ while( output < end_output )
8431+ {
8432+ stbir__simdf e0;
8433+ STBIR_NO_UNROLL(encode);
8434+ stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
8435+ #if stbir__coder_min_num >= 2
8436+ stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
8437+ #endif
8438+ #if stbir__coder_min_num >= 3
8439+ stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
8440+ #endif
8441+ output += stbir__coder_min_num;
8442+ encode += stbir__coder_min_num;
8443+ }
8444+ #endif
8445+
8446+ #else
8447+
8448+ // try to do blocks of 4 when you can
8449+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8450+ output += 4;
8451+ while( output <= end_output )
8452+ {
8453+ float f;
8454+ f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
8455+ f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
8456+ f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
8457+ f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
8458+ output += 4;
8459+ encode += 4;
8460+ }
8461+ output -= 4;
8462+ #endif
8463+
8464+ // do the remnants
8465+ #if stbir__coder_min_num < 4
8466+ STBIR_NO_UNROLL_LOOP_START
8467+ while( output < end_output )
8468+ {
8469+ float f;
8470+ STBIR_NO_UNROLL(encode);
8471+ f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
8472+ #if stbir__coder_min_num >= 2
8473+ f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
8474+ #endif
8475+ #if stbir__coder_min_num >= 3
8476+ f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
8477+ #endif
8478+ output += stbir__coder_min_num;
8479+ encode += stbir__coder_min_num;
8480+ }
8481+ #endif
8482+ #endif
8483+}
8484+
8485+static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
8486+{
8487+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
8488+ float * decode_end = (float*) decode + width_times_channels;
8489+ unsigned char const * input = (unsigned char const*)inputp;
8490+
8491+ #ifdef STBIR_SIMD
8492+ unsigned char const * end_input_m16 = input + width_times_channels - 16;
8493+ if ( width_times_channels >= 16 )
8494+ {
8495+ decode_end -= 16;
8496+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
8497+ for(;;)
8498+ {
8499+ #ifdef STBIR_SIMD8
8500+ stbir__simdi i; stbir__simdi8 o0,o1;
8501+ stbir__simdf8 of0, of1;
8502+ STBIR_NO_UNROLL(decode);
8503+ stbir__simdi_load( i, input );
8504+ stbir__simdi8_expand_u8_to_u32( o0, o1, i );
8505+ stbir__simdi8_convert_i32_to_float( of0, o0 );
8506+ stbir__simdi8_convert_i32_to_float( of1, o1 );
8507+ stbir__decode_simdf8_flip( of0 );
8508+ stbir__decode_simdf8_flip( of1 );
8509+ stbir__simdf8_store( decode + 0, of0 );
8510+ stbir__simdf8_store( decode + 8, of1 );
8511+ #else
8512+ stbir__simdi i, o0, o1, o2, o3;
8513+ stbir__simdf of0, of1, of2, of3;
8514+ STBIR_NO_UNROLL(decode);
8515+ stbir__simdi_load( i, input );
8516+ stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
8517+ stbir__simdi_convert_i32_to_float( of0, o0 );
8518+ stbir__simdi_convert_i32_to_float( of1, o1 );
8519+ stbir__simdi_convert_i32_to_float( of2, o2 );
8520+ stbir__simdi_convert_i32_to_float( of3, o3 );
8521+ stbir__decode_simdf4_flip( of0 );
8522+ stbir__decode_simdf4_flip( of1 );
8523+ stbir__decode_simdf4_flip( of2 );
8524+ stbir__decode_simdf4_flip( of3 );
8525+ stbir__simdf_store( decode + 0, of0 );
8526+ stbir__simdf_store( decode + 4, of1 );
8527+ stbir__simdf_store( decode + 8, of2 );
8528+ stbir__simdf_store( decode + 12, of3 );
8529+#endif
8530+ decode += 16;
8531+ input += 16;
8532+ if ( decode <= decode_end )
8533+ continue;
8534+ if ( decode == ( decode_end + 16 ) )
8535+ break;
8536+ decode = decode_end; // backup and do last couple
8537+ input = end_input_m16;
8538+ }
8539+ return decode_end + 16;
8540+ }
8541+ #endif
8542+
8543+ // try to do blocks of 4 when you can
8544+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8545+ decode += 4;
8546+ STBIR_SIMD_NO_UNROLL_LOOP_START
8547+ while( decode <= decode_end )
8548+ {
8549+ STBIR_SIMD_NO_UNROLL(decode);
8550+ decode[0-4] = ((float)(input[stbir__decode_order0]));
8551+ decode[1-4] = ((float)(input[stbir__decode_order1]));
8552+ decode[2-4] = ((float)(input[stbir__decode_order2]));
8553+ decode[3-4] = ((float)(input[stbir__decode_order3]));
8554+ decode += 4;
8555+ input += 4;
8556+ }
8557+ decode -= 4;
8558+ #endif
8559+
8560+ // do the remnants
8561+ #if stbir__coder_min_num < 4
8562+ STBIR_NO_UNROLL_LOOP_START
8563+ while( decode < decode_end )
8564+ {
8565+ STBIR_NO_UNROLL(decode);
8566+ decode[0] = ((float)(input[stbir__decode_order0]));
8567+ #if stbir__coder_min_num >= 2
8568+ decode[1] = ((float)(input[stbir__decode_order1]));
8569+ #endif
8570+ #if stbir__coder_min_num >= 3
8571+ decode[2] = ((float)(input[stbir__decode_order2]));
8572+ #endif
8573+ decode += stbir__coder_min_num;
8574+ input += stbir__coder_min_num;
8575+ }
8576+ #endif
8577+ return decode_end;
8578+}
8579+
8580+static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
8581+{
8582+ unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
8583+ unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
8584+
8585+ #ifdef STBIR_SIMD
8586+ if ( width_times_channels >= stbir__simdfX_float_count*2 )
8587+ {
8588+ float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
8589+ end_output -= stbir__simdfX_float_count*2;
8590+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
8591+ for(;;)
8592+ {
8593+ stbir__simdfX e0, e1;
8594+ stbir__simdi i;
8595+ STBIR_SIMD_NO_UNROLL(encode);
8596+ stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
8597+ stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
8598+ stbir__encode_simdfX_unflip( e0 );
8599+ stbir__encode_simdfX_unflip( e1 );
8600+ #ifdef STBIR_SIMD8
8601+ stbir__simdf8_pack_to_16bytes( i, e0, e1 );
8602+ stbir__simdi_store( output, i );
8603+ #else
8604+ stbir__simdf_pack_to_8bytes( i, e0, e1 );
8605+ stbir__simdi_store2( output, i );
8606+ #endif
8607+ encode += stbir__simdfX_float_count*2;
8608+ output += stbir__simdfX_float_count*2;
8609+ if ( output <= end_output )
8610+ continue;
8611+ if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
8612+ break;
8613+ output = end_output; // backup and do last couple
8614+ encode = end_encode_m8;
8615+ }
8616+ return;
8617+ }
8618+
8619+ // try to do blocks of 4 when you can
8620+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8621+ output += 4;
8622+ STBIR_NO_UNROLL_LOOP_START
8623+ while( output <= end_output )
8624+ {
8625+ stbir__simdf e0;
8626+ stbir__simdi i0;
8627+ STBIR_NO_UNROLL(encode);
8628+ stbir__simdf_load( e0, encode );
8629+ stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
8630+ stbir__encode_simdf4_unflip( e0 );
8631+ stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4
8632+ *(int*)(output-4) = stbir__simdi_to_int( i0 );
8633+ output += 4;
8634+ encode += 4;
8635+ }
8636+ output -= 4;
8637+ #endif
8638+
8639+ #else
8640+
8641+ // try to do blocks of 4 when you can
8642+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8643+ output += 4;
8644+ while( output <= end_output )
8645+ {
8646+ float f;
8647+ f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
8648+ f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
8649+ f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
8650+ f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
8651+ output += 4;
8652+ encode += 4;
8653+ }
8654+ output -= 4;
8655+ #endif
8656+
8657+ #endif
8658+
8659+ // do the remnants
8660+ #if stbir__coder_min_num < 4
8661+ STBIR_NO_UNROLL_LOOP_START
8662+ while( output < end_output )
8663+ {
8664+ float f;
8665+ STBIR_NO_UNROLL(encode);
8666+ f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
8667+ #if stbir__coder_min_num >= 2
8668+ f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
8669+ #endif
8670+ #if stbir__coder_min_num >= 3
8671+ f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
8672+ #endif
8673+ output += stbir__coder_min_num;
8674+ encode += stbir__coder_min_num;
8675+ }
8676+ #endif
8677+}
8678+
8679+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
8680+{
8681+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
8682+ float * decode_end = (float*) decode + width_times_channels;
8683+ unsigned char const * input = (unsigned char const *)inputp;
8684+
8685+ // try to do blocks of 4 when you can
8686+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8687+ decode += 4;
8688+ while( decode <= decode_end )
8689+ {
8690+ decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
8691+ decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
8692+ decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
8693+ decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
8694+ decode += 4;
8695+ input += 4;
8696+ }
8697+ decode -= 4;
8698+ #endif
8699+
8700+ // do the remnants
8701+ #if stbir__coder_min_num < 4
8702+ STBIR_NO_UNROLL_LOOP_START
8703+ while( decode < decode_end )
8704+ {
8705+ STBIR_NO_UNROLL(decode);
8706+ decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
8707+ #if stbir__coder_min_num >= 2
8708+ decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
8709+ #endif
8710+ #if stbir__coder_min_num >= 3
8711+ decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
8712+ #endif
8713+ decode += stbir__coder_min_num;
8714+ input += stbir__coder_min_num;
8715+ }
8716+ #endif
8717+ return decode_end;
8718+}
8719+
8720+#define stbir__min_max_shift20( i, f ) \
8721+ stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
8722+ stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one )) ); \
8723+ stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
8724+
8725+#define stbir__scale_and_convert( i, f ) \
8726+ stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
8727+ stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
8728+ stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
8729+ stbir__simdf_convert_float_to_i32( i, f );
8730+
8731+#define stbir__linear_to_srgb_finish( i, f ) \
8732+{ \
8733+ stbir__simdi temp; \
8734+ stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
8735+ stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
8736+ stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
8737+ stbir__simdi_16madd( i, i, temp ); \
8738+ stbir__simdi_32shr( i, i, 16 ); \
8739+}
8740+
8741+#define stbir__simdi_table_lookup2( v0,v1, table ) \
8742+{ \
8743+ stbir__simdi_u32 temp0,temp1; \
8744+ temp0.m128i_i128 = v0; \
8745+ temp1.m128i_i128 = v1; \
8746+ temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
8747+ temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
8748+ v0 = temp0.m128i_i128; \
8749+ v1 = temp1.m128i_i128; \
8750+}
8751+
8752+#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
8753+{ \
8754+ stbir__simdi_u32 temp0,temp1,temp2; \
8755+ temp0.m128i_i128 = v0; \
8756+ temp1.m128i_i128 = v1; \
8757+ temp2.m128i_i128 = v2; \
8758+ temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
8759+ temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
8760+ temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
8761+ v0 = temp0.m128i_i128; \
8762+ v1 = temp1.m128i_i128; \
8763+ v2 = temp2.m128i_i128; \
8764+}
8765+
8766+#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
8767+{ \
8768+ stbir__simdi_u32 temp0,temp1,temp2,temp3; \
8769+ temp0.m128i_i128 = v0; \
8770+ temp1.m128i_i128 = v1; \
8771+ temp2.m128i_i128 = v2; \
8772+ temp3.m128i_i128 = v3; \
8773+ temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
8774+ temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
8775+ temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
8776+ temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
8777+ v0 = temp0.m128i_i128; \
8778+ v1 = temp1.m128i_i128; \
8779+ v2 = temp2.m128i_i128; \
8780+ v3 = temp3.m128i_i128; \
8781+}
8782+
8783+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
8784+{
8785+ unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
8786+ unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
8787+
8788+ #ifdef STBIR_SIMD
8789+
8790+ if ( width_times_channels >= 16 )
8791+ {
8792+ float const * end_encode_m16 = encode + width_times_channels - 16;
8793+ end_output -= 16;
8794+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
8795+ for(;;)
8796+ {
8797+ stbir__simdf f0, f1, f2, f3;
8798+ stbir__simdi i0, i1, i2, i3;
8799+ STBIR_SIMD_NO_UNROLL(encode);
8800+
8801+ stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
8802+
8803+ stbir__min_max_shift20( i0, f0 );
8804+ stbir__min_max_shift20( i1, f1 );
8805+ stbir__min_max_shift20( i2, f2 );
8806+ stbir__min_max_shift20( i3, f3 );
8807+
8808+ stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
8809+
8810+ stbir__linear_to_srgb_finish( i0, f0 );
8811+ stbir__linear_to_srgb_finish( i1, f1 );
8812+ stbir__linear_to_srgb_finish( i2, f2 );
8813+ stbir__linear_to_srgb_finish( i3, f3 );
8814+
8815+ stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
8816+
8817+ encode += 16;
8818+ output += 16;
8819+ if ( output <= end_output )
8820+ continue;
8821+ if ( output == ( end_output + 16 ) )
8822+ break;
8823+ output = end_output; // backup and do last couple
8824+ encode = end_encode_m16;
8825+ }
8826+ return;
8827+ }
8828+ #endif
8829+
8830+ // try to do blocks of 4 when you can
8831+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
8832+ output += 4;
8833+ STBIR_SIMD_NO_UNROLL_LOOP_START
8834+ while ( output <= end_output )
8835+ {
8836+ STBIR_SIMD_NO_UNROLL(encode);
8837+
8838+ output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
8839+ output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
8840+ output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
8841+ output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
8842+
8843+ output += 4;
8844+ encode += 4;
8845+ }
8846+ output -= 4;
8847+ #endif
8848+
8849+ // do the remnants
8850+ #if stbir__coder_min_num < 4
8851+ STBIR_NO_UNROLL_LOOP_START
8852+ while( output < end_output )
8853+ {
8854+ STBIR_NO_UNROLL(encode);
8855+ output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
8856+ #if stbir__coder_min_num >= 2
8857+ output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
8858+ #endif
8859+ #if stbir__coder_min_num >= 3
8860+ output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
8861+ #endif
8862+ output += stbir__coder_min_num;
8863+ encode += stbir__coder_min_num;
8864+ }
8865+ #endif
8866+}
8867+
8868+#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
8869+
8870+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
8871+{
8872+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
8873+ float * decode_end = (float*) decode + width_times_channels;
8874+ unsigned char const * input = (unsigned char const *)inputp;
8875+
8876+ do {
8877+ decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
8878+ decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
8879+ decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
8880+ decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
8881+ input += 4;
8882+ decode += 4;
8883+ } while( decode < decode_end );
8884+ return decode_end;
8885+}
8886+
8887+
8888+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
8889+{
8890+ unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
8891+ unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
8892+
8893+ #ifdef STBIR_SIMD
8894+
8895+ if ( width_times_channels >= 16 )
8896+ {
8897+ float const * end_encode_m16 = encode + width_times_channels - 16;
8898+ end_output -= 16;
8899+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
8900+ for(;;)
8901+ {
8902+ stbir__simdf f0, f1, f2, f3;
8903+ stbir__simdi i0, i1, i2, i3;
8904+
8905+ STBIR_SIMD_NO_UNROLL(encode);
8906+ stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
8907+
8908+ stbir__min_max_shift20( i0, f0 );
8909+ stbir__min_max_shift20( i1, f1 );
8910+ stbir__min_max_shift20( i2, f2 );
8911+ stbir__scale_and_convert( i3, f3 );
8912+
8913+ stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
8914+
8915+ stbir__linear_to_srgb_finish( i0, f0 );
8916+ stbir__linear_to_srgb_finish( i1, f1 );
8917+ stbir__linear_to_srgb_finish( i2, f2 );
8918+
8919+ stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
8920+
8921+ output += 16;
8922+ encode += 16;
8923+
8924+ if ( output <= end_output )
8925+ continue;
8926+ if ( output == ( end_output + 16 ) )
8927+ break;
8928+ output = end_output; // backup and do last couple
8929+ encode = end_encode_m16;
8930+ }
8931+ return;
8932+ }
8933+ #endif
8934+
8935+ STBIR_SIMD_NO_UNROLL_LOOP_START
8936+ do {
8937+ float f;
8938+ STBIR_SIMD_NO_UNROLL(encode);
8939+
8940+ output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
8941+ output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
8942+ output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
8943+
8944+ f = encode[3] * stbir__max_uint8_as_float + 0.5f;
8945+ STBIR_CLAMP(f, 0, 255);
8946+ output[stbir__decode_order3] = (unsigned char) f;
8947+
8948+ output += 4;
8949+ encode += 4;
8950+ } while( output < end_output );
8951+}
8952+
8953+#endif
8954+
8955+#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
8956+
8957+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
8958+{
8959+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
8960+ float * decode_end = (float*) decode + width_times_channels;
8961+ unsigned char const * input = (unsigned char const *)inputp;
8962+
8963+ decode += 4;
8964+ while( decode <= decode_end )
8965+ {
8966+ decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
8967+ decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
8968+ decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
8969+ decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
8970+ input += 4;
8971+ decode += 4;
8972+ }
8973+ decode -= 4;
8974+ if( decode < decode_end )
8975+ {
8976+ decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
8977+ decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
8978+ }
8979+ return decode_end;
8980+}
8981+
8982+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
8983+{
8984+ unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
8985+ unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
8986+
8987+ #ifdef STBIR_SIMD
8988+
8989+ if ( width_times_channels >= 16 )
8990+ {
8991+ float const * end_encode_m16 = encode + width_times_channels - 16;
8992+ end_output -= 16;
8993+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
8994+ for(;;)
8995+ {
8996+ stbir__simdf f0, f1, f2, f3;
8997+ stbir__simdi i0, i1, i2, i3;
8998+
8999+ STBIR_SIMD_NO_UNROLL(encode);
9000+ stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
9001+
9002+ stbir__min_max_shift20( i0, f0 );
9003+ stbir__scale_and_convert( i1, f1 );
9004+ stbir__min_max_shift20( i2, f2 );
9005+ stbir__scale_and_convert( i3, f3 );
9006+
9007+ stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
9008+
9009+ stbir__linear_to_srgb_finish( i0, f0 );
9010+ stbir__linear_to_srgb_finish( i2, f2 );
9011+
9012+ stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
9013+
9014+ output += 16;
9015+ encode += 16;
9016+ if ( output <= end_output )
9017+ continue;
9018+ if ( output == ( end_output + 16 ) )
9019+ break;
9020+ output = end_output; // backup and do last couple
9021+ encode = end_encode_m16;
9022+ }
9023+ return;
9024+ }
9025+ #endif
9026+
9027+ STBIR_SIMD_NO_UNROLL_LOOP_START
9028+ do {
9029+ float f;
9030+ STBIR_SIMD_NO_UNROLL(encode);
9031+
9032+ output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
9033+
9034+ f = encode[1] * stbir__max_uint8_as_float + 0.5f;
9035+ STBIR_CLAMP(f, 0, 255);
9036+ output[stbir__decode_order1] = (unsigned char) f;
9037+
9038+ output += 2;
9039+ encode += 2;
9040+ } while( output < end_output );
9041+}
9042+
9043+#endif
9044+
9045+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
9046+{
9047+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
9048+ float * decode_end = (float*) decode + width_times_channels;
9049+ unsigned short const * input = (unsigned short const *)inputp;
9050+
9051+ #ifdef STBIR_SIMD
9052+ unsigned short const * end_input_m8 = input + width_times_channels - 8;
9053+ if ( width_times_channels >= 8 )
9054+ {
9055+ decode_end -= 8;
9056+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
9057+ for(;;)
9058+ {
9059+ #ifdef STBIR_SIMD8
9060+ stbir__simdi i; stbir__simdi8 o;
9061+ stbir__simdf8 of;
9062+ STBIR_NO_UNROLL(decode);
9063+ stbir__simdi_load( i, input );
9064+ stbir__simdi8_expand_u16_to_u32( o, i );
9065+ stbir__simdi8_convert_i32_to_float( of, o );
9066+ stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
9067+ stbir__decode_simdf8_flip( of );
9068+ stbir__simdf8_store( decode + 0, of );
9069+ #else
9070+ stbir__simdi i, o0, o1;
9071+ stbir__simdf of0, of1;
9072+ STBIR_NO_UNROLL(decode);
9073+ stbir__simdi_load( i, input );
9074+ stbir__simdi_expand_u16_to_u32( o0,o1,i );
9075+ stbir__simdi_convert_i32_to_float( of0, o0 );
9076+ stbir__simdi_convert_i32_to_float( of1, o1 );
9077+ stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
9078+ stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
9079+ stbir__decode_simdf4_flip( of0 );
9080+ stbir__decode_simdf4_flip( of1 );
9081+ stbir__simdf_store( decode + 0, of0 );
9082+ stbir__simdf_store( decode + 4, of1 );
9083+ #endif
9084+ decode += 8;
9085+ input += 8;
9086+ if ( decode <= decode_end )
9087+ continue;
9088+ if ( decode == ( decode_end + 8 ) )
9089+ break;
9090+ decode = decode_end; // backup and do last couple
9091+ input = end_input_m8;
9092+ }
9093+ return decode_end + 8;
9094+ }
9095+ #endif
9096+
9097+ // try to do blocks of 4 when you can
9098+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9099+ decode += 4;
9100+ STBIR_SIMD_NO_UNROLL_LOOP_START
9101+ while( decode <= decode_end )
9102+ {
9103+ STBIR_SIMD_NO_UNROLL(decode);
9104+ decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
9105+ decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
9106+ decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
9107+ decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
9108+ decode += 4;
9109+ input += 4;
9110+ }
9111+ decode -= 4;
9112+ #endif
9113+
9114+ // do the remnants
9115+ #if stbir__coder_min_num < 4
9116+ STBIR_NO_UNROLL_LOOP_START
9117+ while( decode < decode_end )
9118+ {
9119+ STBIR_NO_UNROLL(decode);
9120+ decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
9121+ #if stbir__coder_min_num >= 2
9122+ decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
9123+ #endif
9124+ #if stbir__coder_min_num >= 3
9125+ decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
9126+ #endif
9127+ decode += stbir__coder_min_num;
9128+ input += stbir__coder_min_num;
9129+ }
9130+ #endif
9131+ return decode_end;
9132+}
9133+
9134+
9135+static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
9136+{
9137+ unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
9138+ unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
9139+
9140+ #ifdef STBIR_SIMD
9141+ {
9142+ if ( width_times_channels >= stbir__simdfX_float_count*2 )
9143+ {
9144+ float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
9145+ end_output -= stbir__simdfX_float_count*2;
9146+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
9147+ for(;;)
9148+ {
9149+ stbir__simdfX e0, e1;
9150+ stbir__simdiX i;
9151+ STBIR_SIMD_NO_UNROLL(encode);
9152+ stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
9153+ stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
9154+ stbir__encode_simdfX_unflip( e0 );
9155+ stbir__encode_simdfX_unflip( e1 );
9156+ stbir__simdfX_pack_to_words( i, e0, e1 );
9157+ stbir__simdiX_store( output, i );
9158+ encode += stbir__simdfX_float_count*2;
9159+ output += stbir__simdfX_float_count*2;
9160+ if ( output <= end_output )
9161+ continue;
9162+ if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
9163+ break;
9164+ output = end_output; // backup and do last couple
9165+ encode = end_encode_m8;
9166+ }
9167+ return;
9168+ }
9169+ }
9170+
9171+ // try to do blocks of 4 when you can
9172+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9173+ output += 4;
9174+ STBIR_NO_UNROLL_LOOP_START
9175+ while( output <= end_output )
9176+ {
9177+ stbir__simdf e;
9178+ stbir__simdi i;
9179+ STBIR_NO_UNROLL(encode);
9180+ stbir__simdf_load( e, encode );
9181+ stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
9182+ stbir__encode_simdf4_unflip( e );
9183+ stbir__simdf_pack_to_8words( i, e, e ); // only use first 4
9184+ stbir__simdi_store2( output-4, i );
9185+ output += 4;
9186+ encode += 4;
9187+ }
9188+ output -= 4;
9189+ #endif
9190+
9191+ // do the remnants
9192+ #if stbir__coder_min_num < 4
9193+ STBIR_NO_UNROLL_LOOP_START
9194+ while( output < end_output )
9195+ {
9196+ stbir__simdf e;
9197+ STBIR_NO_UNROLL(encode);
9198+ stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
9199+ #if stbir__coder_min_num >= 2
9200+ stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
9201+ #endif
9202+ #if stbir__coder_min_num >= 3
9203+ stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
9204+ #endif
9205+ output += stbir__coder_min_num;
9206+ encode += stbir__coder_min_num;
9207+ }
9208+ #endif
9209+
9210+ #else
9211+
9212+ // try to do blocks of 4 when you can
9213+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9214+ output += 4;
9215+ STBIR_SIMD_NO_UNROLL_LOOP_START
9216+ while( output <= end_output )
9217+ {
9218+ float f;
9219+ STBIR_SIMD_NO_UNROLL(encode);
9220+ f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
9221+ f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
9222+ f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
9223+ f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
9224+ output += 4;
9225+ encode += 4;
9226+ }
9227+ output -= 4;
9228+ #endif
9229+
9230+ // do the remnants
9231+ #if stbir__coder_min_num < 4
9232+ STBIR_NO_UNROLL_LOOP_START
9233+ while( output < end_output )
9234+ {
9235+ float f;
9236+ STBIR_NO_UNROLL(encode);
9237+ f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
9238+ #if stbir__coder_min_num >= 2
9239+ f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
9240+ #endif
9241+ #if stbir__coder_min_num >= 3
9242+ f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
9243+ #endif
9244+ output += stbir__coder_min_num;
9245+ encode += stbir__coder_min_num;
9246+ }
9247+ #endif
9248+ #endif
9249+}
9250+
9251+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
9252+{
9253+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
9254+ float * decode_end = (float*) decode + width_times_channels;
9255+ unsigned short const * input = (unsigned short const *)inputp;
9256+
9257+ #ifdef STBIR_SIMD
9258+ unsigned short const * end_input_m8 = input + width_times_channels - 8;
9259+ if ( width_times_channels >= 8 )
9260+ {
9261+ decode_end -= 8;
9262+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
9263+ for(;;)
9264+ {
9265+ #ifdef STBIR_SIMD8
9266+ stbir__simdi i; stbir__simdi8 o;
9267+ stbir__simdf8 of;
9268+ STBIR_NO_UNROLL(decode);
9269+ stbir__simdi_load( i, input );
9270+ stbir__simdi8_expand_u16_to_u32( o, i );
9271+ stbir__simdi8_convert_i32_to_float( of, o );
9272+ stbir__decode_simdf8_flip( of );
9273+ stbir__simdf8_store( decode + 0, of );
9274+ #else
9275+ stbir__simdi i, o0, o1;
9276+ stbir__simdf of0, of1;
9277+ STBIR_NO_UNROLL(decode);
9278+ stbir__simdi_load( i, input );
9279+ stbir__simdi_expand_u16_to_u32( o0, o1, i );
9280+ stbir__simdi_convert_i32_to_float( of0, o0 );
9281+ stbir__simdi_convert_i32_to_float( of1, o1 );
9282+ stbir__decode_simdf4_flip( of0 );
9283+ stbir__decode_simdf4_flip( of1 );
9284+ stbir__simdf_store( decode + 0, of0 );
9285+ stbir__simdf_store( decode + 4, of1 );
9286+ #endif
9287+ decode += 8;
9288+ input += 8;
9289+ if ( decode <= decode_end )
9290+ continue;
9291+ if ( decode == ( decode_end + 8 ) )
9292+ break;
9293+ decode = decode_end; // backup and do last couple
9294+ input = end_input_m8;
9295+ }
9296+ return decode_end + 8;
9297+ }
9298+ #endif
9299+
9300+ // try to do blocks of 4 when you can
9301+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9302+ decode += 4;
9303+ STBIR_SIMD_NO_UNROLL_LOOP_START
9304+ while( decode <= decode_end )
9305+ {
9306+ STBIR_SIMD_NO_UNROLL(decode);
9307+ decode[0-4] = ((float)(input[stbir__decode_order0]));
9308+ decode[1-4] = ((float)(input[stbir__decode_order1]));
9309+ decode[2-4] = ((float)(input[stbir__decode_order2]));
9310+ decode[3-4] = ((float)(input[stbir__decode_order3]));
9311+ decode += 4;
9312+ input += 4;
9313+ }
9314+ decode -= 4;
9315+ #endif
9316+
9317+ // do the remnants
9318+ #if stbir__coder_min_num < 4
9319+ STBIR_NO_UNROLL_LOOP_START
9320+ while( decode < decode_end )
9321+ {
9322+ STBIR_NO_UNROLL(decode);
9323+ decode[0] = ((float)(input[stbir__decode_order0]));
9324+ #if stbir__coder_min_num >= 2
9325+ decode[1] = ((float)(input[stbir__decode_order1]));
9326+ #endif
9327+ #if stbir__coder_min_num >= 3
9328+ decode[2] = ((float)(input[stbir__decode_order2]));
9329+ #endif
9330+ decode += stbir__coder_min_num;
9331+ input += stbir__coder_min_num;
9332+ }
9333+ #endif
9334+ return decode_end;
9335+}
9336+
9337+static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
9338+{
9339+ unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
9340+ unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
9341+
9342+ #ifdef STBIR_SIMD
9343+ {
9344+ if ( width_times_channels >= stbir__simdfX_float_count*2 )
9345+ {
9346+ float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
9347+ end_output -= stbir__simdfX_float_count*2;
9348+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
9349+ for(;;)
9350+ {
9351+ stbir__simdfX e0, e1;
9352+ stbir__simdiX i;
9353+ STBIR_SIMD_NO_UNROLL(encode);
9354+ stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
9355+ stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
9356+ stbir__encode_simdfX_unflip( e0 );
9357+ stbir__encode_simdfX_unflip( e1 );
9358+ stbir__simdfX_pack_to_words( i, e0, e1 );
9359+ stbir__simdiX_store( output, i );
9360+ encode += stbir__simdfX_float_count*2;
9361+ output += stbir__simdfX_float_count*2;
9362+ if ( output <= end_output )
9363+ continue;
9364+ if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
9365+ break;
9366+ output = end_output; // backup and do last couple
9367+ encode = end_encode_m8;
9368+ }
9369+ return;
9370+ }
9371+ }
9372+
9373+ // try to do blocks of 4 when you can
9374+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9375+ output += 4;
9376+ STBIR_NO_UNROLL_LOOP_START
9377+ while( output <= end_output )
9378+ {
9379+ stbir__simdf e;
9380+ stbir__simdi i;
9381+ STBIR_NO_UNROLL(encode);
9382+ stbir__simdf_load( e, encode );
9383+ stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
9384+ stbir__encode_simdf4_unflip( e );
9385+ stbir__simdf_pack_to_8words( i, e, e ); // only use first 4
9386+ stbir__simdi_store2( output-4, i );
9387+ output += 4;
9388+ encode += 4;
9389+ }
9390+ output -= 4;
9391+ #endif
9392+
9393+ #else
9394+
9395+ // try to do blocks of 4 when you can
9396+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9397+ output += 4;
9398+ STBIR_SIMD_NO_UNROLL_LOOP_START
9399+ while( output <= end_output )
9400+ {
9401+ float f;
9402+ STBIR_SIMD_NO_UNROLL(encode);
9403+ f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
9404+ f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
9405+ f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
9406+ f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
9407+ output += 4;
9408+ encode += 4;
9409+ }
9410+ output -= 4;
9411+ #endif
9412+
9413+ #endif
9414+
9415+ // do the remnants
9416+ #if stbir__coder_min_num < 4
9417+ STBIR_NO_UNROLL_LOOP_START
9418+ while( output < end_output )
9419+ {
9420+ float f;
9421+ STBIR_NO_UNROLL(encode);
9422+ f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
9423+ #if stbir__coder_min_num >= 2
9424+ f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
9425+ #endif
9426+ #if stbir__coder_min_num >= 3
9427+ f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
9428+ #endif
9429+ output += stbir__coder_min_num;
9430+ encode += stbir__coder_min_num;
9431+ }
9432+ #endif
9433+}
9434+
9435+static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
9436+{
9437+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
9438+ float * decode_end = (float*) decode + width_times_channels;
9439+ stbir__FP16 const * input = (stbir__FP16 const *)inputp;
9440+
9441+ #ifdef STBIR_SIMD
9442+ if ( width_times_channels >= 8 )
9443+ {
9444+ stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
9445+ decode_end -= 8;
9446+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
9447+ for(;;)
9448+ {
9449+ STBIR_NO_UNROLL(decode);
9450+
9451+ stbir__half_to_float_SIMD( decode, input );
9452+ #ifdef stbir__decode_swizzle
9453+ #ifdef STBIR_SIMD8
9454+ {
9455+ stbir__simdf8 of;
9456+ stbir__simdf8_load( of, decode );
9457+ stbir__decode_simdf8_flip( of );
9458+ stbir__simdf8_store( decode, of );
9459+ }
9460+ #else
9461+ {
9462+ stbir__simdf of0,of1;
9463+ stbir__simdf_load( of0, decode );
9464+ stbir__simdf_load( of1, decode+4 );
9465+ stbir__decode_simdf4_flip( of0 );
9466+ stbir__decode_simdf4_flip( of1 );
9467+ stbir__simdf_store( decode, of0 );
9468+ stbir__simdf_store( decode+4, of1 );
9469+ }
9470+ #endif
9471+ #endif
9472+ decode += 8;
9473+ input += 8;
9474+ if ( decode <= decode_end )
9475+ continue;
9476+ if ( decode == ( decode_end + 8 ) )
9477+ break;
9478+ decode = decode_end; // backup and do last couple
9479+ input = end_input_m8;
9480+ }
9481+ return decode_end + 8;
9482+ }
9483+ #endif
9484+
9485+ // try to do blocks of 4 when you can
9486+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9487+ decode += 4;
9488+ STBIR_SIMD_NO_UNROLL_LOOP_START
9489+ while( decode <= decode_end )
9490+ {
9491+ STBIR_SIMD_NO_UNROLL(decode);
9492+ decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
9493+ decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
9494+ decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
9495+ decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
9496+ decode += 4;
9497+ input += 4;
9498+ }
9499+ decode -= 4;
9500+ #endif
9501+
9502+ // do the remnants
9503+ #if stbir__coder_min_num < 4
9504+ STBIR_NO_UNROLL_LOOP_START
9505+ while( decode < decode_end )
9506+ {
9507+ STBIR_NO_UNROLL(decode);
9508+ decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
9509+ #if stbir__coder_min_num >= 2
9510+ decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
9511+ #endif
9512+ #if stbir__coder_min_num >= 3
9513+ decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
9514+ #endif
9515+ decode += stbir__coder_min_num;
9516+ input += stbir__coder_min_num;
9517+ }
9518+ #endif
9519+ return decode_end;
9520+}
9521+
9522+static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
9523+{
9524+ stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
9525+ stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
9526+
9527+ #ifdef STBIR_SIMD
9528+ if ( width_times_channels >= 8 )
9529+ {
9530+ float const * end_encode_m8 = encode + width_times_channels - 8;
9531+ end_output -= 8;
9532+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
9533+ for(;;)
9534+ {
9535+ STBIR_SIMD_NO_UNROLL(encode);
9536+ #ifdef stbir__decode_swizzle
9537+ #ifdef STBIR_SIMD8
9538+ {
9539+ stbir__simdf8 of;
9540+ stbir__simdf8_load( of, encode );
9541+ stbir__encode_simdf8_unflip( of );
9542+ stbir__float_to_half_SIMD( output, (float*)&of );
9543+ }
9544+ #else
9545+ {
9546+ stbir__simdf of[2];
9547+ stbir__simdf_load( of[0], encode );
9548+ stbir__simdf_load( of[1], encode+4 );
9549+ stbir__encode_simdf4_unflip( of[0] );
9550+ stbir__encode_simdf4_unflip( of[1] );
9551+ stbir__float_to_half_SIMD( output, (float*)of );
9552+ }
9553+ #endif
9554+ #else
9555+ stbir__float_to_half_SIMD( output, encode );
9556+ #endif
9557+ encode += 8;
9558+ output += 8;
9559+ if ( output <= end_output )
9560+ continue;
9561+ if ( output == ( end_output + 8 ) )
9562+ break;
9563+ output = end_output; // backup and do last couple
9564+ encode = end_encode_m8;
9565+ }
9566+ return;
9567+ }
9568+ #endif
9569+
9570+ // try to do blocks of 4 when you can
9571+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9572+ output += 4;
9573+ STBIR_SIMD_NO_UNROLL_LOOP_START
9574+ while( output <= end_output )
9575+ {
9576+ STBIR_SIMD_NO_UNROLL(output);
9577+ output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
9578+ output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
9579+ output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
9580+ output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
9581+ output += 4;
9582+ encode += 4;
9583+ }
9584+ output -= 4;
9585+ #endif
9586+
9587+ // do the remnants
9588+ #if stbir__coder_min_num < 4
9589+ STBIR_NO_UNROLL_LOOP_START
9590+ while( output < end_output )
9591+ {
9592+ STBIR_NO_UNROLL(output);
9593+ output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
9594+ #if stbir__coder_min_num >= 2
9595+ output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
9596+ #endif
9597+ #if stbir__coder_min_num >= 3
9598+ output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
9599+ #endif
9600+ output += stbir__coder_min_num;
9601+ encode += stbir__coder_min_num;
9602+ }
9603+ #endif
9604+}
9605+
9606+static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
9607+{
9608+ #ifdef stbir__decode_swizzle
9609+ float STBIR_STREAMOUT_PTR( * ) decode = decodep;
9610+ float * decode_end = (float*) decode + width_times_channels;
9611+ float const * input = (float const *)inputp;
9612+
9613+ #ifdef STBIR_SIMD
9614+ if ( width_times_channels >= 16 )
9615+ {
9616+ float const * end_input_m16 = input + width_times_channels - 16;
9617+ decode_end -= 16;
9618+ STBIR_NO_UNROLL_LOOP_START_INF_FOR
9619+ for(;;)
9620+ {
9621+ STBIR_NO_UNROLL(decode);
9622+ #ifdef stbir__decode_swizzle
9623+ #ifdef STBIR_SIMD8
9624+ {
9625+ stbir__simdf8 of0,of1;
9626+ stbir__simdf8_load( of0, input );
9627+ stbir__simdf8_load( of1, input+8 );
9628+ stbir__decode_simdf8_flip( of0 );
9629+ stbir__decode_simdf8_flip( of1 );
9630+ stbir__simdf8_store( decode, of0 );
9631+ stbir__simdf8_store( decode+8, of1 );
9632+ }
9633+ #else
9634+ {
9635+ stbir__simdf of0,of1,of2,of3;
9636+ stbir__simdf_load( of0, input );
9637+ stbir__simdf_load( of1, input+4 );
9638+ stbir__simdf_load( of2, input+8 );
9639+ stbir__simdf_load( of3, input+12 );
9640+ stbir__decode_simdf4_flip( of0 );
9641+ stbir__decode_simdf4_flip( of1 );
9642+ stbir__decode_simdf4_flip( of2 );
9643+ stbir__decode_simdf4_flip( of3 );
9644+ stbir__simdf_store( decode, of0 );
9645+ stbir__simdf_store( decode+4, of1 );
9646+ stbir__simdf_store( decode+8, of2 );
9647+ stbir__simdf_store( decode+12, of3 );
9648+ }
9649+ #endif
9650+ #endif
9651+ decode += 16;
9652+ input += 16;
9653+ if ( decode <= decode_end )
9654+ continue;
9655+ if ( decode == ( decode_end + 16 ) )
9656+ break;
9657+ decode = decode_end; // backup and do last couple
9658+ input = end_input_m16;
9659+ }
9660+ return decode_end + 16;
9661+ }
9662+ #endif
9663+
9664+ // try to do blocks of 4 when you can
9665+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9666+ decode += 4;
9667+ STBIR_SIMD_NO_UNROLL_LOOP_START
9668+ while( decode <= decode_end )
9669+ {
9670+ STBIR_SIMD_NO_UNROLL(decode);
9671+ decode[0-4] = input[stbir__decode_order0];
9672+ decode[1-4] = input[stbir__decode_order1];
9673+ decode[2-4] = input[stbir__decode_order2];
9674+ decode[3-4] = input[stbir__decode_order3];
9675+ decode += 4;
9676+ input += 4;
9677+ }
9678+ decode -= 4;
9679+ #endif
9680+
9681+ // do the remnants
9682+ #if stbir__coder_min_num < 4
9683+ STBIR_NO_UNROLL_LOOP_START
9684+ while( decode < decode_end )
9685+ {
9686+ STBIR_NO_UNROLL(decode);
9687+ decode[0] = input[stbir__decode_order0];
9688+ #if stbir__coder_min_num >= 2
9689+ decode[1] = input[stbir__decode_order1];
9690+ #endif
9691+ #if stbir__coder_min_num >= 3
9692+ decode[2] = input[stbir__decode_order2];
9693+ #endif
9694+ decode += stbir__coder_min_num;
9695+ input += stbir__coder_min_num;
9696+ }
9697+ #endif
9698+ return decode_end;
9699+
9700+ #else
9701+
9702+ if ( (void*)decodep != inputp )
9703+ STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
9704+
9705+ return decodep + width_times_channels;
9706+
9707+ #endif
9708+}
9709+
9710+static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
9711+{
9712+ #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
9713+
9714+ if ( (void*)outputp != (void*) encode )
9715+ STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
9716+
9717+ #else
9718+
9719+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
9720+ float * end_output = ( (float*) output ) + width_times_channels;
9721+
9722+ #ifdef STBIR_FLOAT_HIGH_CLAMP
9723+ #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
9724+ #else
9725+ #define stbir_scalar_hi_clamp( v )
9726+ #endif
9727+ #ifdef STBIR_FLOAT_LOW_CLAMP
9728+ #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
9729+ #else
9730+ #define stbir_scalar_lo_clamp( v )
9731+ #endif
9732+
9733+ #ifdef STBIR_SIMD
9734+
9735+ #ifdef STBIR_FLOAT_HIGH_CLAMP
9736+ const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
9737+ #endif
9738+ #ifdef STBIR_FLOAT_LOW_CLAMP
9739+ const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
9740+ #endif
9741+
9742+ if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
9743+ {
9744+ float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
9745+ end_output -= ( stbir__simdfX_float_count * 2 );
9746+ STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
9747+ for(;;)
9748+ {
9749+ stbir__simdfX e0, e1;
9750+ STBIR_SIMD_NO_UNROLL(encode);
9751+ stbir__simdfX_load( e0, encode );
9752+ stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
9753+#ifdef STBIR_FLOAT_HIGH_CLAMP
9754+ stbir__simdfX_min( e0, e0, high_clamp );
9755+ stbir__simdfX_min( e1, e1, high_clamp );
9756+#endif
9757+#ifdef STBIR_FLOAT_LOW_CLAMP
9758+ stbir__simdfX_max( e0, e0, low_clamp );
9759+ stbir__simdfX_max( e1, e1, low_clamp );
9760+#endif
9761+ stbir__encode_simdfX_unflip( e0 );
9762+ stbir__encode_simdfX_unflip( e1 );
9763+ stbir__simdfX_store( output, e0 );
9764+ stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
9765+ encode += stbir__simdfX_float_count * 2;
9766+ output += stbir__simdfX_float_count * 2;
9767+ if ( output < end_output )
9768+ continue;
9769+ if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
9770+ break;
9771+ output = end_output; // backup and do last couple
9772+ encode = end_encode_m8;
9773+ }
9774+ return;
9775+ }
9776+
9777+ // try to do blocks of 4 when you can
9778+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9779+ output += 4;
9780+ STBIR_NO_UNROLL_LOOP_START
9781+ while( output <= end_output )
9782+ {
9783+ stbir__simdf e0;
9784+ STBIR_NO_UNROLL(encode);
9785+ stbir__simdf_load( e0, encode );
9786+#ifdef STBIR_FLOAT_HIGH_CLAMP
9787+ stbir__simdf_min( e0, e0, high_clamp );
9788+#endif
9789+#ifdef STBIR_FLOAT_LOW_CLAMP
9790+ stbir__simdf_max( e0, e0, low_clamp );
9791+#endif
9792+ stbir__encode_simdf4_unflip( e0 );
9793+ stbir__simdf_store( output-4, e0 );
9794+ output += 4;
9795+ encode += 4;
9796+ }
9797+ output -= 4;
9798+ #endif
9799+
9800+ #else
9801+
9802+ // try to do blocks of 4 when you can
9803+ #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
9804+ output += 4;
9805+ STBIR_SIMD_NO_UNROLL_LOOP_START
9806+ while( output <= end_output )
9807+ {
9808+ float e;
9809+ STBIR_SIMD_NO_UNROLL(encode);
9810+ e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
9811+ e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
9812+ e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
9813+ e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
9814+ output += 4;
9815+ encode += 4;
9816+ }
9817+ output -= 4;
9818+
9819+ #endif
9820+
9821+ #endif
9822+
9823+ // do the remnants
9824+ #if stbir__coder_min_num < 4
9825+ STBIR_NO_UNROLL_LOOP_START
9826+ while( output < end_output )
9827+ {
9828+ float e;
9829+ STBIR_NO_UNROLL(encode);
9830+ e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
9831+ #if stbir__coder_min_num >= 2
9832+ e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
9833+ #endif
9834+ #if stbir__coder_min_num >= 3
9835+ e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
9836+ #endif
9837+ output += stbir__coder_min_num;
9838+ encode += stbir__coder_min_num;
9839+ }
9840+ #endif
9841+
9842+ #endif
9843+}
9844+
9845+#undef stbir__decode_suffix
9846+#undef stbir__decode_simdf8_flip
9847+#undef stbir__decode_simdf4_flip
9848+#undef stbir__decode_order0
9849+#undef stbir__decode_order1
9850+#undef stbir__decode_order2
9851+#undef stbir__decode_order3
9852+#undef stbir__encode_order0
9853+#undef stbir__encode_order1
9854+#undef stbir__encode_order2
9855+#undef stbir__encode_order3
9856+#undef stbir__encode_simdf8_unflip
9857+#undef stbir__encode_simdf4_unflip
9858+#undef stbir__encode_simdfX_unflip
9859+#undef STBIR__CODER_NAME
9860+#undef stbir__coder_min_num
9861+#undef stbir__decode_swizzle
9862+#undef stbir_scalar_hi_clamp
9863+#undef stbir_scalar_lo_clamp
9864+#undef STB_IMAGE_RESIZE_DO_CODERS
9865+
9866+#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
9867+
9868+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
9869+#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
9870+#else
9871+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
9872+#endif
9873+
9874+#if STBIR__vertical_channels >= 1
9875+#define stbIF0( code ) code
9876+#else
9877+#define stbIF0( code )
9878+#endif
9879+#if STBIR__vertical_channels >= 2
9880+#define stbIF1( code ) code
9881+#else
9882+#define stbIF1( code )
9883+#endif
9884+#if STBIR__vertical_channels >= 3
9885+#define stbIF2( code ) code
9886+#else
9887+#define stbIF2( code )
9888+#endif
9889+#if STBIR__vertical_channels >= 4
9890+#define stbIF3( code ) code
9891+#else
9892+#define stbIF3( code )
9893+#endif
9894+#if STBIR__vertical_channels >= 5
9895+#define stbIF4( code ) code
9896+#else
9897+#define stbIF4( code )
9898+#endif
9899+#if STBIR__vertical_channels >= 6
9900+#define stbIF5( code ) code
9901+#else
9902+#define stbIF5( code )
9903+#endif
9904+#if STBIR__vertical_channels >= 7
9905+#define stbIF6( code ) code
9906+#else
9907+#define stbIF6( code )
9908+#endif
9909+#if STBIR__vertical_channels >= 8
9910+#define stbIF7( code ) code
9911+#else
9912+#define stbIF7( code )
9913+#endif
9914+
9915+static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
9916+{
9917+ stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
9918+ stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
9919+ stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
9920+ stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
9921+ stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
9922+ stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
9923+ stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
9924+ stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
9925+
9926+ #ifdef STBIR_SIMD
9927+ {
9928+ stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
9929+ stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
9930+ stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
9931+ stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
9932+ stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
9933+ stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
9934+ stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
9935+ stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
9936+ STBIR_SIMD_NO_UNROLL_LOOP_START
9937+ while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
9938+ {
9939+ stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
9940+ STBIR_SIMD_NO_UNROLL(output0);
9941+
9942+ stbir__simdfX_load( r0, input ); stbir__simdfX_load( r1, input+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
9943+
9944+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
9945+ stbIF0( stbir__simdfX_load( o0, output0 ); stbir__simdfX_load( o1, output0+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
9946+ stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 );
9947+ stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
9948+ stbIF1( stbir__simdfX_load( o0, output1 ); stbir__simdfX_load( o1, output1+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
9949+ stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 );
9950+ stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
9951+ stbIF2( stbir__simdfX_load( o0, output2 ); stbir__simdfX_load( o1, output2+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
9952+ stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 );
9953+ stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
9954+ stbIF3( stbir__simdfX_load( o0, output3 ); stbir__simdfX_load( o1, output3+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
9955+ stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 );
9956+ stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
9957+ stbIF4( stbir__simdfX_load( o0, output4 ); stbir__simdfX_load( o1, output4+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
9958+ stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 );
9959+ stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
9960+ stbIF5( stbir__simdfX_load( o0, output5 ); stbir__simdfX_load( o1, output5+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count)); stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
9961+ stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 );
9962+ stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
9963+ stbIF6( stbir__simdfX_load( o0, output6 ); stbir__simdfX_load( o1, output6+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
9964+ stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 );
9965+ stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
9966+ stbIF7( stbir__simdfX_load( o0, output7 ); stbir__simdfX_load( o1, output7+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
9967+ stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 );
9968+ stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
9969+ #else
9970+ stbIF0( stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 );
9971+ stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
9972+ stbIF1( stbir__simdfX_mult( o0, r0, c1 ); stbir__simdfX_mult( o1, r1, c1 ); stbir__simdfX_mult( o2, r2, c1 ); stbir__simdfX_mult( o3, r3, c1 );
9973+ stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
9974+ stbIF2( stbir__simdfX_mult( o0, r0, c2 ); stbir__simdfX_mult( o1, r1, c2 ); stbir__simdfX_mult( o2, r2, c2 ); stbir__simdfX_mult( o3, r3, c2 );
9975+ stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
9976+ stbIF3( stbir__simdfX_mult( o0, r0, c3 ); stbir__simdfX_mult( o1, r1, c3 ); stbir__simdfX_mult( o2, r2, c3 ); stbir__simdfX_mult( o3, r3, c3 );
9977+ stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
9978+ stbIF4( stbir__simdfX_mult( o0, r0, c4 ); stbir__simdfX_mult( o1, r1, c4 ); stbir__simdfX_mult( o2, r2, c4 ); stbir__simdfX_mult( o3, r3, c4 );
9979+ stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
9980+ stbIF5( stbir__simdfX_mult( o0, r0, c5 ); stbir__simdfX_mult( o1, r1, c5 ); stbir__simdfX_mult( o2, r2, c5 ); stbir__simdfX_mult( o3, r3, c5 );
9981+ stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
9982+ stbIF6( stbir__simdfX_mult( o0, r0, c6 ); stbir__simdfX_mult( o1, r1, c6 ); stbir__simdfX_mult( o2, r2, c6 ); stbir__simdfX_mult( o3, r3, c6 );
9983+ stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
9984+ stbIF7( stbir__simdfX_mult( o0, r0, c7 ); stbir__simdfX_mult( o1, r1, c7 ); stbir__simdfX_mult( o2, r2, c7 ); stbir__simdfX_mult( o3, r3, c7 );
9985+ stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
9986+ #endif
9987+
9988+ input += (4*stbir__simdfX_float_count);
9989+ stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
9990+ }
9991+ STBIR_SIMD_NO_UNROLL_LOOP_START
9992+ while ( ( (char*)input_end - (char*) input ) >= 16 )
9993+ {
9994+ stbir__simdf o0, r0;
9995+ STBIR_SIMD_NO_UNROLL(output0);
9996+
9997+ stbir__simdf_load( r0, input );
9998+
9999+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10000+ stbIF0( stbir__simdf_load( o0, output0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); )
10001+ stbIF1( stbir__simdf_load( o0, output1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); )
10002+ stbIF2( stbir__simdf_load( o0, output2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); )
10003+ stbIF3( stbir__simdf_load( o0, output3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); )
10004+ stbIF4( stbir__simdf_load( o0, output4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); )
10005+ stbIF5( stbir__simdf_load( o0, output5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); )
10006+ stbIF6( stbir__simdf_load( o0, output6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); )
10007+ stbIF7( stbir__simdf_load( o0, output7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); )
10008+ #else
10009+ stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); )
10010+ stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); )
10011+ stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); )
10012+ stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); )
10013+ stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); )
10014+ stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); )
10015+ stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); )
10016+ stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); )
10017+ #endif
10018+
10019+ input += 4;
10020+ stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
10021+ }
10022+ }
10023+ #else
10024+ STBIR_NO_UNROLL_LOOP_START
10025+ while ( ( (char*)input_end - (char*) input ) >= 16 )
10026+ {
10027+ float r0, r1, r2, r3;
10028+ STBIR_NO_UNROLL(input);
10029+
10030+ r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
10031+
10032+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10033+ stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
10034+ stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
10035+ stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
10036+ stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
10037+ stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
10038+ stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
10039+ stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
10040+ stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
10041+ #else
10042+ stbIF0( output0[0] = ( r0 * c0s ); output0[1] = ( r1 * c0s ); output0[2] = ( r2 * c0s ); output0[3] = ( r3 * c0s ); )
10043+ stbIF1( output1[0] = ( r0 * c1s ); output1[1] = ( r1 * c1s ); output1[2] = ( r2 * c1s ); output1[3] = ( r3 * c1s ); )
10044+ stbIF2( output2[0] = ( r0 * c2s ); output2[1] = ( r1 * c2s ); output2[2] = ( r2 * c2s ); output2[3] = ( r3 * c2s ); )
10045+ stbIF3( output3[0] = ( r0 * c3s ); output3[1] = ( r1 * c3s ); output3[2] = ( r2 * c3s ); output3[3] = ( r3 * c3s ); )
10046+ stbIF4( output4[0] = ( r0 * c4s ); output4[1] = ( r1 * c4s ); output4[2] = ( r2 * c4s ); output4[3] = ( r3 * c4s ); )
10047+ stbIF5( output5[0] = ( r0 * c5s ); output5[1] = ( r1 * c5s ); output5[2] = ( r2 * c5s ); output5[3] = ( r3 * c5s ); )
10048+ stbIF6( output6[0] = ( r0 * c6s ); output6[1] = ( r1 * c6s ); output6[2] = ( r2 * c6s ); output6[3] = ( r3 * c6s ); )
10049+ stbIF7( output7[0] = ( r0 * c7s ); output7[1] = ( r1 * c7s ); output7[2] = ( r2 * c7s ); output7[3] = ( r3 * c7s ); )
10050+ #endif
10051+
10052+ input += 4;
10053+ stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
10054+ }
10055+ #endif
10056+ STBIR_NO_UNROLL_LOOP_START
10057+ while ( input < input_end )
10058+ {
10059+ float r = input[0];
10060+ STBIR_NO_UNROLL(output0);
10061+
10062+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10063+ stbIF0( output0[0] += ( r * c0s ); )
10064+ stbIF1( output1[0] += ( r * c1s ); )
10065+ stbIF2( output2[0] += ( r * c2s ); )
10066+ stbIF3( output3[0] += ( r * c3s ); )
10067+ stbIF4( output4[0] += ( r * c4s ); )
10068+ stbIF5( output5[0] += ( r * c5s ); )
10069+ stbIF6( output6[0] += ( r * c6s ); )
10070+ stbIF7( output7[0] += ( r * c7s ); )
10071+ #else
10072+ stbIF0( output0[0] = ( r * c0s ); )
10073+ stbIF1( output1[0] = ( r * c1s ); )
10074+ stbIF2( output2[0] = ( r * c2s ); )
10075+ stbIF3( output3[0] = ( r * c3s ); )
10076+ stbIF4( output4[0] = ( r * c4s ); )
10077+ stbIF5( output5[0] = ( r * c5s ); )
10078+ stbIF6( output6[0] = ( r * c6s ); )
10079+ stbIF7( output7[0] = ( r * c7s ); )
10080+ #endif
10081+
10082+ ++input;
10083+ stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
10084+ }
10085+}
10086+
10087+static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
10088+{
10089+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
10090+
10091+ stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
10092+ stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
10093+ stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
10094+ stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
10095+ stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
10096+ stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
10097+ stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
10098+ stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
10099+
10100+#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
10101+ // check single channel one weight
10102+ if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
10103+ {
10104+ STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
10105+ return;
10106+ }
10107+#endif
10108+
10109+ #ifdef STBIR_SIMD
10110+ {
10111+ stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
10112+ stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
10113+ stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
10114+ stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
10115+ stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
10116+ stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
10117+ stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
10118+ stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
10119+
10120+ STBIR_SIMD_NO_UNROLL_LOOP_START
10121+ while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
10122+ {
10123+ stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
10124+ STBIR_SIMD_NO_UNROLL(output);
10125+
10126+ // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
10127+ stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
10128+ stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
10129+ stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
10130+ stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
10131+ stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
10132+ stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
10133+ stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
10134+ stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
10135+
10136+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10137+ stbIF0( stbir__simdfX_load( o0, output ); stbir__simdfX_load( o1, output+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
10138+ stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
10139+ stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 ); )
10140+ #else
10141+ stbIF0( stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
10142+ stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 ); )
10143+ #endif
10144+
10145+ stbIF1( stbir__simdfX_load( r0, input1 ); stbir__simdfX_load( r1, input1+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
10146+ stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 ); )
10147+ stbIF2( stbir__simdfX_load( r0, input2 ); stbir__simdfX_load( r1, input2+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
10148+ stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 ); )
10149+ stbIF3( stbir__simdfX_load( r0, input3 ); stbir__simdfX_load( r1, input3+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
10150+ stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 ); )
10151+ stbIF4( stbir__simdfX_load( r0, input4 ); stbir__simdfX_load( r1, input4+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
10152+ stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 ); )
10153+ stbIF5( stbir__simdfX_load( r0, input5 ); stbir__simdfX_load( r1, input5+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
10154+ stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 ); )
10155+ stbIF6( stbir__simdfX_load( r0, input6 ); stbir__simdfX_load( r1, input6+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
10156+ stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 ); )
10157+ stbIF7( stbir__simdfX_load( r0, input7 ); stbir__simdfX_load( r1, input7+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
10158+ stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 ); )
10159+
10160+ stbir__simdfX_store( output, o0 ); stbir__simdfX_store( output+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
10161+ output += (4*stbir__simdfX_float_count);
10162+ stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
10163+ }
10164+
10165+ STBIR_SIMD_NO_UNROLL_LOOP_START
10166+ while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
10167+ {
10168+ stbir__simdf o0, r0;
10169+ STBIR_SIMD_NO_UNROLL(output);
10170+
10171+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10172+ stbIF0( stbir__simdf_load( o0, output ); stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
10173+ #else
10174+ stbIF0( stbir__simdf_load( r0, input0 ); stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
10175+ #endif
10176+ stbIF1( stbir__simdf_load( r0, input1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
10177+ stbIF2( stbir__simdf_load( r0, input2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
10178+ stbIF3( stbir__simdf_load( r0, input3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
10179+ stbIF4( stbir__simdf_load( r0, input4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
10180+ stbIF5( stbir__simdf_load( r0, input5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
10181+ stbIF6( stbir__simdf_load( r0, input6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
10182+ stbIF7( stbir__simdf_load( r0, input7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
10183+
10184+ stbir__simdf_store( output, o0 );
10185+ output += 4;
10186+ stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
10187+ }
10188+ }
10189+ #else
10190+ STBIR_NO_UNROLL_LOOP_START
10191+ while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
10192+ {
10193+ float o0, o1, o2, o3;
10194+ STBIR_NO_UNROLL(output);
10195+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10196+ stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
10197+ #else
10198+ stbIF0( o0 = input0[0] * c0s; o1 = input0[1] * c0s; o2 = input0[2] * c0s; o3 = input0[3] * c0s; )
10199+ #endif
10200+ stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
10201+ stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
10202+ stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
10203+ stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
10204+ stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
10205+ stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
10206+ stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
10207+ output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
10208+ output += 4;
10209+ stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
10210+ }
10211+ #endif
10212+ STBIR_NO_UNROLL_LOOP_START
10213+ while ( input0 < input0_end )
10214+ {
10215+ float o0;
10216+ STBIR_NO_UNROLL(output);
10217+ #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10218+ stbIF0( o0 = output[0] + input0[0] * c0s; )
10219+ #else
10220+ stbIF0( o0 = input0[0] * c0s; )
10221+ #endif
10222+ stbIF1( o0 += input1[0] * c1s; )
10223+ stbIF2( o0 += input2[0] * c2s; )
10224+ stbIF3( o0 += input3[0] * c3s; )
10225+ stbIF4( o0 += input4[0] * c4s; )
10226+ stbIF5( o0 += input5[0] * c5s; )
10227+ stbIF6( o0 += input6[0] * c6s; )
10228+ stbIF7( o0 += input7[0] * c7s; )
10229+ output[0] = o0;
10230+ ++output;
10231+ stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
10232+ }
10233+}
10234+
10235+#undef stbIF0
10236+#undef stbIF1
10237+#undef stbIF2
10238+#undef stbIF3
10239+#undef stbIF4
10240+#undef stbIF5
10241+#undef stbIF6
10242+#undef stbIF7
10243+#undef STB_IMAGE_RESIZE_DO_VERTICALS
10244+#undef STBIR__vertical_channels
10245+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
10246+#undef STBIR_strs_join24
10247+#undef STBIR_strs_join14
10248+#undef STBIR_chans
10249+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10250+#undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
10251+#endif
10252+
10253+#else // !STB_IMAGE_RESIZE_DO_VERTICALS
10254+
10255+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
10256+
10257+#ifndef stbir__2_coeff_only
10258+#define stbir__2_coeff_only() \
10259+ stbir__1_coeff_only(); \
10260+ stbir__1_coeff_remnant(1);
10261+#endif
10262+
10263+#ifndef stbir__2_coeff_remnant
10264+#define stbir__2_coeff_remnant( ofs ) \
10265+ stbir__1_coeff_remnant(ofs); \
10266+ stbir__1_coeff_remnant((ofs)+1);
10267+#endif
10268+
10269+#ifndef stbir__3_coeff_only
10270+#define stbir__3_coeff_only() \
10271+ stbir__2_coeff_only(); \
10272+ stbir__1_coeff_remnant(2);
10273+#endif
10274+
10275+#ifndef stbir__3_coeff_remnant
10276+#define stbir__3_coeff_remnant( ofs ) \
10277+ stbir__2_coeff_remnant(ofs); \
10278+ stbir__1_coeff_remnant((ofs)+2);
10279+#endif
10280+
10281+#ifndef stbir__3_coeff_setup
10282+#define stbir__3_coeff_setup()
10283+#endif
10284+
10285+#ifndef stbir__4_coeff_start
10286+#define stbir__4_coeff_start() \
10287+ stbir__2_coeff_only(); \
10288+ stbir__2_coeff_remnant(2);
10289+#endif
10290+
10291+#ifndef stbir__4_coeff_continue_from_4
10292+#define stbir__4_coeff_continue_from_4( ofs ) \
10293+ stbir__2_coeff_remnant(ofs); \
10294+ stbir__2_coeff_remnant((ofs)+2);
10295+#endif
10296+
10297+#ifndef stbir__store_output_tiny
10298+#define stbir__store_output_tiny stbir__store_output
10299+#endif
10300+
10301+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10302+{
10303+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10304+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10305+ STBIR_SIMD_NO_UNROLL_LOOP_START
10306+ do {
10307+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10308+ float const * hc = horizontal_coefficients;
10309+ stbir__1_coeff_only();
10310+ stbir__store_output_tiny();
10311+ } while ( output < output_end );
10312+}
10313+
10314+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10315+{
10316+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10317+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10318+ STBIR_SIMD_NO_UNROLL_LOOP_START
10319+ do {
10320+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10321+ float const * hc = horizontal_coefficients;
10322+ stbir__2_coeff_only();
10323+ stbir__store_output_tiny();
10324+ } while ( output < output_end );
10325+}
10326+
10327+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10328+{
10329+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10330+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10331+ STBIR_SIMD_NO_UNROLL_LOOP_START
10332+ do {
10333+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10334+ float const * hc = horizontal_coefficients;
10335+ stbir__3_coeff_only();
10336+ stbir__store_output_tiny();
10337+ } while ( output < output_end );
10338+}
10339+
10340+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10341+{
10342+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10343+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10344+ STBIR_SIMD_NO_UNROLL_LOOP_START
10345+ do {
10346+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10347+ float const * hc = horizontal_coefficients;
10348+ stbir__4_coeff_start();
10349+ stbir__store_output();
10350+ } while ( output < output_end );
10351+}
10352+
10353+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10354+{
10355+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10356+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10357+ STBIR_SIMD_NO_UNROLL_LOOP_START
10358+ do {
10359+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10360+ float const * hc = horizontal_coefficients;
10361+ stbir__4_coeff_start();
10362+ stbir__1_coeff_remnant(4);
10363+ stbir__store_output();
10364+ } while ( output < output_end );
10365+}
10366+
10367+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10368+{
10369+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10370+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10371+ STBIR_SIMD_NO_UNROLL_LOOP_START
10372+ do {
10373+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10374+ float const * hc = horizontal_coefficients;
10375+ stbir__4_coeff_start();
10376+ stbir__2_coeff_remnant(4);
10377+ stbir__store_output();
10378+ } while ( output < output_end );
10379+}
10380+
10381+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10382+{
10383+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10384+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10385+ stbir__3_coeff_setup();
10386+ STBIR_SIMD_NO_UNROLL_LOOP_START
10387+ do {
10388+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10389+ float const * hc = horizontal_coefficients;
10390+
10391+ stbir__4_coeff_start();
10392+ stbir__3_coeff_remnant(4);
10393+ stbir__store_output();
10394+ } while ( output < output_end );
10395+}
10396+
10397+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10398+{
10399+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10400+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10401+ STBIR_SIMD_NO_UNROLL_LOOP_START
10402+ do {
10403+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10404+ float const * hc = horizontal_coefficients;
10405+ stbir__4_coeff_start();
10406+ stbir__4_coeff_continue_from_4(4);
10407+ stbir__store_output();
10408+ } while ( output < output_end );
10409+}
10410+
10411+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10412+{
10413+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10414+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10415+ STBIR_SIMD_NO_UNROLL_LOOP_START
10416+ do {
10417+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10418+ float const * hc = horizontal_coefficients;
10419+ stbir__4_coeff_start();
10420+ stbir__4_coeff_continue_from_4(4);
10421+ stbir__1_coeff_remnant(8);
10422+ stbir__store_output();
10423+ } while ( output < output_end );
10424+}
10425+
10426+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10427+{
10428+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10429+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10430+ STBIR_SIMD_NO_UNROLL_LOOP_START
10431+ do {
10432+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10433+ float const * hc = horizontal_coefficients;
10434+ stbir__4_coeff_start();
10435+ stbir__4_coeff_continue_from_4(4);
10436+ stbir__2_coeff_remnant(8);
10437+ stbir__store_output();
10438+ } while ( output < output_end );
10439+}
10440+
10441+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10442+{
10443+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10444+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10445+ stbir__3_coeff_setup();
10446+ STBIR_SIMD_NO_UNROLL_LOOP_START
10447+ do {
10448+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10449+ float const * hc = horizontal_coefficients;
10450+ stbir__4_coeff_start();
10451+ stbir__4_coeff_continue_from_4(4);
10452+ stbir__3_coeff_remnant(8);
10453+ stbir__store_output();
10454+ } while ( output < output_end );
10455+}
10456+
10457+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10458+{
10459+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10460+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10461+ STBIR_SIMD_NO_UNROLL_LOOP_START
10462+ do {
10463+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10464+ float const * hc = horizontal_coefficients;
10465+ stbir__4_coeff_start();
10466+ stbir__4_coeff_continue_from_4(4);
10467+ stbir__4_coeff_continue_from_4(8);
10468+ stbir__store_output();
10469+ } while ( output < output_end );
10470+}
10471+
10472+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10473+{
10474+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10475+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10476+ STBIR_SIMD_NO_UNROLL_LOOP_START
10477+ do {
10478+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10479+ int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
10480+ float const * hc = horizontal_coefficients;
10481+
10482+ stbir__4_coeff_start();
10483+ STBIR_SIMD_NO_UNROLL_LOOP_START
10484+ do {
10485+ hc += 4;
10486+ decode += STBIR__horizontal_channels * 4;
10487+ stbir__4_coeff_continue_from_4( 0 );
10488+ --n;
10489+ } while ( n > 0 );
10490+ stbir__store_output();
10491+ } while ( output < output_end );
10492+}
10493+
10494+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10495+{
10496+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10497+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10498+ STBIR_SIMD_NO_UNROLL_LOOP_START
10499+ do {
10500+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10501+ int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
10502+ float const * hc = horizontal_coefficients;
10503+
10504+ stbir__4_coeff_start();
10505+ STBIR_SIMD_NO_UNROLL_LOOP_START
10506+ do {
10507+ hc += 4;
10508+ decode += STBIR__horizontal_channels * 4;
10509+ stbir__4_coeff_continue_from_4( 0 );
10510+ --n;
10511+ } while ( n > 0 );
10512+ stbir__1_coeff_remnant( 4 );
10513+ stbir__store_output();
10514+ } while ( output < output_end );
10515+}
10516+
10517+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10518+{
10519+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10520+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10521+ STBIR_SIMD_NO_UNROLL_LOOP_START
10522+ do {
10523+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10524+ int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
10525+ float const * hc = horizontal_coefficients;
10526+
10527+ stbir__4_coeff_start();
10528+ STBIR_SIMD_NO_UNROLL_LOOP_START
10529+ do {
10530+ hc += 4;
10531+ decode += STBIR__horizontal_channels * 4;
10532+ stbir__4_coeff_continue_from_4( 0 );
10533+ --n;
10534+ } while ( n > 0 );
10535+ stbir__2_coeff_remnant( 4 );
10536+
10537+ stbir__store_output();
10538+ } while ( output < output_end );
10539+}
10540+
10541+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
10542+{
10543+ float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
10544+ float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
10545+ stbir__3_coeff_setup();
10546+ STBIR_SIMD_NO_UNROLL_LOOP_START
10547+ do {
10548+ float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
10549+ int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
10550+ float const * hc = horizontal_coefficients;
10551+
10552+ stbir__4_coeff_start();
10553+ STBIR_SIMD_NO_UNROLL_LOOP_START
10554+ do {
10555+ hc += 4;
10556+ decode += STBIR__horizontal_channels * 4;
10557+ stbir__4_coeff_continue_from_4( 0 );
10558+ --n;
10559+ } while ( n > 0 );
10560+ stbir__3_coeff_remnant( 4 );
10561+
10562+ stbir__store_output();
10563+ } while ( output < output_end );
10564+}
10565+
10566+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
10567+{
10568+ STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
10569+ STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
10570+ STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
10571+ STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
10572+};
10573+
10574+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
10575+{
10576+ STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
10577+ STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
10578+ STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
10579+ STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
10580+ STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
10581+ STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
10582+ STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
10583+ STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
10584+ STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
10585+ STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
10586+ STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
10587+ STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
10588+};
10589+
10590+#undef STBIR__horizontal_channels
10591+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
10592+#undef stbir__1_coeff_only
10593+#undef stbir__1_coeff_remnant
10594+#undef stbir__2_coeff_only
10595+#undef stbir__2_coeff_remnant
10596+#undef stbir__3_coeff_only
10597+#undef stbir__3_coeff_remnant
10598+#undef stbir__3_coeff_setup
10599+#undef stbir__4_coeff_start
10600+#undef stbir__4_coeff_continue_from_4
10601+#undef stbir__store_output
10602+#undef stbir__store_output_tiny
10603+#undef STBIR_chans
10604+
10605+#endif // HORIZONALS
10606+
10607+#undef STBIR_strs_join2
10608+#undef STBIR_strs_join1
10609+
10610+#endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS
10611+
10612+/*
10613+------------------------------------------------------------------------------
10614+This software is available under 2 licenses -- choose whichever you prefer.
10615+------------------------------------------------------------------------------
10616+ALTERNATIVE A - MIT License
10617+Copyright (c) 2017 Sean Barrett
10618+Permission is hereby granted, free of charge, to any person obtaining a copy of
10619+this software and associated documentation files (the "Software"), to deal in
10620+the Software without restriction, including without limitation the rights to
10621+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
10622+of the Software, and to permit persons to whom the Software is furnished to do
10623+so, subject to the following conditions:
10624+The above copyright notice and this permission notice shall be included in all
10625+copies or substantial portions of the Software.
10626+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10627+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
10628+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
10629+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
10630+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
10631+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
10632+SOFTWARE.
10633+------------------------------------------------------------------------------
10634+ALTERNATIVE B - Public Domain (www.unlicense.org)
10635+This is free and unencumbered software released into the public domain.
10636+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
10637+software, either in source code form or as a compiled binary, for any purpose,
10638+commercial or non-commercial, and by any means.
10639+In jurisdictions that recognize copyright laws, the author or authors of this
10640+software dedicate any and all copyright interest in the software to the public
10641+domain. We make this dedication for the benefit of the public at large and to
10642+the detriment of our heirs and successors. We intend this dedication to be an
10643+overt act of relinquishment in perpetuity of all present and future rights to
10644+this software under copyright law.
10645+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10646+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
10647+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
10648+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
10649+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
10650+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10651+------------------------------------------------------------------------------
10652+*/