[Scummvm-git-logs] scummvm master -> 742f1f296dd052db30f2eabf2c3eb5e50e3ef308

Mon Aug 28 19:17:05 UTC 2023

This automated email contains information about 45 new commits which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
9b77e3bf03 AGS: Preliminary bitmap benchmark code
bf1f3dbb8b AGS: Refined the temporary bitmap benchmark
1e4a03313c AGS: First blending optimizations
aa9d13f84a AGS: Preliminary optimizations of blending funcs
b7505b93c5 AGS: Optimized the kRgbToRgbBlend 32 bit code path
1e47d7fc56 AGS: Accounted for hflip in drawing optimizations
dc6f6070b2 AGS: Finished first full version of optimizations
6c3c94b7ff AGS: Cleaning up and micro optimizations on bitmap
499dfd7d7a AGS: Added 2Bpp specifc code path optimization
d5b2cd4aea AGS: Added SIMD optimizations for stretchedDraw
07107b19bd AGS: Fixed ARGB blending and finished benchmark
4ad7a30b38 AGS: Just fixed kTintBlenderMode
a22396163f AGS: Created test code for blending modes
673cb4d659 AGS: Optimizations turns off if SSE is not found.
b177d382b6 AGS: Moved arm neon bitmap code to new file.
90df6233f8 AGS: Put comments in NEON blitting/blending code.
0d29563122 AGS: Started on SSE version
45f093f4be AGS: Fixed SSE2 detector and unoptimized draw.
acc818a0cb AGS: Intel/AMD's SIMD path goes to the normal one.
87656d66df AGS: Cleaned up blending/blitting pull request.
ace1a346cc AGS: Finished SSE2 blending optimizations
def889099e AGS: Added check for MSVC specific macros in simd
b3681c5cb3 AGS: Making MSVC and iOS compilers happy
9c11912da9 AGS: Not using Arm NEON on iOS sim. or Arm Windows
33cb39c2e8 AGS: Appeasing iOS compiler again
ff72736c49 AGS: PowerPC Altivec: initial support
bdbceeb674 AGS: Fixed PowerPC code not compiler under GCC 6.59.21
f53e39bac4 AGS: Fixed last commit
0f6da5b299 AGS: Still fixing PowerPC blitting issues
f5908486b9 AGS: Still trying to get PPC blitting to compile
01fa027296 AGS: Still trying to get PPC blitting to compile
89fef524b8 AGS: Fixed more compiling issues for blitting PPC
ef265e68dd AGS: Still trying to get PPC to compile
f88d3633b4 AGS: Still trying to get PPC to compile
29a0903e5b AGS: Holding off PowerPC blitting optimizations
6c353ba72b AGS: Use main SIMD detection features
a1858e31f0 AGS: JANITORIAL: Cleaned up old bliting files
cf358fbc4e BUILD: AGS removed PPC blending files
153afb1081 AGS: Cleaned up blending funcs argument passing
bc20c0185d AGS: GRAPHICS: Changed bending functions templates
1dfbaa35c5 AGS: GRAPHICS: Moved duplicate code to DrawInnerArgs
1cf3c7832a AGS: GRAPHICS: SIMD blending refactoring
e416492a06 AGS: Now engine can detect AVX2
b22e073e35 AGS: Added AVX2 support for blending functions
742f1f296d AGS: Fixed code formatting


Commit: 9b77e3bf0386e5e87df1e8db05325e5ddb242c1c
    https://github.com/scummvm/scummvm/commit/9b77e3bf0386e5e87df1e8db05325e5ddb242c1c
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Preliminary bitmap benchmark code

Made basic benchmarking code in engines/ags/engine/main/engine.cpp, by
adding the code into the seemingly unused allegro_bitmap_test_init
function.

Changed paths:
  A benchgfx1.bmp
    engines/ags/engine/main/engine.cpp

diff --git a/benchgfx1.bmp b/benchgfx1.bmp
new file mode 100644
index 00000000000..5f190fe075f
Binary files /dev/null and b/benchgfx1.bmp differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index fae7dedcd91..c1e8bccdb5a 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -23,6 +23,7 @@
 // Engine initialization
 //
 
+#include "ags/lib/std/chrono.h"
 #include "ags/shared/core/platform.h"
 #include "ags/lib/allegro.h" // allegro_install and _exit
 #include "ags/engine/ac/asset_helper.h"
@@ -71,6 +72,7 @@
 #include "ags/engine/platform/base/ags_platform_driver.h"
 #include "ags/shared/util/directory.h"
 #include "ags/shared/util/error.h"
+#include "ags/shared/util/file.h"
 #include "ags/shared/util/path.h"
 #include "ags/shared/util/string_utils.h"
 #include "ags/ags.h"
@@ -808,6 +810,42 @@ void allegro_bitmap_test_init() {
 	test_allegro_bitmap = nullptr;
 	// Switched the test off for now
 	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
+
+	const uint64_t bench_runs[] = {10000, 10, 100, 1000, 10000, 100000};
+
+	Bitmap *benchgfx1 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
+	if (benchgfx1 != nullptr) {
+		Debug::Printf(kDbgMsg_Info, "Benchmark ver 1");
+		if (_G(gfxDriver)->UsesMemoryBackBuffer()) 
+			_G(gfxDriver)->GetMemoryBackBuffer()->Clear();
+
+		const Rect &view = _GP(play).GetMainViewport();
+		Bitmap *tsc = BitmapHelper::CreateBitmapCopy(benchgfx1, _GP(game).GetColorDepth());
+		IDriverDependantBitmap *ddb = _G(gfxDriver)->CreateDDBFromBitmap(tsc, false, true);
+
+		for (long unsigned int i = 0; i < sizeof(bench_runs)/sizeof(uint64_t); i++) {
+			_G(gfxDriver)->ClearDrawLists();
+			_G(gfxDriver)->BeginSpriteBatch(view);
+			for (uint64_t j = 0; j < bench_runs[i]; j++) {
+				_G(gfxDriver)->DrawSprite(0, 0, ddb);
+			}
+			_G(gfxDriver)->EndSpriteBatch();
+
+			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 1");
+			uint32_t start = std::chrono::high_resolution_clock::now();
+			render_to_screen();
+			uint32_t end = std::chrono::high_resolution_clock::now();
+			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
+			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
+		}
+		
+		_G(gfxDriver)->DestroyDDB(ddb);
+		delete benchgfx1;
+		delete tsc;
+		_G(platform)->Delay(1000);
+	} else {
+		warning("Couldn't load the test bench graphics!");
+	}
 }
 
 // Define location of the game data either using direct settings or searching


Commit: bf1f3dbb8b5367e11fdbcb02e385c3b53f420949
    https://github.com/scummvm/scummvm/commit/bf1f3dbb8b5367e11fdbcb02e385c3b53f420949
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Refined the temporary bitmap benchmark

The old benchmark benchmarked the render_to_screen function, which
wasn't specific enough to what is needing to be benchmarked (
BITMAP::draw, and blending functions), so it was changed to just use the
drawing functions of allegro's bitmap directly. Also from just looking
at other games, it seems like AGS games use truecolor graphics more
often than not, so the benchmark test graphic was changed to truecolor.

Changed paths:
    benchgfx1.bmp
    engines/ags/engine/main/engine.cpp


diff --git a/benchgfx1.bmp b/benchgfx1.bmp
index 5f190fe075f..99f058dcbe1 100644
Binary files a/benchgfx1.bmp and b/benchgfx1.bmp differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index c1e8bccdb5a..5d23c5221e7 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -810,39 +810,24 @@ void allegro_bitmap_test_init() {
 	test_allegro_bitmap = nullptr;
 	// Switched the test off for now
 	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
-
-	const uint64_t bench_runs[] = {10000, 10, 100, 1000, 10000, 100000};
-
+	
 	Bitmap *benchgfx1 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
+	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, benchgfx1->GetColorDepth());
+	uint64_t bench_runs[] = {1000, 10000, 100000};
 	if (benchgfx1 != nullptr) {
-		Debug::Printf(kDbgMsg_Info, "Benchmark ver 1");
-		if (_G(gfxDriver)->UsesMemoryBackBuffer()) 
-			_G(gfxDriver)->GetMemoryBackBuffer()->Clear();
-
-		const Rect &view = _GP(play).GetMainViewport();
-		Bitmap *tsc = BitmapHelper::CreateBitmapCopy(benchgfx1, _GP(game).GetColorDepth());
-		IDriverDependantBitmap *ddb = _G(gfxDriver)->CreateDDBFromBitmap(tsc, false, true);
-
 		for (long unsigned int i = 0; i < sizeof(bench_runs)/sizeof(uint64_t); i++) {
-			_G(gfxDriver)->ClearDrawLists();
-			_G(gfxDriver)->BeginSpriteBatch(view);
+			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (%d bpp)", benchgfx1->GetColorDepth());
+			uint32_t start = std::chrono::high_resolution_clock::now();
 			for (uint64_t j = 0; j < bench_runs[i]; j++) {
-				_G(gfxDriver)->DrawSprite(0, 0, ddb);
+				dest->Blit(benchgfx1, 0, 0, kBitmap_Transparency);
 			}
-			_G(gfxDriver)->EndSpriteBatch();
-
-			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 1");
-			uint32_t start = std::chrono::high_resolution_clock::now();
-			render_to_screen();
 			uint32_t end = std::chrono::high_resolution_clock::now();
 			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
 			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
 		}
 		
-		_G(gfxDriver)->DestroyDDB(ddb);
 		delete benchgfx1;
-		delete tsc;
-		_G(platform)->Delay(1000);
+		delete dest;
 	} else {
 		warning("Couldn't load the test bench graphics!");
 	}


Commit: 1e4a03313c04b8b360c5712aea58d73e3551b4ee
    https://github.com/scummvm/scummvm/commit/1e4a03313c04b8b360c5712aea58d73e3551b4ee
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: First blending optimizations

Moved the loop of BTIMAP::draw into a templated function so that certain
checks could be done at runtime. When src and dst formats are both 4
bytes per pixel it skips colorToARGB and does a quicker method. This is
still a very early work in progress.

Changed paths:
    benchgfx1.bmp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/std/functional.h


diff --git a/benchgfx1.bmp b/benchgfx1.bmp
index 99f058dcbe1..9a09349533a 100644
Binary files a/benchgfx1.bmp and b/benchgfx1.bmp differ
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index caa34668d1e..70f9ce488e4 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -140,13 +140,9 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	Graphics::Surface destArea = dest.getSubArea(destRect);
 
 	// Define scaling and other stuff used by the drawing loops
-	const int xDir = horizFlip ? -1 : 1;
 	bool useTint = (tintRed >= 0 && tintGreen >= 0 && tintBlue >= 0);
 	bool sameFormat = (src.format == format);
 
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-
 	PALETTE palette;
 	if (src.format.bytesPerPixel == 1 && format.bytesPerPixel != 1) {
 		for (int i = 0; i < PAL_SIZE; ++i) {
@@ -166,81 +162,10 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-	for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
-		if (destY < 0 || destY >= destArea.h)
-			continue;
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       horizFlip ? srcArea.right - 1 : srcArea.left,
-		                       vertFlip ? srcArea.bottom - 1 - yCtr :
-		                       srcArea.top + yCtr);
-
-		// Loop through the pixels of the row
-		for (int destX = xStart, xCtr = 0, xCtrBpp = 0; xCtr < dstRect.width(); ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-			if (destX < 0 || destX >= destArea.w)
-				continue;
-
-			const byte *srcVal = srcP + xDir * xCtrBpp;
-			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
-
-			// Check if this is a transparent color we should skip
-			if (skipTrans && ((srcCol & alphaMask) == transColor))
-				continue;
-
-			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-
-			// When blitting to the same format we can just copy the color
-			if (format.bytesPerPixel == 1) {
-				*destVal = srcCol;
-				continue;
-			} else if (sameFormat && srcAlpha == -1) {
-				if (format.bytesPerPixel == 4)
-					*(uint32 *)destVal = srcCol;
-				else
-					*(uint16 *)destVal = srcCol;
-				continue;
-			}
-
-			// We need the rgb values to do blending and/or convert between formats
-			if (src.format.bytesPerPixel == 1) {
-				const RGB &rgb = palette[srcCol];
-				aSrc = 0xff;
-				rSrc = rgb.r;
-				gSrc = rgb.g;
-				bSrc = rgb.b;
-			} else
-				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-
-			if (srcAlpha == -1) {
-				// This means we don't use blending.
-				aDest = aSrc;
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-			} else {
-				if (useTint) {
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-					aDest = aSrc;
-					rSrc = tintRed;
-					gSrc = tintGreen;
-					bSrc = tintBlue;
-					aSrc = srcAlpha;
-				} else {
-					// TODO: move this to blendPixel to only do it when needed?
-					format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
-				}
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
-			}
-
-			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			if (format.bytesPerPixel == 4)
-				*(uint32 *)destVal = pixel;
-			else
-				*(uint16 *)destVal = pixel;
-		}
-	}
+#define DRAWINNER(formattype) drawInner<formattype>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea)
+	if (sameFormat && format.bytesPerPixel == 4) DRAWINNER(1);
+	else DRAWINNER(0);
+#undef DRAWINNER
 }
 
 void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 88405dc9681..159bfcb9395 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -24,6 +24,7 @@
 
 #include "graphics/managed_surface.h"
 #include "ags/lib/allegro/base.h"
+#include "ags/lib/allegro/color.h"
 #include "common/array.h"
 
 namespace AGS3 {
@@ -265,6 +266,96 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
+	template<int FormatType>
+	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea) {
+		const int xDir = horizFlip ? -1 : 1;
+		byte rSrc, gSrc, bSrc, aSrc;
+		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		
+		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
+			if (destY < 0 || destY >= destArea.h)
+				continue;
+			byte *destP = (byte *)destArea.getBasePtr(0, destY);
+			const byte *srcP = (const byte *)src.getBasePtr(
+			                       horizFlip ? srcArea.right - 1 : srcArea.left,
+			                       vertFlip ? srcArea.bottom - 1 - yCtr :
+			                       srcArea.top + yCtr);
+
+			// Loop through the pixels of the row
+			for (int destX = xStart, xCtr = 0, xCtrBpp = 0; xCtr < dstRect.width(); ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+				if (destX < 0 || destX >= destArea.w)
+					continue;
+
+				const byte *srcVal = srcP + xDir * xCtrBpp;
+				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+				// Check if this is a transparent color we should skip
+				if (skipTrans && ((srcCol & alphaMask) == transColor))
+					continue;
+
+				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+				// When blitting to the same format we can just copy the color
+				if (format.bytesPerPixel == 1) {
+					*destVal = srcCol;
+					continue;
+				} else if (sameFormat && srcAlpha == -1) {
+					if (format.bytesPerPixel == 4)
+						*(uint32 *)destVal = srcCol;
+					else
+						*(uint16 *)destVal = srcCol;
+					continue;
+				}
+
+				// We need the rgb values to do blending and/or convert between formats
+				if (src.format.bytesPerPixel == 1) {
+					const RGB &rgb = palette[srcCol];
+					aSrc = 0xff;
+					rSrc = rgb.r;
+					gSrc = rgb.g;
+					bSrc = rgb.b;
+				} else {
+					if (FormatType == 1) {
+						aSrc = srcCol >> src.format.aShift & 0xff;
+						rSrc = srcCol >> src.format.rShift & 0xff;
+						gSrc = srcCol >> src.format.gShift & 0xff;
+						bSrc = srcCol >> src.format.bShift & 0xff;
+					} else {
+						src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+					}
+				}
+
+				if (srcAlpha == -1) {
+					// This means we don't use blending.
+					aDest = aSrc;
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+				} else {
+					if (useTint) {
+						rDest = rSrc;
+						gDest = gSrc;
+						bDest = bSrc;
+						aDest = aSrc;
+						rSrc = tintRed;
+						gSrc = tintGreen;
+						bSrc = tintBlue;
+						aSrc = srcAlpha;
+					} else {
+						// TODO: move this to blendPixel to only do it when needed?
+						format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+					}
+					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
+				}
+
+				uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+				if (format.bytesPerPixel == 4)
+					*(uint32 *)destVal = pixel;
+				else
+					*(uint16 *)destVal = pixel;
+			}
+		}
+	}
 
 	inline uint32 getColor(const byte *data, byte bpp) const {
 		switch (bpp) {
diff --git a/engines/ags/lib/std/functional.h b/engines/ags/lib/std/functional.h
index ece633814bc..1d1e3b16e61 100644
--- a/engines/ags/lib/std/functional.h
+++ b/engines/ags/lib/std/functional.h
@@ -49,7 +49,7 @@ struct function {
 		return *_fn;
 	}
 
-	operator bool() const {
+	operator bool() {
 		return _fn != nullptr;
 	}
 };


Commit: aa9d13f84ab75cc925ce067bf1309d42c0cc2d92
    https://github.com/scummvm/scummvm/commit/aa9d13f84ab75cc925ce067bf1309d42c0cc2d92
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Preliminary optimizations of blending funcs

Just commiting my first attempts at optimizing BITMAP::draw and the
blendPixel function. Here's an overview of the changes (some are
temporary):

- Put the loop of BITMAP::draw into its own function drawInner.
 I templated it so that I could put different paths into the loop that
 could be optimized out at compile time if a certain blending function
 didn't need it etc.

- I added apple NEON (SIMD) intrensics to the drawInner function,
 haven't ported it to SSE yet, but there is a small library that actually
 maps neon intrensics to sse ones.

- Removed a few ifs from the normal x loop and put it in the y loop.

Changed paths:
    benchgfx1.bmp
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    graphics/surface.h


diff --git a/benchgfx1.bmp b/benchgfx1.bmp
index 9a09349533a..8167853bb60 100644
Binary files a/benchgfx1.bmp and b/benchgfx1.bmp differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index 5d23c5221e7..c2bac729e7f 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -815,6 +815,7 @@ void allegro_bitmap_test_init() {
 	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, benchgfx1->GetColorDepth());
 	uint64_t bench_runs[] = {1000, 10000, 100000};
 	if (benchgfx1 != nullptr) {
+		_G(_blender_mode) = kRgbToRgbBlender; // Using normal blender mode
 		for (long unsigned int i = 0; i < sizeof(bench_runs)/sizeof(uint64_t); i++) {
 			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (%d bpp)", benchgfx1->GetColorDepth());
 			uint32_t start = std::chrono::high_resolution_clock::now();
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 70f9ce488e4..db8b7f2e566 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -163,8 +163,13 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
 #define DRAWINNER(formattype) drawInner<formattype>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea)
-	if (sameFormat && format.bytesPerPixel == 4) DRAWINNER(1);
-	else DRAWINNER(0);
+	if (sameFormat && format.bytesPerPixel == 4 && _G(_blender_mode) == kRgbToRgbBlender) {
+		if (format.bShift == 0 && format.gShift == 8 && format.rShift == 16) DRAWINNER(1);
+		else DRAWINNER(0);
+	}
+	else {
+		DRAWINNER(0);
+	}
 #undef DRAWINNER
 }
 
@@ -271,8 +276,8 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 				bDest = bSrc;
 			} else {
 				// TODO: move this to blendPixel to only do it when needed?
-				format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
+				// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, false, destVal);
 			}
 
 			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
@@ -284,30 +289,37 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	}
 }
 
-void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
+void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
 	switch (_G(_blender_mode)) {
 	case kSourceAlphaBlender:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendSourceAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kArgbToArgbBlender:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendArgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kArgbToRgbBlender:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendArgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kRgbToArgbBlender:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendRgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kRgbToRgbBlender:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendRgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kAlphaPreservedBlenderMode:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendPreserveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kOpaqueBlenderMode:
 		blendOpaque(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kAdditiveBlenderMode:
+		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendAdditiveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kTintBlenderMode:
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 159bfcb9395..3642b9799ad 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -27,6 +27,11 @@
 #include "ags/lib/allegro/color.h"
 #include "common/array.h"
 
+#if defined(__aarch64__)
+// M1/M2 SIMD intrensics
+#include "arm_neon.h"
+#endif
+
 namespace AGS3 {
 
 class BITMAP {
@@ -131,7 +136,7 @@ public:
 	// unsigned int blender_func(unsigned long x, unsigned long y, unsigned long n)
 	// when x is the sprite color, y the destination color, and n an alpha value
 
-	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const;
+	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
 
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
@@ -280,12 +285,19 @@ public:
 			                       horizFlip ? srcArea.right - 1 : srcArea.left,
 			                       vertFlip ? srcArea.bottom - 1 - yCtr :
 			                       srcArea.top + yCtr);
+			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
+			if (xStart < 0) {
+				xCtr = -xStart;
+				xCtrBpp = xCtr * src.format.bytesPerPixel;
+				destX = 0;
+			}
+			if (xStart + xCtrWidth > destArea.w) {
+				xCtrWidth = destArea.w - xStart;
+			}
 
+			if (FormatType == 0) {
 			// Loop through the pixels of the row
-			for (int destX = xStart, xCtr = 0, xCtrBpp = 0; xCtr < dstRect.width(); ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-				if (destX < 0 || destX >= destArea.w)
-					continue;
-
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
 				const byte *srcVal = srcP + xDir * xCtrBpp;
 				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
 
@@ -315,14 +327,14 @@ public:
 					gSrc = rgb.g;
 					bSrc = rgb.b;
 				} else {
-					if (FormatType == 1) {
-						aSrc = srcCol >> src.format.aShift & 0xff;
-						rSrc = srcCol >> src.format.rShift & 0xff;
-						gSrc = srcCol >> src.format.gShift & 0xff;
-						bSrc = srcCol >> src.format.bShift & 0xff;
-					} else {
+					// if (FormatType == 1) {
+					// 	aSrc = srcCol >> src.format.aShift & 0xff;
+					// 	rSrc = srcCol >> src.format.rShift & 0xff;
+					// 	gSrc = srcCol >> src.format.gShift & 0xff;
+					// 	bSrc = srcCol >> src.format.bShift & 0xff;
+					// } else {
 						src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-					}
+					// }
 				}
 
 				if (srcAlpha == -1) {
@@ -343,9 +355,9 @@ public:
 						aSrc = srcAlpha;
 					} else {
 						// TODO: move this to blendPixel to only do it when needed?
-						format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+						// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 					}
-					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
+					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
 				}
 
 				uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
@@ -353,7 +365,53 @@ public:
 					*(uint32 *)destVal = pixel;
 				else
 					*(uint16 *)destVal = pixel;
+			} // FormatType == 0
+			} else { // FormatType == 1
+			uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
+			uint32x4_t transColors = vld1q_dup_u32(&transColor);
+			uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
+			uint8x16_t srcCols;
+			for (; xCtr + 4 < dstRect.width(); destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
+				uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
+				if (srcAlpha != -1) {
+					uint8x16_t srcColsRaw = vld1q_u8(srcP + xDir * xCtrBpp);
+					uint8x16_t destColsRaw = vld1q_u8((uint8 *)destPtr);
+					uint8x16_t diff = vqsubq_u32(srcColsRaw, destColsRaw);
+					diff = vmulq_u8(diff, vmovq_n_u8(alpha));
+					diff = vshrq_n_u8(diff, 8);
+					diff = vaddq_u8(diff, destColsRaw);
+					srcCols = vld1q_u32((const uint32 *)&diff);
+				} else {
+					srcCols = vld1q_u32((const uint32 *)(srcP + xDir * xCtrBpp));
+				}
+				uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
+				uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
+				if (srcAlpha != -1) mask1 = vorrq_u32(mask1, vmovq_n_u32(0xff000000));
+				uint32x4_t mask2 = vmvnq_u32(mask1);
+				uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
+				uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
+				uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+				vst1q_u32(destPtr, final);
+			}
+			// Get the last x values
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+				const uint32 *srcCol = (const uint32 *)(srcP + xDir * xCtrBpp);
+				// Check if this is a transparent color we should skip
+				if (skipTrans && ((*srcCol & alphaMask) == transColor))
+					continue;
+
+				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+				uint32 destCol = srcAlpha == -1 ? *srcCol : *(uint32 *)destVal;
+				if (srcAlpha != -1) {
+					//uint8 aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest;
+					format.colorToARGB(destCol, aDest, rDest, gDest, bDest);
+					src.format.colorToARGB(*srcCol, aSrc, rSrc, gSrc, bSrc);
+					rgbBlend(rSrc, gSrc, bSrc, rDest, gDest, bDest, srcAlpha);
+					destCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+				}
+				*(uint32 *)destVal = destCol;
 			}
+			} // FormatType == 1
 		}
 	}
 
diff --git a/graphics/surface.h b/graphics/surface.h
index 8d56d2f8316..242dca71657 100644
--- a/graphics/surface.h
+++ b/graphics/surface.h
@@ -25,6 +25,7 @@
 #include "common/scummsys.h"
 #include "common/endian.h"
 #include "common/list.h"
+#include "common/textconsole.h"
 
 namespace Common {
 struct Rect;
@@ -124,7 +125,7 @@ public:
 	 *
 	 * @param newPixels The new pixel data.
 	 */
-	void setPixels(void *newPixels) { pixels = newPixels; }
+	void setPixels(void *newPixels) { if ((unsigned long long)newPixels & 0xf) warning("unaligned pixels!"); pixels = newPixels; }
 
 	/**
 	 * Return a pointer to the pixel at the specified point.


Commit: b7505b93c515bd7287a74a10a015129f1243a41f
    https://github.com/scummvm/scummvm/commit/b7505b93c515bd7287a74a10a015129f1243a41f
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Optimized the kRgbToRgbBlend 32 bit code path

Fixed the blending code and it should now be 1 to 1 with the original
allegro source code but just using simd intrensics.

Changed paths:
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 3642b9799ad..a436f843561 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -296,121 +296,133 @@ public:
 			}
 
 			if (FormatType == 0) {
-			// Loop through the pixels of the row
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-				const byte *srcVal = srcP + xDir * xCtrBpp;
-				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
-
-				// Check if this is a transparent color we should skip
-				if (skipTrans && ((srcCol & alphaMask) == transColor))
-					continue;
-
-				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-
-				// When blitting to the same format we can just copy the color
-				if (format.bytesPerPixel == 1) {
-					*destVal = srcCol;
-					continue;
-				} else if (sameFormat && srcAlpha == -1) {
-					if (format.bytesPerPixel == 4)
-						*(uint32 *)destVal = srcCol;
-					else
-						*(uint16 *)destVal = srcCol;
-					continue;
-				}
+				// Loop through the pixels of the row
+				for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+					const byte *srcVal = srcP + xDir * xCtrBpp;
+					uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+					// Check if this is a transparent color we should skip
+					if (skipTrans && ((srcCol & alphaMask) == transColor))
+						continue;
+
+					byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+					// When blitting to the same format we can just copy the color
+					if (format.bytesPerPixel == 1) {
+						*destVal = srcCol;
+						continue;
+					} else if (sameFormat && srcAlpha == -1) {
+						if (format.bytesPerPixel == 4)
+							*(uint32 *)destVal = srcCol;
+						else
+							*(uint16 *)destVal = srcCol;
+						continue;
+					}
 
-				// We need the rgb values to do blending and/or convert between formats
-				if (src.format.bytesPerPixel == 1) {
-					const RGB &rgb = palette[srcCol];
-					aSrc = 0xff;
-					rSrc = rgb.r;
-					gSrc = rgb.g;
-					bSrc = rgb.b;
-				} else {
-					// if (FormatType == 1) {
-					// 	aSrc = srcCol >> src.format.aShift & 0xff;
-					// 	rSrc = srcCol >> src.format.rShift & 0xff;
-					// 	gSrc = srcCol >> src.format.gShift & 0xff;
-					// 	bSrc = srcCol >> src.format.bShift & 0xff;
-					// } else {
-						src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-					// }
-				}
+					// We need the rgb values to do blending and/or convert between formats
+					if (src.format.bytesPerPixel == 1) {
+						const RGB &rgb = palette[srcCol];
+						aSrc = 0xff;
+						rSrc = rgb.r;
+						gSrc = rgb.g;
+						bSrc = rgb.b;
+					} else {
+						// if (FormatType == 1) {
+						// 	aSrc = srcCol >> src.format.aShift & 0xff;
+						// 	rSrc = srcCol >> src.format.rShift & 0xff;
+						// 	gSrc = srcCol >> src.format.gShift & 0xff;
+						// 	bSrc = srcCol >> src.format.bShift & 0xff;
+						// } else {
+							src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+						// }
+					}
 
-				if (srcAlpha == -1) {
-					// This means we don't use blending.
-					aDest = aSrc;
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-				} else {
-					if (useTint) {
+					if (srcAlpha == -1) {
+						// This means we don't use blending.
+						aDest = aSrc;
 						rDest = rSrc;
 						gDest = gSrc;
 						bDest = bSrc;
-						aDest = aSrc;
-						rSrc = tintRed;
-						gSrc = tintGreen;
-						bSrc = tintBlue;
-						aSrc = srcAlpha;
 					} else {
-						// TODO: move this to blendPixel to only do it when needed?
-						// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+						if (useTint) {
+							rDest = rSrc;
+							gDest = gSrc;
+							bDest = bSrc;
+							aDest = aSrc;
+							rSrc = tintRed;
+							gSrc = tintGreen;
+							bSrc = tintBlue;
+							aSrc = srcAlpha;
+						} else {
+							// TODO: move this to blendPixel to only do it when needed?
+							// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+						}
+						blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
 					}
-					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-				}
 
-				uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
-				if (format.bytesPerPixel == 4)
-					*(uint32 *)destVal = pixel;
-				else
-					*(uint16 *)destVal = pixel;
-			} // FormatType == 0
+					uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+					if (format.bytesPerPixel == 4)
+						*(uint32 *)destVal = pixel;
+					else
+						*(uint16 *)destVal = pixel;
+				} // FormatType == 0
 			} else { // FormatType == 1
-			uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
-			uint32x4_t transColors = vld1q_dup_u32(&transColor);
-			uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
-			uint8x16_t srcCols;
-			for (; xCtr + 4 < dstRect.width(); destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
-				uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
-				if (srcAlpha != -1) {
-					uint8x16_t srcColsRaw = vld1q_u8(srcP + xDir * xCtrBpp);
-					uint8x16_t destColsRaw = vld1q_u8((uint8 *)destPtr);
-					uint8x16_t diff = vqsubq_u32(srcColsRaw, destColsRaw);
-					diff = vmulq_u8(diff, vmovq_n_u8(alpha));
-					diff = vshrq_n_u8(diff, 8);
-					diff = vaddq_u8(diff, destColsRaw);
-					srcCols = vld1q_u32((const uint32 *)&diff);
-				} else {
-					srcCols = vld1q_u32((const uint32 *)(srcP + xDir * xCtrBpp));
+				uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
+				uint32x4_t transColors = vld1q_dup_u32(&transColor);
+				uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
+				uint32x4_t alphas = vld1q_dup_u32(&alpha);
+				for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
+					uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
+					uint32x4_t srcColsO = vld1q_u32((const uint32 *)(srcP + xDir * xCtrBpp));
+					uint32x4_t srcCols = srcColsO;
+					if (srcAlpha != -1) {
+						uint32x4_t destCols = vld1q_u32(destPtr);
+						destCols = vandq_u32(destCols, vmovq_n_u32(0x00ffffff));
+						uint32x4_t srcColsCopy = srcCols;
+						srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+						uint32x4_t destColsCopy = destCols;
+						destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+						srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
+						srcColsCopy = vmulq_u32(srcColsCopy, alphas);
+						srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
+						srcColsCopy = vaddq_u32(srcColsCopy, destCols);
+
+						srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+						destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
+						srcCols = vsubq_u32(srcCols, destCols);
+						srcCols = vmulq_u32(srcCols, alphas);
+						srcCols = vshrq_n_u32(srcCols, 8);
+						srcCols = vaddq_u32(srcCols, destCols);
+						srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+						srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+						srcCols = vorrq_u32(srcCols, srcColsCopy);
+					}
+					uint32x4_t anded = vandq_u32(srcColsO, maskedAlphas);
+					uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
+					uint32x4_t mask2 = vmvnq_u32(mask1);
+					uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
+					uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
+					uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+					vst1q_u32(destPtr, final);
 				}
-				uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
-				uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
-				if (srcAlpha != -1) mask1 = vorrq_u32(mask1, vmovq_n_u32(0xff000000));
-				uint32x4_t mask2 = vmvnq_u32(mask1);
-				uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
-				uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
-				uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-				vst1q_u32(destPtr, final);
-			}
-			// Get the last x values
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-				const uint32 *srcCol = (const uint32 *)(srcP + xDir * xCtrBpp);
-				// Check if this is a transparent color we should skip
-				if (skipTrans && ((*srcCol & alphaMask) == transColor))
-					continue;
-
-				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-				uint32 destCol = srcAlpha == -1 ? *srcCol : *(uint32 *)destVal;
-				if (srcAlpha != -1) {
-					//uint8 aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest;
-					format.colorToARGB(destCol, aDest, rDest, gDest, bDest);
-					src.format.colorToARGB(*srcCol, aSrc, rSrc, gSrc, bSrc);
-					rgbBlend(rSrc, gSrc, bSrc, rDest, gDest, bDest, srcAlpha);
-					destCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+				// Get the last x values
+				for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+					const uint32 *srcCol = (const uint32 *)(srcP + xDir * xCtrBpp);
+					// Check if this is a transparent color we should skip
+					if (skipTrans && ((*srcCol & alphaMask) == transColor))
+						continue;
+
+					byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+					uint32 destCol = srcAlpha == -1 ? *srcCol : *(uint32 *)destVal;
+					if (srcAlpha != -1) {
+						//uint8 aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest;
+						format.colorToARGB(destCol, aDest, rDest, gDest, bDest);
+						src.format.colorToARGB(*srcCol, aSrc, rSrc, gSrc, bSrc);
+						rgbBlend(rSrc, gSrc, bSrc, rDest, gDest, bDest, srcAlpha);
+						destCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+					}
+					*(uint32 *)destVal = destCol;
 				}
-				*(uint32 *)destVal = destCol;
-			}
 			} // FormatType == 1
 		}
 	}


Commit: 1e47d7fc56f2d83b28ade4c087d07764f72d2437
    https://github.com/scummvm/scummvm/commit/1e47d7fc56f2d83b28ade4c087d07764f72d2437
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Accounted for hflip in drawing optimizations

Just added code to make the SIMD BITMAP::draw to make it work when art
is flipped.

Changed paths:
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index a436f843561..024adebc2b7 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -367,13 +367,15 @@ public:
 						*(uint16 *)destVal = pixel;
 				} // FormatType == 0
 			} else { // FormatType == 1
+				const byte *srcP2 = srcP;
+				if (horizFlip && xCtr + 4 < xCtrWidth) srcP2 -= src.format.bytesPerPixel * 3;
 				uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
 				uint32x4_t transColors = vld1q_dup_u32(&transColor);
 				uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
 				uint32x4_t alphas = vld1q_dup_u32(&alpha);
 				for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
 					uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
-					uint32x4_t srcColsO = vld1q_u32((const uint32 *)(srcP + xDir * xCtrBpp));
+					uint32x4_t srcColsO = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
 					uint32x4_t srcCols = srcColsO;
 					if (srcAlpha != -1) {
 						uint32x4_t destCols = vld1q_u32(destPtr);
@@ -403,6 +405,10 @@ public:
 					uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
 					uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
 					uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+					if (horizFlip) {
+						final = vrev64q_u32(final);
+						final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+					}
 					vst1q_u32(destPtr, final);
 				}
 				// Get the last x values


Commit: dc6f6070b26d07053a0249b5e8188bfe140c2ceb
    https://github.com/scummvm/scummvm/commit/dc6f6070b26d07053a0249b5e8188bfe140c2ceb
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Finished first full version of optimizations

Optimized most if not all code paths in BITMAP::draw. All blending modes
have been optimized with ARM NEON intrensics, and multiple different
source and destination formats are optimized. (for bytes per pixel the
following have been optimized, 1 and 1, 2 and 2, 4 and 4, 2 and 4).
After this, I am going to clean up this code and apply more optmizations
where I can, then make the SSE versions of the functions, and try to
optimize the slow path as much as I can. Then I will see what I can do
with BITMAP::stretched draw.

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index db8b7f2e566..ecc995cc094 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -162,13 +162,19 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-#define DRAWINNER(formattype) drawInner<formattype>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea)
-	if (sameFormat && format.bytesPerPixel == 4 && _G(_blender_mode) == kRgbToRgbBlender) {
-		if (format.bShift == 0 && format.gShift == 8 && format.rShift == 16) DRAWINNER(1);
-		else DRAWINNER(0);
-	}
-	else {
-		DRAWINNER(0);
+#define DRAWINNER(destBPP, srcBPP) drawInner<destBPP, srcBPP>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode))
+	if (sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: DRAWINNER(1, 1); break;
+		case 2: DRAWINNER(2, 2); break;
+		case 4: DRAWINNER(4, 4); break;
+		}
+	} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		DRAWINNER(4, 2);
+	} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		DRAWINNER(2, 4);
+	} else { // Older more generic implementation (doesn't use SIMD)
+		DRAWINNER(0, 0);
 	}
 #undef DRAWINNER
 }
@@ -331,6 +337,57 @@ void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &a
 	}
 }
 
+uint32x4_t BITMAP::blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const {
+	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		srcAlphas = vshrq_n_u32(srcCols, 24);
+		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
+		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
+		difAlphas = vshlq_n_u32(difAlphas, 24);
+		srcAlphas = vshlq_n_u32(srcAlphas, 24);
+		mask = vceqq_u32(alphas, vmovq_n_u32(0));
+		srcAlphas = vandq_u32(srcAlphas, mask);
+		difAlphas = vandq_u32(srcAlphas, vmvnq_u32(mask));
+		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+		alphas = vshrq_n_u32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender:
+		setupArgbAlphas();
+		return argbBlendSIMD(srcCols, destCols);
+	case kArgbToRgbBlender:
+		setupArgbAlphas();
+		srcCols = argbBlendSIMD(srcCols, destCols);
+		return vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+	case kRgbToArgbBlender:
+		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
+		ch1 = vandq_u32(ch1, mask);
+		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kRgbToRgbBlender:
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode:
+		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+	case kAdditiveBlenderMode:
+		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
+		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		return vorrq_u32(srcCols, srcAlphas);
+	case kTintBlenderMode:
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode:
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+}
+
 void BITMAP::blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const {
 	// Used from draw_lit_sprite after set_blender_mode(kTintBlenderMode or kTintLightBlenderMode)
 	// Original blender function: _myblender_color32 and _myblender_color32_light
@@ -352,6 +409,92 @@ void BITMAP::blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uin
 	// Preserve value in aDest
 }
 
+uint32x4_t BITMAP::blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the dest color
+	// Then it gets the h and s of the srcCols
+
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+	
+	float32x4_t ddr, ddg, ddb;
+	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t ssr, ssg, ssb;
+	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
+	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
+	//float32x4_t dmins = vminq_f32(ddr, vminq_f32(ddg, ddb));
+	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
+	//float32x4_t ddelta = vsubq_f32(dmaxes, dmins);
+	float32x4_t sdelta = vsubq_f32(smaxes, smins);
+
+	float32x4_t quotient, product, hr, hg, hb, hue, sat;
+	hr = vdivq_f32(vsubq_f32(ssg, ssb), sdelta);
+	quotient = vdivq_f32(hr, vmovq_n_f32(6.0));
+	product = vmulq_n_f32(quotient, 6.0);
+	hr = vmulq_n_f32(vsubq_f32(hr, product), 60.0);
+	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), sdelta), vmovq_n_f32(2.0));
+	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), sdelta), vmovq_n_f32(4.0));
+	float32x4_t hrfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssr), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
+	float32x4_t hgfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssg), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
+	float32x4_t hbfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssb), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
+	hue = vmulq_f32(hr, hrfactors);
+	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
+	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
+	float32x4_t satfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(smaxes), vmovq_n_f32(0.0)), vmovq_n_u32(1)));
+	sat = vmulq_f32(satfactors, vdivq_f32(sdelta, smaxes));
+
+	// Mess with the light
+	float32x4_t val = dmaxes;
+	if (light) {
+		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
+		val = vmaxq_f32(val, vmovq_n_f32(0.0));
+	}
+		
+	// then it stiches them back together
+	float32x4_t hp = vmulq_n_f32(hue, 1.0 / 60.0);
+	uint32x4_t hpi = vcvtq_u32_f32(hp);
+	val = vmulq_n_f32(val, 255.0);
+	uint32x4_t x = vcvtq_u32_f32(vmulq_f32(val, sat));
+	uint32x4_t y = vcvtq_u32_f32(vmulq_f32(x, vsubq_f32(hue, vrndq_f32(hue))));
+	val = vaddq_f32(val, vmovq_n_f32(0.5));
+	uint32x4_t z = vcvtq_u32_f32(vsubq_f32(val, x));
+	uint32x4_t v = vcvtq_u32_f32(val);
+	
+	uint32x4_t c0 = vorrq_u32(z, vorrq_u32(vshlq_n_u32(v, 16), vshlq_n_u32(vaddq_u32(z, y), 8)));
+	uint32x4_t m0 = vceqq_u32(hpi, vmovq_n_u32(0));
+	uint32x4_t c1 = vorrq_u32(z, vorrq_u32(vshlq_n_u32(v, 8), vshlq_n_u32(vsubq_u32(v, y), 16)));
+	uint32x4_t m1 = vceqq_u32(hpi, vmovq_n_u32(1));
+	uint32x4_t c2 = vorrq_u32(vshlq_n_u32(z, 16), vorrq_u32(vshlq_n_u32(v, 8), vaddq_u32(z, y)));
+	uint32x4_t m2 = vceqq_u32(hpi, vmovq_n_u32(2));
+	uint32x4_t c3 = vorrq_u32(v, vorrq_u32(vshlq_n_u32(z, 16), vshlq_n_u32(vsubq_u32(v, y), 8)));
+	uint32x4_t m3 = vceqq_u32(hpi, vmovq_n_u32(3));
+	uint32x4_t c4 = vorrq_u32(v, vorrq_u32(vshlq_n_u32(z, 8), vshlq_n_u32(vaddq_u32(z, y), 16)));
+	uint32x4_t m4 = vceqq_u32(hpi, vmovq_n_u32(4));
+	uint32x4_t c5 = vorrq_u32(vshlq_n_u32(v, 16), vorrq_u32(vshlq_n_u32(z, 8), vsubq_u32(v, y)));
+	uint32x4_t m5 = vceqq_u32(hpi, vmovq_n_u32(5));
+
+	uint32x4_t final = vandq_u32(c0, m0);
+	final = vorrq_u32(final, vandq_u32(c1, m1));
+	final = vorrq_u32(final, vandq_u32(c2, m2));
+	final = vorrq_u32(final, vandq_u32(c3, m3));
+	final = vorrq_u32(final, vandq_u32(c4, m4));
+	final = vorrq_u32(final, vandq_u32(c5, m5));
+	final = vorrq_u32(final, vandq_u32(destCols, vmovq_n_u32(0xff000000)));
+	return final;
+}
+
 /*-------------------------------------------------------------------*/
 
 /**
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 024adebc2b7..5f38d4b20c1 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -137,6 +137,7 @@ public:
 	// when x is the sprite color, y the destination color, and n an alpha value
 
 	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
+	uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
 
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
@@ -159,6 +160,33 @@ public:
 		bDest = res & 0xff;
 	}
 
+	inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) const {
+		uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
+		uint32x4_t srcColsCopy = srcCols;
+		srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+		uint32x4_t destColsCopy = destCols;
+		destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+		srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
+		srcColsCopy = vmulq_u32(srcColsCopy, alphas);
+		srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
+		srcColsCopy = vaddq_u32(srcColsCopy, destCols);
+
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+		destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
+		srcCols = vsubq_u32(srcCols, destCols);
+		srcCols = vmulq_u32(srcCols, alphas);
+		srcCols = vshrq_n_u32(srcCols, 8);
+		srcCols = vaddq_u32(srcCols, destCols);
+		srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+		srcCols = vorrq_u32(srcCols, srcColsCopy);
+		if (preserveAlpha) {
+			srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+			srcCols = vorrq_u32(srcCols, alpha);
+		}
+		return srcCols;
+	}
+
 	inline void argbBlend(uint32 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest) const {
 		// Original logic has uint32 src and dst colors as ARGB8888
 		// ++src_alpha;
@@ -183,6 +211,36 @@ public:
 		aDest = static_cast<uint8>(255. * (sAlpha + dAlpha));
 	}
 
+	inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) const {
+		float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+		sAlphas = vdiv_f16(sAlphas, vmov_n_f16(255.0));
+		float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
+		float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
+		float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+		dAlphas = vdiv_f16(dAlphas, vmov_n_f16(255.0));
+		dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
+		float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
+		float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
+		float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
+		float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
+		float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
+		float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
+		srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
+		destRgb1 = vmulq_f16(destRgb1, dAlphas1);
+		srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
+		srcRgb1 = vdivq_f16(srcRgb1, vaddq_f16(sAlphas1, dAlphas1));
+		srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
+		destRgb2 = vmulq_f16(destRgb2, dAlphas2);
+		srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
+		srcRgb2 = vdivq_f16(srcRgb2, vaddq_f16(sAlphas2, dAlphas2));
+		uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
+		srcRgb1 = vcopyq_lane_u16(srcRgb1, 0, alphas, 0);
+		srcRgb1 = vcopyq_lane_u16(srcRgb1, 4, alphas, 1);
+		srcRgb2 = vcopyq_lane_u16(srcRgb2, 0, alphas, 2);
+		srcRgb2 = vcopyq_lane_u16(srcRgb2, 4, alphas, 3);
+		return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(srcRgb1)), vreinterpret_u32_u8(vmovn_u16(srcRgb2)));
+	}
+
 	// kRgbToRgbBlender
 	inline void blendRgbToRgb(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
 		// Default mode for set_trans_blender
@@ -271,11 +329,20 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
-	template<int FormatType>
-	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea) {
+	// kTintBlenderMode and kTintLightBlenderMode for SIMD
+	uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const;
+
+	// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+	template<int DestBytesPerPixel, int SrcBytesPerPixel>
+	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
 		const int xDir = horizFlip ? -1 : 1;
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		const bool isRgbToRgbBlender = blenderMode == kRgbToRgbBlender;
+		uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(srcAlpha), 24);
+		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintRed), 16));
+		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintGreen), 8));
+		tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
 		
 		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
 			if (destY < 0 || destY >= destArea.h)
@@ -288,148 +355,247 @@ public:
 			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
 			if (xStart < 0) {
 				xCtr = -xStart;
-				xCtrBpp = xCtr * src.format.bytesPerPixel;
+				xCtrBpp = xCtr * SrcBytesPerPixel;
 				destX = 0;
 			}
 			if (xStart + xCtrWidth > destArea.w) {
 				xCtrWidth = destArea.w - xStart;
 			}
 
-			if (FormatType == 0) {
-				// Loop through the pixels of the row
-				for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-					const byte *srcVal = srcP + xDir * xCtrBpp;
-					uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
-
-					// Check if this is a transparent color we should skip
-					if (skipTrans && ((srcCol & alphaMask) == transColor))
-						continue;
-
-					byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-
-					// When blitting to the same format we can just copy the color
-					if (format.bytesPerPixel == 1) {
-						*destVal = srcCol;
-						continue;
-					} else if (sameFormat && srcAlpha == -1) {
-						if (format.bytesPerPixel == 4)
-							*(uint32 *)destVal = srcCol;
-						else
-							*(uint16 *)destVal = srcCol;
-						continue;
-					}
 
-					// We need the rgb values to do blending and/or convert between formats
-					if (src.format.bytesPerPixel == 1) {
-						const RGB &rgb = palette[srcCol];
-						aSrc = 0xff;
-						rSrc = rgb.r;
-						gSrc = rgb.g;
-						bSrc = rgb.b;
+			const byte *srcP2 = srcP;
+			if (horizFlip && xCtr + 4 < xCtrWidth) srcP2 -= SrcBytesPerPixel * 3;
+			uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
+			uint32x4_t transColors = vld1q_dup_u32(&transColor);
+			uint32 alpha = srcAlpha && isRgbToRgbBlender ? srcAlpha + 1 : srcAlpha;
+			uint32x4_t alphas = vld1q_dup_u32(&alpha);
+			for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = destPtr = &destP[destX * DestBytesPerPixel];
+				uint32x4_t srcColsO, destCol;
+				if (SrcBytesPerPixel == 4) {
+					destCol = vld1q_u32((uint32 *)destPtr);
+					srcColsO = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+				} else {
+					// RGB565 -> ARGB8888
+					uint32x4_t rawDest = vmovl_u16(vld1_u16((uint16 *)destPtr));
+					uint32x4_t rawSrc = vmovl_u16(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
+					uint32x4_t colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0xf800)), 11);
+					uint32x4_t red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
+					colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0x07e0)), 5);
+					uint32x4_t green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
+					colors = vandq_u32(rawDest, vmovq_n_u32(0x001f));
+					uint32x4_t blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
+					destCol = vorrq_u32(vorrq_u32(red, green), blue);
+					
+					colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0xf800)), 11);
+					red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
+					colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0x07e0)), 5);
+					green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
+					colors = vandq_u32(rawSrc, vmovq_n_u32(0x001f));
+					blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
+					srcColsO = vorrq_u32(vorrq_u32(vorrq_u32(red, green), blue), vmovq_n_u32(0xff000000));
+				}
+				uint32x4_t srcCols = srcColsO;
+				if (srcAlpha != -1) {
+					// take into account for useTint
+					if (useTint) {
+						srcCols = blendPixelSIMD(tint, srcCols, alphas);
 					} else {
-						// if (FormatType == 1) {
-						// 	aSrc = srcCol >> src.format.aShift & 0xff;
-						// 	rSrc = srcCol >> src.format.rShift & 0xff;
-						// 	gSrc = srcCol >> src.format.gShift & 0xff;
-						// 	bSrc = srcCol >> src.format.bShift & 0xff;
-						// } else {
-							src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-						// }
+						srcCols = blendPixelSIMD(srcCols, destCol, alphas);
 					}
-
-					if (srcAlpha == -1) {
-						// This means we don't use blending.
-						aDest = aSrc;
+				}
+				uint32x4_t anded = vandq_u32(srcColsO, maskedAlphas);
+				uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
+				uint32x4_t destCols2 = vandq_u32(destCol, mask1);
+				uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
+				uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+				if (horizFlip) {
+					final = vrev64q_u32(final);
+					final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+				}
+				if (DestBytesPerPixel == 4) {
+					vst1q_u32((uint32 *)destPtr, final);
+				} else {
+					uint32x4_t final16 = vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x000000ff)), 3);
+					final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x0000ff00)), 8+3), 5));
+					final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+					vst1_u16((uint16 *)destPtr, vmovn_u32(final16));
+				}
+			}
+			// Get the last x values
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+				const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+				byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+				uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
+				
+				// Check if this is a transparent color we should skip
+				if (skipTrans && ((srcCol & alphaMask) == transColor))
+					continue;
+
+				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+				if (srcAlpha != -1) {
+					if (useTint) {
 						rDest = rSrc;
 						gDest = gSrc;
 						bDest = bSrc;
+						aDest = aSrc;
+						rSrc = tintRed;
+						gSrc = tintGreen;
+						bSrc = tintBlue;
+						aSrc = srcAlpha;
 					} else {
-						if (useTint) {
-							rDest = rSrc;
-							gDest = gSrc;
-							bDest = bSrc;
-							aDest = aSrc;
-							rSrc = tintRed;
-							gSrc = tintGreen;
-							bSrc = tintBlue;
-							aSrc = srcAlpha;
-						} else {
-							// TODO: move this to blendPixel to only do it when needed?
-							// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
-						}
-						blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+						format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
 					}
+					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+					srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+				} else {
+					srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+				}
+				if (DestBytesPerPixel == 4)
+					*(uint32 *)destVal = srcCol;
+				else
+					*(uint16 *)destVal = srcCol;
+			}
+		}
+	}
 
-					uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+	// Call drawInner with BytesPerPixel=0 if both formats aren't the same.
+	template<>
+	void drawInner<0, 0>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+		const int xDir = horizFlip ? -1 : 1;
+		byte rSrc, gSrc, bSrc, aSrc;
+		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		
+		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
+			if (destY < 0 || destY >= destArea.h)
+				continue;
+			byte *destP = (byte *)destArea.getBasePtr(0, destY);
+			const byte *srcP = (const byte *)src.getBasePtr(
+			                       horizFlip ? srcArea.right - 1 : srcArea.left,
+			                       vertFlip ? srcArea.bottom - 1 - yCtr :
+			                       srcArea.top + yCtr);
+			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
+			if (xStart < 0) {
+				xCtr = -xStart;
+				xCtrBpp = xCtr * src.format.bytesPerPixel;
+				destX = 0;
+			}
+			if (xStart + xCtrWidth > destArea.w) {
+				xCtrWidth = destArea.w - xStart;
+			}
+
+			// Loop through the pixels of the row
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+				const byte *srcVal = srcP + xDir * xCtrBpp;
+				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+				// Check if this is a transparent color we should skip
+				if (skipTrans && ((srcCol & alphaMask) == transColor))
+					continue;
+
+				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+				// When blitting to the same format we can just copy the color
+				if (format.bytesPerPixel == 1) {
+					*destVal = srcCol;
+					continue;
+				} else if (sameFormat && srcAlpha == -1) {
 					if (format.bytesPerPixel == 4)
-						*(uint32 *)destVal = pixel;
+						*(uint32 *)destVal = srcCol;
 					else
-						*(uint16 *)destVal = pixel;
-				} // FormatType == 0
-			} else { // FormatType == 1
-				const byte *srcP2 = srcP;
-				if (horizFlip && xCtr + 4 < xCtrWidth) srcP2 -= src.format.bytesPerPixel * 3;
-				uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
-				uint32x4_t transColors = vld1q_dup_u32(&transColor);
-				uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
-				uint32x4_t alphas = vld1q_dup_u32(&alpha);
-				for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
-					uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
-					uint32x4_t srcColsO = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-					uint32x4_t srcCols = srcColsO;
-					if (srcAlpha != -1) {
-						uint32x4_t destCols = vld1q_u32(destPtr);
-						destCols = vandq_u32(destCols, vmovq_n_u32(0x00ffffff));
-						uint32x4_t srcColsCopy = srcCols;
-						srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-						uint32x4_t destColsCopy = destCols;
-						destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
-						srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
-						srcColsCopy = vmulq_u32(srcColsCopy, alphas);
-						srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
-						srcColsCopy = vaddq_u32(srcColsCopy, destCols);
-
-						srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-						destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
-						srcCols = vsubq_u32(srcCols, destCols);
-						srcCols = vmulq_u32(srcCols, alphas);
-						srcCols = vshrq_n_u32(srcCols, 8);
-						srcCols = vaddq_u32(srcCols, destCols);
-						srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-						srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-						srcCols = vorrq_u32(srcCols, srcColsCopy);
-					}
-					uint32x4_t anded = vandq_u32(srcColsO, maskedAlphas);
-					uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
-					uint32x4_t mask2 = vmvnq_u32(mask1);
-					uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
-					uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
-					uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-					if (horizFlip) {
-						final = vrev64q_u32(final);
-						final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
-					}
-					vst1q_u32(destPtr, final);
+						*(uint16 *)destVal = srcCol;
+					continue;
+				}
+
+				// We need the rgb values to do blending and/or convert between formats
+				if (src.format.bytesPerPixel == 1) {
+					const RGB &rgb = palette[srcCol];
+					aSrc = 0xff;
+					rSrc = rgb.r;
+					gSrc = rgb.g;
+					bSrc = rgb.b;
+				} else {
+					src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
 				}
-				// Get the last x values
-				for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
-					const uint32 *srcCol = (const uint32 *)(srcP + xDir * xCtrBpp);
-					// Check if this is a transparent color we should skip
-					if (skipTrans && ((*srcCol & alphaMask) == transColor))
-						continue;
-
-					byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-					uint32 destCol = srcAlpha == -1 ? *srcCol : *(uint32 *)destVal;
-					if (srcAlpha != -1) {
-						//uint8 aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest;
-						format.colorToARGB(destCol, aDest, rDest, gDest, bDest);
-						src.format.colorToARGB(*srcCol, aSrc, rSrc, gSrc, bSrc);
-						rgbBlend(rSrc, gSrc, bSrc, rDest, gDest, bDest, srcAlpha);
-						destCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+
+				if (srcAlpha == -1) {
+					// This means we don't use blending.
+					aDest = aSrc;
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+				} else {
+					if (useTint) {
+						rDest = rSrc;
+						gDest = gSrc;
+						bDest = bSrc;
+						aDest = aSrc;
+						rSrc = tintRed;
+						gSrc = tintGreen;
+						bSrc = tintBlue;
+						aSrc = srcAlpha;
 					}
-					*(uint32 *)destVal = destCol;
+					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+				}
+
+				uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+				if (format.bytesPerPixel == 4)
+					*(uint32 *)destVal = pixel;
+				else
+					*(uint16 *)destVal = pixel;
+			}
+		}
+	}
+	
+	template<>
+	void drawInner<1, 1>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+		const int xDir = horizFlip ? -1 : 1;
+		// byte rSrc, gSrc, bSrc, aSrc;
+		// byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		
+		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
+			if (destY < 0 || destY >= destArea.h)
+				continue;
+			byte *destP = (byte *)destArea.getBasePtr(0, destY);
+			const byte *srcP = (const byte *)src.getBasePtr(
+			                       horizFlip ? srcArea.right - 1 : srcArea.left,
+			                       vertFlip ? srcArea.bottom - 1 - yCtr :
+			                       srcArea.top + yCtr);
+			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
+			if (xStart < 0) {
+				xCtr = -xStart;
+				xCtrBpp = xCtr * src.format.bytesPerPixel;
+				destX = 0;
+			}
+			if (xStart + xCtrWidth > destArea.w) {
+				xCtrWidth = destArea.w - xStart;
+			}
+
+			const byte *srcP2 = srcP;
+			if (horizFlip && xCtr + 16 < xCtrWidth) srcP2 -= 15;
+			uint8x16_t transColors = vld1q_dup_u8(&transColor);
+			for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX];
+				uint8x16_t destCols = vld1q_u8(destPtr);
+				uint8x16_t srcCols = vld1q_u8(srcP2 + xDir * xCtrBpp);
+				uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
+				uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
+				if (horizFlip) {
+					final = vrev64q_u8(final);
+					final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
 				}
-			} // FormatType == 1
+				vst1q_u8(destPtr, final);
+			}
+			// Get the last x values
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, ++xCtrBpp) {
+				const byte *srcCol = (const byte *)(srcP + xDir * xCtrBpp);
+				// Check if this is a transparent color we should skip
+				if (skipTrans && *srcCol == transColor)
+					continue;
+
+				byte *destVal = (byte *)&destP[destX];
+				*destVal = *srcCol;
+			}
 		}
 	}
 


Commit: 6c3c94b7ffcf81ee79734c8604fa22e78a3a3549
    https://github.com/scummvm/scummvm/commit/6c3c94b7ffcf81ee79734c8604fa22e78a3a3549
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Cleaning up and micro optimizations on bitmap

Added the micro optimizations to BITMAP::draw (on all blending modes and
combinations of source and destination pixel formats).

Changed paths:
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 5f38d4b20c1..9c967a717f8 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -213,11 +213,11 @@ public:
 
 	inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) const {
 		float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
-		sAlphas = vdiv_f16(sAlphas, vmov_n_f16(255.0));
+		sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
 		float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
 		float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
 		float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
-		dAlphas = vdiv_f16(dAlphas, vmov_n_f16(255.0));
+		dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
 		dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
 		float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
 		float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
@@ -228,11 +228,13 @@ public:
 		srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
 		destRgb1 = vmulq_f16(destRgb1, dAlphas1);
 		srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
-		srcRgb1 = vdivq_f16(srcRgb1, vaddq_f16(sAlphas1, dAlphas1));
+		float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1));
+		srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
 		srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
 		destRgb2 = vmulq_f16(destRgb2, dAlphas2);
 		srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
-		srcRgb2 = vdivq_f16(srcRgb2, vaddq_f16(sAlphas2, dAlphas2));
+		alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
+		srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
 		uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
 		srcRgb1 = vcopyq_lane_u16(srcRgb1, 0, alphas, 0);
 		srcRgb1 = vcopyq_lane_u16(srcRgb1, 4, alphas, 1);
@@ -332,130 +334,156 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode for SIMD
 	uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const;
 
+	template<int DestBytesPerPixel, int SrcBytesPerPixel>
+	inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
+		uint32x4_t srcCols, destCol;
+		if (SrcBytesPerPixel == 4) {
+			destCol = vld1q_u32((uint32 *)destPtr);
+			srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+		} else {
+			// RGB565 -> ARGB8888
+			uint32x4_t rawDest = vmovl_u16(vld1_u16((uint16 *)destPtr));
+			uint32x4_t rawSrc = vmovl_u16(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
+			uint32x4_t colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0xf800)), 11);
+			uint32x4_t red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
+			colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0x07e0)), 5);
+			uint32x4_t green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
+			colors = vandq_u32(rawDest, vmovq_n_u32(0x001f));
+			uint32x4_t blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
+			destCol = vorrq_u32(vorrq_u32(red, green), blue);
+			
+			colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0xf800)), 11);
+			red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
+			colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0x07e0)), 5);
+			green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
+			colors = vandq_u32(rawSrc, vmovq_n_u32(0x001f));
+			blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
+			srcCols = vorrq_u32(vorrq_u32(vorrq_u32(red, green), blue), vmovq_n_u32(0xff000000));
+		}
+		uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
+		if (srcAlpha != -1) {
+			// take into account for useTint
+			if (useTint) {
+				srcCols = blendPixelSIMD(tint, srcCols, alphas);
+			} else {
+				srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+			}
+		}
+		uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
+		mask1 = vorrq_u32(mask1, skipMask);
+		uint32x4_t destCols2 = vandq_u32(destCol, mask1);
+		uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
+		uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+		if (horizFlip) {
+			final = vrev64q_u32(final);
+			final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+		}
+		if (DestBytesPerPixel == 4) {
+			vst1q_u32((uint32 *)destPtr, final);
+		} else {
+			uint32x4_t final16 = vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x000000ff)), 3);
+			final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x0000ff00)), 8+3), 5));
+			final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+			vst1_u16((uint16 *)destPtr, vmovn_u32(final16));
+		}
+	}
+
 	// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 	template<int DestBytesPerPixel, int SrcBytesPerPixel>
 	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
 		const int xDir = horizFlip ? -1 : 1;
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-		const bool isRgbToRgbBlender = blenderMode == kRgbToRgbBlender;
 		uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(srcAlpha), 24);
 		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintRed), 16));
 		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintGreen), 8));
 		tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
+		uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
+		uint32x4_t transColors = vld1q_dup_u32(&transColor);
+		int rgbCorrectedAlpha = srcAlpha;
+		if (blenderMode != kRgbToArgbBlender && blenderMode != kTintBlenderMode &&
+			blenderMode != kTintLightBlenderMode && blenderMode != kOpaqueBlenderMode &&
+			blenderMode != kArgbToRgbBlender) {
+			rgbCorrectedAlpha += !!srcAlpha;
+		}
+		uint32x4_t alphas = vld1q_dup_u32(&rgbCorrectedAlpha);
+		uint32x4_t addIndexes = {0, 1, 2, 3};
+		if (horizFlip) addIndexes = {3, 2, 1, 0};
 		
-		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
-			if (destY < 0 || destY >= destArea.h)
-				continue;
-			byte *destP = (byte *)destArea.getBasePtr(0, destY);
-			const byte *srcP = (const byte *)src.getBasePtr(
-			                       horizFlip ? srcArea.right - 1 : srcArea.left,
-			                       vertFlip ? srcArea.bottom - 1 - yCtr :
-			                       srcArea.top + yCtr);
-			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
-			if (xStart < 0) {
-				xCtr = -xStart;
-				xCtrBpp = xCtr * SrcBytesPerPixel;
-				destX = 0;
-			}
-			if (xStart + xCtrWidth > destArea.w) {
-				xCtrWidth = destArea.w - xStart;
+		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+		if (xStart + xCtrWidth > destArea.w) {
+			xCtrWidth = destArea.w - xStart;
+		}
+		if (xStart < 0) {
+			xCtrStart = -xStart;
+			xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+			xStart = 0;
+		}
+		int destY = yStart, yCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		if (yStart < 0) {
+			yCtr = -yStart;
+			destY = 0;
+		}
+		if (yStart + yCtrHeight > destArea.h) {
+			yCtrHeight = destArea.h - yStart;
+		}
+		
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       horizFlip ? srcArea.right - 4 : srcArea.left,
+		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
+			uint32x4_t xCtrWidthSIMD = vmovq_n_u32(xCtrWidth);
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
 
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		}
 
-			const byte *srcP2 = srcP;
-			if (horizFlip && xCtr + 4 < xCtrWidth) srcP2 -= SrcBytesPerPixel * 3;
-			uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
-			uint32x4_t transColors = vld1q_dup_u32(&transColor);
-			uint32 alpha = srcAlpha && isRgbToRgbBlender ? srcAlpha + 1 : srcAlpha;
-			uint32x4_t alphas = vld1q_dup_u32(&alpha);
-			for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = destPtr = &destP[destX * DestBytesPerPixel];
-				uint32x4_t srcColsO, destCol;
-				if (SrcBytesPerPixel == 4) {
-					destCol = vld1q_u32((uint32 *)destPtr);
-					srcColsO = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-				} else {
-					// RGB565 -> ARGB8888
-					uint32x4_t rawDest = vmovl_u16(vld1_u16((uint16 *)destPtr));
-					uint32x4_t rawSrc = vmovl_u16(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-					uint32x4_t colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0xf800)), 11);
-					uint32x4_t red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
-					colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0x07e0)), 5);
-					uint32x4_t green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
-					colors = vandq_u32(rawDest, vmovq_n_u32(0x001f));
-					uint32x4_t blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
-					destCol = vorrq_u32(vorrq_u32(red, green), blue);
-					
-					colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0xf800)), 11);
-					red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
-					colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0x07e0)), 5);
-					green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
-					colors = vandq_u32(rawSrc, vmovq_n_u32(0x001f));
-					blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
-					srcColsO = vorrq_u32(vorrq_u32(vorrq_u32(red, green), blue), vmovq_n_u32(0xff000000));
-				}
-				uint32x4_t srcCols = srcColsO;
-				if (srcAlpha != -1) {
-					// take into account for useTint
-					if (useTint) {
-						srcCols = blendPixelSIMD(tint, srcCols, alphas);
-					} else {
-						srcCols = blendPixelSIMD(srcCols, destCol, alphas);
-					}
-				}
-				uint32x4_t anded = vandq_u32(srcColsO, maskedAlphas);
-				uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
-				uint32x4_t destCols2 = vandq_u32(destCol, mask1);
-				uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
-				uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-				if (horizFlip) {
-					final = vrev64q_u32(final);
-					final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
-				}
-				if (DestBytesPerPixel == 4) {
-					vst1q_u32((uint32 *)destPtr, final);
-				} else {
-					uint32x4_t final16 = vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x000000ff)), 3);
-					final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x0000ff00)), 8+3), 5));
-					final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x00ff0000)), 16+3), 11));
-					vst1_u16((uint16 *)destPtr, vmovn_u32(final16));
-				}
-			}
-			// Get the last x values
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
-				const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-				byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-				uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
-				
-				// Check if this is a transparent color we should skip
-				if (skipTrans && ((srcCol & alphaMask) == transColor))
-					continue;
+		// Get the last x values of the last row
+		if (xCtrWidth % 4 == 0) return;
+		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+		}
+		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+			uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
+			
+			// Check if this is a transparent color we should skip
+			if (skipTrans && ((srcCol & alphaMask) == transColor))
+				continue;
 
-				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-				if (srcAlpha != -1) {
-					if (useTint) {
-						rDest = rSrc;
-						gDest = gSrc;
-						bDest = bSrc;
-						aDest = aSrc;
-						rSrc = tintRed;
-						gSrc = tintGreen;
-						bSrc = tintBlue;
-						aSrc = srcAlpha;
-					} else {
-						format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
-					}
-					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-					srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+			if (srcAlpha != -1) {
+				if (useTint) {
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+					aDest = aSrc;
+					rSrc = tintRed;
+					gSrc = tintGreen;
+					bSrc = tintBlue;
+					aSrc = srcAlpha;
 				} else {
-					srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+					format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
 				}
-				if (DestBytesPerPixel == 4)
-					*(uint32 *)destVal = srcCol;
-				else
-					*(uint16 *)destVal = srcCol;
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+				srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			} else {
+				srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
 			}
+			if (DestBytesPerPixel == 4)
+				*(uint32 *)destVal = srcCol;
+			else
+				*(uint16 *)destVal = srcCol;
 		}
 	}
 
@@ -466,26 +494,32 @@ public:
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
 		
-		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
-			if (destY < 0 || destY >= destArea.h)
-				continue;
-			byte *destP = (byte *)destArea.getBasePtr(0, destY);
-			const byte *srcP = (const byte *)src.getBasePtr(
-			                       horizFlip ? srcArea.right - 1 : srcArea.left,
-			                       vertFlip ? srcArea.bottom - 1 - yCtr :
-			                       srcArea.top + yCtr);
-			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
-			if (xStart < 0) {
-				xCtr = -xStart;
-				xCtrBpp = xCtr * src.format.bytesPerPixel;
-				destX = 0;
-			}
-			if (xStart + xCtrWidth > destArea.w) {
-				xCtrWidth = destArea.w - xStart;
-			}
+		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+		if (xStart + xCtrWidth > destArea.w) {
+			xCtrWidth = destArea.w - xStart;
+		}
+		if (xStart < 0) {
+			xCtrStart = -xStart;
+			xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
+			xStart = 0;
+		}
+		int destY = yStart, yCtr = 0, yCtrHeight = dstRect.height();
+		if (yStart < 0) {
+			yCtr = -yStart;
+			destY = 0;
+		}
+		if (yStart + yCtrHeight > destArea.h) {
+			yCtrHeight = destArea.h - yStart;
+		}
 
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       horizFlip ? srcArea.right - 1 : srcArea.left,
+		                       vertFlip ? srcArea.bottom - 1 - yCtr :
+		                       srcArea.top + yCtr);
+		for (; yCtr < dstRect.height(); ++destY, ++yCtr) {
 			// Loop through the pixels of the row
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+			for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
 				const byte *srcVal = srcP + xDir * xCtrBpp;
 				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
 
@@ -544,40 +578,44 @@ public:
 				else
 					*(uint16 *)destVal = pixel;
 			}
+
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 	}
 	
 	template<>
 	void drawInner<1, 1>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
 		const int xDir = horizFlip ? -1 : 1;
-		// byte rSrc, gSrc, bSrc, aSrc;
-		// byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		uint8x16_t transColors = vld1q_dup_u8(&transColor);
 		
-		for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
-			if (destY < 0 || destY >= destArea.h)
-				continue;
-			byte *destP = (byte *)destArea.getBasePtr(0, destY);
-			const byte *srcP = (const byte *)src.getBasePtr(
-			                       horizFlip ? srcArea.right - 1 : srcArea.left,
-			                       vertFlip ? srcArea.bottom - 1 - yCtr :
-			                       srcArea.top + yCtr);
-			int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
-			if (xStart < 0) {
-				xCtr = -xStart;
-				xCtrBpp = xCtr * src.format.bytesPerPixel;
-				destX = 0;
-			}
-			if (xStart + xCtrWidth > destArea.w) {
-				xCtrWidth = destArea.w - xStart;
-			}
-
-			const byte *srcP2 = srcP;
-			if (horizFlip && xCtr + 16 < xCtrWidth) srcP2 -= 15;
-			uint8x16_t transColors = vld1q_dup_u8(&transColor);
-			for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 16) {
+		int xCtrStart = 0, xCtrWidth = dstRect.width();
+		if (xStart + xCtrWidth > destArea.w) {
+			xCtrWidth = destArea.w - xStart;
+		}
+		if (xStart < 0) {
+			xCtrStart = -xStart;
+			xStart = 0;
+		}
+		int destY = yStart, yCtr = 0, yCtrHeight = dstRect.height();
+		if (yStart < 0) {
+			yCtr = -yStart;
+			destY = 0;
+		}
+		if (yStart + yCtrHeight > destArea.h) {
+			yCtrHeight = destArea.h - yStart;
+		}
+		
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       horizFlip ? srcArea.right - 16 : srcArea.left,
+		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
+			int xCtr = xCtrStart, destX = xStart;
+			for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
 				byte *destPtr = &destP[destX];
 				uint8x16_t destCols = vld1q_u8(destPtr);
-				uint8x16_t srcCols = vld1q_u8(srcP2 + xDir * xCtrBpp);
+				uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
 				uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
 				uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
 				if (horizFlip) {
@@ -587,8 +625,9 @@ public:
 				vst1q_u8(destPtr, final);
 			}
 			// Get the last x values
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, ++xCtrBpp) {
-				const byte *srcCol = (const byte *)(srcP + xDir * xCtrBpp);
+			if (horizFlip) srcP += 15;
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr) {
+				const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
 				// Check if this is a transparent color we should skip
 				if (skipTrans && *srcCol == transColor)
 					continue;
@@ -596,6 +635,9 @@ public:
 				byte *destVal = (byte *)&destP[destX];
 				*destVal = *srcCol;
 			}
+			if (horizFlip) srcP -= 15;
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 	}
 


Commit: 499dfd7d7a6737516cf1db9c02acad5db8587ae3
    https://github.com/scummvm/scummvm/commit/499dfd7d7a6737516cf1db9c02acad5db8587ae3
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Added 2Bpp specifc code path optimization

Added a template specialization for 2bpp to 2bpp blits in
BITMAP::drawInner, makes 2bpp to 2bpp now around 2 times as fast as
normal 4bpp to 4bpp blitting.

Changed paths:
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index c2bac729e7f..eedf6b3299c 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -813,6 +813,8 @@ void allegro_bitmap_test_init() {
 	
 	Bitmap *benchgfx1 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
 	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, benchgfx1->GetColorDepth());
+	Bitmap *dest2 = BitmapHelper::CreateBitmapCopy(dest, 16);
+	Bitmap *benchgfx2 = BitmapHelper::CreateBitmapCopy(benchgfx1, 16);
 	uint64_t bench_runs[] = {1000, 10000, 100000};
 	if (benchgfx1 != nullptr) {
 		_G(_blender_mode) = kRgbToRgbBlender; // Using normal blender mode
@@ -825,10 +827,21 @@ void allegro_bitmap_test_init() {
 			uint32_t end = std::chrono::high_resolution_clock::now();
 			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
 			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
+			
+			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (16 bpp)");
+			start = std::chrono::high_resolution_clock::now();
+			for (uint64_t j = 0; j < bench_runs[i]; j++) {
+				dest2->Blit(benchgfx2, 0, 0, kBitmap_Transparency);
+			}
+			end = std::chrono::high_resolution_clock::now();
+			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
+			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
 		}
 		
 		delete benchgfx1;
 		delete dest;
+		delete benchgfx2;
+		delete dest2;
 	} else {
 		warning("Couldn't load the test bench graphics!");
 	}
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index ecc995cc094..e3920da462e 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -388,6 +388,32 @@ uint32x4_t BITMAP::blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint3
 	}
 }
 
+uint16x8_t BITMAP::blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const {
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kAdditiveBlenderMode:
+	case kOpaqueBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, vmovq_n_u16(0xff));
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+	case kRgbToArgbBlender:
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
+		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
+		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
+		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
+		uint32x4_t alphasLo = simd2BppTo4Bpp(vget_low_u16(alphas));
+		uint32x4_t alphasHi = simd2BppTo4Bpp(vget_high_u16(alphas));
+		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return vcombine_u16(lo, hi);
+	}
+}
+
 void BITMAP::blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const {
 	// Used from draw_lit_sprite after set_blender_mode(kTintBlenderMode or kTintLightBlenderMode)
 	// Original blender function: _myblender_color32 and _myblender_color32_light
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 9c967a717f8..71367a74bb4 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -138,6 +138,7 @@ public:
 
 	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
 	uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
+	uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const;
 
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
@@ -160,6 +161,31 @@ public:
 		bDest = res & 0xff;
 	}
 
+	inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const {
+		uint16x8_t srcComps[] = {
+			vandq_u16(srcCols, vmovq_n_u16(0x1f)),
+			vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),
+			vshrq_n_u16(srcCols, 11),
+		}, destComps[] = {
+			vandq_u16(destCols, vmovq_n_u16(0x1f)),
+			vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)),
+			vshrq_n_u16(destCols, 11),
+		};
+		uint16x8_t diffs[] = {
+			vsubq_u16(srcComps[0], destComps[0]), // B
+			vsubq_u16(srcComps[1], destComps[1]), // G
+			vsubq_u16(srcComps[2], destComps[2]), // R
+		};
+		alphas = vshrq_n_u16(alphas, 2);
+		diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
+		alphas = vshrq_n_u16(alphas, 1);
+		diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
+		diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
+		diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
+		diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
+		return vaddq_u16(diffs[0], destCols);
+	}
+
 	inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) const {
 		uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
 		uint32x4_t srcColsCopy = srcCols;
@@ -334,6 +360,24 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode for SIMD
 	uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const;
 
+	inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) const {
+		uint32x4_t x = vmovl_u16(pixels);
+		uint32x4_t c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0xf800)), 11);
+		uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
+		c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
+		uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
+		c = vandq_u32(x, vmovq_n_u32(0x001f));
+		uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
+		return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
+	}
+
+	inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) const {
+		uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
+		x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+3), 5));
+		x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+		return vmovn_u32(x);
+	}
+
 	template<int DestBytesPerPixel, int SrcBytesPerPixel>
 	inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
 		uint32x4_t srcCols, destCol;
@@ -341,24 +385,8 @@ public:
 			destCol = vld1q_u32((uint32 *)destPtr);
 			srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
 		} else {
-			// RGB565 -> ARGB8888
-			uint32x4_t rawDest = vmovl_u16(vld1_u16((uint16 *)destPtr));
-			uint32x4_t rawSrc = vmovl_u16(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-			uint32x4_t colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0xf800)), 11);
-			uint32x4_t red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
-			colors = vshrq_n_u32(vandq_u32(rawDest, vmovq_n_u32(0x07e0)), 5);
-			uint32x4_t green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
-			colors = vandq_u32(rawDest, vmovq_n_u32(0x001f));
-			uint32x4_t blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
-			destCol = vorrq_u32(vorrq_u32(red, green), blue);
-			
-			colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0xf800)), 11);
-			red = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2)), 16);
-			colors = vshrq_n_u32(vandq_u32(rawSrc, vmovq_n_u32(0x07e0)), 5);
-			green = vshlq_n_u32(vorrq_u32(vshlq_n_u32(colors, 2), vshrq_n_u32(colors, 4)), 8);
-			colors = vandq_u32(rawSrc, vmovq_n_u32(0x001f));
-			blue = vorrq_u32(vshlq_n_u32(colors, 3), vshrq_n_u32(colors, 2));
-			srcCols = vorrq_u32(vorrq_u32(vorrq_u32(red, green), blue), vmovq_n_u32(0xff000000));
+			destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
+			srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
 		}
 		uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
 		if (srcAlpha != -1) {
@@ -381,13 +409,33 @@ public:
 		if (DestBytesPerPixel == 4) {
 			vst1q_u32((uint32 *)destPtr, final);
 		} else {
-			uint32x4_t final16 = vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x000000ff)), 3);
-			final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x0000ff00)), 8+3), 5));
-			final16 = vorrq_u32(final16, vshlq_n_u32(vshrq_n_u32(vandq_u32(final, vmovq_n_u32(0x00ff0000)), 16+3), 11));
-			vst1_u16((uint16 *)destPtr, vmovn_u32(final16));
+			vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
 		}
 	}
 
+	inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
+		uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
+		uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
+		if (srcAlpha != -1) {
+			// take into account for useTint
+			if (useTint) {
+				srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+			} else {
+				srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+			}
+		}
+		uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
+		mask1 = vorrq_u16(mask1, skipMask);
+		uint16x8_t destCols2 = vandq_u16(destCol, mask1);
+		uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
+		uint16x8_t final = vorrq_u16(destCols2, srcCols2);
+		if (horizFlip) {
+			final = vrev64q_u16(final);
+			final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
+		}
+		vst1q_u16((uint16 *)destPtr, final);
+	}
+
 	// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 	template<int DestBytesPerPixel, int SrcBytesPerPixel>
 	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
@@ -487,6 +535,97 @@ public:
 		}
 	}
 
+#if 1
+	template<>
+	void drawInner<2, 2>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+		const int xDir = horizFlip ? -1 : 1;
+		byte rSrc, gSrc, bSrc, aSrc;
+		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+		uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
+		uint16x8_t transColors = vdupq_n_u16(transColor);
+		int rgbCorrectedAlpha = srcAlpha;
+		if (blenderMode != kTintBlenderMode && blenderMode != kTintLightBlenderMode) {
+			rgbCorrectedAlpha += !!srcAlpha;
+		}
+		uint16x8_t alphas = vdupq_n_u16(rgbCorrectedAlpha);
+		uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+		if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+		
+		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+		if (xStart + xCtrWidth > destArea.w) {
+			xCtrWidth = destArea.w - xStart;
+		}
+		if (xStart < 0) {
+			xCtrStart = -xStart;
+			xCtrBppStart = xCtrStart * 2;
+			xStart = 0;
+		}
+		int destY = yStart, yCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		if (yStart < 0) {
+			yCtr = -yStart;
+			destY = 0;
+		}
+		if (yStart + yCtrHeight > destArea.h) {
+			yCtrHeight = destArea.h - yStart;
+		}
+		
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       horizFlip ? srcArea.right - 8 : srcArea.left,
+		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
+			uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth);
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		}
+
+		// Get the last x values of the last row
+		if (xCtrWidth % 8 == 0) return;
+		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+		}
+		if (horizFlip) srcP += 2*3;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+			byte *destVal = (byte *)&destP[destX * 2];
+			uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+			
+			// Check if this is a transparent color we should skip
+			if (skipTrans && srcCol == transColor)
+				continue;
+
+			src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+			if (srcAlpha != -1) {
+				if (useTint) {
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+					aDest = aSrc;
+					rSrc = tintRed;
+					gSrc = tintGreen;
+					bSrc = tintBlue;
+					aSrc = srcAlpha;
+				} else {
+					format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+				}
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+				srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			} else {
+				srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+			}
+			*(uint16 *)destVal = srcCol;
+		}
+	}
+#endif
+
 	// Call drawInner with BytesPerPixel=0 if both formats aren't the same.
 	template<>
 	void drawInner<0, 0>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {


Commit: d5b2cd4aeac1f92fef2b8488895d97d01c809459
    https://github.com/scummvm/scummvm/commit/d5b2cd4aeac1f92fef2b8488895d97d01c809459
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Added SIMD optimizations for stretchedDraw

Changed paths:
    benchgfx1.bmp
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/benchgfx1.bmp b/benchgfx1.bmp
index 8167853bb60..488896208a8 100644
Binary files a/benchgfx1.bmp and b/benchgfx1.bmp differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index eedf6b3299c..11f6182f3a7 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -811,10 +811,9 @@ void allegro_bitmap_test_init() {
 	// Switched the test off for now
 	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
 	
-	Bitmap *benchgfx1 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
-	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, benchgfx1->GetColorDepth());
-	Bitmap *dest2 = BitmapHelper::CreateBitmapCopy(dest, 16);
-	Bitmap *benchgfx2 = BitmapHelper::CreateBitmapCopy(benchgfx1, 16);
+	Bitmap *benchgfx132 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
+	Bitmap *benchgfx1 = BitmapHelper::CreateBitmapCopy(benchgfx132, 16);
+	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, 16);
 	uint64_t bench_runs[] = {1000, 10000, 100000};
 	if (benchgfx1 != nullptr) {
 		_G(_blender_mode) = kRgbToRgbBlender; // Using normal blender mode
@@ -822,26 +821,20 @@ void allegro_bitmap_test_init() {
 			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (%d bpp)", benchgfx1->GetColorDepth());
 			uint32_t start = std::chrono::high_resolution_clock::now();
 			for (uint64_t j = 0; j < bench_runs[i]; j++) {
-				dest->Blit(benchgfx1, 0, 0, kBitmap_Transparency);
+				dest->StretchBlt(benchgfx1, Rect(0, 0, 90, 90), kBitmap_Transparency);
+				//dest->Blit(benchgfx1, 0, 0, kBitmap_Transparency);
 			}
 			uint32_t end = std::chrono::high_resolution_clock::now();
 			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
 			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
-			
-			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (16 bpp)");
-			start = std::chrono::high_resolution_clock::now();
-			for (uint64_t j = 0; j < bench_runs[i]; j++) {
-				dest2->Blit(benchgfx2, 0, 0, kBitmap_Transparency);
-			}
-			end = std::chrono::high_resolution_clock::now();
-			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
-			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
 		}
+
+		dest->Clear();
+		dest->StretchBlt(benchgfx1, Rect(0, 0, 19, 19), kBitmap_Transparency);
+		dest->SaveToFile("benchgfx1result1.bmp", NULL);
 		
 		delete benchgfx1;
 		delete dest;
-		delete benchgfx2;
-		delete dest2;
 	} else {
 		warning("Couldn't load the test bench graphics!");
 	}
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index e3920da462e..c7285370931 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -162,19 +162,19 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-#define DRAWINNER(destBPP, srcBPP) drawInner<destBPP, srcBPP>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode))
+#define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
 	if (sameFormat) {
 		switch (format.bytesPerPixel) {
-		case 1: DRAWINNER(1, 1); break;
-		case 2: DRAWINNER(2, 2); break;
-		case 4: DRAWINNER(4, 4); break;
+		case 1: DRAWINNER(drawInner1Bpp<0>); break;
+		case 2: DRAWINNER(drawInner2Bpp<0>); break;
+		case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); break;
 		}
 	} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
-		DRAWINNER(4, 2);
+		DRAWINNER((drawInner4BppWithConv<4, 2, 0>));
 	} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
-		DRAWINNER(2, 4);
-	} else { // Older more generic implementation (doesn't use SIMD)
-		DRAWINNER(0, 0);
+		DRAWINNER((drawInner4BppWithConv<2, 4, 0>));
+	} else {
+		DRAWINNER(drawInnerGeneric<0>);
 	}
 #undef DRAWINNER
 }
@@ -207,9 +207,6 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	const int scaleY = SCALE_THRESHOLD * srcRect.height() / dstRect.height();
 	bool sameFormat = (src.format == format);
 
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-
 	PALETTE palette;
 	if (src.format.bytesPerPixel == 1 && format.bytesPerPixel != 1) {
 		for (int i = 0; i < PAL_SIZE; ++i) {
@@ -229,70 +226,21 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-	for (int destY = yStart, yCtr = 0, scaleYCtr = 0; yCtr < dstRect.height();
-	        ++destY, ++yCtr, scaleYCtr += scaleY) {
-		if (destY < 0 || destY >= destArea.h)
-			continue;
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       srcRect.left, srcRect.top + scaleYCtr / SCALE_THRESHOLD);
-
-		// Loop through the pixels of the row
-		for (int destX = xStart, xCtr = 0, scaleXCtr = 0; xCtr < dstRect.width();
-		        ++destX, ++xCtr, scaleXCtr += scaleX) {
-			if (destX < 0 || destX >= destArea.w)
-				continue;
-
-			const byte *srcVal = srcP + scaleXCtr / SCALE_THRESHOLD * src.format.bytesPerPixel;
-			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
-
-			// Check if this is a transparent color we should skip
-			if (skipTrans && ((srcCol & alphaMask) == transColor))
-				continue;
-
-			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-
-			// When blitting to the same format we can just copy the color
-			if (format.bytesPerPixel == 1) {
-				*destVal = srcCol;
-				continue;
-			} else if (sameFormat && srcAlpha == -1) {
-				if (format.bytesPerPixel == 4)
-					*(uint32 *)destVal = srcCol;
-				else
-					*(uint16 *)destVal = srcCol;
-				continue;
-			}
-
-			// We need the rgb values to do blending and/or convert between formats
-			if (src.format.bytesPerPixel == 1) {
-				const RGB &rgb = palette[srcCol];
-				aSrc = 0xff;
-				rSrc = rgb.r;
-				gSrc = rgb.g;
-				bSrc = rgb.b;
-			} else
-				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-
-			if (srcAlpha == -1) {
-				// This means we don't use blending.
-				aDest = aSrc;
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-			} else {
-				// TODO: move this to blendPixel to only do it when needed?
-				// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, false, destVal);
-			}
-
-			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			if (format.bytesPerPixel == 4)
-				*(uint32 *)destVal = pixel;
-			else
-				*(uint16 *)destVal = pixel;
+#define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY)
+	if (sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); break;
+		case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); break;
+		case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); break;
 		}
+	} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		DRAWINNER((drawInner4BppWithConv<4, 2, SCALE_THRESHOLD>));
+	} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		DRAWINNER((drawInner4BppWithConv<2, 4, SCALE_THRESHOLD>));
+	} else {
+		DRAWINNER(drawInnerGeneric<SCALE_THRESHOLD>);
 	}
+#undef DRAWINNER
 }
 
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 71367a74bb4..323f7737fca 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -437,8 +437,8 @@ public:
 	}
 
 	// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-	template<int DestBytesPerPixel, int SrcBytesPerPixel>
-	void drawInner(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+	void drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 		const int xDir = horizFlip ? -1 : 1;
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -457,6 +457,7 @@ public:
 		uint32x4_t alphas = vld1q_dup_u32(&rgbCorrectedAlpha);
 		uint32x4_t addIndexes = {0, 1, 2, 3};
 		if (horizFlip) addIndexes = {3, 2, 1, 0};
+		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
 		
 		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
 		if (xStart + xCtrWidth > destArea.w) {
@@ -467,10 +468,15 @@ public:
 			xCtrBppStart = xCtrStart * SrcBytesPerPixel;
 			xStart = 0;
 		}
-		int destY = yStart, yCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
 		if (yStart < 0) {
 			yCtr = -yStart;
 			destY = 0;
+			if (ScaleThreshold != 0) {
+				scaleYCtr = yCtr * scaleY;
+				srcYCtr = scaleYCtr / ScaleThreshold;
+			}
 		}
 		if (yStart + yCtrHeight > destArea.h) {
 			yCtrHeight = destArea.h - yStart;
@@ -480,28 +486,65 @@ public:
 		const byte *srcP = (const byte *)src.getBasePtr(
 		                       horizFlip ? srcArea.right - 4 : srcArea.left,
 		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
-			uint32x4_t xCtrWidthSIMD = vmovq_n_u32(xCtrWidth);
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+			uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth);
+
+			if (ScaleThreshold == 0) {
+				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+					byte *destPtr = &destP[destX * DestBytesPerPixel];
+					uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+					drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				}
+				destP += destArea.pitch;
+				srcP += vertFlip ? -src.pitch : src.pitch;
+			} else {
+				int newSrcYCtr = scaleYCtr / ScaleThreshold;
+				if (srcYCtr != newSrcYCtr) {
+					int diffSrcYCtr = newSrcYCtr - srcYCtr;
+					srcP += src.pitch * diffSrcYCtr;
+					srcYCtr = newSrcYCtr;
+				}
+				byte srcBuffer[4*4];
+				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+					if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break;
+					uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+					indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
+#else
+#error Change code to allow different scale threshold!
+#endif
+					memcpy(&srcBuffer[0*SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
+					memcpy(&srcBuffer[1*SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
+					memcpy(&srcBuffer[2*SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
+					memcpy(&srcBuffer[3*SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
+					scaleXCtr += scaleX*4;
+					byte *destPtr = &destP[destX * DestBytesPerPixel];
+					uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+					drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				}
+				if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
 			}
-
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 
 		// Get the last x values of the last row
-		if (xCtrWidth % 4 == 0) return;
 		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-			byte *destPtr = &destP[destX * DestBytesPerPixel];
-			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+		if (xCtrWidth % 4 == 0) return;
+		if (ScaleThreshold == 0) {
+			for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+			}
+			if (horizFlip) srcP += SrcBytesPerPixel * 3;
+		} else {
+			xCtr = xCtrWidth - xCtrWidth % 4;
+			xCtrBpp = xCtr * SrcBytesPerPixel;
+			destX = xStart+xCtr;
 		}
-		if (horizFlip) srcP += SrcBytesPerPixel * 3;
 		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+			if (ScaleThreshold != 0) {
+				srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+			}
 			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 			uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
 			
@@ -535,9 +578,8 @@ public:
 		}
 	}
 
-#if 1
-	template<>
-	void drawInner<2, 2>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+	template<int ScaleThreshold>
+	void drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 		const int xDir = horizFlip ? -1 : 1;
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -550,6 +592,8 @@ public:
 		uint16x8_t alphas = vdupq_n_u16(rgbCorrectedAlpha);
 		uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
 		if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+		uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
 		
 		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
 		if (xStart + xCtrWidth > destArea.w) {
@@ -560,10 +604,15 @@ public:
 			xCtrBppStart = xCtrStart * 2;
 			xStart = 0;
 		}
-		int destY = yStart, yCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
 		if (yStart < 0) {
 			yCtr = -yStart;
 			destY = 0;
+			if (ScaleThreshold != 0) {
+				scaleYCtr = yCtr * scaleY;
+				srcYCtr = scaleYCtr / ScaleThreshold;
+			}
 		}
 		if (yStart + yCtrHeight > destArea.h) {
 			yCtrHeight = destArea.h - yStart;
@@ -573,28 +622,69 @@ public:
 		const byte *srcP = (const byte *)src.getBasePtr(
 		                       horizFlip ? srcArea.right - 8 : srcArea.left,
 		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
 			uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth);
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				byte *destPtr = &destP[destX * 2];
-				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			if (ScaleThreshold == 0) {
+				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+					byte *destPtr = &destP[destX * 2];
+					uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+					drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				}
+				destP += destArea.pitch;
+				srcP += vertFlip ? -src.pitch : src.pitch;
+			} else {
+				int newSrcYCtr = scaleYCtr / ScaleThreshold;
+				if (srcYCtr != newSrcYCtr) {
+					int diffSrcYCtr = newSrcYCtr - srcYCtr;
+					srcP += src.pitch * diffSrcYCtr;
+					srcYCtr = newSrcYCtr;
+				}
+				uint16 srcBuffer[8];
+				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+					if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+					uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+					indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
+					indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
+#else
+#error Change code to allow different scale threshold!
+#endif
+					srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
+					srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
+					srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
+					srcBuffer[3] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 3));
+					srcBuffer[4] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 0));
+					srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
+					srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
+					srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
+					scaleXCtr += scaleX*8;
+					byte *destPtr = &destP[destX * 2];
+					uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+					drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				}
+				if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
 			}
-
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 
 		// Get the last x values of the last row
 		if (xCtrWidth % 8 == 0) return;
 		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-			byte *destPtr = &destP[destX * 2];
-			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+		if (ScaleThreshold == 0) {
+			for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+			}
+			if (horizFlip) srcP += 2*3;
+		} else {
+			xCtr = xCtrWidth - xCtrWidth % 8;
+			xCtrBpp = xCtr * 2;
+			destX = xStart+xCtr;
 		}
-		if (horizFlip) srcP += 2*3;
 		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+			if (ScaleThreshold != 0) {
+				srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+			}
 			byte *destVal = (byte *)&destP[destX * 2];
 			uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
 			
@@ -624,11 +714,10 @@ public:
 			*(uint16 *)destVal = srcCol;
 		}
 	}
-#endif
 
 	// Call drawInner with BytesPerPixel=0 if both formats aren't the same.
-	template<>
-	void drawInner<0, 0>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+	template<int ScaleThreshold>
+	void drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 		const int xDir = horizFlip ? -1 : 1;
 		byte rSrc, gSrc, bSrc, aSrc;
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -642,10 +731,14 @@ public:
 			xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
 			xStart = 0;
 		}
-		int destY = yStart, yCtr = 0, yCtrHeight = dstRect.height();
+		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
 		if (yStart < 0) {
 			yCtr = -yStart;
 			destY = 0;
+			if (ScaleThreshold != 0) {
+				scaleYCtr = yCtr * scaleY;
+				srcYCtr = scaleYCtr / ScaleThreshold;
+			}
 		}
 		if (yStart + yCtrHeight > destArea.h) {
 			yCtrHeight = destArea.h - yStart;
@@ -656,10 +749,21 @@ public:
 		                       horizFlip ? srcArea.right - 1 : srcArea.left,
 		                       vertFlip ? srcArea.bottom - 1 - yCtr :
 		                       srcArea.top + yCtr);
-		for (; yCtr < dstRect.height(); ++destY, ++yCtr) {
+		for (; yCtr < dstRect.height(); ++destY, ++yCtr, scaleYCtr += scaleY) {
+			if (ScaleThreshold != 0) {
+				int newSrcYCtr = scaleYCtr / ScaleThreshold;
+				if (srcYCtr != newSrcYCtr) {
+					int diffSrcYCtr = newSrcYCtr - srcYCtr;
+					srcP += src.pitch * diffSrcYCtr;
+					srcYCtr = newSrcYCtr;
+				}
+			}
 			// Loop through the pixels of the row
-			for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+			for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel, scaleXCtr += scaleX) {
 				const byte *srcVal = srcP + xDir * xCtrBpp;
+				if (ScaleThreshold != 0) {
+					srcVal = srcP + (scaleXCtr / ScaleThreshold) * src.format.bytesPerPixel;
+				}
 				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
 
 				// Check if this is a transparent color we should skip
@@ -719,14 +823,18 @@ public:
 			}
 
 			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 	}
 	
-	template<>
-	void drawInner<1, 1>(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode) {
+	template<int ScaleThreshold>
+	void drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 		const int xDir = horizFlip ? -1 : 1;
 		uint8x16_t transColors = vld1q_dup_u8(&transColor);
+		uint32x4_t scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+		uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+		uint32x4_t scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
+		uint32x4_t scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
 		
 		int xCtrStart = 0, xCtrWidth = dstRect.width();
 		if (xStart + xCtrWidth > destArea.w) {
@@ -736,10 +844,15 @@ public:
 			xCtrStart = -xStart;
 			xStart = 0;
 		}
-		int destY = yStart, yCtr = 0, yCtrHeight = dstRect.height();
+		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
+		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
 		if (yStart < 0) {
 			yCtr = -yStart;
 			destY = 0;
+			if (ScaleThreshold != 0) {
+				scaleYCtr = yCtr * scaleY;
+				srcYCtr = scaleYCtr / ScaleThreshold;
+			}
 		}
 		if (yStart + yCtrHeight > destArea.h) {
 			yCtrHeight = destArea.h - yStart;
@@ -749,12 +862,49 @@ public:
 		const byte *srcP = (const byte *)src.getBasePtr(
 		                       horizFlip ? srcArea.right - 16 : srcArea.left,
 		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr) {
-			int xCtr = xCtrStart, destX = xStart;
+		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+			if (ScaleThreshold != 0) {
+				int newSrcYCtr = scaleYCtr / ScaleThreshold;
+				if (srcYCtr != newSrcYCtr) {
+					int diffSrcYCtr = newSrcYCtr - srcYCtr;
+					srcP += src.pitch * diffSrcYCtr;
+					srcYCtr = newSrcYCtr;
+				}
+			}
+			int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
 			for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
 				byte *destPtr = &destP[destX];
 				uint8x16_t destCols = vld1q_u8(destPtr);
 				uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
+				if (ScaleThreshold != 0) {
+					uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+					uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+					indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
+					indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
+					indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
+					indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
+#else
+#error Change code to allow different scale threshold!
+#endif
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 3)], srcCols, 3);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 0)], srcCols, 4);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 1)], srcCols, 5);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 2)], srcCols, 6);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 3)], srcCols, 7);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 0)], srcCols, 8);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 1)], srcCols, 9);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 2)], srcCols, 10);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 3)], srcCols, 11);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 0)], srcCols, 12);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
+					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
+					scaleXCtr += scaleX*16;
+				}
 				uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
 				uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
 				if (horizFlip) {
@@ -765,8 +915,11 @@ public:
 			}
 			// Get the last x values
 			if (horizFlip) srcP += 15;
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr) {
+			for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
 				const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+				if (ScaleThreshold != 0) {
+					srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+				}
 				// Check if this is a transparent color we should skip
 				if (skipTrans && *srcCol == transColor)
 					continue;
@@ -776,7 +929,7 @@ public:
 			}
 			if (horizFlip) srcP -= 15;
 			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
 		}
 	}
 


Commit: 07107b19bd05b15399938f3587d696d8c4a51660
    https://github.com/scummvm/scummvm/commit/07107b19bd05b15399938f3587d696d8c4a51660
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed ARGB blending and finished benchmark

Changed paths:
  A bench_output.txt
  A bench_output_fast.txt
  A benchgfx32.bmp
  A benchgfx8.bmp
  R benchgfx1.bmp
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/bench_output.txt b/bench_output.txt
new file mode 100644
index 00000000000..8dee5f04919
--- /dev/null
+++ b/bench_output.txt
@@ -0,0 +1,251 @@
+121User picked target 'kq2agdi' (engine ID 'ags', game ID 'kq2agdi')...
+   Looking for a plugin supporting this target... Adventure Game Studio
+Running King's Quest II: Romancing the Stones Remake (English)
+kq2vga.exe: 40cfb7563df7dacf6530b19289a4745b, 12574643 bytes.
+Initializing backend libs
+Initializing game data
+Opened game data file: game28.dta
+Game data version: 42
+Compiled with: 3.2.0
+Startup directory: ./
+Data directory: ./
+Setting up game configuration
+Voice pack found: speech.vox
+audio.vox found and initialized.
+Initializing TTF renderer
+Initializing mouse: number of buttons reported is 3
+Install timer
+Initialize legacy path finder library
+Game title: 'King's Quest II'
+Game uid (old format): `1025889151`
+Game guid: '{b85ea0b0-35c5-4e53-bfc7-2281bf481001}'
+Game GUI version: 115
+Lipsync data found and loaded
+Checking for disk space
+Game native resolution: 320 x 200 (32 bit)
+Graphic settings: driver: Software, windowed: no, screen size: 0 x 0, game scale: proportional
+Graphic settings: refresh rate (optional): 0, vsync: 0
+Requested graphics driver 'Software' not found, will try existing drivers instead
+Graphics mode set: 320 x 200 (32-bit) fullscreen desktop
+Graphics mode set: refresh rate (optional): 0, vsync: 0
+Mouse speed control: enabled, unit: 1.000000, user value: 1.000000
+Multitasking mode set: 0
+Setting up window
+Multitasking mode set: 0
+Initialize sprites
+34135008 34135024 34135040 34135056 34135072 34135088
+Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 640
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1552
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 643
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1555
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 643
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1556
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 642
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1556
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 644
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1554
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 2226
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 5666
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 2227
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 5675
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 2227
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 5665
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 2225
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 5666
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 2227
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 5664
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 1526
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 3850
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 1526
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 3819
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 1528
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 3817
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 1528
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 3819
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 1528
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 3818
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 2586
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 6620
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 2585
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 6619
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 2585
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 6622
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 2586
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 6623
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 2584
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 6622
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 699
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1710
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 698
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1707
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 699
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1708
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 698
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1714
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 698
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1706
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 1527
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 3706
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 1525
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 3709
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 1523
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 3709
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 1526
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 3705
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 1524
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 3706
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 629
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1546
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 629
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1546
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 628
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1547
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 628
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1547
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 629
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1547
+
+Engine initialization complete
+Starting game
+WARNING: channel 2 - same clip assigned
+Quitting the game...
+***** ENGINE HAS SHUTDOWN
diff --git a/bench_output_fast.txt b/bench_output_fast.txt
new file mode 100644
index 00000000000..6449b1a93c2
--- /dev/null
+++ b/bench_output_fast.txt
@@ -0,0 +1,251 @@
+User picked target 'kq2agdi' (engine ID 'ags', game ID 'kq2agdi')...
+   Looking for a plugin supporting this target... Adventure Game Studio
+Running King's Quest II: Romancing the Stones Remake (English)
+kq2vga.exe: 40cfb7563df7dacf6530b19289a4745b, 12574643 bytes.
+Initializing backend libs
+Initializing game data
+Opened game data file: game28.dta
+Game data version: 42
+Compiled with: 3.2.0
+Startup directory: ./
+Data directory: ./
+Setting up game configuration
+Voice pack found: speech.vox
+audio.vox found and initialized.
+Initializing TTF renderer
+Initializing mouse: number of buttons reported is 3
+Install timer
+Initialize legacy path finder library
+Game title: 'King's Quest II'
+Game uid (old format): `1025889151`
+Game guid: '{b85ea0b0-35c5-4e53-bfc7-2281bf481001}'
+Game GUI version: 115
+Lipsync data found and loaded
+Checking for disk space
+Game native resolution: 320 x 200 (32 bit)
+Graphic settings: driver: Software, windowed: no, screen size: 0 x 0, game scale: proportional
+Graphic settings: refresh rate (optional): 0, vsync: 0
+Requested graphics driver 'Software' not found, will try existing drivers instead
+Graphics mode set: 320 x 200 (32-bit) fullscreen desktop
+Graphics mode set: refresh rate (optional): 0, vsync: 0
+Mouse speed control: enabled, unit: 1.000000, user value: 1.000000
+Multitasking mode set: 0
+Setting up window
+Multitasking mode set: 0
+Initialize sprites
+55839744 55839760 55839776 55839792 55839808 55839824
+Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 622
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1546
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 640
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1545
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 639
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1546
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 640
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1545
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 640
+
+Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1545
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 2213
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 5618
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 2212
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 5621
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 2214
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 5619
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 2213
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 5619
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 2212
+
+Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 5618
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 1526
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 1524
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 3687
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 1528
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 3687
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 1526
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 3686
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 1525
+
+Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 2571
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 6575
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 2571
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 6574
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 2570
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 6574
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 2570
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 6575
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 2570
+
+Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 6574
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 693
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1695
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 692
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1695
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 693
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1694
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 691
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1695
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 692
+
+Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1695
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 1514
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 1513
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 1517
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 3687
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 1519
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 1512
+
+Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 3688
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
+exec time (mills): 625
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
+exec time (mills): 1540
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
+exec time (mills): 626
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
+exec time (mills): 1540
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
+exec time (mills): 625
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
+exec time (mills): 1540
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
+exec time (mills): 625
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
+exec time (mills): 1539
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
+exec time (mills): 625
+
+Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
+exec time (mills): 1540
+
+Engine initialization complete
+Starting game
+WARNING: channel 2 - same clip assigned
+Quitting the game...
+***** ENGINE HAS SHUTDOWN
diff --git a/benchgfx1.bmp b/benchgfx32.bmp
similarity index 100%
rename from benchgfx1.bmp
rename to benchgfx32.bmp
diff --git a/benchgfx8.bmp b/benchgfx8.bmp
new file mode 100644
index 00000000000..aea80181e41
Binary files /dev/null and b/benchgfx8.bmp differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index 11f6182f3a7..5e09e8eb843 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -810,34 +810,53 @@ void allegro_bitmap_test_init() {
 	test_allegro_bitmap = nullptr;
 	// Switched the test off for now
 	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
-	
-	Bitmap *benchgfx132 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx1.bmp", nullptr));
-	Bitmap *benchgfx1 = BitmapHelper::CreateBitmapCopy(benchgfx132, 16);
-	Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, 16);
-	uint64_t bench_runs[] = {1000, 10000, 100000};
-	if (benchgfx1 != nullptr) {
-		_G(_blender_mode) = kRgbToRgbBlender; // Using normal blender mode
-		for (long unsigned int i = 0; i < sizeof(bench_runs)/sizeof(uint64_t); i++) {
-			Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (%d bpp)", benchgfx1->GetColorDepth());
-			uint32_t start = std::chrono::high_resolution_clock::now();
-			for (uint64_t j = 0; j < bench_runs[i]; j++) {
-				dest->StretchBlt(benchgfx1, Rect(0, 0, 90, 90), kBitmap_Transparency);
-				//dest->Blit(benchgfx1, 0, 0, kBitmap_Transparency);
+
+	return;
+
+	PALETTE gfx8pal;
+	Bitmap *benchgfx32 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx32.bmp", nullptr));
+	Bitmap *benchgfx16 = BitmapHelper::CreateBitmapCopy(benchgfx32, 16);
+	Bitmap *benchgfx8 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx8.bmp", gfx8pal));
+	Bitmap *dest32 = BitmapHelper::CreateBitmap(100, 100, 32);
+	Bitmap *dest16 = BitmapHelper::CreateBitmap(100, 100, 16);
+	Bitmap *dest8 = BitmapHelper::CreateBitmap(100, 100, 8);
+	Debug::Printf(kDbgMsg_Info, "%d %d %d %d %d %d", benchgfx32, benchgfx16, benchgfx8, dest32, dest16, dest8);
+	int benchRuns[] = {1000, 10000, 100000};
+	int blenderModes[] = {kRgbToRgbBlender, kSourceAlphaBlender, kArgbToArgbBlender, kOpaqueBlenderMode, kTintLightBlenderMode};
+	const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
+	Bitmap *destinations[] = {dest32, dest16, dest8};
+	Bitmap *graphics[] = {benchgfx32, benchgfx16, benchgfx8};
+	int bpps[] = {32, 16, 8};
+	for (int dest = 0; dest < 3; dest++) {
+		for (int gfx = 0; gfx < 3; gfx++) {
+			if (dest == 2 && gfx != 2) continue;
+			for (int mode = 0; mode < sizeof(blenderModes) / sizeof(int); mode++) {
+				for (int runs = 0; runs < sizeof(benchRuns)/sizeof(int); runs++) {
+					uint32 start, end;
+					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
+					if (runs == 2) Debug::Printf(kDbgMsg_Info, "Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
+					start = std::chrono::high_resolution_clock::now();
+					for (int i = 0; i < benchRuns[runs]; i++)
+						destinations[dest]->Blit(graphics[gfx], 0, 0, kBitmap_Transparency);
+					end = std::chrono::high_resolution_clock::now();
+					if (runs == 2) Debug::Printf(kDbgMsg_Info, "exec time (mills): %u\n", end - start);
+					if (runs == 2) Debug::Printf(kDbgMsg_Info, "Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
+					start = std::chrono::high_resolution_clock::now();
+					for (int i = 0; i < benchRuns[runs]; i++)
+						destinations[dest]->StretchBlt(graphics[gfx], Rect(0, 0, 99, 99), kBitmap_Transparency);
+					end = std::chrono::high_resolution_clock::now();
+					if (runs == 2) Debug::Printf(kDbgMsg_Info, "exec time (mills): %u\n", end - start);
+				}
 			}
-			uint32_t end = std::chrono::high_resolution_clock::now();
-			Debug::Printf(kDbgMsg_Info, "Done! Results (%llu iterations):", bench_runs[i]);
-			Debug::Printf(kDbgMsg_Info, "exec time (mills): %u", end - start);
 		}
-
-		dest->Clear();
-		dest->StretchBlt(benchgfx1, Rect(0, 0, 19, 19), kBitmap_Transparency);
-		dest->SaveToFile("benchgfx1result1.bmp", NULL);
-		
-		delete benchgfx1;
-		delete dest;
-	} else {
-		warning("Couldn't load the test bench graphics!");
 	}
+	
+	delete benchgfx32;
+	delete benchgfx16;
+	delete benchgfx8;
+	delete dest32;
+	delete dest16;
+	delete dest8;
 }
 
 // Define location of the game data either using direct settings or searching
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index c7285370931..2d83ddea510 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -27,6 +27,8 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
+//#define WYATTOPT
+
 namespace AGS3 {
 
 BITMAP::BITMAP(Graphics::ManagedSurface *owner) : _owner(owner),
@@ -162,6 +164,7 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
+#ifdef WYATTOPT
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
 	if (sameFormat) {
 		switch (format.bytesPerPixel) {
@@ -177,6 +180,86 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 		DRAWINNER(drawInnerGeneric<0>);
 	}
 #undef DRAWINNER
+#else
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	const int xDir = horizFlip ? -1 : 1;
+	for (int destY = yStart, yCtr = 0; yCtr < dstRect.height(); ++destY, ++yCtr) {
+		if (destY < 0 || destY >= destArea.h)
+			continue;
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       horizFlip ? srcArea.right - 1 : srcArea.left,
+		                       vertFlip ? srcArea.bottom - 1 - yCtr :
+		                       srcArea.top + yCtr);
+
+		// Loop through the pixels of the row
+		for (int destX = xStart, xCtr = 0, xCtrBpp = 0; xCtr < dstRect.width(); ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
+			if (destX < 0 || destX >= destArea.w)
+				continue;
+
+			const byte *srcVal = srcP + xDir * xCtrBpp;
+			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+			// Check if this is a transparent color we should skip
+			if (skipTrans && ((srcCol & alphaMask) == transColor))
+				continue;
+
+			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+			// When blitting to the same format we can just copy the color
+			if (format.bytesPerPixel == 1) {
+				*destVal = srcCol;
+				continue;
+			} else if (sameFormat && srcAlpha == -1) {
+				if (format.bytesPerPixel == 4)
+					*(uint32 *)destVal = srcCol;
+				else
+					*(uint16 *)destVal = srcCol;
+				continue;
+			}
+
+			// We need the rgb values to do blending and/or convert between formats
+			if (src.format.bytesPerPixel == 1) {
+				const RGB &rgb = palette[srcCol];
+				aSrc = 0xff;
+				rSrc = rgb.r;
+				gSrc = rgb.g;
+				bSrc = rgb.b;
+			} else
+				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+
+			if (srcAlpha == -1) {
+				// This means we don't use blending.
+				aDest = aSrc;
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+			} else {
+				if (useTint) {
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+					aDest = aSrc;
+					rSrc = tintRed;
+					gSrc = tintGreen;
+					bSrc = tintBlue;
+					aSrc = srcAlpha;
+				} else {
+					// TODO: move this to blendPixel to only do it when needed?
+					format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+				}
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
+			}
+
+			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			if (format.bytesPerPixel == 4)
+				*(uint32 *)destVal = pixel;
+			else
+				*(uint16 *)destVal = pixel;
+		}
+	}
+#endif
 }
 
 void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
@@ -226,6 +309,7 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
+#ifdef WYATTOPT
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY)
 	if (sameFormat) {
 		switch (format.bytesPerPixel) {
@@ -241,6 +325,74 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 		DRAWINNER(drawInnerGeneric<SCALE_THRESHOLD>);
 	}
 #undef DRAWINNER
+#else
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	for (int destY = yStart, yCtr = 0, scaleYCtr = 0; yCtr < dstRect.height();
+	        ++destY, ++yCtr, scaleYCtr += scaleY) {
+		if (destY < 0 || destY >= destArea.h)
+			continue;
+		byte *destP = (byte *)destArea.getBasePtr(0, destY);
+		const byte *srcP = (const byte *)src.getBasePtr(
+		                       srcRect.left, srcRect.top + scaleYCtr / SCALE_THRESHOLD);
+
+		// Loop through the pixels of the row
+		for (int destX = xStart, xCtr = 0, scaleXCtr = 0; xCtr < dstRect.width();
+		        ++destX, ++xCtr, scaleXCtr += scaleX) {
+			if (destX < 0 || destX >= destArea.w)
+				continue;
+
+			const byte *srcVal = srcP + scaleXCtr / SCALE_THRESHOLD * src.format.bytesPerPixel;
+			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+			// Check if this is a transparent color we should skip
+			if (skipTrans && ((srcCol & alphaMask) == transColor))
+				continue;
+
+			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+			// When blitting to the same format we can just copy the color
+			if (format.bytesPerPixel == 1) {
+				*destVal = srcCol;
+				continue;
+			} else if (sameFormat && srcAlpha == -1) {
+				if (format.bytesPerPixel == 4)
+					*(uint32 *)destVal = srcCol;
+				else
+					*(uint16 *)destVal = srcCol;
+				continue;
+			}
+
+			// We need the rgb values to do blending and/or convert between formats
+			if (src.format.bytesPerPixel == 1) {
+				const RGB &rgb = palette[srcCol];
+				aSrc = 0xff;
+				rSrc = rgb.r;
+				gSrc = rgb.g;
+				bSrc = rgb.b;
+			} else
+				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+
+			if (srcAlpha == -1) {
+				// This means we don't use blending.
+				aDest = aSrc;
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+			} else {
+				// TODO: move this to blendPixel to only do it when needed?
+				format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
+			}
+
+			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			if (format.bytesPerPixel == 4)
+				*(uint32 *)destVal = pixel;
+			else
+				*(uint16 *)destVal = pixel;
+		}
+	}
+#endif
 }
 
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
@@ -284,6 +436,42 @@ void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &a
 		break;
 	}
 }
+#ifndef WYATTOPT
+void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+		blendSourceAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kArgbToArgbBlender:
+		blendArgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kArgbToRgbBlender:
+		blendArgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kRgbToArgbBlender:
+		blendRgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kRgbToRgbBlender:
+		blendRgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kAlphaPreservedBlenderMode:
+		blendPreserveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kOpaqueBlenderMode:
+		blendOpaque(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kAdditiveBlenderMode:
+		blendAdditiveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
+		break;
+	case kTintBlenderMode:
+		blendTintSprite(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha, false);
+		break;
+	case kTintLightBlenderMode:
+		blendTintSprite(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha, true);
+		break;
+	}
+}
+#endif
 
 uint32x4_t BITMAP::blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const {
 	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 323f7737fca..31e67300e00 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -31,6 +31,7 @@
 // M1/M2 SIMD intrensics
 #include "arm_neon.h"
 #endif
+//#define WYATTOPT
 
 namespace AGS3 {
 
@@ -139,7 +140,9 @@ public:
 	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
 	uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
 	uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const;
-
+#ifndef WYATTOPT
+	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const;
+#endif
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
 		// Note: the original's handling varies slightly for R & B vs G.
@@ -162,6 +165,7 @@ public:
 	}
 
 	inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const {
+		alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
 		uint16x8_t srcComps[] = {
 			vandq_u16(srcCols, vmovq_n_u16(0x1f)),
 			vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),
@@ -187,6 +191,7 @@ public:
 	}
 
 	inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) const {
+		alphas = vaddq_u32(alphas, vandq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
 		uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
 		uint32x4_t srcColsCopy = srcCols;
 		srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
@@ -242,7 +247,7 @@ public:
 		sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
 		float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
 		float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
-		float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+		float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
 		dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
 		dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
 		float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
@@ -262,11 +267,11 @@ public:
 		alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
 		srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
 		uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
-		srcRgb1 = vcopyq_lane_u16(srcRgb1, 0, alphas, 0);
-		srcRgb1 = vcopyq_lane_u16(srcRgb1, 4, alphas, 1);
-		srcRgb2 = vcopyq_lane_u16(srcRgb2, 0, alphas, 2);
-		srcRgb2 = vcopyq_lane_u16(srcRgb2, 4, alphas, 3);
-		return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(srcRgb1)), vreinterpret_u32_u8(vmovn_u16(srcRgb2)));
+		srcRgb1 = vcopyq_lane_u16(srcRgb1, 3, alphas, 0);
+		srcRgb1 = vcopyq_lane_u16(srcRgb1, 7, alphas, 1);
+		srcRgb2 = vcopyq_lane_u16(srcRgb2, 3, alphas, 2);
+		srcRgb2 = vcopyq_lane_u16(srcRgb2, 7, alphas, 3);
+		return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(vcvtq_u16_f16(srcRgb1))), vreinterpret_u32_u8(vmovn_u16(vcvtq_u16_f16(srcRgb2))));
 	}
 
 	// kRgbToRgbBlender
@@ -448,13 +453,7 @@ public:
 		tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
 		uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
 		uint32x4_t transColors = vld1q_dup_u32(&transColor);
-		int rgbCorrectedAlpha = srcAlpha;
-		if (blenderMode != kRgbToArgbBlender && blenderMode != kTintBlenderMode &&
-			blenderMode != kTintLightBlenderMode && blenderMode != kOpaqueBlenderMode &&
-			blenderMode != kArgbToRgbBlender) {
-			rgbCorrectedAlpha += !!srcAlpha;
-		}
-		uint32x4_t alphas = vld1q_dup_u32(&rgbCorrectedAlpha);
+		uint32x4_t alphas = vld1q_dup_u32(&srcAlpha);
 		uint32x4_t addIndexes = {0, 1, 2, 3};
 		if (horizFlip) addIndexes = {3, 2, 1, 0};
 		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
@@ -585,11 +584,7 @@ public:
 		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
 		uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
 		uint16x8_t transColors = vdupq_n_u16(transColor);
-		int rgbCorrectedAlpha = srcAlpha;
-		if (blenderMode != kTintBlenderMode && blenderMode != kTintLightBlenderMode) {
-			rgbCorrectedAlpha += !!srcAlpha;
-		}
-		uint16x8_t alphas = vdupq_n_u16(rgbCorrectedAlpha);
+		uint16x8_t alphas = vdupq_n_u16(srcAlpha);
 		uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
 		if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
 		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};


Commit: 4ad7a30b38f928d29c8a1785bab9f317ad1af651
    https://github.com/scummvm/scummvm/commit/4ad7a30b38f928d29c8a1785bab9f317ad1af651
Author: Wyatt Radkiewicz (wyattradkiewicz at wyatts-air.boisestate.edu)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Just fixed kTintBlenderMode

Changed paths:
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index 5e09e8eb843..d741e741190 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -811,7 +811,13 @@ void allegro_bitmap_test_init() {
 	// Switched the test off for now
 	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
 
-	return;
+	//Bitmap *testgfx = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx32.bmp", nullptr));
+	//Bitmap *destgfx = BitmapHelper::CreateBitmap(testgfx->GetWidth(), testgfx->GetHeight(), testgfx->GetBPP()*8);
+	//set_blender_mode(kTintBlenderMode, 0, 0, 255, 255);
+	//destgfx->LitBlendBlt(testgfx, 0, 0, 100);
+	//destgfx->SaveToFile("tint_result.bmp", nullptr);
+
+	return; // Normal benchmark below
 
 	PALETTE gfx8pal;
 	Bitmap *benchgfx32 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx32.bmp", nullptr));
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 2d83ddea510..87b6fc7ae70 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -27,7 +27,7 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
-//#define WYATTOPT
+#define WYATTOPT
 
 namespace AGS3 {
 
@@ -585,7 +585,7 @@ uint32x4_t BITMAP::blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols,
 	// ddr = { R[0], R[1], R[2], R[3] }
 	// ddg = { G[0], G[1], G[2], G[3] }
 	// ddb = { B[0], B[1], B[2], B[3] }
-	
+
 	float32x4_t ddr, ddg, ddb;
 	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
 	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
@@ -599,23 +599,19 @@ uint32x4_t BITMAP::blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols,
 	//float32x4_t dmins = vminq_f32(ddr, vminq_f32(ddg, ddb));
 	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
 	//float32x4_t ddelta = vsubq_f32(dmaxes, dmins);
-	float32x4_t sdelta = vsubq_f32(smaxes, smins);
-
-	float32x4_t quotient, product, hr, hg, hb, hue, sat;
-	hr = vdivq_f32(vsubq_f32(ssg, ssb), sdelta);
-	quotient = vdivq_f32(hr, vmovq_n_f32(6.0));
-	product = vmulq_n_f32(quotient, 6.0);
-	hr = vmulq_n_f32(vsubq_f32(hr, product), 60.0);
-	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), sdelta), vmovq_n_f32(2.0));
-	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), sdelta), vmovq_n_f32(4.0));
-	float32x4_t hrfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssr), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
-	float32x4_t hgfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssg), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
-	float32x4_t hbfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(ssb), vreinterpretq_u32_f32(smaxes)), vmovq_n_u32(1)));
+	float32x4_t chroma = vsubq_f32(smaxes, smins);
+
+	float32x4_t hr, hg, hb, hue;
+	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
+	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
+	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
+	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
+	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmovq_n_u32(1)));
+	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmovq_n_u32(1)));
+	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmovq_n_u32(1)));
 	hue = vmulq_f32(hr, hrfactors);
 	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
 	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
-	float32x4_t satfactors = vcvtnq_u32_f32(vandq_u32(vceqq_u32(vreinterpretq_u32_f32(smaxes), vmovq_n_f32(0.0)), vmovq_n_u32(1)));
-	sat = vmulq_f32(satfactors, vdivq_f32(sdelta, smaxes));
 
 	// Mess with the light
 	float32x4_t val = dmaxes;
@@ -625,35 +621,32 @@ uint32x4_t BITMAP::blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols,
 	}
 		
 	// then it stiches them back together
-	float32x4_t hp = vmulq_n_f32(hue, 1.0 / 60.0);
-	uint32x4_t hpi = vcvtq_u32_f32(hp);
-	val = vmulq_n_f32(val, 255.0);
-	uint32x4_t x = vcvtq_u32_f32(vmulq_f32(val, sat));
-	uint32x4_t y = vcvtq_u32_f32(vmulq_f32(x, vsubq_f32(hue, vrndq_f32(hue))));
-	val = vaddq_f32(val, vmovq_n_f32(0.5));
-	uint32x4_t z = vcvtq_u32_f32(vsubq_f32(val, x));
-	uint32x4_t v = vcvtq_u32_f32(val);
-	
-	uint32x4_t c0 = vorrq_u32(z, vorrq_u32(vshlq_n_u32(v, 16), vshlq_n_u32(vaddq_u32(z, y), 8)));
-	uint32x4_t m0 = vceqq_u32(hpi, vmovq_n_u32(0));
-	uint32x4_t c1 = vorrq_u32(z, vorrq_u32(vshlq_n_u32(v, 8), vshlq_n_u32(vsubq_u32(v, y), 16)));
-	uint32x4_t m1 = vceqq_u32(hpi, vmovq_n_u32(1));
-	uint32x4_t c2 = vorrq_u32(vshlq_n_u32(z, 16), vorrq_u32(vshlq_n_u32(v, 8), vaddq_u32(z, y)));
-	uint32x4_t m2 = vceqq_u32(hpi, vmovq_n_u32(2));
-	uint32x4_t c3 = vorrq_u32(v, vorrq_u32(vshlq_n_u32(z, 16), vshlq_n_u32(vsubq_u32(v, y), 8)));
-	uint32x4_t m3 = vceqq_u32(hpi, vmovq_n_u32(3));
-	uint32x4_t c4 = vorrq_u32(v, vorrq_u32(vshlq_n_u32(z, 8), vshlq_n_u32(vaddq_u32(z, y), 16)));
-	uint32x4_t m4 = vceqq_u32(hpi, vmovq_n_u32(4));
-	uint32x4_t c5 = vorrq_u32(vshlq_n_u32(v, 16), vorrq_u32(vshlq_n_u32(z, 8), vsubq_u32(v, y)));
-	uint32x4_t m5 = vceqq_u32(hpi, vmovq_n_u32(5));
-
-	uint32x4_t final = vandq_u32(c0, m0);
-	final = vorrq_u32(final, vandq_u32(c1, m1));
-	final = vorrq_u32(final, vandq_u32(c2, m2));
-	final = vorrq_u32(final, vandq_u32(c3, m3));
-	final = vorrq_u32(final, vandq_u32(c4, m4));
-	final = vorrq_u32(final, vandq_u32(c5, m5));
-	final = vorrq_u32(final, vandq_u32(destCols, vmovq_n_u32(0xff000000)));
+	//AGS3::Shared::Debug::Printf(AGS3::Shared::kDbgMsg_Info, "hues: %f", vgetq_lane_f32(hue, 0));
+	chroma = vsubq_f32(val, smins);
+	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
+	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
+	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
+	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
+	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
+
+	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
+	val0 = vandq_u32(val0, vceqq_u32(hprime_rounded, vmovq_n_u32(0)));
+	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
+	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
+	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
+	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
+	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
+	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
+	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
+	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
+	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
+	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
+
+	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
+	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
+	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
+	final = vaddq_u32(final, val_add);
 	return final;
 }
 
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 31e67300e00..481b728506f 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -31,7 +31,7 @@
 // M1/M2 SIMD intrensics
 #include "arm_neon.h"
 #endif
-//#define WYATTOPT
+#define WYATTOPT
 
 namespace AGS3 {
 


Commit: a22396163f0e3b6fddf3e3843bbc5c7d8507c844
    https://github.com/scummvm/scummvm/commit/a22396163f0e3b6fddf3e3843bbc5c7d8507c844
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Created test code for blending modes

Changed paths:
  A engines/ags/lib/NEON_2_SSE.h
    engines/ags/ags.h
    engines/ags/engine/main/engine.cpp
    engines/ags/globals.h
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/tests/test_all.cpp
    engines/ags/tests/test_file.cpp
    engines/ags/tests/test_gfx.cpp
    engines/ags/tests/test_inifile.cpp
    engines/ags/tests/test_math.cpp
    engines/ags/tests/test_memory.cpp
    engines/ags/tests/test_sprintf.cpp
    engines/ags/tests/test_string.cpp
    engines/ags/tests/test_version.cpp


diff --git a/engines/ags/ags.h b/engines/ags/ags.h
index bd786b958a7..155eb087a0d 100644
--- a/engines/ags/ags.h
+++ b/engines/ags/ags.h
@@ -37,6 +37,9 @@
 #include "ags/shared/gfx/bitmap.h"
 #include "ags/lib/allegro/system.h"
 
+// DEBUG: @eklipsed TAKE OUT!!!
+//#define ENABLE_AGS_TESTS 1
+
 namespace AGS3 {
 class Globals;
 }
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index d741e741190..11c996fdcf5 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -23,6 +23,7 @@
 // Engine initialization
 //
 
+#include "ags/lib/allegro/color.h"
 #include "ags/lib/std/chrono.h"
 #include "ags/shared/core/platform.h"
 #include "ags/lib/allegro.h" // allegro_install and _exit
@@ -803,68 +804,6 @@ void engine_prepare_to_start_game() {
 	}
 }
 
-// TODO: move to test unit
-Bitmap *test_allegro_bitmap;
-IDriverDependantBitmap *test_allegro_ddb;
-void allegro_bitmap_test_init() {
-	test_allegro_bitmap = nullptr;
-	// Switched the test off for now
-	//test_allegro_bitmap = AllegroBitmap::CreateBitmap(320,200,32);
-
-	//Bitmap *testgfx = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx32.bmp", nullptr));
-	//Bitmap *destgfx = BitmapHelper::CreateBitmap(testgfx->GetWidth(), testgfx->GetHeight(), testgfx->GetBPP()*8);
-	//set_blender_mode(kTintBlenderMode, 0, 0, 255, 255);
-	//destgfx->LitBlendBlt(testgfx, 0, 0, 100);
-	//destgfx->SaveToFile("tint_result.bmp", nullptr);
-
-	return; // Normal benchmark below
-
-	PALETTE gfx8pal;
-	Bitmap *benchgfx32 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx32.bmp", nullptr));
-	Bitmap *benchgfx16 = BitmapHelper::CreateBitmapCopy(benchgfx32, 16);
-	Bitmap *benchgfx8 = BitmapHelper::CreateRawBitmapOwner(load_bmp("benchgfx8.bmp", gfx8pal));
-	Bitmap *dest32 = BitmapHelper::CreateBitmap(100, 100, 32);
-	Bitmap *dest16 = BitmapHelper::CreateBitmap(100, 100, 16);
-	Bitmap *dest8 = BitmapHelper::CreateBitmap(100, 100, 8);
-	Debug::Printf(kDbgMsg_Info, "%d %d %d %d %d %d", benchgfx32, benchgfx16, benchgfx8, dest32, dest16, dest8);
-	int benchRuns[] = {1000, 10000, 100000};
-	int blenderModes[] = {kRgbToRgbBlender, kSourceAlphaBlender, kArgbToArgbBlender, kOpaqueBlenderMode, kTintLightBlenderMode};
-	const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
-	Bitmap *destinations[] = {dest32, dest16, dest8};
-	Bitmap *graphics[] = {benchgfx32, benchgfx16, benchgfx8};
-	int bpps[] = {32, 16, 8};
-	for (int dest = 0; dest < 3; dest++) {
-		for (int gfx = 0; gfx < 3; gfx++) {
-			if (dest == 2 && gfx != 2) continue;
-			for (int mode = 0; mode < sizeof(blenderModes) / sizeof(int); mode++) {
-				for (int runs = 0; runs < sizeof(benchRuns)/sizeof(int); runs++) {
-					uint32 start, end;
-					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
-					if (runs == 2) Debug::Printf(kDbgMsg_Info, "Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
-					start = std::chrono::high_resolution_clock::now();
-					for (int i = 0; i < benchRuns[runs]; i++)
-						destinations[dest]->Blit(graphics[gfx], 0, 0, kBitmap_Transparency);
-					end = std::chrono::high_resolution_clock::now();
-					if (runs == 2) Debug::Printf(kDbgMsg_Info, "exec time (mills): %u\n", end - start);
-					if (runs == 2) Debug::Printf(kDbgMsg_Info, "Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
-					start = std::chrono::high_resolution_clock::now();
-					for (int i = 0; i < benchRuns[runs]; i++)
-						destinations[dest]->StretchBlt(graphics[gfx], Rect(0, 0, 99, 99), kBitmap_Transparency);
-					end = std::chrono::high_resolution_clock::now();
-					if (runs == 2) Debug::Printf(kDbgMsg_Info, "exec time (mills): %u\n", end - start);
-				}
-			}
-		}
-	}
-	
-	delete benchgfx32;
-	delete benchgfx16;
-	delete benchgfx8;
-	delete dest32;
-	delete dest16;
-	delete dest8;
-}
-
 // Define location of the game data either using direct settings or searching
 // for the available resource packs in common locations.
 // Returns two paths:
@@ -1241,8 +1180,6 @@ int initialize_engine(const ConfigTree &startup_opts) {
 
 	engine_prepare_to_start_game();
 
-	allegro_bitmap_test_init();
-
 	initialize_start_and_play_game(_G(override_start_room), _G(loadSaveGameOnStartup));
 
 	return EXIT_NORMAL;
diff --git a/engines/ags/globals.h b/engines/ags/globals.h
index c0b9979f03e..8d9c53bb4d8 100644
--- a/engines/ags/globals.h
+++ b/engines/ags/globals.h
@@ -221,6 +221,7 @@ public:
 	int _trans_blend_green = 0;
 	int _trans_blend_blue = 0;
 	BlenderMode __blender_mode = kRgbToRgbBlender;
+	bool __bitmap_simd_optimizations = true;
 	/* current format information and worker routines */
 	int _utype = U_UTF8;
 
diff --git a/engines/ags/lib/NEON_2_SSE.h b/engines/ags/lib/NEON_2_SSE.h
new file mode 100644
index 00000000000..d1a789046c4
--- /dev/null
+++ b/engines/ags/lib/NEON_2_SSE.h
@@ -0,0 +1,16872 @@
+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina at intel.com
+
+//*** Copyright (C) 2012-2022 Intel Corporation.  All rights reserved.
+
+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+
+//By downloading, copying, installing or using the software you agree to this license.
+//If you do not agree to this license, do not download, install, copy or use the software.
+
+//                              License Agreement
+//Redistribution and use in source and binary forms, with or without modification,
+//are permitted provided that the following conditions are met:
+
+//  * Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+
+//  * The name of the copyright holders may not be used to endorse or promote products
+//    derived from this software without specific prior written permission.
+
+//This software is provided by the copyright holders and contributors "as is" and
+//any express or implied warranties, including, but not limited to, the implied
+//warranties of merchantability and fitness for a particular purpose are disclaimed.
+//In no event shall the Intel Corporation or contributors be liable for any direct,
+//indirect, incidental, special, exemplary, or consequential damages
+//(including, but not limited to, procurement of substitute goods or services;
+//loss of use, data, or profits; or business interruption) however caused
+//and on any theory of liability, whether in contract, strict liability,
+//or tort (including negligence or otherwise) arising in any way out of
+//the use of this software, even if advised of the possibility of such damage.
+
+//*****************************************************************************************
+// This file is intended to simplify ARM->IA32 porting
+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
+//MMX instruction set is not used due to non availability on x64 systems,
+//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
+//*****************************************************************************************
+
+//!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intrinsics instead of "arm_neon.h" and compile it as usual
+//!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
+
+#ifndef NEON2SSE_H
+#define NEON2SSE_H
+
+/*********************************************************************************************************************/
+//!!!!!!!!!!!!!!
+//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
+//For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
+#ifndef USE_SSE4
+#   if defined(__SSE4_2__)
+#       define USE_SSE4
+#   endif
+#endif
+/*********************************************************************************************************************/
+
+#include <xmmintrin.h>     //SSE
+#include <emmintrin.h>     //SSE2
+#include <pmmintrin.h>     //SSE3
+#include <tmmintrin.h>     //SSSE3
+#ifdef USE_SSE4
+#   include <smmintrin.h> //SSE4.1
+#   include <nmmintrin.h> //SSE4.2
+#endif
+
+#include <math.h>
+
+//***************  functions and data attributes, compiler dependent  *********************************
+//***********************************************************************************
+#ifdef __GNUC__
+#   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#   define _NEON2SSESTORAGE static
+#   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
+#   ifdef __clang__
+#       define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__))
+#   else
+#       define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#   endif
+#   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
+#       if _GCC_VERSION <  40500
+#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
+#       else
+#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
+#       endif
+#   else
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#   endif
+#   if defined(__x86_64__)
+#       define _NEON2SSE_64BIT  __x86_64__
+#   endif
+#else
+#   define _NEON2SSESTORAGE static
+#   define _NEON2SSE_ALIGN_16  __declspec(align(16))
+#   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
+#   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
+#       if defined(_M_X64)
+#           define _NEON2SSE_64BIT  _M_X64
+#       endif
+#   else
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#   endif
+#endif
+
+/* Used to mark the intinsics that are declared as functions, but implemented as macros */
+#define _NEON2SSE_GLOBAL
+
+#if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
+#   define _NEON2SSE_64BIT_SSE4
+#endif
+
+#ifndef UNREFERENCED_PARAMETER
+#   define UNREFERENCED_PARAMETER(P) ((void)(P))
+#endif
+
+/*********************************************************************************************************************/
+//    data types conversion
+/*********************************************************************************************************************/
+#if defined(_MSC_VER) && (_MSC_VER < 1300)
+    typedef signed char int8_t;
+    typedef unsigned char uint8_t;
+    typedef signed short int16_t;
+    typedef unsigned short uint16_t;
+    typedef signed int int32_t;
+    typedef unsigned int uint32_t;
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
+#elif defined(_MSC_VER)
+    typedef signed __int8 int8_t;
+    typedef unsigned __int8 uint8_t;
+    typedef signed __int16 int16_t;
+    typedef unsigned __int16 uint16_t;
+    typedef signed __int32 int32_t;
+    typedef unsigned __int32 uint32_t;
+
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
+#else
+#   include <stdint.h>
+#   include <limits.h>
+#endif
+
+
+typedef   float float32_t;
+#if !defined(__clang__)
+typedef   float __fp16;
+#endif
+
+typedef   double float64_t;
+
+typedef union   __m64_128 {
+    uint64_t m64_u64[1];
+    int64_t m64_i64[1];
+    float64_t m64_d64[1];
+    uint32_t m64_u32[2];
+    int32_t m64_i32[2];
+    float32_t m64_f32[2];
+    int16_t m64_i16[4];
+    uint16_t m64_u16[4];
+    int8_t m64_i8[8];
+    uint8_t m64_u8[8];
+} __m64_128;
+
+typedef __m64_128 int8x8_t;
+typedef __m64_128 uint8x8_t;
+typedef __m64_128 int16x4_t;
+typedef __m64_128 uint16x4_t;
+typedef __m64_128 int32x2_t;
+typedef __m64_128 uint32x2_t;
+typedef __m64_128 int64x1_t;
+typedef __m64_128 uint64x1_t;
+typedef __m64_128 poly8x8_t;
+typedef __m64_128 poly16x4_t;
+
+typedef __m64_128 float32x2_t;
+typedef __m128 float32x4_t;
+
+typedef __m128 float16x4_t; //not supported by IA, for compartibility
+typedef __m128 float16x8_t; //not supported by IA, for compartibility
+
+typedef __m64_128 float64x1_t;
+typedef __m128d float64x2_t;
+
+typedef __m128i int8x16_t;
+typedef __m128i int16x8_t;
+typedef __m128i int32x4_t;
+typedef __m128i int64x2_t;
+typedef __m128i uint8x16_t;
+typedef __m128i uint16x8_t;
+typedef __m128i uint32x4_t;
+typedef __m128i uint64x2_t;
+typedef __m128i poly8x16_t;
+typedef __m128i poly16x8_t;
+
+#if defined(_MSC_VER)
+#   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
+#   define SINT_MAX       2147483647 /* max signed int value */
+#else
+#   define SINT_MIN     INT_MIN /* min signed int value */
+#   define SINT_MAX     INT_MAX /* max signed int value */
+#endif
+
+typedef  uint8_t poly8_t;
+typedef  uint16_t poly16_t;
+
+
+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
+struct int8x16x2_t {
+    int8x16_t val[2];
+};
+struct int16x8x2_t {
+    int16x8_t val[2];
+};
+struct int32x4x2_t {
+    int32x4_t val[2];
+};
+struct int64x2x2_t {
+    int64x2_t val[2];
+};
+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
+struct int8x8x2_t {
+    int8x8_t val[2];
+};
+struct int16x4x2_t {
+    int16x4_t val[2];
+};
+struct int32x2x2_t {
+    int32x2_t val[2];
+};
+struct int64x1x2_t {
+    int64x1_t val[2];
+};
+
+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
+
+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
+typedef struct int8x16x2_t uint8x16x2_t;
+typedef struct int16x8x2_t uint16x8x2_t;
+typedef struct int32x4x2_t uint32x4x2_t;
+typedef struct int64x2x2_t uint64x2x2_t;
+typedef struct int8x16x2_t poly8x16x2_t;
+typedef struct int16x8x2_t poly16x8x2_t;
+
+typedef struct int8x8x2_t uint8x8x2_t;
+typedef struct int16x4x2_t uint16x4x2_t;
+typedef struct int32x2x2_t uint32x2x2_t;
+typedef struct int64x1x2_t uint64x1x2_t;
+typedef struct int8x8x2_t poly8x8x2_t;
+typedef struct int16x4x2_t poly16x4x2_t;
+
+//float
+struct float32x4x2_t {
+    float32x4_t val[2];
+};
+struct float16x8x2_t {
+    float16x8_t val[2];
+};
+struct float32x2x2_t {
+    float32x2_t val[2];
+};
+
+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
+typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
+typedef  float16x8x2_t float16x4x2_t;
+
+//4
+struct int8x16x4_t {
+    int8x16_t val[4];
+};
+struct int16x8x4_t {
+    int16x8_t val[4];
+};
+struct int32x4x4_t {
+    int32x4_t val[4];
+};
+struct int64x2x4_t {
+    int64x2_t val[4];
+};
+
+struct int8x8x4_t {
+    int8x8_t val[4];
+};
+struct int16x4x4_t {
+    int16x4_t val[4];
+};
+struct int32x2x4_t {
+    int32x2_t val[4];
+};
+struct int64x1x4_t {
+    int64x1_t val[4];
+};
+
+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
+
+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef struct int8x8x4_t uint8x8x4_t;
+typedef struct int16x4x4_t uint16x4x4_t;
+typedef struct int32x2x4_t uint32x2x4_t;
+typedef struct int64x1x4_t uint64x1x4_t;
+typedef struct int8x8x4_t poly8x8x4_t;
+typedef struct int16x4x4_t poly16x4x4_t;
+
+typedef struct int8x16x4_t uint8x16x4_t;
+typedef struct int16x8x4_t uint16x8x4_t;
+typedef struct int32x4x4_t uint32x4x4_t;
+typedef struct int64x2x4_t uint64x2x4_t;
+typedef struct int8x16x4_t poly8x16x4_t;
+typedef struct int16x8x4_t poly16x8x4_t;
+
+struct float32x4x4_t {
+    float32x4_t val[4];
+};
+struct float16x8x4_t {
+    float16x8_t val[4];
+};
+struct float32x2x4_t {
+    float32x2_t val[4];
+};
+
+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
+typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
+typedef  float16x8x4_t float16x4x4_t;
+
+//3
+struct int16x8x3_t {
+    int16x8_t val[3];
+};
+struct int32x4x3_t {
+    int32x4_t val[3];
+};
+struct int64x2x3_t {
+    int64x2_t val[3];
+};
+struct int8x16x3_t {
+    int8x16_t val[3];
+};
+
+struct int16x4x3_t {
+    int16x4_t val[3];
+};
+struct int32x2x3_t {
+    int32x2_t val[3];
+};
+struct int64x1x3_t {
+    int64x1_t val[3];
+};
+struct int8x8x3_t {
+    int8x8_t val[3];
+};
+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
+
+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
+
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef struct int8x16x3_t uint8x16x3_t;
+typedef struct int16x8x3_t uint16x8x3_t;
+typedef struct int32x4x3_t uint32x4x3_t;
+typedef struct int64x2x3_t uint64x2x3_t;
+typedef struct int8x16x3_t poly8x16x3_t;
+typedef struct int16x8x3_t poly16x8x3_t;
+typedef struct  int8x8x3_t uint8x8x3_t;
+typedef struct  int16x4x3_t uint16x4x3_t;
+typedef struct  int32x2x3_t uint32x2x3_t;
+typedef struct  int64x1x3_t uint64x1x3_t;
+typedef struct  int8x8x3_t poly8x8x3_t;
+typedef struct  int16x4x3_t poly16x4x3_t;
+
+//float
+struct float32x4x3_t {
+    float32x4_t val[3];
+};
+struct float32x2x3_t {
+    float32x2_t val[3];
+};
+struct float16x8x3_t {
+    float16x8_t val[3];
+};
+
+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
+typedef  float16x8x3_t float16x4x3_t;
+
+
+//****************************************************************************
+//****** Porting auxiliary macros ********************************************
+
+//** floating point related macros **
+#define _M128i(a) _mm_castps_si128(a)
+#define _M128(a) _mm_castsi128_ps(a)
+//here the most performance effective implementation is compiler and 32/64 bits build dependent
+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
+#   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
+#   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
+#   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
+#else
+   //for 32bit gcc and Microsoft compilers builds
+#   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
+#   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
+#   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
+#endif
+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
+
+#define return64(a)  _M64(res64,a); return res64;
+#define return64f(a)  _M64f(res64,a); return res64;
+
+#define _Ui64(a) (*(uint64_t*)&(a))
+#define _UNSIGNED_T(a) u ## a
+
+#define _SIGNBIT64 ((uint64_t)1 << 63)
+#define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
+
+#define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
+#define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#define __constrange(min,max)  const
+#define __transfersize(size)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
+_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
+_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
+//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
+
+//*************************************************************************
+//*************************************************************************
+//*********  Functions declarations as declared in original arm_neon.h *****
+//*************************************************************************
+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
+_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
+_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
+_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
+_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
+_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+//Vector rounding add high half: vraddhn
+_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+//Multiplication
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+//multiply lane
+_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
+_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
+//Vector multiply subtract long
+_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+//Vector saturating doubling multiply high
+_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
+//Vector saturating rounding doubling multiply high
+_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
+//Vector saturating doubling multiply accumulate long
+_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+//Vector saturating doubling multiply subtract long
+_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+//Vector long multiply
+_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+//Vector saturating doubling long multiply
+_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+//Subtraction
+//Vector subtract
+_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
+_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+//Vector saturating subtract
+_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
+//Vector halving subtract
+_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+//Vector subtract high half
+_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+//Vector rounding subtract high half
+_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+//Comparison
+//Vector compare equal
+_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
+//Vector compare greater-than or equal
+_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+//Vector compare less-than or equal
+_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+//Vector compare greater-than
+_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+//Vector compare less-than
+_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+//Vector compare absolute greater-than or equal
+_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+//Vector compare absolute less-than or equal
+_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+//Vector compare absolute greater-than
+_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+//Vector compare absolute less-than
+_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+//Vector test bits
+_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
+//Absolute difference
+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
+_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+//Absolute difference - long
+_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
+_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
+//Absolute difference and accumulate - long
+_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+//Max/Min
+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
+_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
+
+_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
+
+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
+_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
+
+_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
+
+//Pairwise addition
+//Pairwise add
+_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+//Long pairwise add
+_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
+_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+//Long pairwise add and accumulate
+_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
+_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
+_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
+//Folding maximum vpmax -> takes maximum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+//Folding minimum vpmin -> takes minimum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+//Reciprocal/Sqrt
+_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+//Shifts by signed variable
+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
+//Vector saturating shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
+//Vector rounding shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
+//Vector saturating rounding shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
+//Shifts by a constant
+//Vector shift right by constant
+_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
+_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
+//Vector shift left by constant
+_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+//Vector rounding shift right by constant
+_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
+//Vector shift right by constant and accumulate
+_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+//Vector rounding shift right by constant and accumulate
+_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
+//Vector saturating shift left by constant
+_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
+//Vector signed->unsigned saturating shift left by constant
+_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
+//Vector narrowing shift right by constant
+_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+//Vector signed->unsigned narrowing saturating shift right by constant
+_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+//Vector signed->unsigned rounding narrowing saturating shift right by constant
+_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+//Vector narrowing saturating shift right by constant
+_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+//Vector rounding narrowing shift right by constant
+_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+//Vector rounding narrowing saturating shift right by constant
+_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+//Vector widening shift left by constant
+_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
+_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+//Shifts with insert
+//Vector shift right and insert
+_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+//Vector shift left and insert
+_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
+//Load a single vector from memory
+_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+
+_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+
+//Load a single lane from memory
+_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
+_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+//Load all lanes of vector with same value from memory
+_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+//Store a single vector or lane. Stores all lanes or a single lane of a vector.
+//Store a single vector into memory
+_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+//Store a lane of a vector into memory
+//Loads of an N-element structure
+//Load N-element structure from memory
+_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+//Load all lanes of N-element structure with same value from memory
+_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_GLOBAL int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_GLOBAL int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+//Load a single lane of N-element structure from memory
+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
+_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Store N-element structure to memory
+_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t const * val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t const * val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSE_GLOBAL void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_GLOBAL void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_GLOBAL void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+//Store a single lane of N-element structure to memory
+_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t const * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t const * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
+_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
+_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
+_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
+_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+//Initialize a vector from a literal bit pattern.
+_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
+//Set all lanes to same value
+//Load all lanes of vector to the same literal value
+_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
+_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
+_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
+_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
+_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
+_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
+_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
+_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
+//Load all lanes of the vector to the value of a lane of a vector
+_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
+_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
+_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
+//Converting vectors. These intrinsics are used to convert vectors.
+//Convert from float
+_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
+_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
+//Convert to float
+_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
+_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
+//Convert between floats
+_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
+_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
+//Vector narrow integer
+_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
+//Vector long move
+_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
+_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
+//Vector saturating narrow integer
+_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
+_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
+//Vector saturating narrow integer signed->unsigned
+_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
+//Table look up
+_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+//Extended table look up intrinsics
+_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSE_GLOBAL int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSE_GLOBAL poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+//Operations with a scalar value
+//Vector multiply accumulate with scalar
+_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
+_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
+_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
+//Vector widening multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
+//Vector widening saturating doubling multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
+//Vector multiply subtract with scalar
+_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
+//Vector widening multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
+//Vector widening saturating doubling multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
+//Vector multiply by scalar
+_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
+//Vector long multiply with scalar
+_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
+_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
+//Vector long multiply by scalar
+_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
+_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
+//Vector saturating doubling long multiply with scalar
+_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
+//Vector saturating doubling long multiply by scalar
+_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
+//Vector saturating doubling multiply high with scalar
+_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
+//Vector saturating doubling multiply high by scalar
+_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
+//Vector saturating rounding doubling multiply high with scalar
+_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
+//Vector rounding saturating doubling multiply high by scalar
+_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
+//Vector multiply accumulate with scalar
+_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
+//Vector widening multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
+//Vector widening saturating doubling multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
+//Vector multiply subtract with scalar
+_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
+//Vector widening multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
+//Vector widening saturating doubling multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
+//Vector extract
+_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
+//Other single operand arithmetic
+//Absolute: Vd[i] = |Va[i]|
+_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
+_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
+_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
+_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
+
+#ifdef _NEON2SSE_64BIT
+_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
+_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
+#endif
+
+//Saturating absolute: Vd[i] = sat(|Va[i]|)
+_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
+//Negate: Vd[i] = - Va[i]
+_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
+//Saturating Negate: sat(Vd[i] = - Va[i])
+_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
+//Count leading sign bits
+_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
+_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
+_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
+//Count leading zeros
+_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
+_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
+_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
+//Count number of set bits
+_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
+_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
+//Reciprocal estimate
+_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
+//Reciprocal square root estimate
+_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+//Logical operations
+//Bitwise not
+_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
+//Bitwise and
+_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
+//Bitwise or
+_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
+//Bitwise exclusive or (EOR or XOR)
+_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
+//Bit Clear
+_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
+//Bitwise OR complement
+_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
+//Bitwise Select
+_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
+//Transposition operations
+//Transpose elements
+_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
+_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
+_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
+_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
+//Interleave elements
+_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
+_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
+_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
+_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
+//De-Interleave elements
+_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
+_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
+_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
+_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
+
+_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
+_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
+
+//Sqrt
+_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
+_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
+
+
+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
+// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
+//
+#if  ( defined (__INTEL_COMPILER)  && !defined(__llvm__) )
+#   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
+#   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
+#   define _MM_INSERT_EPI16 _mm_insert_epi16
+#   ifdef USE_SSE4
+#       define _MM_EXTRACT_EPI8  _mm_extract_epi8
+#       define _MM_EXTRACT_EPI32  _mm_extract_epi32
+#       define _MM_EXTRACT_PS  _mm_extract_ps
+#       define _MM_INSERT_EPI8  _mm_insert_epi8
+#       define _MM_INSERT_EPI32 _mm_insert_epi32
+#       define _MM_INSERT_PS    _mm_insert_ps
+#       ifdef  _NEON2SSE_64BIT
+#           define _MM_INSERT_EPI64 _mm_insert_epi64
+#           define _MM_EXTRACT_EPI64 _mm_extract_epi64
+#       endif
+#   endif //SSE4
+#else
+#   define _NEON2SSE_COMMA ,
+#   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
+        switch(LANE)         \
+        {                \
+        case 0:     return NAME(a b, 0); \
+        case 1:     return NAME(a b, 1); \
+        case 2:     return NAME(a b, 2); \
+        case 3:     return NAME(a b, 3); \
+        case 4:     return NAME(a b, 4); \
+        case 5:     return NAME(a b, 5); \
+        case 6:     return NAME(a b, 6); \
+        case 7:     return NAME(a b, 7); \
+        case 8:     return NAME(a b, 8); \
+        case 9:     return NAME(a b, 9); \
+        case 10:    return NAME(a b, 10); \
+        case 11:    return NAME(a b, 11); \
+        case 12:    return NAME(a b, 12); \
+        case 13:    return NAME(a b, 13); \
+        case 14:    return NAME(a b, 14); \
+        case 15:    return NAME(a b, 15); \
+        default:    return NAME(a b, 0); \
+        }
+
+#   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
+        switch(LANE)              \
+        {                          \
+        case 0:  return NAME(vec p,0); \
+        case 1:  return NAME(vec p,1); \
+        case 2:  return NAME(vec p,2); \
+        case 3:  return NAME(vec p,3); \
+        case 4:  return NAME(vec p,4); \
+        case 5:  return NAME(vec p,5); \
+        case 6:  return NAME(vec p,6); \
+        case 7:  return NAME(vec p,7); \
+        default: return NAME(vec p,0); \
+        }
+
+#   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
+        switch(LANE)              \
+        {                          \
+        case case0:  return NAME(vec p,case0); \
+        case case1:  return NAME(vec p,case1); \
+        case case2:  return NAME(vec p,case2); \
+        case case3:  return NAME(vec p,case3); \
+        default:     return NAME(vec p,case0); \
+        }
+
+    _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
+    {
+        _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
+    }
+
+    _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_SWITCH8((int16_t)_mm_extract_epi16, vec, LANE,)
+    }
+
+#ifdef USE_SSE4
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
+        }
+
+#ifdef  _NEON2SSE_64BIT
+            //the special case of functions available only for SSE4 and 64-bit build.
+            _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int64_t p, const int LANE)
+            {
+                switch(LANE) {
+                case 0:
+                    return _mm_insert_epi64(vec,  p, 0);
+                case 1:
+                    return _mm_insert_epi64(vec,  p, 1);
+                default:
+                    return _mm_insert_epi64(vec,  p, 0);
+                }
+            }
+
+            _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
+            {
+                if (LANE ==0) return _mm_extract_epi64(val, 0);
+                else return _mm_extract_epi64(val, 1);
+            }
+#endif
+
+        _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+#endif //USE_SSE4
+
+#endif     //#ifdef NDEBUG
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
+// or for some specific commonly used operations implementation missing in SSE
+#ifdef USE_SSE4
+#   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
+#   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
+#   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
+
+#   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
+#   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
+#   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
+
+#   define _MM_MAX_EPI8  _mm_max_epi8
+#   define _MM_MAX_EPI32 _mm_max_epi32
+#   define _MM_MAX_EPU16 _mm_max_epu16
+#   define _MM_MAX_EPU32 _mm_max_epu32
+
+#   define _MM_MIN_EPI8  _mm_min_epi8
+#   define _MM_MIN_EPI32 _mm_min_epi32
+#   define _MM_MIN_EPU16 _mm_min_epu16
+#   define _MM_MIN_EPU32 _mm_min_epu32
+
+#   define _MM_BLENDV_EPI8 _mm_blendv_epi8
+#   define _MM_PACKUS_EPI32 _mm_packus_epi32
+#   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
+
+#   define _MM_MULLO_EPI32 _mm_mullo_epi32
+#   define _MM_MUL_EPI32  _mm_mul_epi32
+
+#   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
+#else     //no SSE4 !!!!!!
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi8(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi16(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi32(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi8(zero, a);
+        return _mm_unpacklo_epi8(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi16(zero, a);
+        return _mm_unpacklo_epi16(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi32(zero, a);
+        return _mm_unpacklo_epi32(a, sign);
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t tmp[16];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return (int)tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, _M128i(vec));
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = (int8_t)p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128 tmp, vec_masked, p_masked;
+        mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
+        vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
+        p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
+        tmp = _mm_or_ps(vec_masked, p_masked);
+        return tmp;
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
+    {
+        //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
+        __m128i a_masked, b_masked;
+        b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
+        a_masked = _mm_andnot_si128 (mask,a);
+        return _mm_or_si128(a_masked, b_masked);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
+    {
+        __m128i a16, b16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
+        b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
+        res = _mm_unpacklo_epi64(a16, b16); //result without saturation
+        reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
+    {
+        __m128i a16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
+        reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+
+    // method used by GCC with generic vector extensions
+    _NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
+    {
+        __m128i a_high = _mm_srli_epi64(a, 32);
+        __m128i low = _mm_mul_epu32(a, b);
+        __m128i b_high = _mm_srli_epi64(b, 32);
+        __m128i high = _mm_mul_epu32(a_high, b_high);
+        low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0));
+        high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0));
+        return _mm_unpacklo_epi32(low, high);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
+    {
+        __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
+        sign = _mm_xor_si128 (a, b);
+        sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
+        sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
+        zero = _mm_setzero_si128();
+        a_neg = _mm_abs_epi32 (a); //negate a and b
+        b_neg = _mm_abs_epi32 (b); //negate a and b
+        mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
+        mul_us_neg = _mm_sub_epi64(zero, mul_us);
+        mul_us_neg = _mm_and_si128(sign, mul_us_neg);
+        mul_us = _mm_andnot_si128(sign, mul_us);
+        return _mm_or_si128 (mul_us, mul_us_neg);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
+    {
+        __m128i res;
+        res = _mm_cmpeq_epi32 (a, b);
+        return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
+    }
+#endif     //SSE4
+
+//the special case of functions working only for 32 bits, no SSE4
+_NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int64_t p, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
+    _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
+    __m128i vec_masked, p_masked;
+    pvec[LANE] = p;
+    mask[LANE] = 0x0;
+    vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+    p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+    return _mm_or_si128(vec_masked, p_masked);
+}
+
+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 int64_t tmp[2];
+    _mm_store_si128((__m128i*)tmp, val);
+    return tmp[LANE];
+}
+
+#ifndef _NEON2SSE_64BIT_SSE4
+#   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
+#   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
+#endif
+
+_NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
+_NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
+{
+    //Overflow happens only if a and sum have the opposite signs
+    __m128i c7fffffff, res, res_sat, res_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_slli_epi32 (a, 1); // res = a*2
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//*************************************************************************
+//*************************************************************************
+//*****************  Functions redefinition\implementatin starts here *****
+//*************************************************************************
+//*************************************************************************
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
+#ifdef ARM
+#define vector_addq_s32 _mm_add_epi32
+#else //if we have IA
+#define vector_addq_s32 vadd_s32
+#endif
+
+********************************************************************************************
+Functions below are organised in the following way:
+
+Each NEON intrinsic function has one of the following options:
+1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
+2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
+3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
+4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
+- please consider such functions removal from your code.
+*/
+
+//***********************************************************************
+//************************      Vector add   *****************************
+//***********************************************************************
+_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSE_GLOBAL uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+#define vadd_u8 vadd_s8
+
+_NEON2SSE_GLOBAL uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+#define vadd_u16 vadd_s16
+
+_NEON2SSE_GLOBAL uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+#define vadd_u32 vadd_s32
+
+_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
+{
+    uint64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
+    return res64;
+}
+
+
+_NEON2SSE_GLOBAL int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+#define vaddq_s8 _mm_add_epi8
+
+_NEON2SSE_GLOBAL int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+#define vaddq_s16 _mm_add_epi16
+
+_NEON2SSE_GLOBAL int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+#define vaddq_s32 _mm_add_epi32
+
+_NEON2SSE_GLOBAL int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+#define vaddq_s64 _mm_add_epi64
+
+_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+#define vaddq_f32 _mm_add_ps
+
+_NEON2SSE_GLOBAL uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+#define vaddq_u8 _mm_add_epi8
+
+_NEON2SSE_GLOBAL uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+#define vaddq_u16 _mm_add_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+#define vaddq_u32 _mm_add_epi32
+
+_NEON2SSE_GLOBAL uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
+#define vaddq_u64 _mm_add_epi64
+
+//**************************** Vector long add *****************************:
+//***********************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 ( a64, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a64, b64);
+}
+
+//***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
+//*************** *********************************************************************
+_NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
+    return _mm_add_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
+//*************************************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = vshrq_n_s8(tmp2,1);
+    return _mm_add_epi8(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi16(tmp2,1);
+    return _mm_add_epi16(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
+{
+    __m128i c1, sum, res;
+    c1 = _mm_set1_epi8(1);
+    sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_and_si128(res,c1); //for rounding compensation
+    return _mm_sub_epi8 (sum, res); //actual rounding compensation
+}
+
+_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
+{
+    __m128i sum, res;
+    sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_slli_epi16 (res,15); //shift left  then back right to
+    res = _mm_srli_epi16 (res,15); //get 1 or zero
+    return _mm_sub_epi16 (sum, res); //actual rounding compensation
+}
+
+_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srli_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
+//*****************************************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
+    __m128i c128, au, bu, sum;
+    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
+    au = _mm_sub_epi8(a, c128); //add 128
+    bu = _mm_sub_epi8(b, c128); //add 128
+    sum = _mm_avg_epu8(au, bu);
+    return _mm_add_epi8 (sum, c128); //sub 128
+}
+
+_NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
+    __m128i cx8000, au, bu, sum;
+    cx8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
+    au = _mm_sub_epi16(a, cx8000); //add 32768
+    bu = _mm_sub_epi16(b, cx8000); //add 32768
+    sum = _mm_avg_epu16(au, bu);
+    return _mm_add_epi16 (sum, cx8000); //sub 32768
+}
+
+_NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
+{
+    //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srai_epi32(a,1); //a2=a/2;
+    b2 = _mm_srai_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+_NEON2SSE_GLOBAL uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
+
+_NEON2SSE_GLOBAL uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
+
+
+_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
+{
+    //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srli_epi32(a,1); //a2=a/2;
+    b2 = _mm_srli_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+//****************** VQADD: Vector saturating add ************************
+//************************************************************************
+_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int64x1_t res;
+    uint64_t a64, b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    a64 = (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t a64, b64;
+    uint64x1_t res;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    if (res.m64_u64[0] < a64) {
+        res.m64_u64[0] = ~(uint64_t)0;
+    }
+    return res;
+}
+
+_NEON2SSE_GLOBAL int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+#define vqaddq_s8 _mm_adds_epi8
+
+_NEON2SSE_GLOBAL int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+#define vqaddq_s16 _mm_adds_epi16
+
+_NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_add_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a_ = _mm_xor_si128(b, a);
+    res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+_NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] + btmp[0];
+    res[1] = atmp[1] + btmp[1];
+
+    atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
+    atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
+
+    if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
+        res[0] = atmp[0];
+    }
+    if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
+        res[1] = atmp[1];
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSE_GLOBAL uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+#define vqaddq_u8 _mm_adds_epu8
+
+_NEON2SSE_GLOBAL uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
+#define vqaddq_u16 _mm_adds_epu16
+
+_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i c80000000, cmp, subsum, suba, sum;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    sum = _mm_add_epi32 (a, b);
+    subsum = _mm_sub_epi32 (sum, c80000000);
+    suba = _mm_sub_epi32 (a, c80000000);
+    cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
+    return _mm_or_si128 (sum, cmp); //saturation
+}
+
+_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, sum, cmp, suba, subsum;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sum = _mm_add_epi64 (a, b);
+        subsum = _mm_sub_epi64 (sum, c80000000);
+        suba = _mm_sub_epi64 (a, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_or_si128 (sum, cmp); //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = atmp[0] + btmp[0];
+        res[1] = atmp[1] + btmp[1];
+        if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
+        if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+
+//******************* Vector add high half (truncated)  ******************
+//************************************************************************
+_NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srai_epi16 (sum, 8);
+    sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srai_epi32(sum, 16);
+    sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sum;
+    sum = _mm_add_epi64 (a, b);
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srli_epi16 (sum, 8);
+    sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+     __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srli_epi32 (sum, 16);
+#ifdef USE_SSE4
+    sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
+#else
+    sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sum);
+}
+
+_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+#define vaddhn_u64 vaddhn_s64
+
+//*********** Vector rounding add high half: vraddhn_<type> ******************.
+//***************************************************************************
+_NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi16 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi32 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sum, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    sum = _mm_add_epi32 (sum, mask1); //actual high half rounding
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packus_epi16 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _MM_PACKUS1_EPI32 (sum);
+    return64(sum);
+}
+
+_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+#define vraddhn_u64 vraddhn_s64
+
+//**********************************************************************************
+//*********             Multiplication            *************************************
+//**************************************************************************************
+
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+//As we don't go to wider result functions are equal to "multiply low" in x86
+_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    int8x8_t res64;
+    __m128i a128, b128, res;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
+    return64(res);
+}
+
+_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
+#define vmul_s16 vmul_u16
+
+_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
+#define vmul_s32 vmul_u32
+
+_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t tmp;
+    __m64_128 res64;
+    tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
+    _M64f(res64, tmp); //use low 64 bits
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    uint8x8_t res64;
+    __m128i mask, a128, b128, res;
+    mask = _mm_set1_epi16(0xff);
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use only low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
+    res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    poly8x8_t res64;
+    __m128i a64, b64, c1, res, tmp, bmasked;
+    int i;
+    a64 = _pM128i(a);
+    b64 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b64, c1); //0x1
+    res = vmulq_u8(a64, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b64, c1); //0x1
+        tmp = vmulq_u8(a64, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return64 (res);
+}
+
+_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
+    __m128i a16, b16, r16_1, r16_2;
+    a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+
+    return _mm_unpacklo_epi64(r16_1,  r16_2);
+}
+
+_NEON2SSE_GLOBAL int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+#define vmulq_s16 _mm_mullo_epi16
+
+_NEON2SSE_GLOBAL int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
+
+_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+#define vmulq_f32 _mm_mul_ps
+
+_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
+    __m128i maskff, a16, b16, r16_1, r16_2;
+    maskff = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
+    return _mm_packus_epi16 (r16_1,  r16_2);
+}
+
+_NEON2SSE_GLOBAL uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+#define vmulq_u16 _mm_mullo_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
+
+_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
+{
+    //may be optimized
+    __m128i c1, res, tmp, bmasked;
+    int i;
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b, c1); //0x1
+    res = vmulq_u8(a, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b, c1); //0x1
+        tmp = vmulq_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
+
+//************************* Vector long multiply ***********************************
+//****************************************************************************
+_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
+{
+#ifdef USE_SSE4
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
+    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+#else
+    __m128i low, hi, a128,b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    low =  _mm_mullo_epi16(a128,b128);
+    hi =   _mm_mulhi_epi16(a128,b128);
+    return _mm_unpacklo_epi16(low,hi);
+#endif
+}
+
+_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
+{
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
+{
+#ifdef USE_SSE4
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
+    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+#else
+    __m128i a128,b128,low, hi;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    low =  _mm_mullo_epi16(a128,b128);
+    hi =   _mm_mulhi_epu16(a128,b128);
+    return _mm_unpacklo_epi16(low,hi);
+#endif
+}
+
+_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
+{
+    ///may be not optimal compared with serial implementation
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
+    int i;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b128, c1); //0x1
+
+    a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
+    bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+    res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b128, c1); //0x1
+        bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+        tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
+
+//****************Vector saturating doubling long multiply **************************
+//*****************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s16(a, b);
+    return vqd_s32(res);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s32(a,b);
+    return vqaddq_s64(res,res); //slow serial function!!!!
+}
+
+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
+//******************************************************************************************
+_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    int8x8_t res64;
+    __m128i b128, c128, res;
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
+    res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
+    res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_add_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i mask, b128, c128, res;
+    mask = _mm_set1_epi16(0xff);
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res, res);
+    res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+#define vmla_u16 vmla_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+#define vmla_u32 vmla_s32
+
+_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2,r16_2;
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_add_ps (a, res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+#define vmlaq_u16 vmlaq_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
+#define vmlaq_u32 vmlaq_s32
+
+//**********************  Vector widening multiply accumulate (long multiply accumulate):
+//                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
+//********************************************************************************************
+_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b, c);
+    return _mm_add_epi64 (res, a);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_add_epi64 (res, a);
+}
+
+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
+//********************************************************************************************
+_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    int8x8_t res64;
+    __m128i res;
+    res64 = vmul_s8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
+    res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_sub_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i res;
+    res64 = vmul_u8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+#define vmls_u16 vmls_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+#define vmls_u32 vmls_s32
+
+
+_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b, c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8 (a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c, b); //SSE4.1
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
+{
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_sub_ps (a, res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8(a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+#define vmlsq_u16 vmlsq_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
+#define vmlsq_u32 vmlsq_s32
+
+//******************** Vector multiply subtract long (widening multiply subtract) ************************************
+//*************************************************************************************************************
+_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+//******  Vector saturating doubling multiply high **********************
+//*************************************************************************
+_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int32_t a32, b32, i;
+    for (i = 0; i<4; i++) {
+        a32 = (int32_t) a.m64_i16[i];
+        b32 = (int32_t) b.m64_i16[i];
+        a32 = (a32 * b32) >> 15;
+        res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    __m128i mask;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    mul = _mm_slli_epi64(mul,1); //double the result
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
+
+_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
+{
+    __m128i res, res_lo, mask;
+    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhi_epi16 (a, b);
+    res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
+    res_lo = _mm_mullo_epi16 (a, b);
+    res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
+    res = _mm_add_epi16(res, res_lo); //combine results
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, mask, mul, mul1;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64(mul,1); //double the result
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64(mul1,1); //double the result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
+
+//********* Vector saturating rounding doubling multiply high ****************
+//****************************************************************************
+//If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
+_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128i res_sat, mask, mask1;
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
+
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
+{
+    __m128i mask, res;
+    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhrs_epi16 (a, b);
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba,  mask, mul, mul1, mask1;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (mul, mask1); //actual rounding
+
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
+
+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
+//*************************************************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32;
+    res32 = vmull_s16(b,  c);
+    res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
+    return vqaddq_s32(res32, a); //saturation
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64;
+    res64 = vmull_s32(b,c);
+    res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
+    return vqaddq_s64(res64, a); //saturation
+}
+
+//************************************************************************************
+//******************  Vector subtract ***********************************************
+//************************************************************************************
+_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+#define vsub_u8 vsub_s8
+
+_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+#define vsub_u16 vsub_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+#define vsub_u32 vsub_s32
+
+
+_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
+    return res64;
+}
+
+
+_NEON2SSE_GLOBAL int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+#define vsubq_s8 _mm_sub_epi8
+
+_NEON2SSE_GLOBAL int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+#define vsubq_s16 _mm_sub_epi16
+
+_NEON2SSE_GLOBAL int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+#define vsubq_s32 _mm_sub_epi32
+
+_NEON2SSE_GLOBAL int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+#define vsubq_s64 _mm_sub_epi64
+
+_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+#define vsubq_f32 _mm_sub_ps
+
+_NEON2SSE_GLOBAL uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+#define vsubq_u8 _mm_sub_epi8
+
+_NEON2SSE_GLOBAL uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+#define vsubq_u16 _mm_sub_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+#define vsubq_u32 _mm_sub_epi32
+
+_NEON2SSE_GLOBAL uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
+#define vsubq_u64 _mm_sub_epi64
+
+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
+//***********************************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
+//*****************************************************************************************************
+_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+//************************Vector saturating subtract *********************************
+//*************************************************************************************
+_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    uint64x1_t res;
+    uint64_t a64,b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 - b64;
+
+    a64 =  (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint64x1_t res;
+    uint64_t a64, b64;
+    a64 = _Ui64(a);
+    b64 = _Ui64(b);
+    if (a64 > b64) {
+        res.m64_u64[0] = a64 - b64;
+    } else {
+        res.m64_u64[0] = 0;
+    }
+    return res;
+}
+
+_NEON2SSE_GLOBAL int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+#define vqsubq_s8 _mm_subs_epi8
+
+_NEON2SSE_GLOBAL int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+#define vqsubq_s16 _mm_subs_epi16
+
+_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a = _mm_xor_si128(b, a);
+    res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] - btmp[0];
+    res[1] = atmp[1] - btmp[1];
+    if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
+        res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
+    }
+    if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
+        res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSE_GLOBAL uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+#define vqsubq_u8 _mm_subs_epu8
+
+_NEON2SSE_GLOBAL uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
+#define vqsubq_u16 _mm_subs_epu16
+
+_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
+{
+    __m128i min, mask, sub;
+    min = _MM_MIN_EPU32(a, b); //SSE4.1
+    mask = _mm_cmpeq_epi32 (min,  b);
+    sub = _mm_sub_epi32 (a, b);
+    return _mm_and_si128 ( sub, mask);
+}
+
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, subb, suba, cmp, sub;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sub  = _mm_sub_epi64 (a, b);
+        suba = _mm_sub_epi64 (a, c80000000);
+        subb = _mm_sub_epi64 (b, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_and_si128 (sub, cmp); //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
+        res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
+//****************************************************************
+_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
+{
+    //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
+    int8x8_t res64;
+    __m128i r16;
+    int8x8_t r;
+    r = vsub_s8 (a, b);
+    r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
+    r16 = _mm_srai_epi16 (r16, 1); //SSE2
+    r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+
+_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i c128, au,bu;
+    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
+    au = _mm_add_epi8( a, c128);
+    bu = _mm_add_epi8( b, c128);
+    return vhsubq_u8(au,bu);
+}
+
+_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i c8000, au,bu;
+    c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
+    au = _mm_add_epi16( a, c8000);
+    bu = _mm_add_epi16( b, c8000);
+    return vhsubq_u16(au,bu);
+}
+
+_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srai_epi32 (a,1);
+    b2 = _mm_srai_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu8 (a, b);
+    return _mm_sub_epi8(a, avg);
+}
+
+_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu16 (a, b);
+    return _mm_sub_epi16(a, avg);
+}
+
+_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srli_epi32 (a,1);
+    b2 = _mm_srli_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+//******* Vector subtract high half (truncated) ** ************
+//************************************************************
+_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srai_epi16 (sum, 8);
+    sum8 = _mm_packs_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srai_epi32 (sum, 16);
+    sum16 = _mm_packs_epi32(sum16,sum16);
+    return64(sum16);
+}
+
+_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sub;
+    sub = _mm_sub_epi64 (a, b);
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srli_epi16 (sum, 8);
+    sum8 =  _mm_packus_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+     __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srli_epi32 (sum, 16);
+#ifdef USE_SSE4
+    sum16 =  _MM_PACKUS1_EPI32(sum16);
+#else
+    sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sum16);
+}
+
+_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+#define vsubhn_u64 vsubhn_s64
+
+//************ Vector rounding subtract high half *********************
+//*********************************************************************
+_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub =  _mm_packs_epi16 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub = _mm_packs_epi32 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sub, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    sub = _mm_add_epi32 (sub, mask1); //actual high half rounding
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub = _mm_packus_epi16 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+#ifdef USE_SSE4
+    sub =  _MM_PACKUS1_EPI32 (sub);
+#else
+    sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sub);
+}
+
+_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+#define vrsubhn_u64 vrsubhn_s64
+
+//*********** Vector saturating doubling multiply subtract long ********************
+//************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32, mask;
+    int32x4_t res;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = vmull_s16(b,  c);
+    res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
+    mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
+    res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s32(a, res32); //saturation
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64, mask;
+    int64x2_t res;
+    _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
+    res = vmull_s32(b,  c);
+    res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
+    mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
+    res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s64(a, res64); //saturation
+}
+
+//******************  COMPARISON ***************************************
+//******************* Vector compare equal *************************************
+//****************************************************************************
+_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSE_GLOBAL uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+#define vceq_p8 vceq_u8
+
+
+_NEON2SSE_GLOBAL uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_s8 _mm_cmpeq_epi8
+
+_NEON2SSE_GLOBAL uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+#define vceqq_s16 _mm_cmpeq_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+#define vceqq_s32 _mm_cmpeq_epi32
+
+_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpeq_ps(a,b);
+    return _M128i(res);
+}
+
+_NEON2SSE_GLOBAL uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_u8 _mm_cmpeq_epi8
+
+_NEON2SSE_GLOBAL uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+#define vceqq_u16 _mm_cmpeq_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+#define vceqq_u32 _mm_cmpeq_epi32
+
+_NEON2SSE_GLOBAL uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_p8 _mm_cmpeq_epi8
+
+//******************Vector compare greater-than or equal*************************
+//*******************************************************************************
+//in IA SIMD no greater-than-or-equal comparison for integers,
+// there is greater-than available only, so we need the following tricks
+
+_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
+{
+    //serial solution looks faster
+    uint32x2_t res64;
+    return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
+}
+
+
+
+_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi8 ( a, b);
+    m2 = _mm_cmpeq_epi8 ( a, b);
+    return _mm_or_si128  ( m1, m2);
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi16 ( a, b);
+    m2 = _mm_cmpeq_epi16 ( a, b);
+    return _mm_or_si128   ( m1,m2);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi32 (a, b);
+    m2 = _mm_cmpeq_epi32 (a, b);
+    return _mm_or_si128   (m1, m2);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpge_ps(a,b); //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
+    __m128i cmp;
+    cmp = _mm_max_epu8(a, b);
+    return _mm_cmpeq_epi8(cmp, a); //a>=b
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+{
+    //no unsigned shorts comparison, only signed available,so need the trick
+#ifdef USE_SSE4
+    __m128i cmp;
+    cmp = _mm_max_epu16(a, b);
+    return _mm_cmpeq_epi16(cmp, a); //a>=b
+#else
+   __m128i zero = _mm_setzero_si128();
+   __m128i  as = _mm_subs_epu16(b, a);
+   return _mm_cmpeq_epi16(as, zero);
+#endif
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+{
+    //no unsigned ints comparison, only signed available,so need the trick
+#ifdef USE_SSE4
+    __m128i cmp;
+    cmp = _mm_max_epu32(a, b);
+    return _mm_cmpeq_epi32(cmp, a); //a>=b
+#else
+    //serial solution may be faster
+    __m128i c80000000, as, bs, m1, m2;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    as = _mm_sub_epi32(a,c80000000);
+    bs = _mm_sub_epi32(b,c80000000);
+    m1 = _mm_cmpgt_epi32 (as, bs);
+    m2 = _mm_cmpeq_epi32 (as, bs);
+    return _mm_or_si128 ( m1,  m2);
+#endif
+}
+
+//**********************Vector compare less-than or equal******************************
+//***************************************************************************************
+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
+
+_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcleq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcleq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcleq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmple_ps(_pM128(a),_pM128(b));
+    return64f(res);
+}
+
+_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+#define vcle_u8(a,b) vcge_u8(b,a)
+
+
+_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+#define vcle_u16(a,b) vcge_u16(b,a)
+
+
+_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+#define vcle_u32(a,b) vcge_u32(b,a)
+
+_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi8 ( a,  b);
+    return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
+}
+
+_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi16 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi32 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmple_ps(a,b);
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu8(a, b);
+        return _mm_cmpeq_epi8(cmp, a); //a<=b
+    }
+#else
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+    {
+        return vcgeq_u8(b, a);
+    }
+#endif
+
+_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+    {
+        //no unsigned shorts comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu16(a, b);
+        return _mm_cmpeq_epi16(cmp, a); //a<=b
+    }
+#else
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+    {
+        return vcgeq_u16(b, a);
+    }
+#endif
+
+_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu32(a, b);
+        return _mm_cmpeq_epi32(cmp, a); //a<=b
+    }
+#else
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+    {
+        return vcgeq_u32(b, a);
+    }
+#endif
+
+
+//****** Vector compare greater-than ******************************************
+//**************************************************************************
+_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSE_GLOBAL uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcgtq_s8 _mm_cmpgt_epi8
+
+_NEON2SSE_GLOBAL uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcgtq_s16 _mm_cmpgt_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcgtq_s32 _mm_cmpgt_epi32
+
+_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpgt_ps(a,b); //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
+{
+      //no unsigned chars comparison, only signed available,so need the trick
+        __m128i c128, as, bs;
+        c128 = _mm_set1_epi8(-128); //(int8_t)0x80
+        as = _mm_sub_epi8(a, c128);
+        bs = _mm_sub_epi8(b, c128);
+        return _mm_cmpgt_epi8(as, bs);
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
+{
+    //no unsigned short comparison, only signed available,so need the trick
+    __m128i c8000, as, bs;
+    c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
+    as = _mm_sub_epi16(a, c8000);
+    bs = _mm_sub_epi16(b, c8000);
+    return _mm_cmpgt_epi16(as, bs);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
+{
+    //no unsigned int comparison, only signed available,so need the trick
+    __m128i c80000000, as, bs;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    as = _mm_sub_epi32(a,c80000000);
+    bs = _mm_sub_epi32(b,c80000000);
+    return _mm_cmpgt_epi32 ( as, bs);
+}
+
+//********************* Vector compare less-than **************************
+//*************************************************************************
+_NEON2SSE_GLOBAL uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
+
+
+_NEON2SSE_GLOBAL uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
+
+
+_NEON2SSE_GLOBAL uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+#define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
+
+
+_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
+
+_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
+
+//*****************Vector compare absolute greater-than or equal ************
+//***************************************************************************
+_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********Vector compare absolute less-than or equal ******************
+//********************************************************************
+_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********  Vector compare absolute greater-than    ******************
+//******************************************************************
+_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//***************Vector compare absolute less-than  ***********************
+//*************************************************************************
+_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//*************************Vector test bits************************************
+//*****************************************************************************
+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
+with the corresponding element of a second vector. If the result is not zero, the
+corresponding element in the destination vector is set to all ones. Otherwise, it is set to
+all zeros. */
+
+_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vtstq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vtstq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vtstq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_u8 vtst_s8
+
+_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
+#define vtst_u16 vtst_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
+#define vtst_u32 vtst_s32
+
+
+_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_p8 vtst_u8
+
+_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi8 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi16 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi32 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+#define vtstq_u8 vtstq_s8
+
+_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+#define vtstq_u16 vtstq_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+#define vtstq_u32 vtstq_s32
+
+_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
+#define vtstq_p8 vtstq_u8
+
+//****************** Absolute difference ********************
+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
+//************************************************************
+_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vabdq_s8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vabdq_s16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
+{//need to deal with an intermediate overflow
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
+    res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vabdq_u8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vabdq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vabdq_u32(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vabdq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
+{ //need to deal with an intermediate overflow
+   __m128i cmp, difab, difba;
+   cmp = vcgtq_s8(a,b);
+   difab = _mm_sub_epi8(a,b);
+   difba = _mm_sub_epi8(b,a);
+   difab = _mm_and_si128(cmp, difab);
+   difba = _mm_andnot_si128(cmp, difba);
+   return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
+{//need to deal with an intermediate overflow
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_s16(a,b);
+    difab = _mm_sub_epi16(a,b);
+    difba = _mm_sub_epi16 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
+{//need to deal with an intermediate overflow
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_s32(a,b);
+    difab = _mm_sub_epi32(a,b);
+    difba = _mm_sub_epi32(b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
+{
+    __m128i  difab, difba;
+    difab = _mm_subs_epu8(a,b);
+    difba = _mm_subs_epu8 (b,a);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
+{
+    __m128i difab, difba;
+    difab = _mm_subs_epu16(a,b);
+    difba = _mm_subs_epu16 (b,a);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_u32(a,b);
+    difab = _mm_sub_epi32(a,b);
+    difba = _mm_sub_epi32 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
+{
+    __m128i c1;
+    __m128 res;
+    c1 =  _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_ps (a, b);
+    return _mm_and_ps (res, *(__m128*)&c1);
+}
+
+//************  Absolute difference - long **************************
+//********************************************************************
+_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return vabdq_s16(a16, b16);
+
+}
+
+_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return vabdq_s32(a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no optimal SIMD solution, serial looks faster
+    _NEON2SSE_ALIGN_16 int64_t res[2];
+    if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
+    else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
+    if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
+    else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
+{
+    __m128i res;
+    res = vsubl_u8(a,b);
+    return _mm_abs_epi16(res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
+{
+    __m128i res;
+    res = vsubl_u16(a,b);
+    return _mm_abs_epi32(res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
+    else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
+    if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
+    else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
+//*********************************************************************************************
+_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
+{
+    int32x2_t res64;
+    return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
+{
+    uint32x2_t res64;
+    return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
+{
+    int8x16_t sub;
+    sub = vabdq_s8(b, c);
+    return vaddq_s8( a, sub);
+}
+
+_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
+{
+    int16x8_t sub;
+    sub = vabdq_s16(b, c);
+    return vaddq_s16( a, sub);
+}
+
+_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
+{
+    int32x4_t sub;
+    sub = vabdq_s32(b, c);
+    return vaddq_s32( a, sub);
+}
+
+_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+    uint8x16_t sub;
+    sub = vabdq_u8(b, c);
+    return vaddq_u8( a, sub);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
+{
+    uint16x8_t sub;
+    sub = vabdq_u16(b, c);
+    return vaddq_u16( a, sub);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
+{
+    uint32x4_t sub;
+    sub = vabdq_u32(b, c);
+    return vaddq_u32( a, sub);
+}
+
+//************** Absolute difference and accumulate - long ********************************
+//*************************************************************************************
+_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_s32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_u32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+//***********************************************************************************
+//****************  Maximum and minimum operations **********************************
+//***********************************************************************************
+//************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
+//***********************************************************************************
+_NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSE_GLOBAL int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
+
+_NEON2SSE_GLOBAL int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+#define vmaxq_s16 _mm_max_epi16
+
+_NEON2SSE_GLOBAL int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
+
+_NEON2SSE_GLOBAL uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+#define vmaxq_u8 _mm_max_epu8
+
+_NEON2SSE_GLOBAL uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
+
+_NEON2SSE_GLOBAL uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
+
+
+_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
+#define vmaxq_f32 _mm_max_ps
+
+
+_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
+#define vmaxq_f64 _mm_max_pd
+
+
+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
+//***********************************************************************************************************
+_NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSE_GLOBAL int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
+
+_NEON2SSE_GLOBAL int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+#define vminq_s16 _mm_min_epi16
+
+_NEON2SSE_GLOBAL int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
+
+_NEON2SSE_GLOBAL uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+#define vminq_u8 _mm_min_epu8
+
+_NEON2SSE_GLOBAL uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
+
+_NEON2SSE_GLOBAL uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
+
+_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
+#define vminq_f32 _mm_min_ps
+
+
+_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
+#define vminq_f64 _mm_min_pd
+
+
+//*************  Pairwise addition operations. **************************************
+//************************************************************************************
+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
+_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    int8x8_t res64;
+    __m128i a16, b16, res;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+_NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    uint8x8_t res64;
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    __m128i mask8, a16, b16, res;
+    mask8 = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_and_si128(res, mask8); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
+    uint16x4_t res64;
+    __m128i c32767,  cfffe, as, bs, res;
+    c32767 = _mm_set1_epi16 (32767);
+    cfffe = _mm_set1_epi16 (-2); //(int16_t)0xfffe
+    as = _mm_sub_epi16 (_pM128i(a), c32767);
+    bs = _mm_sub_epi16 (_pM128i(b), c32767);
+    res = _mm_hadd_epi16 (as, bs);
+    res = _mm_add_epi16 (res, cfffe);
+    res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
+{
+    //hadd doesn't work for unsigned values
+    uint32x2_t res64;
+    __m128i ab, ab_sh, res;
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
+    ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
+    res = _mm_add_epi32(ab, ab_sh);
+    res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 hadd128;
+    __m64_128 res64;
+    hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
+    hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
+    _M64f(res64, hadd128);
+    return res64;
+}
+
+
+//**************************  Long pairwise add  **********************************
+//*********************************************************************************
+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
+// and places the final results in the destination vector.
+
+_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit anyway
+    __m128i a16;
+    int16x4_t res64;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
+    return64(a16);
+}
+
+_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    int32x2_t res64;
+    __m128i r32_1;
+    r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
+    r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
+    return64(r32_1);
+}
+
+_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    uint16x4_t res64;
+    __m128i a16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
+    return64(a16);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than a SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
+    res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r16_1, r16_2;
+    r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    //swap hi and low part of r to process the remaining data
+    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
+    return _mm_hadd_epi16 (r16_1, r16_2);
+}
+
+_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r32_1, r32_2;
+    r32_1 = _MM_CVTEPI16_EPI32(a);
+    //swap hi and low part of r to process the remaining data
+    r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
+    return _mm_hadd_epi32 (r32_1, r32_2);
+}
+
+_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
+{
+    __m128i top, bot;
+    bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+    bot = _MM_CVTEPI32_EPI64(bot);
+    top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1));
+    top = _MM_CVTEPI32_EPI64(top);
+    return _mm_add_epi64(top, bot);
+}
+
+_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
+{
+    const __m128i ff = _mm_set1_epi16(0xFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi16(a, 8);
+    return _mm_add_epi16(low, high);
+}
+
+#ifdef USE_SSE4
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
+{
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x55); // 0b1010101
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
+}
+
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
+{
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x33); // 0b00110011
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
+}
+#else
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
+{
+    const __m128i ff = _mm_set1_epi32(0xFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
+}
+
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
+{
+    const __m128i ff = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
+}
+#endif
+
+//************************  Long pairwise add and accumulate **************************
+//****************************************************************************************
+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
+// and accumulates the  values of the results into the elements of the destination (wide) vector
+_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
+{
+    int16x4_t res64;
+    return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
+{
+    int32x2_t res64;
+    return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
+{
+    uint16x4_t res64;
+    return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
+{
+    uint32x2_t res64;
+    return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
+{
+    int16x8_t pad;
+    pad = vpaddlq_s8(b);
+    return _mm_add_epi16 (a, pad);
+}
+
+_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
+{
+    int32x4_t pad;
+    pad = vpaddlq_s16(b);
+    return _mm_add_epi32(a, pad);
+}
+
+_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
+{
+    int64x2_t pad;
+    pad = vpaddlq_s32(b);
+    return _mm_add_epi64 (a, pad);
+}
+
+_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
+{
+    uint16x8_t pad;
+    pad = vpaddlq_u8(b);
+    return _mm_add_epi16 (a, pad);
+}
+
+_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
+{
+    uint32x4_t pad;
+    pad = vpaddlq_u16(b);
+    return _mm_add_epi32(a, pad);
+} //no optimal SIMD solution, serial is faster
+
+_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
+{
+    uint64x2_t pad;
+    pad = vpaddlq_u32(b);
+    return _mm_add_epi64(a, pad);
+}
+
+//**********  Folding maximum   *************************************
+//*******************************************************************
+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
+//and copies the larger of each pair into the corresponding element in the destination
+//    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
+_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max); //we need 64 bits only
+}
+
+_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _mm_max_epi16 (ab, ab1);
+    max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(max);
+}
+
+_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _mm_max_epu8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _MM_MAX_EPU16 (ab, ab1);
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(max);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
+
+// ***************** Folding minimum  ****************************
+// **************************************************************
+//vpmin -> takes minimum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
+    min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
+    min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    min = _mm_min_epi16 (ab, ab1);
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(min);
+}
+
+_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    min = _mm_min_epu8 (ab, ab1); // SSE4.1
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
+    min = _MM_MIN_EPU16 (ab, ab1);
+    min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(min);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
+
+//***************************************************************
+//***********  Reciprocal/Sqrt ************************************
+//***************************************************************
+//****************** Reciprocal estimate *******************************
+//the ARM NEON and x86 SIMD results may be slightly different
+_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rcp_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!! No reciprocal for ints in IA32 available
+    uint32x2_t res;
+    float resf, r;
+    int i, q, s;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0x80000000) == 0) {
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
+            r = (float)(1.0f / (((float)q + 0.5f) / 512.0f)); /* reciprocal r */
+            s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
+            r =  (float)s / 256.0f;
+            res.m64_u32[i] = (uint32_t)(r * (uint32_t)(1 << 31));
+        }
+    }
+    return res;
+}
+
+_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+#define vrecpeq_f32 _mm_rcp_ps
+
+
+_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
+    _NEON2SSE_ALIGN_16 uint32_t res[4];
+    _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
+    float resf, r;
+    int i, q, s;
+    __m128i res128, mask, zero;
+    _mm_store_si128((__m128i*)atmp, a);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
+        q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
+        r = 1.0f / (((float)q + 0.5f) / 512.0f); /* reciprocal r */
+        s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
+        r =  (float)s / 256.0f;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c80000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
+    return _mm_or_si128(res128, mask);
+}
+
+//**********Reciprocal square root estimate ****************
+//**********************************************************
+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
+////the ARM NEON and x86 SIMD results may be slightly different
+_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rsqrt_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+  // Input is  fixed point number!!!
+  // We implement the recip_sqrt_estimate function as described in ARMv7
+  // reference manual (VRSQRTE instruction) But results may be slightly different
+  // from ARM implementation due to _mm_rsqrt_ps precision
+  uint32x2_t res;
+  __m64_128 res64[2];
+  int i;
+  _NEON2SSE_ALIGN_16 float coeff[2];
+  for (i = 0; i < 2; i++) {
+    // Generate double-precision value = operand * 2^(-32). This has zero sign
+    // bit, with:
+    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
+    //     or 2^(-2) fraction taken from operand, excluding its most significant
+    //     one or two bits.
+    uint64_t dp_operand;
+    if (a.m64_u32[i] & 0x80000000) {
+      dp_operand =
+          (0x3feLL << 52) | (((uint64_t)a.m64_u32[i] & 0x7FFFFFFF) << 21);
+    } else {
+      dp_operand =
+          (0x3fdLL << 52) | (((uint64_t)a.m64_u32[i] & 0x3FFFFFFF) << 22);
+    }
+    res64[i].m64_u64[0] = dp_operand;
+    coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+  }
+  __m128 coeff_f = _mm_load_ps(coeff);
+  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
+  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
+  __m128 c05_f = _mm_set1_ps(0.5);
+  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
+  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
+  __m128 c256_f = _mm_set1_ps(256.0);
+  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
+#ifdef USE_SSE4
+  s_f = _mm_floor_ps(s_f);
+#else
+  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
+#endif
+  s_f = _mm_div_ps(s_f, c256_f);
+  _M64f(res64[0], s_f);
+
+  for (i = 0; i < 2; i++) {
+    if ((a.m64_u32[i] & 0xc0000000) == 0) { // a <=0x3fffffff
+      res.m64_u32[i] = 0xffffffff;
+    } else {
+      res.m64_u32[i] = (uint32_t)(res64[0].m64_f32[i] * (((uint32_t)1) << 31));
+    }
+  }
+  return res;
+}
+
+_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+#define vrsqrteq_f32 _mm_rsqrt_ps
+
+_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+  // Input is  fixed point number!!!
+  // We implement the recip_sqrt_estimate function as described in ARMv7
+  // reference manual (VRSQRTE instruction) But results may be slightly different
+  // from ARM implementation due to _mm_rsqrt_ps precision
+  int i;
+  _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
+  _NEON2SSE_ALIGN_16 float coeff[4], rr[4];
+  char* coeff_f2_c = (char*)&coeff[2];
+  __m64_128 res64[4];
+  _mm_store_si128((__m128i *)atmp, a);
+  for (i = 0; i < 4; i++) {
+    // Generate double-precision value = operand * 2^(-32). This has zero sign
+    // bit, with:
+    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
+    //     or 2^(-2) fraction taken from operand, excluding its most significant
+    //     one or two bits.
+    uint64_t dp_operand;
+    if (atmp[i] & 0x80000000) {
+      dp_operand = (0x3feLL << 52) | (((uint64_t)atmp[i] & 0x7FFFFFFF) << 21);
+    } else {
+      dp_operand = (0x3fdLL << 52) | (((uint64_t)atmp[i] & 0x3FFFFFFF) << 22);
+    }
+    res64[i].m64_u64[0] = dp_operand;
+    coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+  }
+  __m128 c05_f = _mm_set1_ps(0.5);
+  __m128 coeff_f = _mm_load_ps(coeff);
+  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
+  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
+
+  __m128 coeff_f2 = _M128(_pM128i(*coeff_f2_c));
+  q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[2].m64_d64[0]), _mm_cvtps_pd(coeff_f2));
+  __m128i q0_i2 = _mm_cvttpd_epi32(q0_d);
+  coeff_f = _M128(_mm_unpacklo_epi64(_M128i(coeff_f), _M128i(coeff_f2)));
+  q0_i = _mm_unpacklo_epi64(q0_i, q0_i2);
+
+  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
+  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
+  __m128 c256_f = _mm_set1_ps(256.0);
+  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
+#ifdef USE_SSE4
+  s_f = _mm_floor_ps(s_f);
+#else
+  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
+#endif
+  s_f = _mm_div_ps(s_f, c256_f);
+  _mm_store_ps(rr, s_f);
+
+  for (i = 0; i < 4; i++) {
+    if ((atmp[i] & 0xc0000000) == 0) { // a <=0x3fffffff
+      res[i] = 0xffffffff;
+    } else {
+      res[i] = (uint32_t)(rr[i] * (((uint32_t)1) << 31));
+    }
+  }
+  return _mm_load_si128((__m128i *)res);
+}
+
+//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
+//******************************************************************************************
+//******VRECPS (Vector Reciprocal Step) ***************************************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
+
+_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vrecpsq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
+{
+    __m128 f2, mul;
+    f2 =  _mm_set1_ps(2.);
+    mul = _mm_mul_ps(a,b);
+    return _mm_sub_ps(f2,mul);
+}
+
+//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
+
+_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
+    res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
+    return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
+{
+    __m128 f3, f05, mul;
+    f3 =  _mm_set1_ps(3.f);
+    f05 =  _mm_set1_ps(0.5f);
+    mul = _mm_mul_ps(a,b);
+    f3 = _mm_sub_ps(f3,mul);
+    return _mm_mul_ps (f3, f05);
+}
+//********************************************************************************************
+//***************************** Shifts by signed variable ***********************************
+//********************************************************************************************
+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
+//********************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//helper macro. It matches ARM implementation for big shifts
+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
+        else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
+        else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, i, 8)
+}
+
+_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, i, 4)
+}
+
+_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, i, 2)
+}
+
+_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(64, i, 1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, u, 8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, u, 4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, u, 2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
+{
+    SERIAL_SHIFT_64(64, u, 1)
+}
+
+_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+
+//*********** Vector saturating shift left: (negative values shift right) **********************
+//********************************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if ((atmp[i] ==0)||(btmp[i] ==0)) res[i] = atmp[i]; \
+        else{ \
+            if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        TYPE lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if ((atmp[i] ==0)||(btmp[i] ==0)) { res[i] = atmp[i]; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if ((a.m64_i ## TYPE[i] == 0) ||(b.m64_i ## TYPE[i] == 0)) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i]; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if ((a.m64_u ## TYPE[i] == 0) ||(b.m64_u ## TYPE[i] == 0)) {res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i]; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+
+//******** Vector rounding shift left: (negative values shift right) **********
+//****************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+//rounding makes sense for right shifts only.
+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( btmp[i] >= 0) { \
+            if(btmp[i] >= lanesize) res[i] = 0; \
+            else res[i] = (atmp[i] << btmp[i]); \
+        }else{ \
+            res[i] = (btmp[i] < -lanesize) ? 0 : \
+                            (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
+                            (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
+        return _mm_load_si128((__m128i*)res);
+
+
+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( b.m64_i ## TYPE[i] >= 0) { \
+            if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
+            else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
+        }else{ \
+            res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? 0 : \
+                            (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
+                            (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
+        return res;
+
+
+_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,i,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,i,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,i,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,i,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,u,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,u,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,u,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,u,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+
+//********** Vector saturating rounding shift left: (negative values shift right) ****************
+//*************************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//Saturation happens for left shifts only while rounding makes sense for right shifts only.
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) res[i] = 0; \
+        else{ \
+            if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        int lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) {res[i] = 0; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
+        __m64_128 res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+// *********************************************************************************
+// *****************************  Shifts by a constant *****************************
+// *********************************************************************************
+//**************** Vector shift right by constant*************************************
+//************************************************************************************
+_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srai_epi16 (r, b); //SSE2
+    r = _mm_packs_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(_mm_srai_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(_mm_srai_epi32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no arithmetic shift for 64bit values, serial solution used
+    int64x1_t res;
+    if(b>=64) res.m64_i64[0] = 0;
+    else res.m64_i64[0] = (*(int64_t*)&a) >> b;
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(_mm_srli_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(_mm_srli_epi32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(_mm_srli_epi64(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i zero, mask0, a_sign, r, a_sign_mask;
+    _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
+    zero = _mm_setzero_si128();
+    mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
+    r = _mm_srai_epi16 (a, b);
+    a_sign_mask =  _mm_and_si128 (mask0, a_sign);
+    r =  _mm_andnot_si128 (mask0, r);
+    return _mm_or_si128 (r, a_sign_mask);
+}
+
+_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+#define vshrq_n_s16 _mm_srai_epi16
+
+_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+#define vshrq_n_s32 _mm_srai_epi32
+
+_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{
+    //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
+    __m128i c1, signmask,a0,  res64;
+    _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
+    c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
+    signmask  =  _mm_slli_epi64 (c1, (64 - b));
+    a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
+    a0 = _MM_CMPEQ_EPI64 (a, a0);
+    signmask = _mm_and_si128(a0, signmask);
+    res64 = _mm_srli_epi64 (a, b);
+    return _mm_or_si128(res64, signmask);
+}
+
+_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_srli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
+#define vshrq_n_u16 _mm_srli_epi16
+
+_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+#define vshrq_n_u32 _mm_srli_epi32
+
+_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
+#define vshrq_n_u64 _mm_srli_epi64
+
+//*************************** Vector shift left by constant *************************
+//*********************************************************************************
+_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
+{
+    int16x4_t res64;
+    return64(_mm_slli_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64(_mm_slli_epi32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
+{
+    int64x1_t res64;
+    return64(_mm_slli_epi64(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i mask8;
+    __m128i r;
+    mask8 = _mm_set1_epi16(0xff);
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_and_si128(r, mask8); //to avoid saturation
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+#define vshl_n_u16 vshl_n_s16
+
+
+_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+#define vshl_n_u32 vshl_n_s32
+
+_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+#define vshl_n_u64 vshl_n_s64
+
+_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+#define vshlq_n_s8 vshlq_n_u8
+
+_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+#define vshlq_n_s16 _mm_slli_epi16
+
+_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+#define vshlq_n_s32 _mm_slli_epi32
+
+_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+#define vshlq_n_s64 _mm_slli_epi64
+
+_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_slli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+#define vshlq_n_u16 vshlq_n_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+#define vshlq_n_u32 vshlq_n_s32
+
+_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+#define vshlq_n_u64 vshlq_n_s64
+
+//************* Vector rounding shift right by constant ******************
+//*************************************************************************
+//No corresponding  x86 intrinsics exist, need to do some tricks
+_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r = _mm_packs_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(vrshrq_n_s16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vrshrq_n_s32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution is faster
+    int64x1_t res;
+    int64_t a_i64 = *( int64_t*)&a;
+    if(b==64) {
+        res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
+    } else {
+        int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
+        res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
+    uint8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(vrshrq_n_u16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(vrshrq_n_u32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(vrshrq_n_u64(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
+    r = vshrq_n_s8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
+    r = _mm_srai_epi32(a, b);
+    return _mm_add_epi32 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{
+    //solution may be not optimal compared with a serial one
+    __m128i maskb;
+    int64x2_t r;
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
+    r = vshrq_n_s64(a, b);
+    return _mm_add_epi64 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
+    r = vshrq_n_u8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
+    r = _mm_srli_epi32(a, b);
+    return _mm_add_epi32 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
+{
+    //solution may be not optimal compared with a serial one
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
+    r = _mm_srli_epi64(a, b);
+    return _mm_add_epi64 (r, maskb); //actual rounding
+}
+
+//************* Vector shift right by constant and accumulate *********
+//*********************************************************************
+_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    //may be not optimal compared with a serial solution
+    int64x1_t shift;
+    shift = vshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vshr_n_u64(b, c);
+    return vadd_u64(a, shift);
+}
+
+_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
+{
+    int64x2_t shift;
+    shift = vshrq_n_s64(b, c);
+    return vaddq_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
+{
+    uint64x2_t shift;
+    shift = vshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//************* Vector rounding shift right by constant and accumulate ****************************
+//************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vrshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vrshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vrshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    int64x1_t shift;
+    shift = vrshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vrshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vrshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vrshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vrshr_n_u64(b, c);
+    return vadd_u64( a, shift);
+}
+
+_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vrshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vrshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vrshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{
+    int64x2_t shift;
+    shift = vrshrq_n_s64(b, c);
+    return vaddq_s64(a, shift);
+}
+
+_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vrshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vrshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vrshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
+{
+    uint64x2_t shift;
+    shift = vrshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//**********************Vector saturating shift left by constant *****************************
+//********************************************************************************************
+//we don't check const ranges  assuming they are met
+_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    int8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packs function)
+    int16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    //serial execution may be faster
+    int32x2_t res64;
+    return64(vqshlq_n_s32 (_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    int64x1_t res;
+    int64_t bmask;
+    int64_t a_i64 = *( int64_t*)&a;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    if (a_i64 >= bmask) {
+        res.m64_i64[0] = ~(_SIGNBIT64);
+    } else {
+        res.m64_i64[0]  = (a_i64 <= -bmask) ? (int64_t)_SIGNBIT64 : a_i64 << b;
+    }
+    return res;
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b); //shift_res
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packus function)
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
+{
+    uint32x2_t res64;
+    return64(vqshlq_n_u32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    uint64x1_t res;
+    uint64_t bmask;
+    uint64_t a_i64 = *(uint64_t*)&a;
+    bmask = ( uint64_t)1 << (64 - b);
+    res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
+}
+
+_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    // go to 32 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
+}
+
+_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
+{
+    // no 64 bit saturation option available, special tricks necessary
+    __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
+    c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
+    maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
+    saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
+    shift_res = _mm_slli_epi32 (a, b);
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    //result with positive numbers saturated
+    shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+    //treat negative numbers
+    maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
+    saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+}
+
+_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
+    int64_t bmask;
+    int i;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i] >= bmask) {
+            res[i] = ~(_SIGNBIT64);
+        } else {
+            res[i] = (atmp[i] <= -bmask) ? (int64_t)_SIGNBIT64 : atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPU8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
+{
+    // manual saturation solution looks more optimal than 32 bits conversion one
+    __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
+    c8000 = _mm_set1_epi16 (-32768); // (int16_t)0x8000
+//no unsigned shorts comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi16(a, c8000); //go to signed
+    saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
+    shift_res = _mm_slli_epi16 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
+{
+    // manual saturation solution, no 64 bit saturation option, the serial version may be faster
+    __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
+    c80000000 = _mm_set1_epi32 (0x80000000);
+//no unsigned ints comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi32(a, c80000000); //go to signed
+    saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
+    shift_res = _mm_slli_epi32 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
+    uint64_t bmask;
+    int i;
+    bmask = ( uint64_t)1 << (64 - b);
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**************Vector signed->unsigned saturating shift left by constant *************
+//*************************************************************************************
+_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
+{
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64( vqshluq_n_s32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
+{
+    uint64x1_t res;
+    uint64_t limit;
+    if (a.m64_i64[0]<=0) {
+        res.m64_u64[0] = 0;
+    } else {
+        limit = (uint64_t) 1 << (64 - b);
+        res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? ~((uint64_t)0) : (uint64_t)a.m64_i64[0] << b;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
+{
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
+    zero = _mm_setzero_si128();
+    maskA = _mm_cmpeq_epi32(a, a);
+    maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
+    //saturate negative numbers to zero
+    maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
+    a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
+    //saturate positive to 0xffffffff
+    a_masked = _mm_and_si128 (a0, maskA);
+    a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
+    a_shift = _mm_slli_epi32 (a0, b);
+    return _mm_or_si128 (a_shift, a_masked); //actual saturation
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here, serial execution looks faster
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    uint64_t limit;
+    int i;
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i]<=0) {
+            res[i] = 0;
+        } else {
+            limit = (uint64_t) 1 << (64 - b);
+            res[i] = ( ((uint64_t)atmp[i]) >= limit) ? ~((uint64_t)0) : (uint64_t)atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//************** Vector narrowing  shift right by constant **************
+//**********************************************************************
+_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_u64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************** Vector signed->unsigned narrowing saturating shift right by constant ********
+//*********************************************************************************************
+_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b);
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b);
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
+    }
+    return res;
+}
+
+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
+_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r16;
+    uint8x8_t res64;
+    r16 = vrshrq_n_s16(a,b);
+    r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r32;
+    uint16x4_t res64;
+    r32 = vrshrq_n_s32(a,b);
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
+    }
+    return res;
+}
+
+//***** Vector narrowing saturating shift right by constant ******
+//*****************************************************************
+_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    res64[0] = (atmp[0] >> b);
+    res64[1] = (atmp[1] >> b);
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+
+//********* Vector rounding narrowing shift right by constant *************************
+//****************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+     r16  = vrshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_u64(a,b);
+    r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************* Vector rounding narrowing saturating shift right by constant ************
+//****************************************************************************************
+_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_s16(a,b);
+    r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
+    res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
+    maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
+    res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vrshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************** Vector widening shift left by constant ****************
+//************************************************************************
+_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
+{
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    return _mm_slli_epi16 (r, b);
+}
+
+_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi32 (r, b);
+}
+
+_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi64 (r, b);
+}
+
+_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
+{
+    //no uint8 to uint16 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi8(_pM128i(a), zero);
+    return _mm_slli_epi16 (r, b);
+}
+
+_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
+{
+    //no uint16 to uint32 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi16(_pM128i(a), zero);
+    return _mm_slli_epi32 (r, b);
+}
+
+_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
+{
+    //no uint32 to uint64 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi32(_pM128i(a), zero);
+    return _mm_slli_epi64 (r, b);
+}
+
+//************************************************************************************
+//**************************** Shifts with insert ************************************
+//************************************************************************************
+//takes each element in a vector,  shifts them by an immediate value,
+//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
+
+//**************** Vector shift right and insert ************************************
+//Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted.
+_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
+{
+    int8x8_t res64;
+    return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
+{
+    int16x4_t res64;
+    return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
+{
+    int32x2_t res64;
+    return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    int64x1_t res;
+    if (c ==64)
+        res = a;
+    else{
+        res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
+    }
+    return res;
+}
+
+_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_u8 vsri_n_s8
+
+_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_u16 vsri_n_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+#define vsri_n_u32 vsri_n_s32
+
+
+_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+#define vsri_n_u64 vsri_n_s64
+
+_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_p8 vsri_n_u8
+
+_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_p16 vsri_n_u16
+
+_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
+{
+    __m128i maskA, a_masked;
+    uint8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 static const uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
+    maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
+    a_masked = _mm_and_si128 (a, maskA);
+    b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
+    return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint16x8_t b_shift;
+    uint16x8_t a_c;
+    b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u16( a, (16 - c));
+    a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint32x4_t b_shift;
+    uint32x4_t a_c;
+    b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u32( a, (32 - c));
+    a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{
+    //serial solution may be faster
+    uint64x2_t b_shift;
+    uint64x2_t a_c;
+    b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
+    a_c = _mm_srli_epi64(a, (64 - c));
+    a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+#define vsriq_n_u8 vsriq_n_s8
+
+_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+#define vsriq_n_u16 vsriq_n_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+#define vsriq_n_u32 vsriq_n_s32
+
+_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+#define vsriq_n_u64 vsriq_n_s64
+
+_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+#define vsriq_n_p8 vsriq_n_u8
+
+_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+#define vsriq_n_p16 vsriq_n_u16
+
+//***** Vector shift left and insert *********************************************
+//*********************************************************************************
+//Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
+_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
+{
+    int8x8_t res64;
+    return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
+{
+    int16x4_t res64;
+    return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
+{
+    int32x2_t res64;
+    return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
+    return res;
+}
+
+
+_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_u8 vsli_n_s8
+
+_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_u16 vsli_n_s16
+
+_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+#define vsli_n_u32 vsli_n_s32
+
+_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+#define vsli_n_u64 vsli_n_s64
+
+_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_p8 vsli_n_u8
+
+_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_p16 vsli_n_u16
+
+_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
+{
+    __m128i maskA, a_masked;
+    int8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 static const uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
+    maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
+    b_shift = vshlq_n_s8( b, c);
+    a_masked = _mm_and_si128 (a, maskA);
+    return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
+{
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
+    int16x8_t b_shift;
+    int16x8_t a_c;
+    b_shift = vshlq_n_s16( b, c);
+    a_c = vshlq_n_s16( a, (16 - c));
+    a_c  = _mm_srli_epi16(a_c, (16 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
+    int32x4_t b_shift;
+    int32x4_t a_c;
+    b_shift = vshlq_n_s32( b, c);
+    a_c = vshlq_n_s32( a, (32 - c));
+    a_c  = _mm_srli_epi32(a_c, (32 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
+    int64x2_t b_shift;
+    int64x2_t a_c;
+    b_shift = vshlq_n_s64( b, c);
+    a_c = vshlq_n_s64( a, (64 - c));
+    a_c  = _mm_srli_epi64(a_c, (64 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+#define vsliq_n_u8 vsliq_n_s8
+
+_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+#define vsliq_n_u16 vsliq_n_s16
+
+_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+#define vsliq_n_u32 vsliq_n_s32
+
+_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+#define vsliq_n_u64 vsliq_n_s64
+
+_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+#define vsliq_n_p8 vsliq_n_u8
+
+_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+#define vsliq_n_p16 vsliq_n_u16
+
+// ***********************************************************************************************
+// ****************** Loads and stores of a single vector ***************************************
+// ***********************************************************************************************
+//Performs loads and stores of a single vector of some type.
+//*******************************  Loads ********************************************************
+// ***********************************************************************************************
+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
+//also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
+#define LOAD_SI128(ptr) \
+        ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
+
+_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_u8 LOAD_SI128
+
+_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_u16 LOAD_SI128
+
+_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+#define vld1q_u32 LOAD_SI128
+
+_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld1q_u64 LOAD_SI128
+
+_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_s8 LOAD_SI128
+
+_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_s16 LOAD_SI128
+
+_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+#define vld1q_s32 LOAD_SI128
+
+_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld1q_s64 LOAD_SI128
+
+_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+__m128 f2;
+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
+}*/
+
+_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
+{
+    if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
+        return _mm_load_ps(ptr);
+    else
+        return _mm_loadu_ps(ptr);
+}
+
+_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_p8  LOAD_SI128
+
+_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_p16 LOAD_SI128
+
+_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
+
+_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_u16 vld1_u8
+
+_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_u32 vld1_u8
+
+
+_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_u64 vld1_u8
+
+_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_s8 vld1_u8
+
+_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_s16 vld1_u16
+
+_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_s32 vld1_u32
+
+_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_s64 vld1_u64
+
+_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+
+_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = *(ptr + 1);
+    return res;
+}
+
+_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_p8 vld1_u8
+
+_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_p16 vld1_u16
+
+
+_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
+{
+    if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
+        return _mm_load_pd(ptr);
+    else
+        return _mm_loadu_pd(ptr);
+}
+
+
+//***********************************************************************************************************
+//******* Lane load functions - insert the data at  vector's given position (lane) *************************
+//***********************************************************************************************************
+_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
+
+
+_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
+{
+    //we need to deal with  ptr  16bit NOT aligned case
+    __m128 p;
+    p = _mm_set1_ps(*(ptr));
+    return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
+}
+
+_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    res = vec;
+    res.m64_u8[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    res = vec;
+    res.m64_u16[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res = vec;
+    res.m64_u32[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64x1_t res;
+    UNREFERENCED_PARAMETER(vec);
+    UNREFERENCED_PARAMETER(lane);
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+
+_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
+
+_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
+
+_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
+
+_NEON2SSE_GLOBAL float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res = vec;
+    res.m64_f32[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
+
+_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_p8 vld1_lane_u8
+
+_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_p16 vld1_lane_s16
+
+// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
+// ******************************************************************************************************************
+_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
+
+_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
+
+_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    _NEON2SSE_ALIGN_16 uint64_t val[2];
+
+    val[0] = *(ptr);
+    val[1] = *(ptr);
+
+    return LOAD_SI128(val);
+}
+
+_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
+
+_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
+
+_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
+
+_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+//current IA SIMD doesn't support float16, need to go to 32 bits
+
+_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
+
+_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
+
+_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for(i = 0; i<8; i++) {
+        res.m64_u8[i] =  *(ptr);
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for(i = 0; i<4; i++) {
+        res.m64_u16[i] =  *(ptr);
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = *(ptr);
+    res.m64_u32[1] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
+
+
+_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
+
+
+_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
+
+
+_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
+
+_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = res.m64_f32[0];
+    return res; // use last 64bits only
+}
+
+_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_p8 vld1_dup_u8
+
+
+_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_p16 vld1_dup_u16
+
+
+//*************************************************************************************
+//********************************* Store **********************************************
+//*************************************************************************************
+// If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
+//here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
+#define STORE_SI128(ptr, val) \
+        (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
+
+_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_u8 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_u16 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+#define vst1q_u32 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+#define vst1q_u64 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_s8 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_s16 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+#define vst1q_s32 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+#define vst1q_s64 STORE_SI128
+
+_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently
+
+_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
+{
+    if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
+        _mm_store_ps (ptr, val);
+    else
+        _mm_storeu_ps (ptr, val);
+}
+
+_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_p8  vst1q_u8
+
+_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_p16 vst1q_u16
+
+_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
+{
+    int i;
+    for (i = 0; i<8; i++) {
+        *(ptr + i) = ((uint8_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
+{
+    int i;
+    for (i = 0; i<4; i++) {
+        *(ptr + i) = ((uint16_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
+{
+    int i;
+    for (i = 0; i<2; i++) {
+        *(ptr + i) = ((uint32_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
+{
+    *(ptr) = *((uint64_t*)&val);
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
+
+_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
+
+_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
+
+_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
+
+_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
+{
+    *(ptr) =   val.m64_f32[0];
+    *(ptr + 1) = val.m64_f32[1];
+    return;
+}
+
+_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_p8 vst1_u8
+
+_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_p16 vst1_u16
+
+//***********Store a lane of a vector into memory (extract given lane) *********************
+//******************************************************************************************
+_NEON2SSE_GLOBAL void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
+
+_NEON2SSE_GLOBAL void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
+{
+    *((int32_t*)ptr) = _MM_EXTRACT_PS(val,lane);
+}
+
+_NEON2SSE_GLOBAL void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_p8   vst1q_lane_u8
+
+_NEON2SSE_GLOBAL void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_p16   vst1q_lane_s16
+
+_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
+{
+    *(ptr) = val.m64_u8[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
+{
+    *(ptr) = val.m64_u16[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_u32[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
+{
+    UNREFERENCED_PARAMETER(lane);
+    *(ptr) = val.m64_u64[0];
+}
+
+_NEON2SSE_GLOBAL void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
+
+_NEON2SSE_GLOBAL void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
+
+_NEON2SSE_GLOBAL void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
+
+
+_NEON2SSE_GLOBAL void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
+
+
+_NEON2SSE_GLOBAL void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_f32[lane];
+}
+
+_NEON2SSE_GLOBAL void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1_lane_p8 vst1_lane_u8
+
+_NEON2SSE_GLOBAL void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_p16 vst1_lane_s16
+
+//***********************************************************************************************
+//**************** Loads and stores of an N-element structure **********************************
+//***********************************************************************************************
+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
+//We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
+//****************** 2 elements load  *********************************************
+_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
+{
+    uint8x16x2_t v;
+    v.val[0] = vld1q_u8(ptr);
+    v.val[1] = vld1q_u8((ptr + 16));
+    v = vuzpq_s8(v.val[0], v.val[1]);
+    return v;


Commit: 673cb4d659513b38fcfc8afcc3695d317ecf9565
    https://github.com/scummvm/scummvm/commit/673cb4d659513b38fcfc8afcc3695d317ecf9565
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Optimizations turns off if SSE is not found.

Changed paths:
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/tests/test_gfx.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 35388581c0f..3f0e150fc42 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -103,10 +103,45 @@ namespace AGS3 {
 
 Globals *g_globals;
 
+static bool checkForSIMDExtensions() {
+#if defined(__x86_64__) || defined(__i686__)
+#ifdef __GNUC__
+	int c_ecx, c_edx;
+	asm (".intel_syntax;"
+		 "movq rax,0;"
+		 "cpuid;"
+		 "mov %0,ecx;"
+		 "mov %1,edx;"
+		 ".att_syntax;"
+		 : "=r" (c_ecx), "=r" (c_edx)
+		 : "r"
+		 : "eax", "ebx", "ecx", "edx");
+	return c_edx & (1 << 25); // SSE2 extensions bit
+#elif _MSC_VER
+	int c_ecx, c_edx;
+	__asm
+	{
+		mov rax,0
+		cpuid
+		mov c_ecx,ecx
+		mov c_edx,edx
+	}
+	return c_edx & (1 << 25); // SSE2 extensions bit
+#else
+	return false;
+#endif
+#elif defined(__aarch64__)
+	return true;
+#else
+	return false;
+#endif
+}
+
 Globals::Globals() {
 	g_globals = this;
 
 	// Allegro globals
+	__bitmap_simd_optimizations = checkForSIMDExtensions();
 	Common::fill((byte *)&_black_palette, (byte *)&_black_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_current_palette, (byte *)&_current_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_prev_current_palette, (byte *)&_prev_current_palette + PAL_SIZE, 0);
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index e2bd162403d..0271adf187f 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -254,34 +254,34 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
 	switch (_G(_blender_mode)) {
 	case kSourceAlphaBlender:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendSourceAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kArgbToArgbBlender:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendArgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kArgbToRgbBlender:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendArgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kRgbToArgbBlender:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendRgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kRgbToRgbBlender:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendRgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kAlphaPreservedBlenderMode:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendPreserveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kOpaqueBlenderMode:
 		blendOpaque(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kAdditiveBlenderMode:
-		if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
+		if (!useTint) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
 		blendAdditiveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
 		break;
 	case kTintBlenderMode:
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index 482d0d2f988..81706f6f17e 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -46,7 +46,7 @@ using namespace AGS::Shared;
 // Comment this out if you don't want the console to be clogged with info durning tests
 #define VERBOSE_TEST_GFX
 
-void Test_GfxSpeed(bool opt) {
+void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	_G(_bitmap_simd_optimizations) = opt;
 #ifdef VERBOSE_TEST_GFX
 	if (opt) printf("SIMD optimizations: true\n");
@@ -68,7 +68,7 @@ void Test_GfxSpeed(bool opt) {
 	for (int dest = 0; dest < 3; dest++) {
 		for (int gfx = 0; gfx < 3; gfx++) {
 			if (dest == 2 && gfx != 2) continue;
-			for (int mode = 0; mode < sizeof(blenderModes) / sizeof(int); mode++) {
+			for (int mode = blenderModeStart; mode <= blenderModeEnd; mode++) {
 				for (int runs = 0; runs < sizeof(benchRuns)/sizeof(int); runs++) {
 					uint32 start, end;
 					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
@@ -240,12 +240,16 @@ void Test_GfxTransparency() {
 }
 
 void Test_Gfx() {
-	//Test_GfxTransparency();
+	Test_GfxTransparency();
 	Test_DrawingLoops();
-	//Test_BlenderModes();
-	// This could take a LONG time depending if you don't have SIMD ISA extentions (NEON or x86 atm)
-	//Test_GfxSpeed(false);
-	//Test_GfxSpeed(true);
+	Test_BlenderModes();
+	// This could take a LONG time
+	bool has_simd = _G(_bitmap_simd_optimizations);
+	Test_GfxSpeed(false, 0, kTintLightBlenderMode);
+	if (has_simd) {
+		Test_GfxSpeed(true, 0, kTintLightBlenderMode);
+	}
+	_G(_bitmap_simd_optimizations) = has_simd;
 }
 
 } // namespace AGS3


Commit: b177d382b69e672d8773ef366732b16525d2a1de
    https://github.com/scummvm/scummvm/commit/b177d382b69e672d8773ef366732b16525d2a1de
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Moved arm neon bitmap code to new file.

Changed paths:
  A engines/ags/lib/allegro/surface_simd_neon.cpp
  A engines/ags/lib/allegro/surface_simd_neon.h
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/module.mk
    engines/ags/tests/test_gfx.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 3f0e150fc42..7e5ea6047b4 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -105,7 +105,7 @@ Globals *g_globals;
 
 static bool checkForSIMDExtensions() {
 #if defined(__x86_64__) || defined(__i686__)
-#ifdef __GNUC__
+#  ifdef __GNUC__
 	int c_ecx, c_edx;
 	asm (".intel_syntax;"
 		 "movq rax,0;"
@@ -117,7 +117,7 @@ static bool checkForSIMDExtensions() {
 		 : "r"
 		 : "eax", "ebx", "ecx", "edx");
 	return c_edx & (1 << 25); // SSE2 extensions bit
-#elif _MSC_VER
+#  elif _MSC_VER
 	int c_ecx, c_edx;
 	__asm
 	{
@@ -127,9 +127,9 @@ static bool checkForSIMDExtensions() {
 		mov c_edx,edx
 	}
 	return c_edx & (1 << 25); // SSE2 extensions bit
-#else
+#  else
 	return false;
-#endif
+#  endif
 #elif defined(__aarch64__)
 	return true;
 #else
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 0271adf187f..8f0edfdbdd4 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -107,6 +107,116 @@ void BITMAP::floodfill(int x, int y, int color) {
 
 const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
+template<int ScaleThreshold>
+void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 1 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr :
+	                       srcArea.top + yCtr);
+	for (; yCtr < dstRect.height(); ++destY, ++yCtr, scaleYCtr += scaleY) {
+		if (ScaleThreshold != 0) {
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		// Loop through the pixels of the row
+		for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel, scaleXCtr += scaleX) {
+			const byte *srcVal = srcP + xDir * xCtrBpp;
+			if (ScaleThreshold != 0) {
+				srcVal = srcP + (scaleXCtr / ScaleThreshold) * src.format.bytesPerPixel;
+			}
+			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+
+			// Check if this is a transparent color we should skip
+			if (skipTrans && ((srcCol & alphaMask) == transColor))
+				continue;
+
+			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+
+			// When blitting to the same format we can just copy the color
+			if (format.bytesPerPixel == 1) {
+				*destVal = srcCol;
+				continue;
+			} else if (sameFormat && srcAlpha == -1) {
+				if (format.bytesPerPixel == 4)
+					*(uint32 *)destVal = srcCol;
+				else
+					*(uint16 *)destVal = srcCol;
+				continue;
+			}
+
+			// We need the rgb values to do blending and/or convert between formats
+			if (src.format.bytesPerPixel == 1) {
+				const RGB &rgb = palette[srcCol];
+				aSrc = 0xff;
+				rSrc = rgb.r;
+				gSrc = rgb.g;
+				bSrc = rgb.b;
+			} else {
+				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+			}
+
+			if (srcAlpha == -1) {
+				// This means we don't use blending.
+				aDest = aSrc;
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+			} else {
+				if (useTint) {
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+					aDest = aSrc;
+					rSrc = tintRed;
+					gSrc = tintGreen;
+					bSrc = tintBlue;
+					aSrc = srcAlpha;
+				}
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			}
+
+			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			if (format.bytesPerPixel == 4)
+				*(uint32 *)destVal = pixel;
+			else
+				*(uint16 *)destVal = pixel;
+		}
+
+		destP += destArea.pitch;
+		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+	}
+}
 
 void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
                   int dstX, int dstY, bool horizFlip, bool vertFlip,
@@ -293,98 +403,6 @@ void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &a
 	}
 }
 
-uint32x4_t BITMAP::blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const {
-	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
-	auto setupArgbAlphas = [&]() {
-		srcAlphas = vshrq_n_u32(srcCols, 24);
-		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
-		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
-		difAlphas = vshlq_n_u32(difAlphas, 24);
-		srcAlphas = vshlq_n_u32(srcAlphas, 24);
-		mask = vceqq_u32(alphas, vmovq_n_u32(0));
-		srcAlphas = vandq_u32(srcAlphas, mask);
-		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
-	};
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
-		alphas = vshrq_n_u32(srcCols, 24);
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kArgbToArgbBlender:
-		setupArgbAlphas();
-		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
-		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
-		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kArgbToRgbBlender:
-		setupArgbAlphas();
-		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
-		//mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
-		//ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
-		//ch2 = vandq_u32(destCols, vmvnq_u32(mask));
-		//return vandq_u32(vorrq_u32(ch1, ch2), vmovq_n_u32(0x00ffffff));
-	case kRgbToArgbBlender:
-		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
-		ch2 = argbBlendSIMD(ch2, destCols);
-		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
-		ch1 = vandq_u32(ch1, mask);
-		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kRgbToRgbBlender:
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kAlphaPreservedBlenderMode:
-		return rgbBlendSIMD(srcCols, destCols, alphas, true);
-	case kOpaqueBlenderMode:
-		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-	case kAdditiveBlenderMode:
-		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
-		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
-	case kTintBlenderMode:
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
-	case kTintLightBlenderMode:
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
-	}
-}
-
-uint16x8_t BITMAP::blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const {
-	uint16x8_t mask, ch1, ch2;
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
-	case kOpaqueBlenderMode:
-	case kAdditiveBlenderMode:
-		return srcCols;
-	case kArgbToArgbBlender:
-	case kArgbToRgbBlender:
-		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
-		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
-		alphas = vorrq_u16(ch1, ch2);
-	case kRgbToRgbBlender:
-	case kAlphaPreservedBlenderMode:
-		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
-	case kRgbToArgbBlender:
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
-		ch1 = vandq_u32(srcCols, mask);
-		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kTintBlenderMode:
-	case kTintLightBlenderMode:
-		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
-		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
-		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
-		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
-		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
-		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
-		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
-		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
-		return vcombine_u16(lo, hi);
-	}
-}
-
 void BITMAP::blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const {
 	// Used from draw_lit_sprite after set_blender_mode(kTintBlenderMode or kTintLightBlenderMode)
 	// Original blender function: _myblender_color32 and _myblender_color32_light
@@ -407,88 +425,6 @@ void BITMAP::blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uin
 	// Preserve value in aDest
 }
 
-uint32x4_t BITMAP::blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const {
-	// This function is NOT 1 to 1 with the original... It just approximates it
-	// It gets the value of the dest color
-	// Then it gets the h and s of the srcCols
-
-	// srcCols[0] = A | R | G | B
-	// srcCols[1] = A | R | G | B
-	// srcCols[2] = A | R | G | B
-	// srcCols[3] = A | R | G | B
-	//  ->
-	// dda = { A[0], A[1], A[2], A[3] }
-	// ddr = { R[0], R[1], R[2], R[3] }
-	// ddg = { G[0], G[1], G[2], G[3] }
-	// ddb = { B[0], B[1], B[2], B[3] }
-
-	float32x4_t ddr, ddg, ddb;
-	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-	float32x4_t ssr, ssg, ssb;
-	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
-	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
-	//float32x4_t dmins = vminq_f32(ddr, vminq_f32(ddg, ddb));
-	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
-	//float32x4_t ddelta = vsubq_f32(dmaxes, dmins);
-	
-	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
-	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
-	float32x4_t hr, hg, hb, hue;
-	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
-	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
-	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
-	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
-	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
-	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
-	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
-	hue = vmulq_f32(hr, hrfactors);
-	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
-	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
-	//float32x4_t hchromaZeroMask = vcvtq_f32_u32(vandq_u32(vcleq_f32(chroma, eplison0), vmovq_n_u32(1)));
-	//hue = vmulq_f32(hue, hchromaZeroMask);
-
-	// Mess with the light
-	float32x4_t val = dmaxes;
-	if (light) {
-		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
-		val = vmaxq_f32(val, vmovq_n_f32(0.0));
-	}
-		
-	// then it stiches them back together
-	//AGS3::Shared::Debug::Printf(AGS3::Shared::kDbgMsg_Info, "hues: %f", vgetq_lane_f32(hue, 0));
-	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
-	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
-	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
-	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
-	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
-	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
-	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
-
-	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
-	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
-	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
-	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
-	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
-	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
-	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
-	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
-	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
-	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
-	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
-	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
-
-	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
-	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
-	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
-	final = vaddq_u32(final, val_add);
-	return final;
-}
-
 /*-------------------------------------------------------------------*/
 
 /**
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 53150637946..270c21a1c16 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -29,7 +29,7 @@
 
 #if defined(__aarch64__)
 // All 64 bit arm v8 or whatevers come with neon extensions, no need to check
-#include "arm_neon.h"
+#include <arm_neon.h>
 #elif defined(__x86_64__) || defined(__i686__)
 // Most x86 based processors come with sse2, (which is what intels header has here), but it can use sse4
 // SSE2 support is still checked for at runtime
@@ -141,8 +141,8 @@ public:
 	// when x is the sprite color, y the destination color, and n an alpha value
 
 	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
-	uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
-	uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const;
+	//uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
+	//uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const;
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
 		// Note: the original's handling varies slightly for R & B vs G.
@@ -164,74 +164,6 @@ public:
 		bDest = res & 0xff;
 	}
 
-	inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const {
-		alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
-		uint16x8_t srcComps[] = {
-			vandq_u16(srcCols, vmovq_n_u16(0x1f)),
-			vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),
-			vshrq_n_u16(srcCols, 11),
-		}, destComps[] = {
-			vandq_u16(destCols, vmovq_n_u16(0x1f)),
-			vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)),
-			vshrq_n_u16(destCols, 11),
-		};
-		//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
-		//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
-		//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
-		//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
-		//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
-		//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
-
-		uint16x8_t diffs[] = {
-			vsubq_u16(srcComps[0], destComps[0]), // B
-			vsubq_u16(srcComps[1], destComps[1]), // G
-			vsubq_u16(srcComps[2], destComps[2]), // R
-		};
-		alphas = vshrq_n_u16(alphas, 2);
-		diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
-		alphas = vshrq_n_u16(alphas, 1);
-		diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
-		diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
-
-		//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
-		//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
-		//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
-
-		diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
-		diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
-		diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
-		diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
-		return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
-	}
-
-	inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) const {
-		alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
-		uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
-		uint32x4_t srcColsCopy = srcCols;
-		srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-		uint32x4_t destColsCopy = destCols;
-		destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
-		srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
-		srcColsCopy = vmulq_u32(srcColsCopy, alphas);
-		srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
-		srcColsCopy = vaddq_u32(srcColsCopy, destCols);
-
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-		destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
-		srcCols = vsubq_u32(srcCols, destCols);
-		srcCols = vmulq_u32(srcCols, alphas);
-		srcCols = vshrq_n_u32(srcCols, 8);
-		srcCols = vaddq_u32(srcCols, destCols);
-		srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-		srcCols = vorrq_u32(srcCols, srcColsCopy);
-		if (preserveAlpha) {
-			srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-			srcCols = vorrq_u32(srcCols, alpha);
-		}
-		return srcCols;
-	}
-
 	inline void argbBlend(uint32 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest) const {
 		// Original logic has uint32 src and dst colors as ARGB8888
 		// ++src_alpha;
@@ -256,39 +188,6 @@ public:
 		aDest = static_cast<uint8>(255. * (sAlpha + dAlpha));
 	}
 
-	inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) const {
-		float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
-		sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
-		float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
-		float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
-		float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
-		dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
-		dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
-		float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
-		float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
-		float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
-		float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
-		float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
-		float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
-		srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
-		destRgb1 = vmulq_f16(destRgb1, dAlphas1);
-		srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
-		float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1));
-		srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
-		srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
-		destRgb2 = vmulq_f16(destRgb2, dAlphas2);
-		srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
-		alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
-		srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
-		uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
-		uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
-		uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
-		uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
-		uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
-		uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
-		return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
-	}
-
 	// kRgbToRgbBlender
 	inline void blendRgbToRgb(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
 		// Default mode for set_trans_blender
@@ -377,572 +276,15 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
-	// kTintBlenderMode and kTintLightBlenderMode for SIMD
-	uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) const;
-
-	inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) const {
-		uint32x4_t x = vmovl_u16(pixels);
-		uint32x4_t c = vshrq_n_u32(x, 11);
-		uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
-		c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
-		uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
-		c = vandq_u32(x, vmovq_n_u32(0x001f));
-		uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
-		return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
-	}
-
-	inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) const {
-		uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
-		x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
-		x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
-		return vmovn_u32(x);
-	}
-
-	template<int DestBytesPerPixel, int SrcBytesPerPixel>
-	inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
-		uint32x4_t srcCols, destCol;
-		if (SrcBytesPerPixel == 4) {
-			destCol = vld1q_u32((uint32 *)destPtr);
-			srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-		} else {
-			destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
-			srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-		}
-		uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
-		if (srcAlpha != -1) {
-			// take into account for useTint
-			if (useTint) {
-				srcCols = blendPixelSIMD(tint, srcCols, alphas);
-			} else {
-				srcCols = blendPixelSIMD(srcCols, destCol, alphas);
-			}
-		}
-		uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
-		mask1 = vorrq_u32(mask1, skipMask);
-		uint32x4_t destCols2 = vandq_u32(destCol, mask1);
-		uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
-		uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-		if (horizFlip) {
-			final = vrev64q_u32(final);
-			final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
-		}
-		if (DestBytesPerPixel == 4) {
-			vst1q_u32((uint32 *)destPtr, final);
-		} else {
-			vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
-		}
-	}
-
-	inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
-		uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
-		uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
-		if (srcAlpha != -1) {
-			// take into account for useTint
-			if (useTint) {
-				srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
-			} else {
-				srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
-			}
-		}
-		uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
-		mask1 = vorrq_u16(mask1, skipMask);
-		uint16x8_t destCols2 = vandq_u16(destCol, mask1);
-		uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
-		uint16x8_t final = vorrq_u16(destCols2, srcCols2);
-		if (horizFlip) {
-			final = vrev64q_u16(final);
-			final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
-		}
-		vst1q_u16((uint16 *)destPtr, final);
-	}
-
-	// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-		const int xDir = horizFlip ? -1 : 1;
-		byte rSrc, gSrc, bSrc, aSrc;
-		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-		uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(srcAlpha), 24);
-		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintRed), 16));
-		tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintGreen), 8));
-		tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
-		uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
-		uint32x4_t transColors = vld1q_dup_u32(&transColor);
-		uint32x4_t alphas = vld1q_dup_u32(&srcAlpha);
-		uint32x4_t addIndexes = {0, 1, 2, 3};
-		if (horizFlip) addIndexes = {3, 2, 1, 0};
-		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-		
-		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-		if (xStart + xCtrWidth > destArea.w) {
-			xCtrWidth = destArea.w - xStart;
-		}
-		if (xStart < 0) {
-			xCtrStart = -xStart;
-			xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-			xStart = 0;
-		}
-		int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
-		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-		if (yStart < 0) {
-			yCtr = -yStart;
-			destY = 0;
-			if (ScaleThreshold != 0) {
-				scaleYCtr = yCtr * scaleY;
-				srcYCtr = scaleYCtr / ScaleThreshold;
-			}
-		}
-		if (yStart + yCtrHeight > destArea.h) {
-			yCtrHeight = destArea.h - yStart;
-		}
-		
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       horizFlip ? srcArea.right - 4 : srcArea.left,
-		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-			uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth);
-
-			if (ScaleThreshold == 0) {
-				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-					byte *destPtr = &destP[destX * DestBytesPerPixel];
-					uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-					drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-				}
-				destP += destArea.pitch;
-				srcP += vertFlip ? -src.pitch : src.pitch;
-			} else {
-				int newSrcYCtr = scaleYCtr / ScaleThreshold;
-				if (srcYCtr != newSrcYCtr) {
-					int diffSrcYCtr = newSrcYCtr - srcYCtr;
-					srcP += src.pitch * diffSrcYCtr;
-					srcYCtr = newSrcYCtr;
-				}
-				byte srcBuffer[4*4];
-				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-					if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break;
-					uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-					indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
-#else
-#error Change code to allow different scale threshold!
-#endif
-					memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
-					memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
-					memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
-					memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
-					scaleXCtr += scaleX*4;
-					byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
-					uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-					drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-				}
-				if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
-			}
-		}
-
-		// Get the last x values of the last row
-		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-		if (xCtrWidth % 4 == 0) return;
-		if (ScaleThreshold == 0) {
-			for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
-			}
-			if (horizFlip) srcP += SrcBytesPerPixel * 3;
-		} else {
-			xCtr = xCtrWidth - xCtrWidth % 4;
-			xCtrBpp = xCtr * SrcBytesPerPixel;
-			destX = xStart+xCtr;
-		}
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
-			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-			if (ScaleThreshold != 0) {
-				srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
-			}
-			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-			uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
-			
-			// Check if this is a transparent color we should skip
-			if (skipTrans && ((srcCol & alphaMask) == transColor))
-				continue;
-
-			src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-			if (srcAlpha != -1) {
-				if (useTint) {
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-					aDest = aSrc;
-					rSrc = tintRed;
-					gSrc = tintGreen;
-					bSrc = tintBlue;
-					aSrc = srcAlpha;
-				}/* else {
-					format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
-				}*/
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-				srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			} else {
-				srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-			}
-			if (DestBytesPerPixel == 4)
-				*(uint32 *)destVal = srcCol;
-			else
-				*(uint16 *)destVal = srcCol;
-		}
-	}
-
+	void drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-		const int xDir = horizFlip ? -1 : 1;
-		byte rSrc, gSrc, bSrc, aSrc;
-		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-		uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
-		uint16x8_t transColors = vdupq_n_u16(transColor);
-		uint16x8_t alphas = vdupq_n_u16(srcAlpha);
-		uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
-		if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
-		uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-		uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-		
-		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-		if (xStart + xCtrWidth > destArea.w) {
-			xCtrWidth = destArea.w - xStart;
-		}
-		if (xStart < 0) {
-			xCtrStart = -xStart;
-			xCtrBppStart = xCtrStart * 2;
-			xStart = 0;
-		}
-		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
-		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-		if (yStart < 0) {
-			yCtr = -yStart;
-			destY = 0;
-			if (ScaleThreshold != 0) {
-				scaleYCtr = yCtr * scaleY;
-				srcYCtr = scaleYCtr / ScaleThreshold;
-			}
-		}
-		if (yStart + yCtrHeight > destArea.h) {
-			yCtrHeight = destArea.h - yStart;
-		}
-		
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       horizFlip ? srcArea.right - 8 : srcArea.left,
-		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-			uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth);
-			if (ScaleThreshold == 0) {
-				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-					byte *destPtr = &destP[destX * 2];
-					uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-					drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-				}
-				destP += destArea.pitch;
-				srcP += vertFlip ? -src.pitch : src.pitch;
-			} else {
-				int newSrcYCtr = scaleYCtr / ScaleThreshold;
-				if (srcYCtr != newSrcYCtr) {
-					int diffSrcYCtr = newSrcYCtr - srcYCtr;
-					srcP += src.pitch * diffSrcYCtr;
-					srcYCtr = newSrcYCtr;
-				}
-				uint16 srcBuffer[8];
-				for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-					if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-					uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-					indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
-					indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
-#else
-#error Change code to allow different scale threshold!
-#endif
-					srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
-					srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
-					srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
-					srcBuffer[3] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 3));
-					srcBuffer[4] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 0));
-					srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
-					srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
-					srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
-					scaleXCtr += scaleX*8;
-					byte *destPtr = &destP[destX * 2];
-					uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-					drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-				}
-				if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
-			}
-		}
-
-		// Get the last x values of the last row
-		if (xCtrWidth % 8 == 0) return;
-		int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-		if (ScaleThreshold == 0) {
-			for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				byte *destPtr = &destP[destX * 2];
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
-			}
-			if (horizFlip) srcP += 2*3;
-		} else {
-			xCtr = xCtrWidth - xCtrWidth % 8;
-			xCtrBpp = xCtr * 2;
-			destX = xStart+xCtr;
-		}
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
-			const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-			if (ScaleThreshold != 0) {
-				srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
-			}
-			byte *destVal = (byte *)&destP[destX * 2];
-			uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
-			
-			// Check if this is a transparent color we should skip
-			if (skipTrans && srcCol == transColor)
-				continue;
-
-			src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-			if (srcAlpha != -1) {
-				if (useTint) {
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-					aDest = aSrc;
-					rSrc = tintRed;
-					gSrc = tintGreen;
-					bSrc = tintBlue;
-					aSrc = srcAlpha;
-				}/* else {
-					format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
-				}*/
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-				srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			} else {
-				srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-			}
-			*(uint16 *)destVal = srcCol;
-		}
-	}
-
-	// Call drawInner with BytesPerPixel=0 if both formats aren't the same.
+	void drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-		const int xDir = horizFlip ? -1 : 1;
-		byte rSrc, gSrc, bSrc, aSrc;
-		byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-		
-		int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-		if (xStart + xCtrWidth > destArea.w) {
-			xCtrWidth = destArea.w - xStart;
-		}
-		if (xStart < 0) {
-			xCtrStart = -xStart;
-			xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
-			xStart = 0;
-		}
-		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-		if (yStart < 0) {
-			yCtr = -yStart;
-			destY = 0;
-			if (ScaleThreshold != 0) {
-				scaleYCtr = yCtr * scaleY;
-				srcYCtr = scaleYCtr / ScaleThreshold;
-			}
-		}
-		if (yStart + yCtrHeight > destArea.h) {
-			yCtrHeight = destArea.h - yStart;
-		}
-
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       horizFlip ? srcArea.right - 1 : srcArea.left,
-		                       vertFlip ? srcArea.bottom - 1 - yCtr :
-		                       srcArea.top + yCtr);
-		for (; yCtr < dstRect.height(); ++destY, ++yCtr, scaleYCtr += scaleY) {
-			if (ScaleThreshold != 0) {
-				int newSrcYCtr = scaleYCtr / ScaleThreshold;
-				if (srcYCtr != newSrcYCtr) {
-					int diffSrcYCtr = newSrcYCtr - srcYCtr;
-					srcP += src.pitch * diffSrcYCtr;
-					srcYCtr = newSrcYCtr;
-				}
-			}
-			// Loop through the pixels of the row
-			for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel, scaleXCtr += scaleX) {
-				const byte *srcVal = srcP + xDir * xCtrBpp;
-				if (ScaleThreshold != 0) {
-					srcVal = srcP + (scaleXCtr / ScaleThreshold) * src.format.bytesPerPixel;
-				}
-				uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
-
-				// Check if this is a transparent color we should skip
-				if (skipTrans && ((srcCol & alphaMask) == transColor))
-					continue;
-
-				byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
-
-				// When blitting to the same format we can just copy the color
-				if (format.bytesPerPixel == 1) {
-					*destVal = srcCol;
-					continue;
-				} else if (sameFormat && srcAlpha == -1) {
-					if (format.bytesPerPixel == 4)
-						*(uint32 *)destVal = srcCol;
-					else
-						*(uint16 *)destVal = srcCol;
-					continue;
-				}
-
-				// We need the rgb values to do blending and/or convert between formats
-				if (src.format.bytesPerPixel == 1) {
-					const RGB &rgb = palette[srcCol];
-					aSrc = 0xff;
-					rSrc = rgb.r;
-					gSrc = rgb.g;
-					bSrc = rgb.b;
-				} else {
-					src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-				}
-
-				if (srcAlpha == -1) {
-					// This means we don't use blending.
-					aDest = aSrc;
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-				} else {
-					if (useTint) {
-						rDest = rSrc;
-						gDest = gSrc;
-						bDest = bSrc;
-						aDest = aSrc;
-						rSrc = tintRed;
-						gSrc = tintGreen;
-						bSrc = tintBlue;
-						aSrc = srcAlpha;
-					}
-					blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-				}
-
-				uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
-				if (format.bytesPerPixel == 4)
-					*(uint32 *)destVal = pixel;
-				else
-					*(uint16 *)destVal = pixel;
-			}
-
-			destP += destArea.pitch;
-			if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
-		}
-	}
-	
+	void drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-		const int xDir = horizFlip ? -1 : 1;
-		uint8x16_t transColors = vld1q_dup_u8(&transColor);
-		uint32x4_t scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-		uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-		uint32x4_t scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
-		uint32x4_t scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
-		
-		int xCtrStart = 0, xCtrWidth = dstRect.width();
-		if (xStart + xCtrWidth > destArea.w) {
-			xCtrWidth = destArea.w - xStart;
-		}
-		if (xStart < 0) {
-			xCtrStart = -xStart;
-			xStart = 0;
-		}
-		int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-		if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-		if (yStart < 0) {
-			yCtr = -yStart;
-			destY = 0;
-			if (ScaleThreshold != 0) {
-				scaleYCtr = yCtr * scaleY;
-				srcYCtr = scaleYCtr / ScaleThreshold;
-			}
-		}
-		if (yStart + yCtrHeight > destArea.h) {
-			yCtrHeight = destArea.h - yStart;
-		}
-		
-		byte *destP = (byte *)destArea.getBasePtr(0, destY);
-		const byte *srcP = (const byte *)src.getBasePtr(
-		                       horizFlip ? srcArea.right - 16 : srcArea.left,
-		                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-		for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-			if (ScaleThreshold != 0) {
-				int newSrcYCtr = scaleYCtr / ScaleThreshold;
-				if (srcYCtr != newSrcYCtr) {
-					int diffSrcYCtr = newSrcYCtr - srcYCtr;
-					srcP += src.pitch * diffSrcYCtr;
-					srcYCtr = newSrcYCtr;
-				}
-			}
-			int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
-			for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
-				byte *destPtr = &destP[destX];
-				uint8x16_t destCols = vld1q_u8(destPtr);
-				uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
-				if (ScaleThreshold != 0) {
-					uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
-					uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-					indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
-					indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
-					indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
-					indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
-#else
-#error Change code to allow different scale threshold!
-#endif
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 3)], srcCols, 3);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 0)], srcCols, 4);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 1)], srcCols, 5);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 2)], srcCols, 6);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 3)], srcCols, 7);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 0)], srcCols, 8);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 1)], srcCols, 9);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 2)], srcCols, 10);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 3)], srcCols, 11);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 0)], srcCols, 12);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
-					srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
-					scaleXCtr += scaleX*16;
-				}
-				uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
-				uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
-				if (horizFlip) {
-					final = vrev64q_u8(final);
-					final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
-				}
-				vst1q_u8(destPtr, final);
-			}
-			// Get the last x values
-			if (horizFlip) srcP += 15;
-			for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
-				const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-				if (ScaleThreshold != 0) {
-					srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
-				}
-				// Check if this is a transparent color we should skip
-				if (skipTrans && *srcCol == transColor)
-					continue;
-
-				byte *destVal = (byte *)&destP[destX];
-				*destVal = *srcCol;
-			}
-			if (horizFlip) srcP -= 15;
-			destP += destArea.pitch;
-			if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
-		}
-	}
-
+	void drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	
 	inline uint32 getColor(const byte *data, byte bpp) const {
 		switch (bpp) {
 		case 1:
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
new file mode 100644
index 00000000000..d3d769a81c1
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -0,0 +1,404 @@
+#ifdef __aarch64__
+
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+#include "ags/lib/allegro/surface_simd_neon.h"
+
+namespace AGS3 {
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(srcAlpha), 24);
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintRed), 16));
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintGreen), 8));
+	tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
+	uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
+	uint32x4_t transColors = vld1q_dup_u32(&transColor);
+	uint32x4_t alphas = vld1q_dup_u32(&srcAlpha);
+	uint32x4_t addIndexes = {0, 1, 2, 3};
+	if (horizFlip) addIndexes = {3, 2, 1, 0};
+	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		xStart = 0;
+	}
+	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 4 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth);
+
+		if (ScaleThreshold == 0) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+			byte srcBuffer[4*4];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break;
+				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
+				scaleXCtr += scaleX*4;
+				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
+				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	if (xCtrWidth % 4 == 0) return;
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+		}
+		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+	} else {
+		xCtr = xCtrWidth - xCtrWidth % 4;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = xStart+xCtr;
+	}
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && ((srcCol & alphaMask) == transColor))
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}/* else {
+				format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
+			}*/
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
+	uint16x8_t transColors = vdupq_n_u16(transColor);
+	uint16x8_t alphas = vdupq_n_u16(srcAlpha);
+	uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+	if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * 2;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 8 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth);
+		if (ScaleThreshold == 0) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+			uint16 srcBuffer[8];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
+				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
+				srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
+				srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
+				srcBuffer[3] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 3));
+				srcBuffer[4] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 0));
+				srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
+				srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
+				srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
+				scaleXCtr += scaleX*8;
+				byte *destPtr = &destP[destX * 2];
+				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	if (xCtrWidth % 8 == 0) return;
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+		}
+		if (horizFlip) srcP += 2*3;
+	} else {
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * 2;
+		destX = xStart+xCtr;
+	}
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && srcCol == transColor)
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	uint8x16_t transColors = vld1q_dup_u8(&transColor);
+	uint32x4_t scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	uint32x4_t scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
+	uint32x4_t scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
+	
+	int xCtrStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 16 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		if (ScaleThreshold != 0) {
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
+			byte *destPtr = &destP[destX];
+			uint8x16_t destCols = vld1q_u8(destPtr);
+			uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
+			if (ScaleThreshold != 0) {
+				uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+				uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
+				indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
+				indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
+				indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 3)], srcCols, 3);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 0)], srcCols, 4);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 1)], srcCols, 5);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 2)], srcCols, 6);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 3)], srcCols, 7);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 0)], srcCols, 8);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 1)], srcCols, 9);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 2)], srcCols, 10);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 3)], srcCols, 11);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 0)], srcCols, 12);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
+				scaleXCtr += scaleX*16;
+			}
+			uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
+			uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
+			if (horizFlip) {
+				final = vrev64q_u8(final);
+				final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
+			}
+			vst1q_u8(destPtr, final);
+		}
+		// Get the last x values
+		if (horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (ScaleThreshold != 0) {
+				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+			}
+			// Check if this is a transparent color we should skip
+			if (skipTrans && *srcCol == transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (horizFlip) srcP -= 15;
+		destP += destArea.pitch;
+		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+	}
+}
+
+
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
new file mode 100644
index 00000000000..7619a87684e
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -0,0 +1,361 @@
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
+
+#include "ags/lib/allegro/surface.h"
+
+namespace AGS3 {
+
+inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
+	uint32x4_t x = vmovl_u16(pixels);
+	uint32x4_t c = vshrq_n_u32(x, 11);
+	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
+	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
+	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
+	c = vandq_u32(x, vmovq_n_u32(0x001f));
+	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
+	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
+}
+
+inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
+	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+	return vmovn_u32(x);
+}
+
+inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
+	uint16x8_t srcComps[] = {
+		vandq_u16(srcCols, vmovq_n_u16(0x1f)),
+		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),
+		vshrq_n_u16(srcCols, 11),
+	}, destComps[] = {
+		vandq_u16(destCols, vmovq_n_u16(0x1f)),
+		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)),
+		vshrq_n_u16(destCols, 11),
+	};
+	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
+	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
+	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
+	//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
+	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
+	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
+
+	uint16x8_t diffs[] = {
+		vsubq_u16(srcComps[0], destComps[0]), // B
+		vsubq_u16(srcComps[1], destComps[1]), // G
+		vsubq_u16(srcComps[2], destComps[2]), // R
+	};
+	alphas = vshrq_n_u16(alphas, 2);
+	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
+	alphas = vshrq_n_u16(alphas, 1);
+	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
+	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
+
+	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
+	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
+	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
+
+	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
+	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
+	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
+	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
+	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
+}
+
+inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
+	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
+	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
+	uint32x4_t srcColsCopy = srcCols;
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	uint32x4_t destColsCopy = destCols;
+	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
+	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
+	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
+	srcColsCopy = vaddq_u32(srcColsCopy, destCols);
+
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
+	srcCols = vsubq_u32(srcCols, destCols);
+	srcCols = vmulq_u32(srcCols, alphas);
+	srcCols = vshrq_n_u32(srcCols, 8);
+	srcCols = vaddq_u32(srcCols, destCols);
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	srcCols = vorrq_u32(srcCols, srcColsCopy);
+	if (preserveAlpha) {
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
+	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
+	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
+	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
+	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
+	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
+	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
+	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
+	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
+	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
+	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
+	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
+	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
+	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
+	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
+	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
+	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1));
+	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
+	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
+	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
+	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
+	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
+	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
+	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
+	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
+	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
+}
+
+inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the dest color
+	// Then it gets the h and s of the srcCols
+
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	float32x4_t ddr, ddg, ddb;
+	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t ssr, ssg, ssb;
+	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
+	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
+	//float32x4_t dmins = vminq_f32(ddr, vminq_f32(ddg, ddb));
+	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
+	//float32x4_t ddelta = vsubq_f32(dmaxes, dmins);
+	
+	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
+	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
+	float32x4_t hr, hg, hb, hue;
+	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
+	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
+	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
+	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
+	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
+	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
+	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
+	hue = vmulq_f32(hr, hrfactors);
+	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
+	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
+	//float32x4_t hchromaZeroMask = vcvtq_f32_u32(vandq_u32(vcleq_f32(chroma, eplison0), vmovq_n_u32(1)));
+	//hue = vmulq_f32(hue, hchromaZeroMask);
+
+	// Mess with the light
+	float32x4_t val = dmaxes;
+	if (light) {
+		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
+		val = vmaxq_f32(val, vmovq_n_f32(0.0));
+	}
+		
+	// then it stiches them back together
+	//AGS3::Shared::Debug::Printf(AGS3::Shared::kDbgMsg_Info, "hues: %f", vgetq_lane_f32(hue, 0));
+	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
+	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
+	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
+	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
+	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
+	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
+
+	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
+	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
+	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
+	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
+	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
+	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
+	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
+	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
+	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
+	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
+	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
+	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
+
+	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
+	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
+	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
+	final = vaddq_u32(final, val_add);
+	return final;
+}
+
+inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
+	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		srcAlphas = vshrq_n_u32(srcCols, 24);
+		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
+		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
+		difAlphas = vshlq_n_u32(difAlphas, 24);
+		srcAlphas = vshlq_n_u32(srcAlphas, 24);
+		mask = vceqq_u32(alphas, vmovq_n_u32(0));
+		srcAlphas = vandq_u32(srcAlphas, mask);
+		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+		alphas = vshrq_n_u32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender:
+		setupArgbAlphas();
+		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
+		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kArgbToRgbBlender:
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
+		//mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
+		//ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
+		//ch2 = vandq_u32(destCols, vmvnq_u32(mask));
+		//return vandq_u32(vorrq_u32(ch1, ch2), vmovq_n_u32(0x00ffffff));
+	case kRgbToArgbBlender:
+		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
+		ch1 = vandq_u32(ch1, mask);
+		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kRgbToRgbBlender:
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode:
+		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+	case kAdditiveBlenderMode:
+		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
+		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
+	case kTintBlenderMode:
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode:
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+}
+
+inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	uint16x8_t mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
+		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
+		alphas = vorrq_u16(ch1, ch2);
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
+		ch1 = vandq_u32(srcCols, mask);
+		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
+		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
+		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
+		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
+		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
+		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
+		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return vcombine_u16(lo, hi);
+	}
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
+	uint32x4_t srcCols, destCol;
+	if (SrcBytesPerPixel == 4) {
+		destCol = vld1q_u32((uint32 *)destPtr);
+		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+	} else {
+		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
+		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
+	}
+	uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
+	mask1 = vorrq_u32(mask1, skipMask);
+	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
+	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
+	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u32(final);
+		final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+	}
+	if (DestBytesPerPixel == 4) {
+		vst1q_u32((uint32 *)destPtr, final);
+	} else {
+		vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
+	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
+	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
+	mask1 = vorrq_u16(mask1, skipMask);
+	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
+	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
+	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u16(final);
+		final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
+	}
+	vst1q_u16((uint16 *)destPtr, final);
+}
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 96089975d39..34161e4a0ca 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -24,6 +24,7 @@ MODULE_OBJS = \
 	lib/allegro/math.o \
 	lib/allegro/rotate.o \
 	lib/allegro/surface.o \
+	lib/allegro/surface_simd_neon.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \
 	lib/std/std.o \
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index 81706f6f17e..a78d0d6960a 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -35,6 +35,10 @@
 #include "graphics/managed_surface.h"
 #include "graphics/pixelformat.h"
 
+#ifdef __aarch64__
+#include "ags/lib/allegro/surface_simd_neon.h"
+#endif
+
 namespace AGS3 {
 
 namespace GfxDef = AGS::Shared::GfxDef;
@@ -149,13 +153,13 @@ void Test_BlenderModes() {
 												uint32x4_t src = vdupq_n_u32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
 												uint32x4_t dest = vdupq_n_u32(destB | (destG << 8) | (destR << 16) | (destA << 24));
 												uint32x4_t alphas = vdupq_n_u32(alpha);
-												simdCol = vgetq_lane_u32(dummy.blendPixelSIMD(src, dest, alphas), 0);
+												simdCol = vgetq_lane_u32(blendPixelSIMD(src, dest, alphas), 0);
 											}
 											{
 												uint16x8_t src = vdupq_n_u16((srcB >> 3) | ((srcG >> 2) << 5) | ((srcR >> 3) << 11));
 												uint16x8_t dest = vdupq_n_u16((destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11));
 												uint16x8_t alphas = vdupq_n_u16((uint16)alpha);
-												simd2bppCol = vgetq_lane_u16(dummy.blendPixelSIMD2Bpp(src, dest, alphas), 0);
+												simd2bppCol = vgetq_lane_u16(blendPixelSIMD2Bpp(src, dest, alphas), 0);
 											}
 #ifdef VERBOSE_TEST_GFX
 											printf("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d\n", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);


Commit: 90df6233f836df958dd7ee5e409f433ac1fdc3ad
    https://github.com/scummvm/scummvm/commit/90df6233f836df958dd7ee5e409f433ac1fdc3ad
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Put comments in NEON blitting/blending code.

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_neon.h


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 8f0edfdbdd4..2922e2d9009 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -112,18 +112,20 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	
+
+	// Instead of skipping pixels outside our boundary here, we just clip
+	// our area instead.
 	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
+	if (xStart + xCtrWidth > destArea.w) { // Clip the right
 		xCtrWidth = destArea.w - xStart;
 	}
-	if (xStart < 0) {
+	if (xStart < 0) { // Clip the left
 		xCtrStart = -xStart;
 		xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
 		xStart = 0;
 	}
 	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-	if (yStart < 0) {
+	if (yStart < 0) { // Clip the top
 		yCtr = -yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
@@ -131,7 +133,7 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
+	if (yStart + yCtrHeight > destArea.h) { // Clip the bottom
 		yCtrHeight = destArea.h - yStart;
 	}
 
@@ -274,6 +276,7 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
+	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
 	if (!_G(_bitmap_simd_optimizations)) {
 		DRAWINNER(drawInnerGeneric<0>);
 	} else {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 270c21a1c16..96d86370e06 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -27,15 +27,6 @@
 #include "ags/lib/allegro/color.h"
 #include "common/array.h"
 
-#if defined(__aarch64__)
-// All 64 bit arm v8 or whatevers come with neon extensions, no need to check
-#include <arm_neon.h>
-#elif defined(__x86_64__) || defined(__i686__)
-// Most x86 based processors come with sse2, (which is what intels header has here), but it can use sse4
-// SSE2 support is still checked for at runtime
-#include "ags/lib/NEON_2_SSE.h"
-#endif
-
 namespace AGS3 {
 
 class BITMAP {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index d3d769a81c1..bfc367f61f6 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -25,10 +25,16 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor,
 	uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
 	uint32x4_t transColors = vld1q_dup_u32(&transColor);
 	uint32x4_t alphas = vld1q_dup_u32(&srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
 	uint32x4_t addIndexes = {0, 1, 2, 3};
 	if (horizFlip) addIndexes = {3, 2, 1, 0};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
 	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
 	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
 	if (xStart + xCtrWidth > destArea.w) {
 		xCtrWidth = destArea.w - xStart;
@@ -57,59 +63,91 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor,
 	                       horizFlip ? srcArea.right - 4 : srcArea.left,
 	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth);
+		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
 
 		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
 				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
 				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
+			// Goto next row in source and destination image
 			destP += destArea.pitch;
 			srcP += vertFlip ? -src.pitch : src.pitch;
 		} else {
+			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
 				srcP += src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
 			byte srcBuffer[4*4];
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break;
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
 				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
 #else
 #error Change code to allow different scale threshold!
 #endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
 				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
 				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
 				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
 				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
 				scaleXCtr += scaleX*4;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
 				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
 				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
 			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
 		}
 	}
 
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
 	if (xCtrWidth % 4 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
 			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
 		}
+		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
 		if (horizFlip) srcP += SrcBytesPerPixel * 3;
 	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 4;
 		xCtrBpp = xCtr * SrcBytesPerPixel;
 		destX = xStart+xCtr;
 	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
@@ -133,9 +171,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor,
 				gSrc = tintGreen;
 				bSrc = tintBlue;
 				aSrc = srcAlpha;
-			}/* else {
-				format.colorToARGB(getColor(destVal, DestBytesPerPixel), aDest, rDest, gDest, bDest);
-			}*/
+			}
 			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
@@ -156,11 +192,17 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 	uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
 	uint16x8_t transColors = vdupq_n_u16(transColor);
 	uint16x8_t alphas = vdupq_n_u16(srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
 	uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
 	if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
 	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
 	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-	
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
 	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
 	if (xStart + xCtrWidth > destArea.w) {
 		xCtrWidth = destArea.w - xStart;
@@ -189,32 +231,44 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 	                       horizFlip ? srcArea.right - 8 : srcArea.left,
 	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth);
+		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
 		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
 				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
 				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
+			// Goto next row in source and destination image
 			destP += destArea.pitch;
 			srcP += vertFlip ? -src.pitch : src.pitch;
 		} else {
+			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
 				srcP += src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
 			uint16 srcBuffer[8];
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
 				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
 				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
 #else
 #error Change code to allow different scale threshold!
 #endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
 				srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
 				srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
 				srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
@@ -224,28 +278,48 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 				srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
 				srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
 				scaleXCtr += scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * 2];
 				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
 				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
 			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
 		}
 	}
 
-	// Get the last x values of the last row
+	// We have a picture that is a multiple of 8, so no extra pixels to draw
 	if (xCtrWidth % 8 == 0) return;
+	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 			byte *destPtr = &destP[destX * 2];
 			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
 		}
-		if (horizFlip) srcP += 2*3;
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (horizFlip) srcP += 2 * 7;
 	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 8;
 		xCtrBpp = xCtr * 2;
 		destX = xStart+xCtr;
 	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
@@ -285,11 +359,15 @@ template<int ScaleThreshold>
 void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	uint8x16_t transColors = vld1q_dup_u8(&transColor);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
 	uint32x4_t scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
 	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
 	uint32x4_t scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
 	uint32x4_t scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
 	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
 	int xCtrStart = 0, xCtrWidth = dstRect.width();
 	if (xStart + xCtrWidth > destArea.w) {
 		xCtrWidth = destArea.w - xStart;
@@ -318,8 +396,12 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
 		if (ScaleThreshold != 0) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
 			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
 				srcP += src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
@@ -328,9 +410,13 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
 		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
 			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
 			uint8x16_t destCols = vld1q_u8(destPtr);
 			uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
 			if (ScaleThreshold != 0) {
+				// If we are scaling, we have to set each pixel individually
 				uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
 				uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
@@ -359,6 +445,8 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
 				scaleXCtr += scaleX*16;
 			}
+
+			// Mask out transparent pixels
 			uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
 			uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
 			if (horizFlip) {
@@ -368,6 +456,9 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 			vst1q_u8(destPtr, final);
 		}
 		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
 		if (horizFlip) srcP += 15;
 		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
 			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
@@ -381,8 +472,9 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 			byte *destVal = (byte *)&destP[destX];
 			*destVal = *srcCol;
 		}
-		if (horizFlip) srcP -= 15;
-		destP += destArea.pitch;
+		if (horizFlip) srcP -= 15; // Undo what we did up there
+		destP += destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
 		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
 	}
 }
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index 7619a87684e..ce6361235a3 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -1,22 +1,52 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
+#ifdef __aarch64__
 
+#include <arm_neon.h>
 #include "ags/lib/allegro/surface.h"
 
 namespace AGS3 {
 
 inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
 	uint32x4_t x = vmovl_u16(pixels);
+
+	// c is the extracted 5/6 bit color from the image
 	uint32x4_t c = vshrq_n_u32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
 	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
 	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
 	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
 	c = vandq_u32(x, vmovq_n_u32(0x001f));
 	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
 	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
 }
 
 inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
+	// x is the final 16 bit rgb pixel
 	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
 	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
 	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
@@ -24,16 +54,22 @@ inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
 }
 
 inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
+
+	// Split the components into rgb
 	uint16x8_t srcComps[] = {
-		vandq_u16(srcCols, vmovq_n_u16(0x1f)),
-		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),
-		vshrq_n_u16(srcCols, 11),
+		vandq_u16(srcCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),	// G
+		vshrq_n_u16(srcCols, 11),								// R
 	}, destComps[] = {
-		vandq_u16(destCols, vmovq_n_u16(0x1f)),
-		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)),
-		vshrq_n_u16(destCols, 11),
+		vandq_u16(destCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)), // G
+		vshrq_n_u16(destCols, 11),								// R
 	};
+
+	// At some point I made it so that it would put them into their 8bit depth format
+	// to keep the function as 1-1 with the original, but it didn't seem to help much
 	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
 	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
 	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
@@ -41,49 +77,72 @@ inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint
 	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
 	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
 
+	// Calculate the differences between the colors
 	uint16x8_t diffs[] = {
 		vsubq_u16(srcComps[0], destComps[0]), // B
 		vsubq_u16(srcComps[1], destComps[1]), // G
 		vsubq_u16(srcComps[2], destComps[2]), // R
 	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
 	alphas = vshrq_n_u16(alphas, 2);
 	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
 	alphas = vshrq_n_u16(alphas, 1);
 	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
 	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
 
+	// Originally, I converted it back to normal here from the 8bpp form, but don't need to do that anymore
 	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
 	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
 	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
 
+	// Here we add the difference between the 2 colors times alpha onto the destination
 	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
 	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
 	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
 	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
 	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
 }
 
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
 inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
+
+	// Get the alpha from the destination
 	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
+
+	// Get red and blue components
 	uint32x4_t srcColsCopy = srcCols;
 	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
 	uint32x4_t destColsCopy = destCols;
 	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+
+	// compute the difference, then multiply by alpha and divide by 255
 	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
 	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
 	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
-	srcColsCopy = vaddq_u32(srcColsCopy, destCols);
+	srcColsCopy = vaddq_u32(srcColsCopy, destCols); // Add the new red/blue to the old ones
 
+	// do the same for the green component
 	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
 	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
 	srcCols = vsubq_u32(srcCols, destCols);
 	srcCols = vmulq_u32(srcCols, alphas);
 	srcCols = vshrq_n_u32(srcCols, 8);
-	srcCols = vaddq_u32(srcCols, destCols);
+	srcCols = vaddq_u32(srcCols, destCols); // Add the new green to the old green
+
+	// keep values in 8bit range and glue red/blue and green together
 	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
 	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
 	srcCols = vorrq_u32(srcCols, srcColsCopy);
+
+	// Remeber that alpha is not alphas, but rather the alpha of destCols
 	if (preserveAlpha) {
 		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
 		srcCols = vorrq_u32(srcCols, alpha);
@@ -91,54 +150,78 @@ inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4
 	return srcCols;
 }
 
+// uses the alpha from srcCols and destCols
 inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
 	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
 	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
+
+	// sAlphas1 has the alphas of the first pixel in lanes 0 and 1 and of the second pixel in lanes 2 and 3
+	// same with sAlphas2 but for the 2nd pixel
 	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
 	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
+
+	// Same thing going on here with dAlphas, except that it gets mutliplied by (1 - sAlpha) first
 	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
 	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
 	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
 	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
 	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
+
+	// first 2 pixels
 	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
 	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
+	// last 2 pixels
 	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
 	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
 	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
 	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
 	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
-	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1));
+	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1)); // compute reciprocal
 	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
 	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
 	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
 	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
 	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
 	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
+
+	// alpha channel is computed differently
 	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
+
+	// Final argb components as 16bit values
 	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
+
+	// copy alpha channel over
 	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
 	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
 	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
 	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
+
+	// cast 16bit to 8bit and reinterpret as uint32's
 	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
 }
 
 inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
 	// This function is NOT 1 to 1 with the original... It just approximates it
-	// It gets the value of the dest color
-	// Then it gets the h and s of the srcCols
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
 
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
 	// srcCols[0] = A | R | G | B
 	// srcCols[1] = A | R | G | B
 	// srcCols[2] = A | R | G | B
 	// srcCols[3] = A | R | G | B
 	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
 	// dda = { A[0], A[1], A[2], A[3] }
 	// ddr = { R[0], R[1], R[2], R[3] }
 	// ddg = { G[0], G[1], G[2], G[3] }
 	// ddb = { B[0], B[1], B[2], B[3] }
 
+	// do the transformation (we don't actually need alpha at all)
 	float32x4_t ddr, ddg, ddb;
 	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
 	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
@@ -147,37 +230,42 @@ inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, u
 	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
 	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
 	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
 	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
 	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
-	//float32x4_t dmins = vminq_f32(ddr, vminq_f32(ddg, ddb));
 	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
-	//float32x4_t ddelta = vsubq_f32(dmaxes, dmins);
-	
+
+	// This is here to stop from dividing by 0
 	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
+
 	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
 	float32x4_t hr, hg, hb, hue;
 	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
 	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
 	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
 	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
+
+	// And then compute which one will be used based on criteria
 	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
 	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
 	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
 	hue = vmulq_f32(hr, hrfactors);
 	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
 	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
-	//float32x4_t hchromaZeroMask = vcvtq_f32_u32(vandq_u32(vcleq_f32(chroma, eplison0), vmovq_n_u32(1)));
-	//hue = vmulq_f32(hue, hchromaZeroMask);
 
-	// Mess with the light
+	// Mess with the light like the original function
 	float32x4_t val = dmaxes;
 	if (light) {
 		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
 		val = vmaxq_f32(val, vmovq_n_f32(0.0));
 	}
 		
-	// then it stiches them back together
-	//AGS3::Shared::Debug::Printf(AGS3::Shared::kDbgMsg_Info, "hues: %f", vgetq_lane_f32(hue, 0));
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
 	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
 	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
 	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
@@ -186,6 +274,7 @@ inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, u
 	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
 	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
 
+	// Again HSV->RGB is also a piecewise function
 	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
 	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
 	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
@@ -199,7 +288,10 @@ inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, u
 	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
 	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
 
+	// or the values together
 	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
+
+	// add the minimums back in
 	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
 	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
 	final = vaddq_u32(final, val_add);
@@ -209,6 +301,12 @@ inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, u
 inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
 	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
 	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
 		srcAlphas = vshrq_n_u32(srcCols, 24);
 		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
 		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
@@ -221,45 +319,45 @@ inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32
 		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
 	};
 	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
 		alphas = vshrq_n_u32(srcCols, 24);
 		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kArgbToArgbBlender:
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
 		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
 		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
 		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
 		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
 		return vorrq_u32(ch1, ch2);
-	case kArgbToRgbBlender:
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
 		setupArgbAlphas();
 		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
-		//mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
-		//ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
-		//ch2 = vandq_u32(destCols, vmvnq_u32(mask));
-		//return vandq_u32(vorrq_u32(ch1, ch2), vmovq_n_u32(0x00ffffff));
-	case kRgbToArgbBlender:
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
 		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
 		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
 		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
 		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+		// mask and or them together
 		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
 		ch1 = vandq_u32(ch1, mask);
 		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
 		return vorrq_u32(ch1, ch2);
-	case kRgbToRgbBlender:
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
 		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kAlphaPreservedBlenderMode:
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
 		return rgbBlendSIMD(srcCols, destCols, alphas, true);
-	case kOpaqueBlenderMode:
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
 		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-	case kAdditiveBlenderMode:
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
 		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
 		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
 		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
 		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
-	case kTintBlenderMode:
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
 		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
-	case kTintLightBlenderMode:
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
 		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
 	}
 }
@@ -308,7 +406,9 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uin
 		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
 		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
 	}
-	uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
+	mask1 = vorrq_u32(mask1, skipMask);
 	if (srcAlpha != -1) {
 		// take into account for useTint
 		if (useTint) {
@@ -317,8 +417,6 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uin
 			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
 		}
 	}
-	uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
-	mask1 = vorrq_u32(mask1, skipMask);
 	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
 	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
 	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
@@ -336,6 +434,8 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uin
 inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
 	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
 	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
+	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
+	mask1 = vorrq_u16(mask1, skipMask);
 	if (srcAlpha != -1) {
 		// take into account for useTint
 		if (useTint) {
@@ -344,8 +444,6 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint,
 			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
 		}
 	}
-	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
-	mask1 = vorrq_u16(mask1, skipMask);
 	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
 	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
 	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
@@ -358,4 +456,5 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint,
 
 } // namespace AGS3
 
-#endif
+#endif /* __aarch64__ */
+#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON */


Commit: 0d29563122887a9826d31ec2e54c382e5a2d705a
    https://github.com/scummvm/scummvm/commit/0d29563122887a9826d31ec2e54c382e5a2d705a
Author: Wyatt Radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Started on SSE version

Changed paths:
  A engines/ags/lib/allegro/surface_simd_sse.cpp
  A engines/ags/lib/allegro/surface_simd_sse.h
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/module.mk


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 7e5ea6047b4..07a01ee8824 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -106,27 +106,27 @@ Globals *g_globals;
 static bool checkForSIMDExtensions() {
 #if defined(__x86_64__) || defined(__i686__)
 #  ifdef __GNUC__
-	int c_ecx, c_edx;
-	asm (".intel_syntax;"
-		 "movq rax,0;"
-		 "cpuid;"
-		 "mov %0,ecx;"
-		 "mov %1,edx;"
-		 ".att_syntax;"
-		 : "=r" (c_ecx), "=r" (c_edx)
-		 : "r"
-		 : "eax", "ebx", "ecx", "edx");
-	return c_edx & (1 << 25); // SSE2 extensions bit
+	//int c_ecx, c_edx;
+	//asm ("mov $0, %%eax\n\t"
+	//	 "cpuid\n\t"
+	//	 "mov %%ecx, %0\n\t"
+	//	 "mov %%edx, %1\n\t"
+	//	 : "=rm" (c_ecx), "=rm" (c_edx)
+	//	 :
+	//	 : "eax", "ebx", "ecx", "edx");
+	//return c_edx & (1 << 25); // SSE2 extensions bit
+	return false;
 #  elif _MSC_VER
-	int c_ecx, c_edx;
-	__asm
-	{
-		mov rax,0
-		cpuid
-		mov c_ecx,ecx
-		mov c_edx,edx
-	}
-	return c_edx & (1 << 25); // SSE2 extensions bit
+	//int c_ecx, c_edx;
+	//__asm
+	//{
+	//	mov rax,0
+	//	cpuid
+	//	mov c_ecx,ecx
+	//	mov c_edx,edx
+	//}
+	//return c_edx & (1 << 25); // SSE2 extensions bit
+	return false;
 #  else
 	return false;
 #  endif
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 2922e2d9009..2d1de640be3 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -26,7 +26,6 @@
 #include "ags/globals.h"
 #include "common/textconsole.h"
 #include "graphics/screen.h"
-#include <arm_neon.h>
 
 namespace AGS3 {
 
@@ -105,10 +104,13 @@ void BITMAP::floodfill(int x, int y, int color) {
 	AGS3::floodfill(this, x, y, color);
 }
 
+#include "common/debug.h"
+
 const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
 template<int ScaleThreshold>
 void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	//if (ScaleThreshold != 0) return;
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 96d86370e06..a87bc5fa896 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -132,8 +132,6 @@ public:
 	// when x is the sprite color, y the destination color, and n an alpha value
 
 	void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
-	//uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) const;
-	//uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) const;
 
 	inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
 		// Note: the original's handling varies slightly for R & B vs G.
@@ -268,13 +266,13 @@ public:
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	
 	inline uint32 getColor(const byte *data, byte bpp) const {
 		switch (bpp) {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index bfc367f61f6..59033291de6 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -14,7 +14,7 @@ namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -185,7 +185,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32_t transColor,
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -356,7 +356,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	uint8x16_t transColors = vld1q_dup_u8(&transColor);
 
@@ -480,16 +480,16 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32_t transColor, uint32_t
 }
 
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32_t, uint32_t, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 
 } // namespace AGS3
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
new file mode 100644
index 00000000000..f85b54b937a
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -0,0 +1,42 @@
+#if defined(__x86_64__) || defined(__i686__)
+
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+#include "ags/lib/allegro/surface_simd_neon.h"
+
+namespace AGS3 {
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+}
+
+
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
new file mode 100644
index 00000000000..4d7bfd4302d
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_sse.h
@@ -0,0 +1,35 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
+#if defined(__x86_64__) || defined(__i686__)
+
+#include <immintrin.h>
+#include "ags/lib/allegro/surface.h"
+
+namespace AGS3 {
+
+
+
+} // namespace AGS3
+
+#endif /* __x86_64__ __i686__ */
+#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE */
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 34161e4a0ca..38af18ef161 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -25,6 +25,7 @@ MODULE_OBJS = \
 	lib/allegro/rotate.o \
 	lib/allegro/surface.o \
 	lib/allegro/surface_simd_neon.o \
+	lib/allegro/surface_simd_sse.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \
 	lib/std/std.o \


Commit: 45f093f4beb766f33ec311e1e4a5b54ea0940a95
    https://github.com/scummvm/scummvm/commit/45f093f4beb766f33ec311e1e4a5b54ea0940a95
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed SSE2 detector and unoptimized draw.

Changed paths:
  R engines/ags/lib/NEON_2_SSE.h
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 07a01ee8824..34dfc60e670 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -106,27 +106,24 @@ Globals *g_globals;
 static bool checkForSIMDExtensions() {
 #if defined(__x86_64__) || defined(__i686__)
 #  ifdef __GNUC__
-	//int c_ecx, c_edx;
-	//asm ("mov $0, %%eax\n\t"
-	//	 "cpuid\n\t"
-	//	 "mov %%ecx, %0\n\t"
-	//	 "mov %%edx, %1\n\t"
-	//	 : "=rm" (c_ecx), "=rm" (c_edx)
-	//	 :
-	//	 : "eax", "ebx", "ecx", "edx");
-	//return c_edx & (1 << 25); // SSE2 extensions bit
-	return false;
+	uint32 extensions;
+	asm ("mov $1, %%eax\n\t"
+		 "cpuid\n\t"
+		 "mov %%edx, %0\n\t"
+		 : "=rm" (extensions)
+		 :
+		 : "eax", "ebx", "ecx", "edx");
+	debug("extensions_bits: %ux\n", extensions);
+	return extensions & (1 << 26); // SSE2 extensions bit
 #  elif _MSC_VER
-	//int c_ecx, c_edx;
-	//__asm
-	//{
-	//	mov rax,0
-	//	cpuid
-	//	mov c_ecx,ecx
-	//	mov c_edx,edx
-	//}
-	//return c_edx & (1 << 25); // SSE2 extensions bit
-	return false;
+	uint32 extensions;
+	__asm
+	{
+		mov eax,1
+		cpuid
+		mov extensions,edx
+	}
+	return extensions & (1 << 26); // SSE2 extensions bit
 #  else
 	return false;
 #  endif
diff --git a/engines/ags/lib/NEON_2_SSE.h b/engines/ags/lib/NEON_2_SSE.h
deleted file mode 100644
index d1a789046c4..00000000000
--- a/engines/ags/lib/NEON_2_SSE.h
+++ /dev/null
@@ -1,16872 +0,0 @@
-//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina at intel.com
-
-//*** Copyright (C) 2012-2022 Intel Corporation.  All rights reserved.
-
-//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-
-//By downloading, copying, installing or using the software you agree to this license.
-//If you do not agree to this license, do not download, install, copy or use the software.
-
-//                              License Agreement
-//Redistribution and use in source and binary forms, with or without modification,
-//are permitted provided that the following conditions are met:
-
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-
-//This software is provided by the copyright holders and contributors "as is" and
-//any express or implied warranties, including, but not limited to, the implied
-//warranties of merchantability and fitness for a particular purpose are disclaimed.
-//In no event shall the Intel Corporation or contributors be liable for any direct,
-//indirect, incidental, special, exemplary, or consequential damages
-//(including, but not limited to, procurement of substitute goods or services;
-//loss of use, data, or profits; or business interruption) however caused
-//and on any theory of liability, whether in contract, strict liability,
-//or tort (including negligence or otherwise) arising in any way out of
-//the use of this software, even if advised of the possibility of such damage.
-
-//*****************************************************************************************
-// This file is intended to simplify ARM->IA32 porting
-// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
-// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
-//MMX instruction set is not used due to non availability on x64 systems,
-//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
-//*****************************************************************************************
-
-//!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intrinsics instead of "arm_neon.h" and compile it as usual
-//!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
-
-#ifndef NEON2SSE_H
-#define NEON2SSE_H
-
-/*********************************************************************************************************************/
-//!!!!!!!!!!!!!!
-//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
-//For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
-#ifndef USE_SSE4
-#   if defined(__SSE4_2__)
-#       define USE_SSE4
-#   endif
-#endif
-/*********************************************************************************************************************/
-
-#include <xmmintrin.h>     //SSE
-#include <emmintrin.h>     //SSE2
-#include <pmmintrin.h>     //SSE3
-#include <tmmintrin.h>     //SSSE3
-#ifdef USE_SSE4
-#   include <smmintrin.h> //SSE4.1
-#   include <nmmintrin.h> //SSE4.2
-#endif
-
-#include <math.h>
-
-//***************  functions and data attributes, compiler dependent  *********************************
-//***********************************************************************************
-#ifdef __GNUC__
-#   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#   define _NEON2SSESTORAGE static
-#   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
-#   ifdef __clang__
-#       define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__))
-#   else
-#       define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#   endif
-#   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
-#       if _GCC_VERSION <  40500
-#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
-#       else
-#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
-#       endif
-#   else
-#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
-#   endif
-#   if defined(__x86_64__)
-#       define _NEON2SSE_64BIT  __x86_64__
-#   endif
-#else
-#   define _NEON2SSESTORAGE static
-#   define _NEON2SSE_ALIGN_16  __declspec(align(16))
-#   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
-#   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
-#       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
-#       if defined(_M_X64)
-#           define _NEON2SSE_64BIT  _M_X64
-#       endif
-#   else
-#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
-#   endif
-#endif
-
-/* Used to mark the intinsics that are declared as functions, but implemented as macros */
-#define _NEON2SSE_GLOBAL
-
-#if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
-#   define _NEON2SSE_64BIT_SSE4
-#endif
-
-#ifndef UNREFERENCED_PARAMETER
-#   define UNREFERENCED_PARAMETER(P) ((void)(P))
-#endif
-
-/*********************************************************************************************************************/
-//    data types conversion
-/*********************************************************************************************************************/
-#if defined(_MSC_VER) && (_MSC_VER < 1300)
-    typedef signed char int8_t;
-    typedef unsigned char uint8_t;
-    typedef signed short int16_t;
-    typedef unsigned short uint16_t;
-    typedef signed int int32_t;
-    typedef unsigned int uint32_t;
-    typedef signed long long int64_t;
-    typedef unsigned long long uint64_t;
-#elif defined(_MSC_VER)
-    typedef signed __int8 int8_t;
-    typedef unsigned __int8 uint8_t;
-    typedef signed __int16 int16_t;
-    typedef unsigned __int16 uint16_t;
-    typedef signed __int32 int32_t;
-    typedef unsigned __int32 uint32_t;
-
-    typedef signed long long int64_t;
-    typedef unsigned long long uint64_t;
-#else
-#   include <stdint.h>
-#   include <limits.h>
-#endif
-
-
-typedef   float float32_t;
-#if !defined(__clang__)
-typedef   float __fp16;
-#endif
-
-typedef   double float64_t;
-
-typedef union   __m64_128 {
-    uint64_t m64_u64[1];
-    int64_t m64_i64[1];
-    float64_t m64_d64[1];
-    uint32_t m64_u32[2];
-    int32_t m64_i32[2];
-    float32_t m64_f32[2];
-    int16_t m64_i16[4];
-    uint16_t m64_u16[4];
-    int8_t m64_i8[8];
-    uint8_t m64_u8[8];
-} __m64_128;
-
-typedef __m64_128 int8x8_t;
-typedef __m64_128 uint8x8_t;
-typedef __m64_128 int16x4_t;
-typedef __m64_128 uint16x4_t;
-typedef __m64_128 int32x2_t;
-typedef __m64_128 uint32x2_t;
-typedef __m64_128 int64x1_t;
-typedef __m64_128 uint64x1_t;
-typedef __m64_128 poly8x8_t;
-typedef __m64_128 poly16x4_t;
-
-typedef __m64_128 float32x2_t;
-typedef __m128 float32x4_t;
-
-typedef __m128 float16x4_t; //not supported by IA, for compartibility
-typedef __m128 float16x8_t; //not supported by IA, for compartibility
-
-typedef __m64_128 float64x1_t;
-typedef __m128d float64x2_t;
-
-typedef __m128i int8x16_t;
-typedef __m128i int16x8_t;
-typedef __m128i int32x4_t;
-typedef __m128i int64x2_t;
-typedef __m128i uint8x16_t;
-typedef __m128i uint16x8_t;
-typedef __m128i uint32x4_t;
-typedef __m128i uint64x2_t;
-typedef __m128i poly8x16_t;
-typedef __m128i poly16x8_t;
-
-#if defined(_MSC_VER)
-#   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
-#   define SINT_MAX       2147483647 /* max signed int value */
-#else
-#   define SINT_MIN     INT_MIN /* min signed int value */
-#   define SINT_MAX     INT_MAX /* max signed int value */
-#endif
-
-typedef  uint8_t poly8_t;
-typedef  uint16_t poly16_t;
-
-
-//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
-//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
-struct int8x16x2_t {
-    int8x16_t val[2];
-};
-struct int16x8x2_t {
-    int16x8_t val[2];
-};
-struct int32x4x2_t {
-    int32x4_t val[2];
-};
-struct int64x2x2_t {
-    int64x2_t val[2];
-};
-//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
-struct int8x8x2_t {
-    int8x8_t val[2];
-};
-struct int16x4x2_t {
-    int16x4_t val[2];
-};
-struct int32x2x2_t {
-    int32x2_t val[2];
-};
-struct int64x1x2_t {
-    int64x1_t val[2];
-};
-
-typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
-typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
-typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
-typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
-
-typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
-typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
-typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
-typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
-
-/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
-typedef struct int8x16x2_t uint8x16x2_t;
-typedef struct int16x8x2_t uint16x8x2_t;
-typedef struct int32x4x2_t uint32x4x2_t;
-typedef struct int64x2x2_t uint64x2x2_t;
-typedef struct int8x16x2_t poly8x16x2_t;
-typedef struct int16x8x2_t poly16x8x2_t;
-
-typedef struct int8x8x2_t uint8x8x2_t;
-typedef struct int16x4x2_t uint16x4x2_t;
-typedef struct int32x2x2_t uint32x2x2_t;
-typedef struct int64x1x2_t uint64x1x2_t;
-typedef struct int8x8x2_t poly8x8x2_t;
-typedef struct int16x4x2_t poly16x4x2_t;
-
-//float
-struct float32x4x2_t {
-    float32x4_t val[2];
-};
-struct float16x8x2_t {
-    float16x8_t val[2];
-};
-struct float32x2x2_t {
-    float32x2_t val[2];
-};
-
-typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
-typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
-typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
-typedef  float16x8x2_t float16x4x2_t;
-
-//4
-struct int8x16x4_t {
-    int8x16_t val[4];
-};
-struct int16x8x4_t {
-    int16x8_t val[4];
-};
-struct int32x4x4_t {
-    int32x4_t val[4];
-};
-struct int64x2x4_t {
-    int64x2_t val[4];
-};
-
-struct int8x8x4_t {
-    int8x8_t val[4];
-};
-struct int16x4x4_t {
-    int16x4_t val[4];
-};
-struct int32x2x4_t {
-    int32x2_t val[4];
-};
-struct int64x1x4_t {
-    int64x1_t val[4];
-};
-
-typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
-typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
-typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
-typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
-
-typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
-typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
-typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
-typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
-
-/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-typedef struct int8x8x4_t uint8x8x4_t;
-typedef struct int16x4x4_t uint16x4x4_t;
-typedef struct int32x2x4_t uint32x2x4_t;
-typedef struct int64x1x4_t uint64x1x4_t;
-typedef struct int8x8x4_t poly8x8x4_t;
-typedef struct int16x4x4_t poly16x4x4_t;
-
-typedef struct int8x16x4_t uint8x16x4_t;
-typedef struct int16x8x4_t uint16x8x4_t;
-typedef struct int32x4x4_t uint32x4x4_t;
-typedef struct int64x2x4_t uint64x2x4_t;
-typedef struct int8x16x4_t poly8x16x4_t;
-typedef struct int16x8x4_t poly16x8x4_t;
-
-struct float32x4x4_t {
-    float32x4_t val[4];
-};
-struct float16x8x4_t {
-    float16x8_t val[4];
-};
-struct float32x2x4_t {
-    float32x2_t val[4];
-};
-
-typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
-typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
-typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
-typedef  float16x8x4_t float16x4x4_t;
-
-//3
-struct int16x8x3_t {
-    int16x8_t val[3];
-};
-struct int32x4x3_t {
-    int32x4_t val[3];
-};
-struct int64x2x3_t {
-    int64x2_t val[3];
-};
-struct int8x16x3_t {
-    int8x16_t val[3];
-};
-
-struct int16x4x3_t {
-    int16x4_t val[3];
-};
-struct int32x2x3_t {
-    int32x2_t val[3];
-};
-struct int64x1x3_t {
-    int64x1_t val[3];
-};
-struct int8x8x3_t {
-    int8x8_t val[3];
-};
-typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
-typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
-typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
-typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
-
-typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
-typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
-typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
-typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
-
-
-/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-typedef struct int8x16x3_t uint8x16x3_t;
-typedef struct int16x8x3_t uint16x8x3_t;
-typedef struct int32x4x3_t uint32x4x3_t;
-typedef struct int64x2x3_t uint64x2x3_t;
-typedef struct int8x16x3_t poly8x16x3_t;
-typedef struct int16x8x3_t poly16x8x3_t;
-typedef struct  int8x8x3_t uint8x8x3_t;
-typedef struct  int16x4x3_t uint16x4x3_t;
-typedef struct  int32x2x3_t uint32x2x3_t;
-typedef struct  int64x1x3_t uint64x1x3_t;
-typedef struct  int8x8x3_t poly8x8x3_t;
-typedef struct  int16x4x3_t poly16x4x3_t;
-
-//float
-struct float32x4x3_t {
-    float32x4_t val[3];
-};
-struct float32x2x3_t {
-    float32x2_t val[3];
-};
-struct float16x8x3_t {
-    float16x8_t val[3];
-};
-
-typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
-typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
-typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
-typedef  float16x8x3_t float16x4x3_t;
-
-
-//****************************************************************************
-//****** Porting auxiliary macros ********************************************
-
-//** floating point related macros **
-#define _M128i(a) _mm_castps_si128(a)
-#define _M128(a) _mm_castsi128_ps(a)
-//here the most performance effective implementation is compiler and 32/64 bits build dependent
-#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
-#   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
-#   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
-#   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
-#else
-   //for 32bit gcc and Microsoft compilers builds
-#   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
-#   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
-#   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
-#endif
-#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
-
-#define return64(a)  _M64(res64,a); return res64;
-#define return64f(a)  _M64f(res64,a); return res64;
-
-#define _Ui64(a) (*(uint64_t*)&(a))
-#define _UNSIGNED_T(a) u ## a
-
-#define _SIGNBIT64 ((uint64_t)1 << 63)
-#define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
-#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
-
-#define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
-#define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#define __constrange(min,max)  const
-#define __transfersize(size)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
-_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
-
-//*************************************************************************
-//*************************************************************************
-//*********  Functions declarations as declared in original arm_neon.h *****
-//*************************************************************************
-//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
-_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
-_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
-_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
-_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
-_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
-_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
-_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-//Vector rounding add high half: vraddhn
-_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-//Multiplication
-//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
-_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-//multiply lane
-_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
-_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
-_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
-_NEON2SSE_GLOBAL uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
-_NEON2SSE_GLOBAL uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
-_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
-_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
-_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
-_NEON2SSE_GLOBAL uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
-_NEON2SSE_GLOBAL uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
-//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
-_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-//Vector multiply subtract long
-_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-//Vector saturating doubling multiply high
-_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-//Vector saturating rounding doubling multiply high
-_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-//Vector saturating doubling multiply accumulate long
-_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-//Vector saturating doubling multiply subtract long
-_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-//Vector long multiply
-_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-//Vector saturating doubling long multiply
-_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-//Subtraction
-//Vector subtract
-_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
-_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-//Vector saturating subtract
-_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
-//Vector halving subtract
-_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-//Vector subtract high half
-_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-//Vector rounding subtract high half
-_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-//Comparison
-//Vector compare equal
-_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-_NEON2SSE_GLOBAL uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-_NEON2SSE_GLOBAL uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-//Vector compare greater-than or equal
-_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-//Vector compare less-than or equal
-_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
-_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
-_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
-_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-//Vector compare greater-than
-_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-//Vector compare less-than
-_NEON2SSE_GLOBAL uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-_NEON2SSE_GLOBAL uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-_NEON2SSE_GLOBAL uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
-_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-//Vector compare absolute greater-than or equal
-_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-//Vector compare absolute less-than or equal
-_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-//Vector compare absolute greater-than
-_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-//Vector compare absolute less-than
-_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-//Vector test bits
-_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
-_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
-_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
-_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
-_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
-_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-//Absolute difference
-//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
-_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-//Absolute difference - long
-_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
-_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-//Absolute difference and accumulate - long
-_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
-_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-//Max/Min
-//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
-_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-
-_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
-
-//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
-_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-
-_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
-
-//Pairwise addition
-//Pairwise add
-_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-//Long pairwise add
-_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
-_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
-_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-//Long pairwise add and accumulate
-_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
-_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
-_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
-_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
-_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
-_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-//Folding maximum vpmax -> takes maximum of adjacent pairs
-_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-//Folding minimum vpmin -> takes minimum of adjacent pairs
-_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-//Reciprocal/Sqrt
-_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-//Shifts by signed variable
-//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
-_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-//Vector saturating shift left: (negative values shift right)
-_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-//Vector rounding shift left: (negative values shift right)
-_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-//Vector saturating rounding shift left: (negative values shift right)
-_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
-_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-//Shifts by a constant
-//Vector shift right by constant
-_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
-_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
-_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-//Vector shift left by constant
-_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-//Vector rounding shift right by constant
-_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
-_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
-_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-//Vector shift right by constant and accumulate
-_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
-_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
-_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-//Vector rounding shift right by constant and accumulate
-_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
-_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
-_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-//Vector saturating shift left by constant
-_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
-_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
-_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-//Vector signed->unsigned saturating shift left by constant
-_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-//Vector narrowing shift right by constant
-_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-//Vector signed->unsigned narrowing saturating shift right by constant
-_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-//Vector signed->unsigned rounding narrowing saturating shift right by constant
-_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-//Vector narrowing saturating shift right by constant
-_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-//Vector rounding narrowing shift right by constant
-_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-//Vector rounding narrowing saturating shift right by constant
-_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
-_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-//Vector widening shift left by constant
-_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
-_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-//Shifts with insert
-//Vector shift right and insert
-_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-//Vector shift left and insert
-_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
-//Load a single vector from memory
-_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-
-_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-
-//Load a single lane from memory
-_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
-_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
-_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
-//Load all lanes of vector with same value from memory
-_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-//Store a single vector or lane. Stores all lanes or a single lane of a vector.
-//Store a single vector into memory
-_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-//Store a lane of a vector into memory
-//Loads of an N-element structure
-//Load N-element structure from memory
-_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
-poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
-//Load all lanes of N-element structure with same value from memory
-_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_GLOBAL int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_GLOBAL int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_GLOBAL int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_GLOBAL int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_GLOBAL int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_GLOBAL int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-//Load a single lane of N-element structure from memory
-//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
-_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
-//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-//Store N-element structure to memory
-_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
-_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
-_NEON2SSE_GLOBAL void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
-_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
-//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t const * val); // VST2.16 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t const * val); // VST2.32 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
-_NEON2SSE_GLOBAL void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
-_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_GLOBAL void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
-_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_GLOBAL void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
-_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_GLOBAL void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val); // VST4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_GLOBAL void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
-//Store a single lane of N-element structure to memory
-_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t const * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t const * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
-_NEON2SSE_GLOBAL void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
-_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_GLOBAL void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-_NEON2SSE_GLOBAL void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
-_NEON2SSE_GLOBAL uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-_NEON2SSE_GLOBAL uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-_NEON2SSE_GLOBAL uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
-_NEON2SSE_GLOBAL int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
-_NEON2SSE_GLOBAL int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
-_NEON2SSE_GLOBAL poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
-_NEON2SSE_GLOBAL float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-_NEON2SSE_GLOBAL uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-_NEON2SSE_GLOBAL uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
-_NEON2SSE_GLOBAL int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
-_NEON2SSE_GLOBAL int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
-_NEON2SSE_GLOBAL poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
-_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
-_NEON2SSE_GLOBAL int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-_NEON2SSE_GLOBAL uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
-_NEON2SSE_GLOBAL int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-_NEON2SSE_GLOBAL uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
-//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
-_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-_NEON2SSE_GLOBAL poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
-_NEON2SSE_GLOBAL poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
-_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-_NEON2SSE_GLOBAL poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
-_NEON2SSE_GLOBAL poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
-_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
-_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
-_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
-//Initialize a vector from a literal bit pattern.
-_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
-//Set all lanes to same value
-//Load all lanes of vector to the same literal value
-_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
-_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
-_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
-_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
-_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
-_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
-_NEON2SSE_GLOBAL poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
-_NEON2SSE_GLOBAL poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
-_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
-_NEON2SSE_GLOBAL uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
-_NEON2SSE_GLOBAL int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
-_NEON2SSE_GLOBAL poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
-_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
-_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
-_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
-_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
-_NEON2SSE_GLOBAL uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
-_NEON2SSE_GLOBAL uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
-_NEON2SSE_GLOBAL int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
-_NEON2SSE_GLOBAL int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
-_NEON2SSE_GLOBAL int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
-_NEON2SSE_GLOBAL poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
-_NEON2SSE_GLOBAL poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
-_NEON2SSE_GLOBAL float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
-_NEON2SSE_GLOBAL uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
-_NEON2SSE_GLOBAL int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
-_NEON2SSE_GLOBAL poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
-_NEON2SSE_GLOBAL poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
-_NEON2SSE_GLOBAL float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
-_NEON2SSE_GLOBAL int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
-_NEON2SSE_GLOBAL uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
-//Load all lanes of the vector to the value of a lane of a vector
-_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-_NEON2SSE_GLOBAL int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-_NEON2SSE_GLOBAL int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-_NEON2SSE_GLOBAL int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-_NEON2SSE_GLOBAL poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
-_NEON2SSE_GLOBAL poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
-_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
-_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-_NEON2SSE_GLOBAL int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-_NEON2SSE_GLOBAL int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-_NEON2SSE_GLOBAL int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-_NEON2SSE_GLOBAL poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
-_NEON2SSE_GLOBAL poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
-_NEON2SSE_GLOBAL float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
-_NEON2SSE_GLOBAL int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
-_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
-//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
-_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
-_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
-//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
-_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
-_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
-_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
-_NEON2SSE_GLOBAL poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
-//Converting vectors. These intrinsics are used to convert vectors.
-//Convert from float
-_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
-_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
-_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
-_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
-_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
-_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
-_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
-_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
-_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
-//Convert to float
-_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
-_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
-_NEON2SSE_GLOBAL float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
-_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
-_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
-_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
-_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
-_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
-//Convert between floats
-_NEON2SSE_GLOBAL float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
-_NEON2SSE_GLOBAL float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
-//Vector narrow integer
-_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
-_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
-_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
-_NEON2SSE_GLOBAL uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
-_NEON2SSE_GLOBAL uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
-_NEON2SSE_GLOBAL uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
-//Vector long move
-_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
-_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
-_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
-_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
-_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
-_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
-//Vector saturating narrow integer
-_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
-_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
-_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
-_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
-_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
-_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
-//Vector saturating narrow integer signed->unsigned
-_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
-_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
-_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
-//Table look up
-_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-_NEON2SSE_GLOBAL int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
-_NEON2SSE_GLOBAL poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
-//Extended table look up intrinsics
-_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-_NEON2SSE_GLOBAL int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
-_NEON2SSE_GLOBAL poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
-_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-_NEON2SSE_GLOBAL int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-_NEON2SSE_GLOBAL poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
-_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-_NEON2SSE_GLOBAL int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-_NEON2SSE_GLOBAL poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
-_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-_NEON2SSE_GLOBAL int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-_NEON2SSE_GLOBAL poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
-//Operations with a scalar value
-//Vector multiply accumulate with scalar
-_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-_NEON2SSE_GLOBAL uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
-_NEON2SSE_GLOBAL uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
-_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
-_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
-_NEON2SSE_GLOBAL uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
-_NEON2SSE_GLOBAL uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
-_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
-//Vector widening multiply accumulate with scalar
-_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
-_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
-//Vector widening saturating doubling multiply accumulate with scalar
-_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
-//Vector multiply subtract with scalar
-_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
-_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
-_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
-_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
-_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
-_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
-//Vector widening multiply subtract with scalar
-_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
-_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
-//Vector widening saturating doubling multiply subtract with scalar
-_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
-//Vector multiply by scalar
-_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
-_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
-_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
-_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
-_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
-_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
-_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
-//Vector long multiply with scalar
-_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
-_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
-//Vector long multiply by scalar
-_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
-_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
-_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
-//Vector saturating doubling long multiply with scalar
-_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
-//Vector saturating doubling long multiply by scalar
-_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
-//Vector saturating doubling multiply high with scalar
-_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
-_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
-//Vector saturating doubling multiply high by scalar
-_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
-_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
-//Vector saturating rounding doubling multiply high with scalar
-_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
-_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
-//Vector rounding saturating doubling multiply high by scalar
-_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
-_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
-_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
-_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
-//Vector multiply accumulate with scalar
-_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
-_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
-_NEON2SSE_GLOBAL uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
-_NEON2SSE_GLOBAL uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
-_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
-_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
-_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
-_NEON2SSE_GLOBAL uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
-_NEON2SSE_GLOBAL uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
-_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
-//Vector widening multiply accumulate with scalar
-_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
-_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
-//Vector widening saturating doubling multiply accumulate with scalar
-_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
-//Vector multiply subtract with scalar
-_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
-_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
-_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
-_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
-_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
-_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
-_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
-_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
-_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
-//Vector widening multiply subtract with scalar
-_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
-_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
-_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
-//Vector widening saturating doubling multiply subtract with scalar
-_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
-_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
-//Vector extract
-_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-_NEON2SSE_GLOBAL uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-_NEON2SSE_GLOBAL poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
-_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-_NEON2SSE_GLOBAL uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-_NEON2SSE_GLOBAL poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
-_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-_NEON2SSE_GLOBAL uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-_NEON2SSE_GLOBAL int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-_NEON2SSE_GLOBAL uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
-_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
-_NEON2SSE_GLOBAL int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-_NEON2SSE_GLOBAL uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-_NEON2SSE_GLOBAL poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
-_NEON2SSE_GLOBAL int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-_NEON2SSE_GLOBAL uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-_NEON2SSE_GLOBAL poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
-_NEON2SSE_GLOBAL int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-_NEON2SSE_GLOBAL uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
-_NEON2SSE_GLOBAL int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-_NEON2SSE_GLOBAL uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
-_NEON2SSE_GLOBAL float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
-//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
-_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
-_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
-_NEON2SSE_GLOBAL poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
-_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
-_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
-_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
-_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
-_NEON2SSE_GLOBAL poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
-_NEON2SSE_GLOBAL float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
-_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
-_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
-_NEON2SSE_GLOBAL poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
-_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
-_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
-_NEON2SSE_GLOBAL poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
-_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
-_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
-//Other single operand arithmetic
-//Absolute: Vd[i] = |Va[i]|
-_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
-_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
-_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
-_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
-_NEON2SSE_GLOBAL int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
-_NEON2SSE_GLOBAL int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
-_NEON2SSE_GLOBAL int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
-_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
-
-#ifdef _NEON2SSE_64BIT
-_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
-_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
-#endif
-
-//Saturating absolute: Vd[i] = sat(|Va[i]|)
-_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
-_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
-_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
-_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
-_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
-_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
-//Negate: Vd[i] = - Va[i]
-_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
-_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
-_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
-_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
-_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
-_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
-_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
-_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
-//Saturating Negate: sat(Vd[i] = - Va[i])
-_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
-_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
-_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
-_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
-_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
-_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
-//Count leading sign bits
-_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
-_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
-_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
-_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
-_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
-_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
-//Count leading zeros
-_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
-_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
-_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
-_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
-_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
-_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
-//Count number of set bits
-_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
-_NEON2SSE_GLOBAL int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
-_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
-_NEON2SSE_GLOBAL int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
-//Reciprocal estimate
-_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-//Reciprocal square root estimate
-_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-//Logical operations
-//Bitwise not
-_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
-_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
-_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
-_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
-_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
-_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
-//Bitwise and
-_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
-//Bitwise or
-_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
-//Bitwise exclusive or (EOR or XOR)
-_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
-_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
-//Bit Clear
-_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
-_NEON2SSE_GLOBAL int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
-//Bitwise OR complement
-_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
-_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
-_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
-_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
-//Bitwise Select
-_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
-_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
-_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
-_NEON2SSE_GLOBAL poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
-_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
-_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
-_NEON2SSE_GLOBAL poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
-//Transposition operations
-//Transpose elements
-_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
-_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
-_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
-_NEON2SSE_GLOBAL uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
-_NEON2SSE_GLOBAL uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
-_NEON2SSE_GLOBAL uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
-_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
-_NEON2SSE_GLOBAL poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
-_NEON2SSE_GLOBAL poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
-_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
-_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
-_NEON2SSE_GLOBAL uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
-_NEON2SSE_GLOBAL uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
-_NEON2SSE_GLOBAL uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
-_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
-_NEON2SSE_GLOBAL poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
-_NEON2SSE_GLOBAL poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
-//Interleave elements
-_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
-_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
-_NEON2SSE_GLOBAL int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
-_NEON2SSE_GLOBAL uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
-_NEON2SSE_GLOBAL uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
-_NEON2SSE_GLOBAL uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
-_NEON2SSE_GLOBAL float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
-_NEON2SSE_GLOBAL poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
-_NEON2SSE_GLOBAL poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
-_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
-_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
-_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
-_NEON2SSE_GLOBAL uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
-_NEON2SSE_GLOBAL uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
-_NEON2SSE_GLOBAL uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
-_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
-_NEON2SSE_GLOBAL poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
-_NEON2SSE_GLOBAL poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
-//De-Interleave elements
-_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
-_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
-_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
-_NEON2SSE_GLOBAL uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
-_NEON2SSE_GLOBAL uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
-_NEON2SSE_GLOBAL uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
-_NEON2SSE_GLOBAL float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
-_NEON2SSE_GLOBAL poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
-_NEON2SSE_GLOBAL poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
-_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
-_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
-_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
-_NEON2SSE_GLOBAL uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
-_NEON2SSE_GLOBAL uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
-_NEON2SSE_GLOBAL uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
-_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
-_NEON2SSE_GLOBAL poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
-_NEON2SSE_GLOBAL poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
-
-_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
-_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
-
-//Sqrt
-_NEON2SSE_GLOBAL float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
-_NEON2SSE_GLOBAL float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
-
-
-//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
-// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
-//
-#if  ( defined (__INTEL_COMPILER)  && !defined(__llvm__) )
-#   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
-#   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
-#   define _MM_INSERT_EPI16 _mm_insert_epi16
-#   ifdef USE_SSE4
-#       define _MM_EXTRACT_EPI8  _mm_extract_epi8
-#       define _MM_EXTRACT_EPI32  _mm_extract_epi32
-#       define _MM_EXTRACT_PS  _mm_extract_ps
-#       define _MM_INSERT_EPI8  _mm_insert_epi8
-#       define _MM_INSERT_EPI32 _mm_insert_epi32
-#       define _MM_INSERT_PS    _mm_insert_ps
-#       ifdef  _NEON2SSE_64BIT
-#           define _MM_INSERT_EPI64 _mm_insert_epi64
-#           define _MM_EXTRACT_EPI64 _mm_extract_epi64
-#       endif
-#   endif //SSE4
-#else
-#   define _NEON2SSE_COMMA ,
-#   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
-        switch(LANE)         \
-        {                \
-        case 0:     return NAME(a b, 0); \
-        case 1:     return NAME(a b, 1); \
-        case 2:     return NAME(a b, 2); \
-        case 3:     return NAME(a b, 3); \
-        case 4:     return NAME(a b, 4); \
-        case 5:     return NAME(a b, 5); \
-        case 6:     return NAME(a b, 6); \
-        case 7:     return NAME(a b, 7); \
-        case 8:     return NAME(a b, 8); \
-        case 9:     return NAME(a b, 9); \
-        case 10:    return NAME(a b, 10); \
-        case 11:    return NAME(a b, 11); \
-        case 12:    return NAME(a b, 12); \
-        case 13:    return NAME(a b, 13); \
-        case 14:    return NAME(a b, 14); \
-        case 15:    return NAME(a b, 15); \
-        default:    return NAME(a b, 0); \
-        }
-
-#   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
-        switch(LANE)              \
-        {                          \
-        case 0:  return NAME(vec p,0); \
-        case 1:  return NAME(vec p,1); \
-        case 2:  return NAME(vec p,2); \
-        case 3:  return NAME(vec p,3); \
-        case 4:  return NAME(vec p,4); \
-        case 5:  return NAME(vec p,5); \
-        case 6:  return NAME(vec p,6); \
-        case 7:  return NAME(vec p,7); \
-        default: return NAME(vec p,0); \
-        }
-
-#   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
-        switch(LANE)              \
-        {                          \
-        case case0:  return NAME(vec p,case0); \
-        case case1:  return NAME(vec p,case1); \
-        case case2:  return NAME(vec p,case2); \
-        case case3:  return NAME(vec p,case3); \
-        default:     return NAME(vec p,case0); \
-        }
-
-    _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
-    {
-        _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
-    }
-
-    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
-    {
-        _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
-    }
-
-    _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
-    {
-        _NEON2SSE_SWITCH8((int16_t)_mm_extract_epi16, vec, LANE,)
-    }
-
-#ifdef USE_SSE4
-        _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-        {
-            _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
-        }
-
-        _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-        {
-            _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
-        }
-
-        _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-        {
-            _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
-        }
-
-        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-        {
-            _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
-        }
-
-        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-        {
-            _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
-        }
-
-#ifdef  _NEON2SSE_64BIT
-            //the special case of functions available only for SSE4 and 64-bit build.
-            _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int64_t p, const int LANE)
-            {
-                switch(LANE) {
-                case 0:
-                    return _mm_insert_epi64(vec,  p, 0);
-                case 1:
-                    return _mm_insert_epi64(vec,  p, 1);
-                default:
-                    return _mm_insert_epi64(vec,  p, 0);
-                }
-            }
-
-            _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
-            {
-                if (LANE ==0) return _mm_extract_epi64(val, 0);
-                else return _mm_extract_epi64(val, 1);
-            }
-#endif
-
-        _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-        {
-            _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
-        }
-
-#endif //USE_SSE4
-
-#endif     //#ifdef NDEBUG
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
-// or for some specific commonly used operations implementation missing in SSE
-#ifdef USE_SSE4
-#   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
-#   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
-#   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
-
-#   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
-#   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
-#   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
-
-#   define _MM_MAX_EPI8  _mm_max_epi8
-#   define _MM_MAX_EPI32 _mm_max_epi32
-#   define _MM_MAX_EPU16 _mm_max_epu16
-#   define _MM_MAX_EPU32 _mm_max_epu32
-
-#   define _MM_MIN_EPI8  _mm_min_epi8
-#   define _MM_MIN_EPI32 _mm_min_epi32
-#   define _MM_MIN_EPU16 _mm_min_epu16
-#   define _MM_MIN_EPU32 _mm_min_epu32
-
-#   define _MM_BLENDV_EPI8 _mm_blendv_epi8
-#   define _MM_PACKUS_EPI32 _mm_packus_epi32
-#   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
-
-#   define _MM_MULLO_EPI32 _mm_mullo_epi32
-#   define _MM_MUL_EPI32  _mm_mul_epi32
-
-#   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
-#else     //no SSE4 !!!!!!
-    _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        return _mm_unpacklo_epi8(a, zero);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        return _mm_unpacklo_epi16(a, zero);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        return _mm_unpacklo_epi32(a, zero);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        __m128i sign = _mm_cmpgt_epi8(zero, a);
-        return _mm_unpacklo_epi8(a, sign);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        __m128i sign = _mm_cmpgt_epi16(zero, a);
-        return _mm_unpacklo_epi16(a, sign);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
-    {
-        __m128i zero = _mm_setzero_si128();
-        __m128i sign = _mm_cmpgt_epi32(zero, a);
-        return _mm_unpacklo_epi32(a, sign);
-    }
-
-    _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int32_t tmp[4];
-        _mm_store_si128((__m128i*)tmp, vec);
-        return tmp[LANE];
-    }
-
-    _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int8_t tmp[16];
-        _mm_store_si128((__m128i*)tmp, vec);
-        return (int)tmp[LANE];
-    }
-
-    _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int32_t tmp[4];
-        _mm_store_si128((__m128i*)tmp, _M128i(vec));
-        return tmp[LANE];
-    }
-
-    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
-        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-        __m128i vec_masked, p_masked;
-        pvec[LANE] = p;
-        mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-        return _mm_or_si128(vec_masked, p_masked);
-    }
-
-    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
-        _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
-        __m128i vec_masked, p_masked;
-        pvec[LANE] = (int8_t)p;
-        mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-        return _mm_or_si128(vec_masked, p_masked);
-    }
-
-    _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
-        __m128 tmp, vec_masked, p_masked;
-        mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
-        vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
-        p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
-        tmp = _mm_or_ps(vec_masked, p_masked);
-        return tmp;
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
-    {
-        __m128i cmp, resa, resb;
-        cmp = _mm_cmpgt_epi8 (a, b);
-        resa = _mm_and_si128 (cmp, a);
-        resb = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(resa, resb);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
-    {
-        __m128i cmp, resa, resb;
-        cmp = _mm_cmpgt_epi32(a, b);
-        resa = _mm_and_si128 (cmp, a);
-        resb = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(resa, resb);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
-    {
-        __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-        b_s = _mm_sub_epi16 (b, c8000);
-        a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
-        a_s = _mm_and_si128 (cmp,a);
-        b_s = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(a_s, b_s);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
-    {
-        __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-        b_s = _mm_sub_epi32 (b, c80000000);
-        a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
-        a_s = _mm_and_si128 (cmp,a);
-        b_s = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(a_s, b_s);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
-    {
-        __m128i cmp, resa, resb;
-        cmp = _mm_cmpgt_epi8 (b, a);
-        resa = _mm_and_si128 (cmp, a);
-        resb = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(resa, resb);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
-    {
-        __m128i cmp, resa, resb;
-        cmp = _mm_cmpgt_epi32(b, a);
-        resa = _mm_and_si128 (cmp, a);
-        resb = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(resa, resb);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
-    {
-        __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
-        b_s = _mm_sub_epi16 (b, c8000);
-        a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
-        a_s = _mm_and_si128 (cmp,a);
-        b_s = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(a_s, b_s);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
-    {
-        __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
-        b_s = _mm_sub_epi32 (b, c80000000);
-        a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
-        a_s = _mm_and_si128 (cmp,a);
-        b_s = _mm_andnot_si128 (cmp,b);
-        return _mm_or_si128(a_s, b_s);
-    }
-
-    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
-    {
-        //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
-        __m128i a_masked, b_masked;
-        b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
-        a_masked = _mm_andnot_si128 (mask,a);
-        return _mm_or_si128(a_masked, b_masked);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
-    {
-        __m128i a16, b16, res, reshi,cmp, zero;
-        zero = _mm_setzero_si128();
-        a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
-        b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
-        res = _mm_unpacklo_epi64(a16, b16); //result without saturation
-        reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
-    {
-        __m128i a16, res, reshi,cmp, zero;
-        zero = _mm_setzero_si128();
-        a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
-        reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
-        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
-    }
-
-    // method used by GCC with generic vector extensions
-    _NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
-    {
-        __m128i a_high = _mm_srli_epi64(a, 32);
-        __m128i low = _mm_mul_epu32(a, b);
-        __m128i b_high = _mm_srli_epi64(b, 32);
-        __m128i high = _mm_mul_epu32(a_high, b_high);
-        low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0));
-        high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0));
-        return _mm_unpacklo_epi32(low, high);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
-    {
-        __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
-        sign = _mm_xor_si128 (a, b);
-        sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
-        sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
-        zero = _mm_setzero_si128();
-        a_neg = _mm_abs_epi32 (a); //negate a and b
-        b_neg = _mm_abs_epi32 (b); //negate a and b
-        mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
-        mul_us_neg = _mm_sub_epi64(zero, mul_us);
-        mul_us_neg = _mm_and_si128(sign, mul_us_neg);
-        mul_us = _mm_andnot_si128(sign, mul_us);
-        return _mm_or_si128 (mul_us, mul_us_neg);
-    }
-
-    _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
-    {
-        __m128i res;
-        res = _mm_cmpeq_epi32 (a, b);
-        return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
-    }
-#endif     //SSE4
-
-//the special case of functions working only for 32 bits, no SSE4
-_NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int64_t p, const int LANE)
-{
-    _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
-    _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
-    __m128i vec_masked, p_masked;
-    pvec[LANE] = p;
-    mask[LANE] = 0x0;
-    vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
-    p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
-    return _mm_or_si128(vec_masked, p_masked);
-}
-
-_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
-{
-    _NEON2SSE_ALIGN_16 int64_t tmp[2];
-    _mm_store_si128((__m128i*)tmp, val);
-    return tmp[LANE];
-}
-
-#ifndef _NEON2SSE_64BIT_SSE4
-#   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
-#   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
-#endif
-
-_NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
-_NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
-{
-    //Overflow happens only if a and sum have the opposite signs
-    __m128i c7fffffff, res, res_sat, res_xor_a;
-    c7fffffff = _mm_set1_epi32(0x7fffffff);
-    res = _mm_slli_epi32 (a, 1); // res = a*2
-    res_sat = _mm_srli_epi32(a, 31);
-    res_sat = _mm_add_epi32(res_sat, c7fffffff);
-    res_xor_a = _mm_xor_si128(res, a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-    res_sat = _mm_and_si128(res_xor_a, res_sat);
-    res = _mm_andnot_si128(res_xor_a, res);
-    return _mm_or_si128(res, res_sat);
-}
-
-
-//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-//*************************************************************************
-//*************************************************************************
-//*****************  Functions redefinition\implementatin starts here *****
-//*************************************************************************
-//*************************************************************************
-//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
-#ifdef ARM
-#define vector_addq_s32 _mm_add_epi32
-#else //if we have IA
-#define vector_addq_s32 vadd_s32
-#endif
-
-********************************************************************************************
-Functions below are organised in the following way:
-
-Each NEON intrinsic function has one of the following options:
-1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
-2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
-3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
-4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
-the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
-- please consider such functions removal from your code.
-*/
-
-//***********************************************************************
-//************************      Vector add   *****************************
-//***********************************************************************
-_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
-_NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
-{
-    int64x1_t res64;
-    res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
-    return res64;
-}
-
-
-_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
-{
-    __m128 res;
-    __m64_128 res64;
-    res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSE_GLOBAL uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
-#define vadd_u8 vadd_s8
-
-_NEON2SSE_GLOBAL uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
-#define vadd_u16 vadd_s16
-
-_NEON2SSE_GLOBAL uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
-#define vadd_u32 vadd_s32
-
-_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
-_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
-{
-    uint64x1_t res64;
-    res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
-    return res64;
-}
-
-
-_NEON2SSE_GLOBAL int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
-#define vaddq_s8 _mm_add_epi8
-
-_NEON2SSE_GLOBAL int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
-#define vaddq_s16 _mm_add_epi16
-
-_NEON2SSE_GLOBAL int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
-#define vaddq_s32 _mm_add_epi32
-
-_NEON2SSE_GLOBAL int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
-#define vaddq_s64 _mm_add_epi64
-
-_NEON2SSE_GLOBAL float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
-#define vaddq_f32 _mm_add_ps
-
-_NEON2SSE_GLOBAL uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
-#define vaddq_u8 _mm_add_epi8
-
-_NEON2SSE_GLOBAL uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
-#define vaddq_u16 _mm_add_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
-#define vaddq_u32 _mm_add_epi32
-
-_NEON2SSE_GLOBAL uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
-#define vaddq_u64 _mm_add_epi64
-
-//**************************** Vector long add *****************************:
-//***********************************************************************
-//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-_NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
-{
-    __m128i a16, b16;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_add_epi16 (a16, b16);
-}
-
-_NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
-{
-    __m128i a32, b32;
-    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi32 (a32, b32);
-}
-
-_NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
-_NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
-{
-    //may be not optimal
-    __m128i a64, b64;
-    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi64 ( a64, b64);
-}
-
-_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
-{
-    __m128i a16, b16;
-    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi16 (a16, b16);
-}
-
-_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
-{
-    __m128i a32, b32;
-    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi32 (a32, b32);
-}
-
-_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
-_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
-{
-    //may be not optimal
-    __m128i a64, b64;
-    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi64 (a64, b64);
-}
-
-//***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
-//*************** *********************************************************************
-_NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
-_NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
-{
-    __m128i b16;
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_add_epi16 (a, b16);
-}
-
-_NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
-_NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
-{
-    __m128i b32;
-    b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
-    return _mm_add_epi32 (a, b32);
-}
-
-_NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
-_NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
-{
-    __m128i b64;
-    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi64 (a, b64);
-}
-
-_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
-_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
-{
-    __m128i b16;
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi16 (a, b16);
-}
-
-_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
-_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
-{
-    __m128i b32;
-    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi32 (a, b32);
-}
-
-_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
-_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
-{
-    __m128i b64;
-    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_add_epi64 (a, b64);
-}
-
-//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
-//*************************************************************************************************************************
-_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
-{
-    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-    __m128i tmp1, tmp2;
-    tmp1 = _mm_and_si128(a,b);
-    tmp2 = _mm_xor_si128(a,b);
-    tmp2 = vshrq_n_s8(tmp2,1);
-    return _mm_add_epi8(tmp1,tmp2);
-}
-
-_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
-{
-    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-    __m128i tmp1, tmp2;
-    tmp1 = _mm_and_si128(a,b);
-    tmp2 = _mm_xor_si128(a,b);
-    tmp2 = _mm_srai_epi16(tmp2,1);
-    return _mm_add_epi16(tmp1,tmp2);
-}
-
-_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
-{
-    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-    __m128i tmp1, tmp2;
-    tmp1 = _mm_and_si128(a,b);
-    tmp2 = _mm_xor_si128(a,b);
-    tmp2 = _mm_srai_epi32(tmp2,1);
-    return _mm_add_epi32(tmp1,tmp2);
-}
-
-_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
-{
-    __m128i c1, sum, res;
-    c1 = _mm_set1_epi8(1);
-    sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b); //for rounding compensation
-    res = _mm_and_si128(res,c1); //for rounding compensation
-    return _mm_sub_epi8 (sum, res); //actual rounding compensation
-}
-
-_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
-{
-    __m128i sum, res;
-    sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b); //for rounding compensation
-    res = _mm_slli_epi16 (res,15); //shift left  then back right to
-    res = _mm_srli_epi16 (res,15); //get 1 or zero
-    return _mm_sub_epi16 (sum, res); //actual rounding compensation
-}
-
-_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
-{
-    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
-    __m128i tmp1, tmp2;
-    tmp1 = _mm_and_si128(a,b);
-    tmp2 = _mm_xor_si128(a,b);
-    tmp2 = _mm_srli_epi32(tmp2,1);
-    return _mm_add_epi32(tmp1,tmp2);
-}
-
-//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
-//*****************************************************************************************************************************
-_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
-{
-    //no signed average in x86 SIMD, go to unsigned
-    __m128i c128, au, bu, sum;
-    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
-    au = _mm_sub_epi8(a, c128); //add 128
-    bu = _mm_sub_epi8(b, c128); //add 128
-    sum = _mm_avg_epu8(au, bu);
-    return _mm_add_epi8 (sum, c128); //sub 128
-}
-
-_NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
-{
-    //no signed average in x86 SIMD, go to unsigned
-    __m128i cx8000, au, bu, sum;
-    cx8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
-    au = _mm_sub_epi16(a, cx8000); //add 32768
-    bu = _mm_sub_epi16(b, cx8000); //add 32768
-    sum = _mm_avg_epu16(au, bu);
-    return _mm_add_epi16 (sum, cx8000); //sub 32768
-}
-
-_NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
-{
-    //need to avoid overflow
-    __m128i a2, b2, res, sum;
-    a2 = _mm_srai_epi32(a,1); //a2=a/2;
-    b2 = _mm_srai_epi32(b,1); // b2=b/2;
-    res = _mm_or_si128(a,b); //for rounding
-    res = _mm_slli_epi32 (res,31); //shift left  then back right to
-    res = _mm_srli_epi32 (res,31); //get 1 or zero
-    sum = _mm_add_epi32(a2,b2);
-    return _mm_add_epi32(sum,res);
-}
-
-_NEON2SSE_GLOBAL uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
-#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
-
-_NEON2SSE_GLOBAL uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
-#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
-
-
-_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
-{
-    //need to avoid overflow
-    __m128i a2, b2, res, sum;
-    a2 = _mm_srli_epi32(a,1); //a2=a/2;
-    b2 = _mm_srli_epi32(b,1); // b2=b/2;
-    res = _mm_or_si128(a,b); //for rounding
-    res = _mm_slli_epi32 (res,31); //shift left  then back right to
-    res = _mm_srli_epi32 (res,31); //get 1 or zero
-    sum = _mm_add_epi32(a2,b2);
-    return _mm_add_epi32(sum,res);
-}
-
-//****************** VQADD: Vector saturating add ************************
-//************************************************************************
-_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    int64x1_t res;
-    uint64_t a64, b64;
-    a64 = a.m64_u64[0];
-    b64 = b.m64_u64[0];
-    res.m64_u64[0] = a64 + b64;
-    a64 = (a64 >> 63) + (~_SIGNBIT64);
-    if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
-        res.m64_u64[0] = a64;
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    _NEON2SSE_ALIGN_16 uint64_t a64, b64;
-    uint64x1_t res;
-    a64 = a.m64_u64[0];
-    b64 = b.m64_u64[0];
-    res.m64_u64[0] = a64 + b64;
-    if (res.m64_u64[0] < a64) {
-        res.m64_u64[0] = ~(uint64_t)0;
-    }
-    return res;
-}
-
-_NEON2SSE_GLOBAL int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
-#define vqaddq_s8 _mm_adds_epi8
-
-_NEON2SSE_GLOBAL int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
-#define vqaddq_s16 _mm_adds_epi16
-
-_NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
-{
-    //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
-    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
-    c7fffffff = _mm_set1_epi32(0x7fffffff);
-    res = _mm_add_epi32(a, b);
-    res_sat = _mm_srli_epi32(a, 31);
-    res_sat = _mm_add_epi32(res_sat, c7fffffff);
-    res_xor_a = _mm_xor_si128(res, a);
-    b_xor_a_ = _mm_xor_si128(b, a);
-    res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-    res_sat = _mm_and_si128(res_xor_a, res_sat);
-    res = _mm_andnot_si128(res_xor_a, res);
-    return _mm_or_si128(res, res_sat);
-}
-
-_NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-    _mm_store_si128((__m128i*)atmp, a);
-    _mm_store_si128((__m128i*)btmp, b);
-    res[0] = atmp[0] + btmp[0];
-    res[1] = atmp[1] + btmp[1];
-
-    atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
-    atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
-
-    if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
-        res[0] = atmp[0];
-    }
-    if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
-        res[1] = atmp[1];
-    }
-    return _mm_load_si128((__m128i*)res);
-}
-
-_NEON2SSE_GLOBAL uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
-#define vqaddq_u8 _mm_adds_epu8
-
-_NEON2SSE_GLOBAL uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
-#define vqaddq_u16 _mm_adds_epu16
-
-_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
-{
-    __m128i c80000000, cmp, subsum, suba, sum;
-    c80000000 = _mm_set1_epi32 (0x80000000);
-    sum = _mm_add_epi32 (a, b);
-    subsum = _mm_sub_epi32 (sum, c80000000);
-    suba = _mm_sub_epi32 (a, c80000000);
-    cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
-    return _mm_or_si128 (sum, cmp); //saturation
-}
-
-_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
-#ifdef USE_SSE4
-    _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
-    {
-        __m128i c80000000, sum, cmp, suba, subsum;
-        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-        sum = _mm_add_epi64 (a, b);
-        subsum = _mm_sub_epi64 (sum, c80000000);
-        suba = _mm_sub_epi64 (a, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_or_si128 (sum, cmp); //saturation
-    }
-#else
-    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-    {
-        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-        _mm_store_si128((__m128i*)atmp, a);
-        _mm_store_si128((__m128i*)btmp, b);
-        res[0] = atmp[0] + btmp[0];
-        res[1] = atmp[1] + btmp[1];
-        if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
-        if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
-        return _mm_load_si128((__m128i*)(res));
-    }
-#endif
-
-
-//******************* Vector add high half (truncated)  ******************
-//************************************************************************
-_NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
-_NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
-{
-    int8x8_t res64;
-    __m128i sum;
-    sum = _mm_add_epi16 (a, b);
-    sum = _mm_srai_epi16 (sum, 8);
-    sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
-    return64(sum);
-}
-
-_NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
-_NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
-{
-    int16x4_t res64;
-    __m128i sum;
-    sum = _mm_add_epi32 (a, b);
-    sum = _mm_srai_epi32(sum, 16);
-    sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
-    return64(sum);
-}
-
-_NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
-_NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
-{
-    int32x2_t res64;
-    __m128i sum;
-    sum = _mm_add_epi64 (a, b);
-    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
-    return64(sum);
-}
-
-_NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
-_NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
-{
-    uint8x8_t res64;
-    __m128i sum;
-    sum = _mm_add_epi16 (a, b);
-    sum = _mm_srli_epi16 (sum, 8);
-    sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
-    return64(sum);
-}
-
-_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
-_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
-{
-    uint16x4_t res64;
-     __m128i sum;
-    sum = _mm_add_epi32 (a, b);
-    sum = _mm_srli_epi32 (sum, 16);
-#ifdef USE_SSE4
-    sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
-#else
-    sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
-#endif
-    return64(sum);
-}
-
-_NEON2SSE_GLOBAL uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
-#define vaddhn_u64 vaddhn_s64
-
-//*********** Vector rounding add high half: vraddhn_<type> ******************.
-//***************************************************************************
-_NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
-_NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
-{
-    int8x8_t res64;
-    __m128i sum, mask1;
-    sum = _mm_add_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
-    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
-    sum = _mm_srai_epi16 (sum, 8); //get high half
-    sum = _mm_add_epi16 (sum, mask1); //actual rounding
-    sum = _mm_packs_epi16 (sum, sum);
-    return64(sum);
-}
-
-_NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
-_NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
-{
-    //SIMD may be not optimal, serial may be faster
-    int16x4_t res64;
-    __m128i sum, mask1;
-    sum = _mm_add_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
-    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
-    sum = _mm_srai_epi32 (sum, 16); //get high half
-    sum = _mm_add_epi32 (sum, mask1); //actual rounding
-    sum = _mm_packs_epi32 (sum, sum);
-    return64(sum);
-}
-
-_NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
-_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
-{
-    //SIMD may be not optimal, serial may be faster
-    int32x2_t res64;
-    __m128i sum, mask1;
-    sum = _mm_add_epi64 (a, b);
-    mask1 = _mm_slli_epi64(sum, 32); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
-    sum = _mm_add_epi32 (sum, mask1); //actual high half rounding
-    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
-    return64(sum);
-}
-
-_NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
-_NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
-{
-    uint8x8_t res64;
-    __m128i sum, mask1;
-    sum = _mm_add_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sum, 8); //shift left then back right to
-    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
-    sum = _mm_srai_epi16 (sum, 8); //get high half
-    sum = _mm_add_epi16 (sum, mask1); //actual rounding
-    sum = _mm_packus_epi16 (sum, sum);
-    return64(sum);
-}
-
-_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
-_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
-{
-    //SIMD may be not optimal, serial may be faster
-    uint16x4_t res64;
-    __m128i sum, mask1;
-    sum = _mm_add_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sum, 16); //shift left then back right to
-    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
-    sum = _mm_srai_epi32 (sum, 16); //get high half
-    sum = _mm_add_epi32 (sum, mask1); //actual rounding
-    sum = _MM_PACKUS1_EPI32 (sum);
-    return64(sum);
-}
-
-_NEON2SSE_GLOBAL uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
-#define vraddhn_u64 vraddhn_s64
-
-//**********************************************************************************
-//*********             Multiplication            *************************************
-//**************************************************************************************
-
-//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-//As we don't go to wider result functions are equal to "multiply low" in x86
-_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
-{
-    // no 8 bit simd multiply, need to go to 16 bits in SSE
-    int8x8_t res64;
-    __m128i a128, b128, res;
-    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-    res = _mm_mullo_epi16 (a128, b128);
-    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
-    return64(res);
-}
-
-_NEON2SSE_GLOBAL int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
-#define vmul_s16 vmul_u16
-
-_NEON2SSE_GLOBAL int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
-#define vmul_s32 vmul_u32
-
-_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
-{
-    float32x4_t tmp;
-    __m64_128 res64;
-    tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
-    _M64f(res64, tmp); //use low 64 bits
-    return res64;
-}
-
-_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
-{
-    // no 8 bit simd multiply, need to go to 16 bits in SSE
-    uint8x8_t res64;
-    __m128i mask, a128, b128, res;
-    mask = _mm_set1_epi16(0xff);
-    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
-    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
-    res = _mm_mullo_epi16 (a128, b128);
-    res = _mm_and_si128(res, mask); //to avoid saturation
-    res = _mm_packus_epi16 (res,res); //use only low 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    uint32x2_t res;
-    res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
-    res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
-    return res;
-}
-
-_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
-_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
-{
-    //may be optimized
-    poly8x8_t res64;
-    __m128i a64, b64, c1, res, tmp, bmasked;
-    int i;
-    a64 = _pM128i(a);
-    b64 = _pM128i(b);
-    c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
-    c1 = vshrq_n_u8(c1,7); //0x1
-    bmasked = _mm_and_si128(b64, c1); //0x1
-    res = vmulq_u8(a64, bmasked);
-    for(i = 1; i<8; i++) {
-        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-        bmasked = _mm_and_si128(b64, c1); //0x1
-        tmp = vmulq_u8(a64, bmasked);
-        res = _mm_xor_si128(res, tmp);
-    }
-    return64 (res);
-}
-
-_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
-{
-    // no 8 bit simd multiply, need to go to 16 bits
-    //solution may be not optimal
-    __m128i a16, b16, r16_1, r16_2;
-    a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (a16, b16);
-    //swap hi and low part of a and b to process the remaining data
-    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
-
-    r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
-
-    return _mm_unpacklo_epi64(r16_1,  r16_2);
-}
-
-_NEON2SSE_GLOBAL int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
-#define vmulq_s16 _mm_mullo_epi16
-
-_NEON2SSE_GLOBAL int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
-#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
-
-_NEON2SSE_GLOBAL float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
-#define vmulq_f32 _mm_mul_ps
-
-_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
-{
-    // no 8 bit simd multiply, need to go to 16 bits
-    //solution may be not optimal
-    __m128i maskff, a16, b16, r16_1, r16_2;
-    maskff = _mm_set1_epi16(0xff);
-    a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
-    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
-    //swap hi and low part of a and b to process the remaining data
-    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-
-    r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
-    return _mm_packus_epi16 (r16_1,  r16_2);
-}
-
-_NEON2SSE_GLOBAL uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
-#define vmulq_u16 _mm_mullo_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
-#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
-
-_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
-_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
-{
-    //may be optimized
-    __m128i c1, res, tmp, bmasked;
-    int i;
-    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-    c1 = vshrq_n_u8(c1,7); //0x1
-    bmasked = _mm_and_si128(b, c1); //0x1
-    res = vmulq_u8(a, bmasked);
-    for(i = 1; i<8; i++) {
-        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-        bmasked = _mm_and_si128(b, c1); //0x1
-        tmp = vmulq_u8(a, bmasked);
-        res = _mm_xor_si128(res, tmp);
-    }
-    return res;
-}
-
-//************************* Vector long multiply ***********************************
-//****************************************************************************
-_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
-{
-    //no 8 bit simd multiply, need to go to 16 bits
-    __m128i a16, b16;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-}
-
-_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
-{
-#ifdef USE_SSE4
-    __m128i a16, b16;
-    a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
-    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-#else
-    __m128i low, hi, a128,b128;
-    a128 = _pM128i(a);
-    b128 = _pM128i(b);
-    low =  _mm_mullo_epi16(a128,b128);
-    hi =   _mm_mulhi_epi16(a128,b128);
-    return _mm_unpacklo_epi16(low,hi);
-#endif
-}
-
-_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
-_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
-{
-    __m128i ab, ba, a128, b128;
-    a128 = _pM128i(a);
-    b128 = _pM128i(b);
-    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-    return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-}
-
-_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
-{
-    //no 8 bit simd multiply, need to go to 16 bits
-    __m128i a16, b16;
-    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
-}
-
-_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
-{
-#ifdef USE_SSE4
-    __m128i a16, b16;
-    a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
-    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
-#else
-    __m128i a128,b128,low, hi;
-    a128 = _pM128i(a);
-    b128 = _pM128i(b);
-    low =  _mm_mullo_epi16(a128,b128);
-    hi =   _mm_mulhi_epu16(a128,b128);
-    return _mm_unpacklo_epi16(low,hi);
-#endif
-}
-
-_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
-_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
-{
-    ///may be not optimal compared with serial implementation
-    __m128i ab, ba, a128, b128;
-    a128 = _pM128i(a);
-    b128 = _pM128i(b);
-    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
-    return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-}
-
-_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
-_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
-{
-    //may be optimized
-    __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
-    int i;
-    a128 = _pM128i(a);
-    b128 = _pM128i(b);
-    c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
-    c1 = vshrq_n_u8(c1,7); //0x1
-    bmasked = _mm_and_si128(b128, c1); //0x1
-
-    a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
-    bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-    res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
-    for(i = 1; i<8; i++) {
-        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
-        bmasked = _mm_and_si128(b128, c1); //0x1
-        bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
-        tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
-        res = _mm_xor_si128(res, tmp);
-    }
-    return res;
-}
-
-//****************Vector saturating doubling long multiply **************************
-//*****************************************************************
-_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
-{
-    //the serial soulution may be faster due to saturation
-    __m128i res;
-    res = vmull_s16(a, b);
-    return vqd_s32(res);
-}
-
-_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //the serial soulution may be faster due to saturation
-    __m128i res;
-    res = vmull_s32(a,b);
-    return vqaddq_s64(res,res); //slow serial function!!!!
-}
-
-//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
-//******************************************************************************************
-_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
-{
-    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
-    int8x8_t res64;
-    __m128i b128, c128, res;
-    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-    c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-    res = _mm_mullo_epi16 (c128, b128);
-    res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
-    res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
-{
-    int16x4_t res64;
-    return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
-{
-    int32x2_t res64;
-    __m128i res;
-    res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
-    res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-{
-    //fma is coming soon, but right now:
-    __m128 res;
-    __m64_128 res64;
-    res = _mm_mul_ps (_pM128(c), _pM128(b));
-    res = _mm_add_ps (_pM128(a), res);
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
-{
-    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
-    uint8x8_t res64;
-    __m128i mask, b128, c128, res;
-    mask = _mm_set1_epi16(0xff);
-    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
-    c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
-    res = _mm_mullo_epi16 (c128, b128);
-    res = _mm_and_si128(res, mask); //to avoid saturation
-    res = _mm_packus_epi16 (res, res);
-    res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
-    return64(res);
-}
-
-_NEON2SSE_GLOBAL uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
-#define vmla_u16 vmla_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
-#define vmla_u32 vmla_s32
-
-_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
-{
-    //solution may be not optimal
-    // no 8 bit simd multiply, need to go to 16 bits
-    __m128i b16, c16, r16_1, a_2,r16_2;
-    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-    r16_1 = _mm_add_epi8 (r16_1, a);
-    //swap hi and low part of a, b and c to process the remaining data
-    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-
-    r16_2 = _mm_mullo_epi16 (b16, c16);
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-    r16_2 = _mm_add_epi8(r16_2, a_2);
-    return _mm_unpacklo_epi64(r16_1,r16_2);
-}
-
-_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
-{
-    __m128i res;
-    res = _mm_mullo_epi16 (c, b);
-    return _mm_add_epi16 (res, a);
-}
-
-_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
-{
-    __m128i res;
-    res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
-    return _mm_add_epi32 (res, a);
-}
-
-_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
-{
-    //fma is coming soon, but right now:
-    __m128 res;
-    res = _mm_mul_ps (c, b);
-    return _mm_add_ps (a, res);
-}
-
-_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
-{
-    //solution may be not optimal
-    // no 8 bit simd multiply, need to go to 16 bits
-    __m128i b16, c16, r16_1, a_2, r16_2;
-    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-    r16_1 = _mm_add_epi8 (r16_1, a);
-    //swap hi and low part of a, b and c to process the remaining data
-    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-
-    r16_2 = _mm_mullo_epi16 (b16, c16);
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-    r16_2 = _mm_add_epi8(r16_2, a_2);
-    return _mm_unpacklo_epi64(r16_1,r16_2);
-}
-
-_NEON2SSE_GLOBAL uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
-#define vmlaq_u16 vmlaq_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
-#define vmlaq_u32 vmlaq_s32
-
-//**********************  Vector widening multiply accumulate (long multiply accumulate):
-//                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
-//********************************************************************************************
-_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
-{
-    int16x8_t res;
-    res = vmull_s8(b, c);
-    return _mm_add_epi16 (res, a);
-}
-
-_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int32x4_t res;
-    res = vmull_s16(b,  c);
-    return _mm_add_epi32 (res, a);
-}
-
-_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
-_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int64x2_t res;
-    res = vmull_s32( b, c);
-    return _mm_add_epi64 (res, a);
-}
-
-_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
-{
-    uint16x8_t res;
-    res = vmull_u8(b, c);
-    return _mm_add_epi16 (res, a);
-}
-
-_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    uint32x4_t res;
-    res = vmull_u16(b, c);
-    return _mm_add_epi32 (res, a);
-}
-
-_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
-_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int64x2_t res;
-    res = vmull_u32( b,c);
-    return _mm_add_epi64 (res, a);
-}
-
-//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
-//********************************************************************************************
-_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
-{
-    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
-    int8x8_t res64;
-    __m128i res;
-    res64 = vmul_s8(b,c);
-    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-    return64(res);
-}
-
-_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
-{
-    int16x4_t res64;
-    return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
-{
-    int32x2_t res64;
-    __m128i res;
-    res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
-    res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
-    return64(res);
-}
-
-_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
-{
-    __m128 res;
-    __m64_128 res64;
-    res = _mm_mul_ps (_pM128(c), _pM128(b));
-    res = _mm_sub_ps (_pM128(a), res);
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
-{
-    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
-    uint8x8_t res64;
-    __m128i res;
-    res64 = vmul_u8(b,c);
-    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
-    return64(res);
-}
-
-_NEON2SSE_GLOBAL uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
-#define vmls_u16 vmls_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
-#define vmls_u32 vmls_s32
-
-
-_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
-{
-    //solution may be not optimal
-    // no 8 bit simd multiply, need to go to 16 bits
-    __m128i b16, c16, r16_1, a_2, r16_2;
-    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
-    r16_1 = _mm_sub_epi8 (a, r16_1);
-    //swap hi and low part of a, b, c to process the remaining data
-    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
-
-    r16_2 = _mm_mullo_epi16 (b16, c16);
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-    r16_2 = _mm_sub_epi8 (a_2, r16_2);
-    return _mm_unpacklo_epi64(r16_1,r16_2);
-}
-
-_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
-{
-    __m128i res;
-    res = _mm_mullo_epi16 (c, b);
-    return _mm_sub_epi16 (a, res);
-}
-
-_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
-{
-    __m128i res;
-    res = _MM_MULLO_EPI32 (c, b); //SSE4.1
-    return _mm_sub_epi32 (a, res);
-}
-
-_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
-{
-    __m128 res;
-    res = _mm_mul_ps (c, b);
-    return _mm_sub_ps (a, res);
-}
-
-_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
-{
-    //solution may be not optimal
-    // no 8 bit simd multiply, need to go to 16 bits
-    __m128i b16, c16, r16_1, a_2, r16_2;
-    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
-    r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
-    r16_1 = _mm_sub_epi8 (a, r16_1);
-    //swap hi and low part of a, b and c to process the remaining data
-    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
-
-    r16_2 = _mm_mullo_epi16 (b16, c16);
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
-    r16_2 = _mm_sub_epi8(a_2, r16_2);
-    return _mm_unpacklo_epi64(r16_1,r16_2);
-}
-
-_NEON2SSE_GLOBAL uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
-#define vmlsq_u16 vmlsq_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
-#define vmlsq_u32 vmlsq_s32
-
-//******************** Vector multiply subtract long (widening multiply subtract) ************************************
-//*************************************************************************************************************
-_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
-{
-    int16x8_t res;
-    res = vmull_s8(b, c);
-    return _mm_sub_epi16 (a, res);
-}
-
-_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int32x4_t res;
-    res = vmull_s16(b,  c);
-    return _mm_sub_epi32 (a, res);
-}
-
-_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
-_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int64x2_t res;
-    res = vmull_s32( b,c);
-    return _mm_sub_epi64 (a, res);
-}
-
-_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
-{
-    uint16x8_t res;
-    res = vmull_u8(b, c);
-    return _mm_sub_epi16 (a, res);
-}
-
-_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    uint32x4_t res;
-    res = vmull_u16(b, c);
-    return _mm_sub_epi32 (a, res);
-}
-
-_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
-_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
-{
-    //may be not optimal compared with serial implementation
-    int64x2_t res;
-    res = vmull_u32( b,c);
-    return _mm_sub_epi64 (a, res);
-}
-
-//******  Vector saturating doubling multiply high **********************
-//*************************************************************************
-_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    int16x4_t res;
-    int32_t a32, b32, i;
-    for (i = 0; i<4; i++) {
-        a32 = (int32_t) a.m64_i16[i];
-        b32 = (int32_t) b.m64_i16[i];
-        a32 = (a32 * b32) >> 15;
-        res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
-{
-    //may be not optimal compared with a serial solution
-    int32x2_t res64;
-    __m128i mask;
-    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-    int64x2_t mul;
-    mul = vmull_s32(a,b);
-    mul = _mm_slli_epi64(mul,1); //double the result
-    //at this point start treating 2 64-bit numbers as 4 32-bit
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
-    return64(mul);
-}
-
-_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
-{
-    __m128i res, res_lo, mask;
-    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-    res = _mm_mulhi_epi16 (a, b);
-    res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
-    res_lo = _mm_mullo_epi16 (a, b);
-    res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
-    res = _mm_add_epi16(res, res_lo); //combine results
-    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
-}
-
-_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{
-    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba, mask, mul, mul1;
-    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_slli_epi64(mul,1); //double the result
-    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul1 = _mm_slli_epi64(mul1,1); //double the result
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
-    mul = _mm_unpacklo_epi64(mul, mul1);
-    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
-}
-
-//********* Vector saturating rounding doubling multiply high ****************
-//****************************************************************************
-//If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
-_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{
-    //may be not optimal compared with a serial solution
-    int32x2_t res64;
-    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-    __m128i res_sat, mask, mask1;
-    int64x2_t mul;
-    mul = vmull_s32(a,b);
-    res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-    mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
-    mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
-    //at this point start treating 2 64-bit numbers as 4 32-bit
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
-    return64(mul);
-}
-
-_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
-{
-    __m128i mask, res;
-    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
-    res = _mm_mulhrs_epi16 (a, b);
-    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
-    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
-}
-
-_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{
-    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba,  mask, mul, mul1, mask1;
-    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
-    mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
-    mul = _mm_add_epi32 (mul, mask1); //actual rounding
-
-    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
-    mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
-    mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
-    //at this point start treating 2 64-bit numbers as 4 32-bit
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
-    mul = _mm_unpacklo_epi64(mul, mul1);
-    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
-    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
-}
-
-//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
-//*************************************************************************************************************************
-_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
-{
-    //not optimal SIMD soulution, serial may be faster
-    __m128i res32;
-    res32 = vmull_s16(b,  c);
-    res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
-    return vqaddq_s32(res32, a); //saturation
-}
-
-_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
-{
-    __m128i res64;
-    res64 = vmull_s32(b,c);
-    res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
-    return vqaddq_s64(res64, a); //saturation
-}
-
-//************************************************************************************
-//******************  Vector subtract ***********************************************
-//************************************************************************************
-_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
-_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
-{
-    int64x1_t res64;
-    res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
-    return res64;
-}
-
-
-_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
-{
-    float32x2_t res;
-    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
-    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
-    return res;
-}
-
-_NEON2SSE_GLOBAL uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
-#define vsub_u8 vsub_s8
-
-_NEON2SSE_GLOBAL uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
-#define vsub_u16 vsub_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
-#define vsub_u32 vsub_s32
-
-
-_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
-_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
-{
-    int64x1_t res64;
-    res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
-    return res64;
-}
-
-
-_NEON2SSE_GLOBAL int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
-#define vsubq_s8 _mm_sub_epi8
-
-_NEON2SSE_GLOBAL int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
-#define vsubq_s16 _mm_sub_epi16
-
-_NEON2SSE_GLOBAL int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
-#define vsubq_s32 _mm_sub_epi32
-
-_NEON2SSE_GLOBAL int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
-#define vsubq_s64 _mm_sub_epi64
-
-_NEON2SSE_GLOBAL float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
-#define vsubq_f32 _mm_sub_ps
-
-_NEON2SSE_GLOBAL uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
-#define vsubq_u8 _mm_sub_epi8
-
-_NEON2SSE_GLOBAL uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
-#define vsubq_u16 _mm_sub_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
-#define vsubq_u32 _mm_sub_epi32
-
-_NEON2SSE_GLOBAL uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
-#define vsubq_u64 _mm_sub_epi64
-
-//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
-//***********************************************************************************
-//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
-{
-    __m128i a16, b16;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi16 (a16, b16);
-}
-
-_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
-{
-    __m128i a32, b32;
-    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi32 (a32, b32);
-}
-
-_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
-_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
-{
-    //may be not optimal
-    __m128i a64, b64;
-    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
-    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi64 (a64, b64);
-}
-
-_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
-{
-    __m128i a16, b16;
-    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi16 (a16, b16);
-}
-
-_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
-{
-    __m128i a32, b32;
-    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
-    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi32 (a32, b32);
-}
-
-_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
-_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
-{
-    //may be not optimal
-    __m128i a64, b64;
-    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
-    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi64 (a64, b64);
-}
-
-//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
-//*****************************************************************************************************
-_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
-_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
-{
-    __m128i b16;
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi16 (a, b16);
-}
-
-_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
-_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
-{
-    __m128i b32;
-    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi32 (a, b32);
-}
-
-_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
-_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
-{
-    __m128i b64;
-    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_sub_epi64 (a, b64);
-}
-
-_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
-_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
-{
-    __m128i b16;
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi16 (a, b16);
-}
-
-_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
-_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
-{
-    __m128i b32;
-    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
-    return _mm_sub_epi32 (a, b32);
-}
-
-_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
-_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
-{
-    __m128i b64;
-    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
-    return _mm_sub_epi64 (a, b64);
-}
-
-//************************Vector saturating subtract *********************************
-//*************************************************************************************
-_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-{
-    uint64x1_t res;
-    uint64_t a64,b64;
-    a64 = a.m64_u64[0];
-    b64 = b.m64_u64[0];
-    res.m64_u64[0] = a64 - b64;
-
-    a64 =  (a64 >> 63) + (~_SIGNBIT64);
-    if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
-        res.m64_u64[0] = a64;
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    uint64x1_t res;
-    uint64_t a64, b64;
-    a64 = _Ui64(a);
-    b64 = _Ui64(b);
-    if (a64 > b64) {
-        res.m64_u64[0] = a64 - b64;
-    } else {
-        res.m64_u64[0] = 0;
-    }
-    return res;
-}
-
-_NEON2SSE_GLOBAL int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
-#define vqsubq_s8 _mm_subs_epi8
-
-_NEON2SSE_GLOBAL int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
-#define vqsubq_s16 _mm_subs_epi16
-
-_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
-{
-    //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
-    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
-    c7fffffff = _mm_set1_epi32(0x7fffffff);
-    res = _mm_sub_epi32(a, b);
-    res_sat = _mm_srli_epi32(a, 31);
-    res_sat = _mm_add_epi32(res_sat, c7fffffff);
-    res_xor_a = _mm_xor_si128(res, a);
-    b_xor_a = _mm_xor_si128(b, a);
-    res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
-    res_sat = _mm_and_si128(res_xor_a, res_sat);
-    res = _mm_andnot_si128(res_xor_a, res);
-    return _mm_or_si128(res, res_sat);
-}
-
-_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
-{
-    _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
-    _NEON2SSE_ALIGN_16 uint64_t res[2];
-    _mm_store_si128((__m128i*)atmp, a);
-    _mm_store_si128((__m128i*)btmp, b);
-    res[0] = atmp[0] - btmp[0];
-    res[1] = atmp[1] - btmp[1];
-    if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
-        res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
-    }
-    if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
-        res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
-    }
-    return _mm_load_si128((__m128i*)res);
-}
-
-_NEON2SSE_GLOBAL uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
-#define vqsubq_u8 _mm_subs_epu8
-
-_NEON2SSE_GLOBAL uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
-#define vqsubq_u16 _mm_subs_epu16
-
-_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
-{
-    __m128i min, mask, sub;
-    min = _MM_MIN_EPU32(a, b); //SSE4.1
-    mask = _mm_cmpeq_epi32 (min,  b);
-    sub = _mm_sub_epi32 (a, b);
-    return _mm_and_si128 ( sub, mask);
-}
-
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
-#ifdef USE_SSE4
-    _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
-    {
-        __m128i c80000000, subb, suba, cmp, sub;
-        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
-        sub  = _mm_sub_epi64 (a, b);
-        suba = _mm_sub_epi64 (a, c80000000);
-        subb = _mm_sub_epi64 (b, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_and_si128 (sub, cmp); //saturation
-    }
-#else
-    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-    {
-        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
-        _mm_store_si128((__m128i*)atmp, a);
-        _mm_store_si128((__m128i*)btmp, b);
-        res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
-        res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
-        return _mm_load_si128((__m128i*)(res));
-    }
-#endif
-
-//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
-//****************************************************************
-_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
-{
-    //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
-    int8x8_t res64;
-    __m128i r16;
-    int8x8_t r;
-    r = vsub_s8 (a, b);
-    r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
-    r16 = _mm_srai_epi16 (r16, 1); //SSE2
-    r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
-    return64(r16);
-}
-
-_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-
-_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
-{
-    //need to deal with the possibility of internal overflow
-    __m128i c128, au,bu;
-    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
-    au = _mm_add_epi8( a, c128);
-    bu = _mm_add_epi8( b, c128);
-    return vhsubq_u8(au,bu);
-}
-
-_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
-{
-    //need to deal with the possibility of internal overflow
-    __m128i c8000, au,bu;
-    c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
-    au = _mm_add_epi16( a, c8000);
-    bu = _mm_add_epi16( b, c8000);
-    return vhsubq_u16(au,bu);
-}
-
-_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
-{
-    //need to deal with the possibility of internal overflow
-    __m128i a2, b2,r, b_1;
-    a2 = _mm_srai_epi32 (a,1);
-    b2 = _mm_srai_epi32 (b,1);
-    r = _mm_sub_epi32 (a2, b2);
-    b_1 = _mm_andnot_si128(a, b); //!a and b
-    b_1 = _mm_slli_epi32 (b_1,31);
-    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-    return _mm_sub_epi32(r,b_1);
-}
-
-_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
-{
-    __m128i avg;
-    avg = _mm_avg_epu8 (a, b);
-    return _mm_sub_epi8(a, avg);
-}
-
-_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
-{
-    __m128i avg;
-    avg = _mm_avg_epu16 (a, b);
-    return _mm_sub_epi16(a, avg);
-}
-
-_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
-{
-    //need to deal with the possibility of internal overflow
-    __m128i a2, b2,r, b_1;
-    a2 = _mm_srli_epi32 (a,1);
-    b2 = _mm_srli_epi32 (b,1);
-    r = _mm_sub_epi32 (a2, b2);
-    b_1 = _mm_andnot_si128(a, b); //!a and b
-    b_1 = _mm_slli_epi32 (b_1,31);
-    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
-    return _mm_sub_epi32(r,b_1);
-}
-
-//******* Vector subtract high half (truncated) ** ************
-//************************************************************
-_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
-_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
-{
-    int8x8_t res64;
-    __m128i sum, sum8;
-    sum = _mm_sub_epi16 (a, b);
-    sum8 = _mm_srai_epi16 (sum, 8);
-    sum8 = _mm_packs_epi16(sum8,sum8);
-    return64(sum8);
-}
-
-_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
-_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
-{
-    int16x4_t res64;
-    __m128i sum, sum16;
-    sum = _mm_sub_epi32 (a, b);
-    sum16 = _mm_srai_epi32 (sum, 16);
-    sum16 = _mm_packs_epi32(sum16,sum16);
-    return64(sum16);
-}
-
-_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
-_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
-{
-    int32x2_t res64;
-    __m128i sub;
-    sub = _mm_sub_epi64 (a, b);
-    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
-    return64(sub);
-}
-
-_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
-_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
-{
-    uint8x8_t res64;
-    __m128i sum, sum8;
-    sum = _mm_sub_epi16 (a, b);
-    sum8 = _mm_srli_epi16 (sum, 8);
-    sum8 =  _mm_packus_epi16(sum8,sum8);
-    return64(sum8);
-}
-
-_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
-_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
-{
-    uint16x4_t res64;
-     __m128i sum, sum16;
-    sum = _mm_sub_epi32 (a, b);
-    sum16 = _mm_srli_epi32 (sum, 16);
-#ifdef USE_SSE4
-    sum16 =  _MM_PACKUS1_EPI32(sum16);
-#else
-    sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
-#endif
-    return64(sum16);
-}
-
-_NEON2SSE_GLOBAL uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
-#define vsubhn_u64 vsubhn_s64
-
-//************ Vector rounding subtract high half *********************
-//*********************************************************************
-_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
-_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
-{
-    int8x8_t res64;
-    __m128i sub, mask1;
-    sub = _mm_sub_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
-    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
-    sub = _mm_srai_epi16 (sub, 8); //get high half
-    sub = _mm_add_epi16 (sub, mask1); //actual rounding
-    sub =  _mm_packs_epi16 (sub, sub);
-    return64(sub);
-}
-
-_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
-_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
-{
-    //SIMD may be not optimal, serial may be faster
-    int16x4_t res64;
-    __m128i sub, mask1;
-    sub = _mm_sub_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
-    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
-    sub = _mm_srai_epi32 (sub, 16); //get high half
-    sub = _mm_add_epi32 (sub, mask1); //actual rounding
-    sub = _mm_packs_epi32 (sub, sub);
-    return64(sub);
-}
-
-_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
-_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
-{
-    //SIMD may be not optimal, serial may be faster
-    int32x2_t res64;
-    __m128i sub, mask1;
-    sub = _mm_sub_epi64 (a, b);
-    mask1 = _mm_slli_epi64(sub, 32); //shift left then back right to
-    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
-    sub = _mm_add_epi32 (sub, mask1); //actual high half rounding
-    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
-    return64(sub);
-}
-
-_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
-_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
-{
-    uint8x8_t res64;
-    __m128i sub, mask1;
-    sub = _mm_sub_epi16 (a, b);
-    mask1 = _mm_slli_epi16(sub, 8); //shift left then back right to
-    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
-    sub = _mm_srai_epi16 (sub, 8); //get high half
-    sub = _mm_add_epi16 (sub, mask1); //actual rounding
-    sub = _mm_packus_epi16 (sub, sub);
-    return64(sub);
-}
-
-_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
-_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
-{
-    //SIMD may be not optimal, serial may be faster
-    uint16x4_t res64;
-    __m128i sub, mask1;
-    sub = _mm_sub_epi32 (a, b);
-    mask1 = _mm_slli_epi32(sub, 16); //shift left then back right to
-    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
-    sub = _mm_srai_epi32 (sub, 16); //get high half
-    sub = _mm_add_epi32 (sub, mask1); //actual rounding
-#ifdef USE_SSE4
-    sub =  _MM_PACKUS1_EPI32 (sub);
-#else
-    sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
-#endif
-    return64(sub);
-}
-
-_NEON2SSE_GLOBAL uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
-#define vrsubhn_u64 vrsubhn_s64
-
-//*********** Vector saturating doubling multiply subtract long ********************
-//************************************************************************************
-_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
-{
-    //not optimal SIMD soulution, serial may be faster
-    __m128i res32, mask;
-    int32x4_t res;
-    _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-    res = vmull_s16(b,  c);
-    res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
-    mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
-    res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
-    return vqsubq_s32(a, res32); //saturation
-}
-
-_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    __m128i res64, mask;
-    int64x2_t res;
-    _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
-    res = vmull_s32(b,  c);
-    res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
-    mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
-    res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
-    return vqsubq_s64(a, res64); //saturation
-}
-
-//******************  COMPARISON ***************************************
-//******************* Vector compare equal *************************************
-//****************************************************************************
-_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
-_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
-_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
-_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128 res;
-    res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
-    return64f(res);
-}
-
-_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
-_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
-_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSE_GLOBAL uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
-#define vceq_p8 vceq_u8
-
-
-_NEON2SSE_GLOBAL uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
-#define vceqq_s8 _mm_cmpeq_epi8
-
-_NEON2SSE_GLOBAL uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
-#define vceqq_s16 _mm_cmpeq_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
-#define vceqq_s32 _mm_cmpeq_epi32
-
-_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
-{
-    __m128 res;
-    res = _mm_cmpeq_ps(a,b);
-    return _M128i(res);
-}
-
-_NEON2SSE_GLOBAL uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
-#define vceqq_u8 _mm_cmpeq_epi8
-
-_NEON2SSE_GLOBAL uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
-#define vceqq_u16 _mm_cmpeq_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
-#define vceqq_u32 _mm_cmpeq_epi32
-
-_NEON2SSE_GLOBAL uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
-#define vceqq_p8 _mm_cmpeq_epi8
-
-//******************Vector compare greater-than or equal*************************
-//*******************************************************************************
-//in IA SIMD no greater-than-or-equal comparison for integers,
-// there is greater-than available only, so we need the following tricks
-
-_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
-_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
-_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
-_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128 res;
-    res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-    return64f(res);
-}
-
-_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
-_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
-_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
-{
-    //serial solution looks faster
-    uint32x2_t res64;
-    return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
-}
-
-
-
-_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-{
-    __m128i m1, m2;
-    m1 = _mm_cmpgt_epi8 ( a, b);
-    m2 = _mm_cmpeq_epi8 ( a, b);
-    return _mm_or_si128  ( m1, m2);
-}
-
-_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-{
-    __m128i m1, m2;
-    m1 = _mm_cmpgt_epi16 ( a, b);
-    m2 = _mm_cmpeq_epi16 ( a, b);
-    return _mm_or_si128   ( m1,m2);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-{
-    __m128i m1, m2;
-    m1 = _mm_cmpgt_epi32 (a, b);
-    m2 = _mm_cmpeq_epi32 (a, b);
-    return _mm_or_si128   (m1, m2);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
-{
-    __m128 res;
-    res = _mm_cmpge_ps(a,b); //use only 2 first entries
-    return *(__m128i*)&res;
-}
-
-_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-{
-    //no unsigned chars comparison, only signed available,so need the trick
-    __m128i cmp;
-    cmp = _mm_max_epu8(a, b);
-    return _mm_cmpeq_epi8(cmp, a); //a>=b
-}
-
-_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-{
-    //no unsigned shorts comparison, only signed available,so need the trick
-#ifdef USE_SSE4
-    __m128i cmp;
-    cmp = _mm_max_epu16(a, b);
-    return _mm_cmpeq_epi16(cmp, a); //a>=b
-#else
-   __m128i zero = _mm_setzero_si128();
-   __m128i  as = _mm_subs_epu16(b, a);
-   return _mm_cmpeq_epi16(as, zero);
-#endif
-}
-
-_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-{
-    //no unsigned ints comparison, only signed available,so need the trick
-#ifdef USE_SSE4
-    __m128i cmp;
-    cmp = _mm_max_epu32(a, b);
-    return _mm_cmpeq_epi32(cmp, a); //a>=b
-#else
-    //serial solution may be faster
-    __m128i c80000000, as, bs, m1, m2;
-    c80000000 = _mm_set1_epi32 (0x80000000);
-    as = _mm_sub_epi32(a,c80000000);
-    bs = _mm_sub_epi32(b,c80000000);
-    m1 = _mm_cmpgt_epi32 (as, bs);
-    m2 = _mm_cmpeq_epi32 (as, bs);
-    return _mm_or_si128 ( m1,  m2);
-#endif
-}
-
-//**********************Vector compare less-than or equal******************************
-//***************************************************************************************
-//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
-
-_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
-_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vcleq_s8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
-_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vcleq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
-_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vcleq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
-_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128 res;
-    res = _mm_cmple_ps(_pM128(a),_pM128(b));
-    return64f(res);
-}
-
-_NEON2SSE_GLOBAL uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
-#define vcle_u8(a,b) vcge_u8(b,a)
-
-
-_NEON2SSE_GLOBAL uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
-#define vcle_u16(a,b) vcge_u16(b,a)
-
-
-_NEON2SSE_GLOBAL uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
-#define vcle_u32(a,b) vcge_u32(b,a)
-
-_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
-{
-    __m128i c1, res;
-    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
-    res = _mm_cmpgt_epi8 ( a,  b);
-    return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
-}
-
-_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
-{
-    __m128i c1, res;
-    c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
-    res = _mm_cmpgt_epi16 ( a,  b);
-    return _mm_andnot_si128 (res, c1);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
-{
-    __m128i c1, res;
-    c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
-    res = _mm_cmpgt_epi32 ( a,  b);
-    return _mm_andnot_si128 (res, c1);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
-{
-    __m128 res;
-    res = _mm_cmple_ps(a,b);
-    return *(__m128i*)&res;
-}
-
-_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
-#ifdef USE_SSE4
-    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-    {
-        //no unsigned chars comparison in SSE, only signed available,so need the trick
-        __m128i cmp;
-        cmp = _mm_min_epu8(a, b);
-        return _mm_cmpeq_epi8(cmp, a); //a<=b
-    }
-#else
-    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
-    {
-        return vcgeq_u8(b, a);
-    }
-#endif
-
-_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
-#ifdef USE_SSE4
-    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-    {
-        //no unsigned shorts comparison in SSE, only signed available,so need the trick
-        __m128i cmp;
-        cmp = _mm_min_epu16(a, b);
-        return _mm_cmpeq_epi16(cmp, a); //a<=b
-    }
-#else
-    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
-    {
-        return vcgeq_u16(b, a);
-    }
-#endif
-
-_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
-#ifdef USE_SSE4
-    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-    {
-        //no unsigned chars comparison in SSE, only signed available,so need the trick
-        __m128i cmp;
-        cmp = _mm_min_epu32(a, b);
-        return _mm_cmpeq_epi32(cmp, a); //a<=b
-    }
-#else
-    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
-    {
-        return vcgeq_u32(b, a);
-    }
-#endif
-
-
-//****** Vector compare greater-than ******************************************
-//**************************************************************************
-_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128 res;
-    res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
-    return64f(res);
-}
-
-_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
-_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
-_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSE_GLOBAL uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-#define vcgtq_s8 _mm_cmpgt_epi8
-
-_NEON2SSE_GLOBAL uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-#define vcgtq_s16 _mm_cmpgt_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-#define vcgtq_s32 _mm_cmpgt_epi32
-
-_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
-{
-    __m128 res;
-    res = _mm_cmpgt_ps(a,b); //use only 2 first entries
-    return *(__m128i*)&res;
-}
-
-_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
-{
-      //no unsigned chars comparison, only signed available,so need the trick
-        __m128i c128, as, bs;
-        c128 = _mm_set1_epi8(-128); //(int8_t)0x80
-        as = _mm_sub_epi8(a, c128);
-        bs = _mm_sub_epi8(b, c128);
-        return _mm_cmpgt_epi8(as, bs);
-}
-
-_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
-{
-    //no unsigned short comparison, only signed available,so need the trick
-    __m128i c8000, as, bs;
-    c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
-    as = _mm_sub_epi16(a, c8000);
-    bs = _mm_sub_epi16(b, c8000);
-    return _mm_cmpgt_epi16(as, bs);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
-{
-    //no unsigned int comparison, only signed available,so need the trick
-    __m128i c80000000, as, bs;
-    c80000000 = _mm_set1_epi32 (0x80000000);
-    as = _mm_sub_epi32(a,c80000000);
-    bs = _mm_sub_epi32(b,c80000000);
-    return _mm_cmpgt_epi32 ( as, bs);
-}
-
-//********************* Vector compare less-than **************************
-//*************************************************************************
-_NEON2SSE_GLOBAL uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
-#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
-
-
-_NEON2SSE_GLOBAL uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
-#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
-
-
-_NEON2SSE_GLOBAL uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
-#define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
-
-
-_NEON2SSE_GLOBAL uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
-#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
-#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
-#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
-#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
-#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
-#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
-#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
-#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
-#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
-#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
-
-_NEON2SSE_GLOBAL uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
-#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
-
-//*****************Vector compare absolute greater-than or equal ************
-//***************************************************************************
-_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-    a0 = _mm_cmpge_ps ( a0, b0);
-    return64f(a0);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-{
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-    a0 = _mm_cmpge_ps ( a0, b0);
-    return (*(__m128i*)&a0);
-}
-
-//********Vector compare absolute less-than or equal ******************
-//********************************************************************
-_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-    a0 = _mm_cmple_ps (a0, b0);
-    return64f(a0);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
-{
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-    a0 = _mm_cmple_ps (a0, b0);
-    return (*(__m128i*)&a0);
-}
-
-//********  Vector compare absolute greater-than    ******************
-//******************************************************************
-_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-    a0 = _mm_cmpgt_ps (a0, b0);
-    return64f(a0);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-{
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-    a0 = _mm_cmpgt_ps (a0, b0);
-    return (*(__m128i*)&a0);
-}
-
-//***************Vector compare absolute less-than  ***********************
-//*************************************************************************
-_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
-    a0 = _mm_cmplt_ps (a0, b0);
-    return64f(a0);
-}
-
-_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
-{
-    __m128i c7fffffff;
-    __m128 a0, b0;
-    c7fffffff = _mm_set1_epi32 (0x7fffffff);
-    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
-    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
-    a0 = _mm_cmplt_ps (a0, b0);
-    return (*(__m128i*)&a0);
-}
-
-//*************************Vector test bits************************************
-//*****************************************************************************
-/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
-with the corresponding element of a second vector. If the result is not zero, the
-corresponding element in the destination vector is set to all ones. Otherwise, it is set to
-all zeros. */
-
-_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
-_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vtstq_s8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
-_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vtstq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
-_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
-{
-    int32x2_t res64;
-    return64(vtstq_s32(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSE_GLOBAL uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
-#define vtst_u8 vtst_s8
-
-_NEON2SSE_GLOBAL uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
-#define vtst_u16 vtst_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
-#define vtst_u32 vtst_s32
-
-
-_NEON2SSE_GLOBAL uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
-#define vtst_p8 vtst_u8
-
-_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
-{
-    __m128i zero, one, res;
-    zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-    res = _mm_and_si128 (a, b);
-    res =  _mm_cmpeq_epi8 (res, zero);
-    return _mm_xor_si128(res, one); //invert result
-}
-
-_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
-{
-    __m128i zero, one, res;
-    zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-    res = _mm_and_si128 (a, b);
-    res =  _mm_cmpeq_epi16 (res, zero);
-    return _mm_xor_si128(res, one); //invert result
-}
-
-_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
-{
-    __m128i zero, one, res;
-    zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
-    res = _mm_and_si128 (a, b);
-    res =  _mm_cmpeq_epi32 (res, zero);
-    return _mm_xor_si128(res, one); //invert result
-}
-
-_NEON2SSE_GLOBAL uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
-#define vtstq_u8 vtstq_s8
-
-_NEON2SSE_GLOBAL uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
-#define vtstq_u16 vtstq_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
-#define vtstq_u32 vtstq_s32
-
-_NEON2SSE_GLOBAL uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
-#define vtstq_p8 vtstq_u8
-
-//****************** Absolute difference ********************
-//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
-//************************************************************
-_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
-{
-    int8x8_t res64;
-    return64(vabdq_s8(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
-{
-    int16x4_t res64;
-    return64(vabdq_s16(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
-{//need to deal with an intermediate overflow
-    int32x2_t res;
-    res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
-    res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(vabdq_u8(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(vabdq_u16(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
-{
-    uint32x2_t res64;
-    return64(vabdq_u32(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
-{
-    float32x4_t res;
-    __m64_128 res64;
-    res = vabdq_f32(_pM128(a), _pM128(b));
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
-{ //need to deal with an intermediate overflow
-   __m128i cmp, difab, difba;
-   cmp = vcgtq_s8(a,b);
-   difab = _mm_sub_epi8(a,b);
-   difba = _mm_sub_epi8(b,a);
-   difab = _mm_and_si128(cmp, difab);
-   difba = _mm_andnot_si128(cmp, difba);
-   return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
-{//need to deal with an intermediate overflow
-    __m128i cmp, difab, difba;
-    cmp = vcgtq_s16(a,b);
-    difab = _mm_sub_epi16(a,b);
-    difba = _mm_sub_epi16 (b,a);
-    difab = _mm_and_si128(cmp, difab);
-    difba = _mm_andnot_si128(cmp, difba);
-    return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
-{//need to deal with an intermediate overflow
-    __m128i cmp, difab, difba;
-    cmp = vcgtq_s32(a,b);
-    difab = _mm_sub_epi32(a,b);
-    difba = _mm_sub_epi32(b,a);
-    difab = _mm_and_si128(cmp, difab);
-    difba = _mm_andnot_si128(cmp, difba);
-    return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
-{
-    __m128i  difab, difba;
-    difab = _mm_subs_epu8(a,b);
-    difba = _mm_subs_epu8 (b,a);
-    return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
-{
-    __m128i difab, difba;
-    difab = _mm_subs_epu16(a,b);
-    difba = _mm_subs_epu16 (b,a);
-    return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
-{
-    __m128i cmp, difab, difba;
-    cmp = vcgtq_u32(a,b);
-    difab = _mm_sub_epi32(a,b);
-    difba = _mm_sub_epi32 (b,a);
-    difab = _mm_and_si128(cmp, difab);
-    difba = _mm_andnot_si128(cmp, difba);
-    return _mm_or_si128(difab, difba);
-}
-
-_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
-{
-    __m128i c1;
-    __m128 res;
-    c1 =  _mm_set1_epi32(0x7fffffff);
-    res = _mm_sub_ps (a, b);
-    return _mm_and_ps (res, *(__m128*)&c1);
-}
-
-//************  Absolute difference - long **************************
-//********************************************************************
-_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
-{
-    __m128i a16, b16;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    return vabdq_s16(a16, b16);
-
-}
-
-_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
-{
-    __m128i a32, b32;
-    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
-    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
-    return vabdq_s32(a32, b32);
-}
-
-_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //no optimal SIMD solution, serial looks faster
-    _NEON2SSE_ALIGN_16 int64_t res[2];
-    if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
-    else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
-    if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
-    else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
-    return _mm_load_si128((__m128i*)res);
-}
-
-_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
-{
-    __m128i res;
-    res = vsubl_u8(a,b);
-    return _mm_abs_epi16(res);
-}
-
-_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
-{
-    __m128i res;
-    res = vsubl_u16(a,b);
-    return _mm_abs_epi32(res);
-}
-
-_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    _NEON2SSE_ALIGN_16 uint64_t res[2];
-    if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
-    else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
-    if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
-    else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
-    return _mm_load_si128((__m128i*)res);
-}
-
-//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
-//*********************************************************************************************
-_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
-{
-    int8x8_t res64;
-    return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
-}
-
-_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
-{
-    int16x4_t res64;
-    return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
-}
-
-_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
-{
-    int32x2_t res64;
-    return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
-}
-
-_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
-{
-    int8x8_t res64;
-    return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
-{
-    int16x4_t res64;
-    return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
-}
-
-_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
-{
-    uint32x2_t res64;
-    return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
-}
-
-_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
-{
-    int8x16_t sub;
-    sub = vabdq_s8(b, c);
-    return vaddq_s8( a, sub);
-}
-
-_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
-{
-    int16x8_t sub;
-    sub = vabdq_s16(b, c);
-    return vaddq_s16( a, sub);
-}
-
-_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
-{
-    int32x4_t sub;
-    sub = vabdq_s32(b, c);
-    return vaddq_s32( a, sub);
-}
-
-_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
-{
-    uint8x16_t sub;
-    sub = vabdq_u8(b, c);
-    return vaddq_u8( a, sub);
-}
-
-_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
-{
-    uint16x8_t sub;
-    sub = vabdq_u16(b, c);
-    return vaddq_u16( a, sub);
-}
-
-_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
-{
-    uint32x4_t sub;
-    sub = vabdq_u32(b, c);
-    return vaddq_u32( a, sub);
-}
-
-//************** Absolute difference and accumulate - long ********************************
-//*************************************************************************************
-_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
-_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
-{
-    __m128i b16, c16, res;
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
-    c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
-    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-    return _mm_add_epi16 (a, res);
-}
-
-_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
-_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
-{
-    __m128i b32, c32, res;
-    b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
-    c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
-    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-    return _mm_add_epi32 (a, res);
-}
-
-_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    __m128i res;
-    res = vabdl_s32(b,c);
-    return _mm_add_epi64(a, res);
-}
-
-_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
-_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
-{
-    __m128i b16, c16, res;
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
-    c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
-    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
-    return _mm_add_epi16 (a, res);
-}
-
-_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
-_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
-{
-    __m128i b32, c32, res;
-    b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
-    c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
-    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
-    return _mm_add_epi32 (a, res);
-}
-
-_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    __m128i res;
-    res = vabdl_u32(b,c);
-    return _mm_add_epi64(a, res);
-}
-
-//***********************************************************************************
-//****************  Maximum and minimum operations **********************************
-//***********************************************************************************
-//************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
-//***********************************************************************************
-_NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    __m128i res;
-    res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
-}
-
-_NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    __m128i res;
-    res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i res;
-    res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-    return64(res);
-}
-
-_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
-{
-    //serial solution looks faster than  SIMD one
-    float32x2_t res;
-    res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-    res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-    return res;
-}
-
-_NEON2SSE_GLOBAL int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
-#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
-
-_NEON2SSE_GLOBAL int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
-#define vmaxq_s16 _mm_max_epi16
-
-_NEON2SSE_GLOBAL int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
-#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
-
-_NEON2SSE_GLOBAL uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
-#define vmaxq_u8 _mm_max_epu8
-
-_NEON2SSE_GLOBAL uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
-#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
-
-_NEON2SSE_GLOBAL uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
-#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
-
-
-_NEON2SSE_GLOBAL float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
-#define vmaxq_f32 _mm_max_ps
-
-
-_NEON2SSE_GLOBAL float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
-#define vmaxq_f64 _mm_max_pd
-
-
-//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
-//***********************************************************************************************************
-_NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
-{
-    int8x8_t res64;
-    __m128i res;
-    res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    __m128i res;
-    res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
-{
-    uint8x8_t res64;
-    return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
-{
-    uint16x4_t res64;
-    return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
-{
-    uint32x2_t res64;
-    __m128i res;
-    res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
-    return64(res);
-}
-
-_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
-{
-    //serial solution looks faster than  SIMD one
-    float32x2_t res;
-    res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
-    res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
-    return res;
-}
-
-_NEON2SSE_GLOBAL int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
-#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
-
-_NEON2SSE_GLOBAL int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
-#define vminq_s16 _mm_min_epi16
-
-_NEON2SSE_GLOBAL int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
-#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
-
-_NEON2SSE_GLOBAL uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
-#define vminq_u8 _mm_min_epu8
-
-_NEON2SSE_GLOBAL uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
-#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
-
-_NEON2SSE_GLOBAL uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
-#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
-
-_NEON2SSE_GLOBAL float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
-#define vminq_f32 _mm_min_ps
-
-
-_NEON2SSE_GLOBAL float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
-#define vminq_f64 _mm_min_pd
-
-
-//*************  Pairwise addition operations. **************************************
-//************************************************************************************
-//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
-_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
-{
-    //no 8 bit hadd in IA32, need to go to 16 bit and then pack
-    int8x8_t res64;
-    __m128i a16, b16, res;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
-    res = _mm_hadd_epi16 (a16, b16);
-    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
-{
-    int16x4_t res64;
-    __m128i hadd128;
-    hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
-    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    return64(hadd128);
-}
-
-
-_NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
-_NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
-{
-    int32x2_t res64;
-    __m128i hadd128;
-    hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
-    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    return64(hadd128);
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
-{
-    //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
-    uint8x8_t res64;
-//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-    __m128i mask8, a16, b16, res;
-    mask8 = _mm_set1_epi16(0xff);
-    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
-    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
-    res = _mm_hadd_epi16 (a16, b16);
-    res = _mm_and_si128(res, mask8); //to avoid saturation
-    res = _mm_packus_epi16 (res,res); //use low 64 bits
-    return64(res);
-}
-
-_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
-{
-    // solution may be not optimal, serial execution may be faster
-    // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
-    uint16x4_t res64;
-    __m128i c32767,  cfffe, as, bs, res;
-    c32767 = _mm_set1_epi16 (32767);
-    cfffe = _mm_set1_epi16 (-2); //(int16_t)0xfffe
-    as = _mm_sub_epi16 (_pM128i(a), c32767);
-    bs = _mm_sub_epi16 (_pM128i(b), c32767);
-    res = _mm_hadd_epi16 (as, bs);
-    res = _mm_add_epi16 (res, cfffe);
-    res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    return64(res);
-}
-
-_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
-_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
-{
-    //hadd doesn't work for unsigned values
-    uint32x2_t res64;
-    __m128i ab, ab_sh, res;
-    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
-    ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
-    res = _mm_add_epi32(ab, ab_sh);
-    res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    return64(res);
-}
-
-_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
-_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
-{
-    __m128 hadd128;
-    __m64_128 res64;
-    hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
-    hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
-    _M64f(res64, hadd128);
-    return res64;
-}
-
-
-//**************************  Long pairwise add  **********************************
-//*********************************************************************************
-//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
-// and places the final results in the destination vector.
-
-_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
-_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
-{
-    //no 8 bit hadd in IA32, need to go to 16 bit anyway
-    __m128i a16;
-    int16x4_t res64;
-    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
-    a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
-    return64(a16);
-}
-
-_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
-_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
-{
-    // solution may be not optimal, serial execution may be faster
-    int32x2_t res64;
-    __m128i r32_1;
-    r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
-    r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
-    return64(r32_1);
-}
-
-_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-{
-    int64x1_t res;
-    res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
-    return res;
-}
-
-_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
-_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
-{
-    //  no 8 bit hadd in IA32, need to go to 16 bit
-//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
-    uint16x4_t res64;
-    __m128i a16;
-    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
-    a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
-    return64(a16);
-}
-
-_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than a SIMD one
-    uint32x2_t res;
-    res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
-    res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
-    return res;
-}
-
-_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
-{
-    uint64x1_t res;
-    res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
-    return res;
-}
-
-_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
-{
-    //no 8 bit hadd in IA32, need to go to 16 bit
-    __m128i r16_1, r16_2;
-    r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
-    //swap hi and low part of r to process the remaining data
-    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
-    return _mm_hadd_epi16 (r16_1, r16_2);
-}
-
-_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
-{
-    //no 8 bit hadd in IA32, need to go to 16 bit
-    __m128i r32_1, r32_2;
-    r32_1 = _MM_CVTEPI16_EPI32(a);
-    //swap hi and low part of r to process the remaining data
-    r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
-    return _mm_hadd_epi32 (r32_1, r32_2);
-}
-
-_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
-_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
-{
-    __m128i top, bot;
-    bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-    bot = _MM_CVTEPI32_EPI64(bot);
-    top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1));
-    top = _MM_CVTEPI32_EPI64(top);
-    return _mm_add_epi64(top, bot);
-}
-
-_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
-{
-    const __m128i ff = _mm_set1_epi16(0xFF);
-    __m128i low = _mm_and_si128(a, ff);
-    __m128i high = _mm_srli_epi16(a, 8);
-    return _mm_add_epi16(low, high);
-}
-
-#ifdef USE_SSE4
-_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
-_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
-{
-    const __m128i zero = _mm_setzero_si128();
-    __m128i low = _mm_blend_epi16(zero, a, 0x55); // 0b1010101
-    __m128i high = _mm_srli_epi32(a, 16);
-    return _mm_add_epi32(low, high);
-}
-
-_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
-{
-    const __m128i zero = _mm_setzero_si128();
-    __m128i low = _mm_blend_epi16(zero, a, 0x33); // 0b00110011
-    __m128i high = _mm_srli_epi64(a, 32);
-    return _mm_add_epi64(low, high);
-}
-#else
-_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
-_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
-{
-    const __m128i ff = _mm_set1_epi32(0xFFFF);
-    __m128i low = _mm_and_si128(a, ff);
-    __m128i high = _mm_srli_epi32(a, 16);
-    return _mm_add_epi32(low, high);
-}
-
-_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
-_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
-{
-    const __m128i ff = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
-    __m128i low = _mm_and_si128(a, ff);
-    __m128i high = _mm_srli_epi64(a, 32);
-    return _mm_add_epi64(low, high);
-}
-#endif
-
-//************************  Long pairwise add and accumulate **************************
-//****************************************************************************************
-//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
-// and accumulates the  values of the results into the elements of the destination (wide) vector
-_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
-_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
-{
-    int16x4_t res64;
-    return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
-_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
-{
-    int32x2_t res64;
-    return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
-_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
-{
-    int64x1_t res;
-    res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
-    return res;
-}
-
-_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
-_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
-{
-    uint16x4_t res64;
-    return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
-_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
-{
-    uint32x2_t res64;
-    return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
-}
-
-_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
-_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
-{
-    uint64x1_t res;
-    res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
-    return res;
-}
-
-_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
-{
-    int16x8_t pad;
-    pad = vpaddlq_s8(b);
-    return _mm_add_epi16 (a, pad);
-}
-
-_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
-{
-    int32x4_t pad;
-    pad = vpaddlq_s16(b);
-    return _mm_add_epi32(a, pad);
-}
-
-_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
-_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
-{
-    int64x2_t pad;
-    pad = vpaddlq_s32(b);
-    return _mm_add_epi64 (a, pad);
-}
-
-_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
-{
-    uint16x8_t pad;
-    pad = vpaddlq_u8(b);
-    return _mm_add_epi16 (a, pad);
-}
-
-_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
-_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
-{
-    uint32x4_t pad;
-    pad = vpaddlq_u16(b);
-    return _mm_add_epi32(a, pad);
-} //no optimal SIMD solution, serial is faster
-
-_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
-_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
-{
-    uint64x2_t pad;
-    pad = vpaddlq_u32(b);
-    return _mm_add_epi64(a, pad);
-}
-
-//**********  Folding maximum   *************************************
-//*******************************************************************
-//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
-//and copies the larger of each pair into the corresponding element in the destination
-//    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
-_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
-{
-    int8x8_t res64;
-    __m128i ab, ab1, max;
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-    max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
-    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-    return64(max); //we need 64 bits only
-}
-
-_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
-{
-    //solution may be not optimal compared with the serial one
-    int16x4_t res64;
-    __m128i ab, ab1, max;
-    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-    max = _mm_max_epi16 (ab, ab1);
-    max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
-    return64(max);
-}
-
-_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than SIMD one
-    int32x2_t res;
-    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
-{
-    uint8x8_t res64;
-    __m128i ab, ab1, max;
-    _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-    ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-    max = _mm_max_epu8 (ab, ab1); // SSE4.1
-    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
-    return64(max);
-}
-
-_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
-{
-    //solution may be not optimal compared with the serial one
-    uint16x4_t res64;
-    __m128i ab, ab1, max;
-    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-    max = _MM_MAX_EPU16 (ab, ab1);
-    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
-    return64(max);
-}
-
-_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than SIMD one
-    uint32x2_t res;
-    res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
-    res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
-    return res;
-}
-
-_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than  SIMD one
-    float32x2_t res;
-    res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-    res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-    return res;
-}
-
-// ***************** Folding minimum  ****************************
-// **************************************************************
-//vpmin -> takes minimum of adjacent pairs
-_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
-_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
-{
-    int8x8_t res64;
-    __m128i ab, ab1, min;
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
-    min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
-    min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-    return64(min);
-}
-
-_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
-_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
-{
-    //solution may be not optimal compared with the serial one
-    int16x4_t res64;
-    __m128i ab, ab1, min;
-    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
-    min = _mm_min_epi16 (ab, ab1);
-    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
-    return64(min);
-}
-
-_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than SIMD one
-    int32x2_t res;
-    res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
-    res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
-_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
-{
-    uint8x8_t res64;
-    __m128i ab, ab1, min;
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
-    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
-    min = _mm_min_epu8 (ab, ab1); // SSE4.1
-    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
-    return64(min);
-}
-
-_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
-_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
-{
-    //solution may be not optimal compared with the serial one
-    uint16x4_t res64;
-    __m128i ab, ab1, min;
-    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
-    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
-    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
-    min = _MM_MIN_EPU16 (ab, ab1);
-    min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
-    return64(min);
-}
-
-_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than SIMD one
-    uint32x2_t res;
-    res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
-    res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
-    return res;
-}
-
-_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution looks faster than SIMD one
-    float32x2_t res;
-    res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
-    res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
-    return res;
-}
-
-//***************************************************************
-//***********  Reciprocal/Sqrt ************************************
-//***************************************************************
-//****************** Reciprocal estimate *******************************
-//the ARM NEON and x86 SIMD results may be slightly different
-_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
-_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
-{
-    float32x4_t res;
-    __m64_128 res64;
-    res = _mm_rcp_ps(_pM128(a));
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //Input is  fixed point number!!! No reciprocal for ints in IA32 available
-    uint32x2_t res;
-    float resf, r;
-    int i, q, s;
-    for (i =0; i<2; i++){
-        if((a.m64_u32[i] & 0x80000000) == 0) {
-            res.m64_u32[i] = 0xffffffff;
-        }else{
-            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
-            q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
-            r = (float)(1.0f / (((float)q + 0.5f) / 512.0f)); /* reciprocal r */
-            s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
-            r =  (float)s / 256.0f;
-            res.m64_u32[i] = (uint32_t)(r * (uint32_t)(1 << 31));
-        }
-    }
-    return res;
-}
-
-_NEON2SSE_GLOBAL float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
-#define vrecpeq_f32 _mm_rcp_ps
-
-
-_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //Input is  fixed point number!!!
-    //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
-    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
-    _NEON2SSE_ALIGN_16 uint32_t res[4];
-    _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
-    float resf, r;
-    int i, q, s;
-    __m128i res128, mask, zero;
-    _mm_store_si128((__m128i*)atmp, a);
-    zero = _mm_setzero_si128();
-    for (i =0; i<4; i++){
-        resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
-        q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
-        r = 1.0f / (((float)q + 0.5f) / 512.0f); /* reciprocal r */
-        s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
-        r =  (float)s / 256.0f;
-        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
-    }
-    res128 = _mm_load_si128((__m128i*)res);
-    mask = _mm_and_si128(a, *(__m128i*)c80000000);
-    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
-    return _mm_or_si128(res128, mask);
-}
-
-//**********Reciprocal square root estimate ****************
-//**********************************************************
-//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
-//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
-////the ARM NEON and x86 SIMD results may be slightly different
-_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
-_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
-{
-    float32x4_t res;
-    __m64_128 res64;
-    res = _mm_rsqrt_ps(_pM128(a));
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-  // Input is  fixed point number!!!
-  // We implement the recip_sqrt_estimate function as described in ARMv7
-  // reference manual (VRSQRTE instruction) But results may be slightly different
-  // from ARM implementation due to _mm_rsqrt_ps precision
-  uint32x2_t res;
-  __m64_128 res64[2];
-  int i;
-  _NEON2SSE_ALIGN_16 float coeff[2];
-  for (i = 0; i < 2; i++) {
-    // Generate double-precision value = operand * 2^(-32). This has zero sign
-    // bit, with:
-    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
-    //     or 2^(-2) fraction taken from operand, excluding its most significant
-    //     one or two bits.
-    uint64_t dp_operand;
-    if (a.m64_u32[i] & 0x80000000) {
-      dp_operand =
-          (0x3feLL << 52) | (((uint64_t)a.m64_u32[i] & 0x7FFFFFFF) << 21);
-    } else {
-      dp_operand =
-          (0x3fdLL << 52) | (((uint64_t)a.m64_u32[i] & 0x3FFFFFFF) << 22);
-    }
-    res64[i].m64_u64[0] = dp_operand;
-    coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
-  }
-  __m128 coeff_f = _mm_load_ps(coeff);
-  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
-  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
-  __m128 c05_f = _mm_set1_ps(0.5);
-  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
-  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
-  __m128 c256_f = _mm_set1_ps(256.0);
-  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
-#ifdef USE_SSE4
-  s_f = _mm_floor_ps(s_f);
-#else
-  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
-#endif
-  s_f = _mm_div_ps(s_f, c256_f);
-  _M64f(res64[0], s_f);
-
-  for (i = 0; i < 2; i++) {
-    if ((a.m64_u32[i] & 0xc0000000) == 0) { // a <=0x3fffffff
-      res.m64_u32[i] = 0xffffffff;
-    } else {
-      res.m64_u32[i] = (uint32_t)(res64[0].m64_f32[i] * (((uint32_t)1) << 31));
-    }
-  }
-  return res;
-}
-
-_NEON2SSE_GLOBAL float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
-#define vrsqrteq_f32 _mm_rsqrt_ps
-
-_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-  // Input is  fixed point number!!!
-  // We implement the recip_sqrt_estimate function as described in ARMv7
-  // reference manual (VRSQRTE instruction) But results may be slightly different
-  // from ARM implementation due to _mm_rsqrt_ps precision
-  int i;
-  _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
-  _NEON2SSE_ALIGN_16 float coeff[4], rr[4];
-  char* coeff_f2_c = (char*)&coeff[2];
-  __m64_128 res64[4];
-  _mm_store_si128((__m128i *)atmp, a);
-  for (i = 0; i < 4; i++) {
-    // Generate double-precision value = operand * 2^(-32). This has zero sign
-    // bit, with:
-    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
-    //     or 2^(-2) fraction taken from operand, excluding its most significant
-    //     one or two bits.
-    uint64_t dp_operand;
-    if (atmp[i] & 0x80000000) {
-      dp_operand = (0x3feLL << 52) | (((uint64_t)atmp[i] & 0x7FFFFFFF) << 21);
-    } else {
-      dp_operand = (0x3fdLL << 52) | (((uint64_t)atmp[i] & 0x3FFFFFFF) << 22);
-    }
-    res64[i].m64_u64[0] = dp_operand;
-    coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0f : 256.0f; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
-  }
-  __m128 c05_f = _mm_set1_ps(0.5);
-  __m128 coeff_f = _mm_load_ps(coeff);
-  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
-  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
-
-  __m128 coeff_f2 = _M128(_pM128i(*coeff_f2_c));
-  q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[2].m64_d64[0]), _mm_cvtps_pd(coeff_f2));
-  __m128i q0_i2 = _mm_cvttpd_epi32(q0_d);
-  coeff_f = _M128(_mm_unpacklo_epi64(_M128i(coeff_f), _M128i(coeff_f2)));
-  q0_i = _mm_unpacklo_epi64(q0_i, q0_i2);
-
-  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
-  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
-  __m128 c256_f = _mm_set1_ps(256.0);
-  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
-#ifdef USE_SSE4
-  s_f = _mm_floor_ps(s_f);
-#else
-  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
-#endif
-  s_f = _mm_div_ps(s_f, c256_f);
-  _mm_store_ps(rr, s_f);
-
-  for (i = 0; i < 4; i++) {
-    if ((atmp[i] & 0xc0000000) == 0) { // a <=0x3fffffff
-      res[i] = 0xffffffff;
-    } else {
-      res[i] = (uint32_t)(rr[i] * (((uint32_t)1) << 31));
-    }
-  }
-  return _mm_load_si128((__m128i *)res);
-}
-
-//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
-//******************************************************************************************
-//******VRECPS (Vector Reciprocal Step) ***************************************************
-//multiplies the elements of one vector by the corresponding elements of another vector,
-//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
-
-_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
-_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
-{
-    float32x4_t res;
-    __m64_128 res64;
-    res = vrecpsq_f32(_pM128(a), _pM128(b));
-    _M64f(res64, res);
-    return res64;
-}
-
-_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
-{
-    __m128 f2, mul;
-    f2 =  _mm_set1_ps(2.);
-    mul = _mm_mul_ps(a,b);
-    return _mm_sub_ps(f2,mul);
-}
-
-//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
-//multiplies the elements of one vector by the corresponding elements of another vector,
-//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
-
-_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
-_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
-{
-    float32x2_t res;
-    res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
-    res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
-    return res;
-}
-
-_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
-{
-    __m128 f3, f05, mul;
-    f3 =  _mm_set1_ps(3.f);
-    f05 =  _mm_set1_ps(0.5f);
-    mul = _mm_mul_ps(a,b);
-    f3 = _mm_sub_ps(f3,mul);
-    return _mm_mul_ps (f3, f05);
-}
-//********************************************************************************************
-//***************************** Shifts by signed variable ***********************************
-//********************************************************************************************
-//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
-//********************************************************************************************
-//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-//helper macro. It matches ARM implementation for big shifts
-#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
-        else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
-        return _mm_load_si128((__m128i*)res);
-
-#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
-        int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
-        for (i = 0; i<LEN; i++) { \
-        if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
-        else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
-        return res;
-
-_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(8, i, 8)
-}
-
-_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(16, i, 4)
-}
-
-_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(32, i, 2)
-}
-
-_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(64, i, 1)
-}
-
-_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(8, u, 8)
-}
-
-_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(16, u, 4)
-}
-
-_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT_64(32, u, 2)
-}
-
-_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
-_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
-{
-    SERIAL_SHIFT_64(64, u, 1)
-}
-
-_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(int8_t, int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(int16_t, int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(int32_t, int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(int64_t, int64_t, 2, 2)
-}
-
-_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
-}
-
-
-//*********** Vector saturating shift left: (negative values shift right) **********************
-//********************************************************************************************
-//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if ((atmp[i] ==0)||(btmp[i] ==0)) res[i] = atmp[i]; \
-        else{ \
-            if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
-            else{ \
-                if (btmp[i]>lanesize_1) { \
-                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-                }else{ \
-                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-                    else res[i] = atmp[i] << btmp[i]; }}}} \
-        return _mm_load_si128((__m128i*)res);
-
-#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-        TYPE lanesize = (sizeof(TYPE) << 3); \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if ((atmp[i] ==0)||(btmp[i] ==0)) { res[i] = atmp[i]; \
-        }else{ \
-            if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
-            else{ \
-                if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
-                else{ \
-                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
-                    res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
-        return _mm_load_si128((__m128i*)res);
-
-#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
-        int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
-        int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
-        for (i = 0; i<LEN; i++) { \
-        if ((a.m64_i ## TYPE[i] == 0) ||(b.m64_i ## TYPE[i] == 0)) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i]; \
-        else{ \
-            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-            else{ \
-                if (b.m64_i ## TYPE[i]>lanesize_1) { \
-                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-                }else{ \
-                    limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
-                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-        return res;
-
-#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-        int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-        int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-        for (i = 0; i<LEN; i++) { \
-        if ((a.m64_u ## TYPE[i] == 0) ||(b.m64_u ## TYPE[i] == 0)) {res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i]; \
-        }else{ \
-            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
-            else{ \
-                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
-                else{ \
-                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
-        return res;
-
-_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
-}
-
-_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
-}
-
-_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
-}
-
-_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
-}
-
-_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
-}
-
-_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
-}
-
-_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
-}
-
-_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
-}
-
-_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
-}
-
-_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
-}
-
-
-//******** Vector rounding shift left: (negative values shift right) **********
-//****************************************************************************
-//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
-//rounding makes sense for right shifts only.
-#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if( btmp[i] >= 0) { \
-            if(btmp[i] >= lanesize) res[i] = 0; \
-            else res[i] = (atmp[i] << btmp[i]); \
-        }else{ \
-            res[i] = (btmp[i] < -lanesize) ? 0 : \
-                            (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
-                            (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
-        return _mm_load_si128((__m128i*)res);
-
-
-#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
-        int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
-        for (i = 0; i<LEN; i++) { \
-        if( b.m64_i ## TYPE[i] >= 0) { \
-            if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
-            else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
-        }else{ \
-            res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? 0 : \
-                            (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
-                            (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
-        return res;
-
-
-_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(8,i,8)
-}
-
-_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(16,i,4)
-}
-
-_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(32,i,2)
-}
-
-_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(64,i,1)
-}
-
-_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(8,u,8)
-}
-
-_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(16,u,4)
-}
-
-_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(32,u,2)
-}
-
-_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT_64(64,u,1)
-}
-
-_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
-}
-
-_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
-}
-
-
-//********** Vector saturating rounding shift left: (negative values shift right) ****************
-//*************************************************************************************************
-//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
-//Saturation happens for left shifts only while rounding makes sense for right shifts only.
-#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
-        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if (atmp[i] ==0) res[i] = 0; \
-        else{ \
-            if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-            else{ \
-                if (btmp[i]>lanesize_1) { \
-                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-                }else{ \
-                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
-                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
-                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
-                    else res[i] = atmp[i] << btmp[i]; }}}} \
-        return _mm_load_si128((__m128i*)res);
-
-#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
-        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
-        int lanesize = (sizeof(TYPE) << 3); \
-        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
-        for (i = 0; i<LEN; i++) { \
-        if (atmp[i] ==0) {res[i] = 0; \
-        }else{ \
-            if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
-            else{ \
-                if (btmp[i]>lanesize) res[i] = (_UNSIGNED_T(TYPE))(~0ll); \
-                else{ \
-                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
-                    res[i] = ( atmp[i] >= limit) ? (_UNSIGNED_T(TYPE))(~0ll) : atmp[i] << btmp[i]; }}}} \
-        return _mm_load_si128((__m128i*)res);
-
-#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
-        __m64_128 res; int ## TYPE ## _t limit; int i; \
-        int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
-        for (i = 0; i<LEN; i++) { \
-        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
-        else{ \
-            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-            else{ \
-                if (b.m64_i ## TYPE[i]>lanesize_1) { \
-                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-                }else{ \
-                    limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
-                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
-                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
-                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-        return res;
-
-#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
-        __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
-        int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
-        for (i = 0; i<LEN; i++) { \
-        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
-        }else{ \
-            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
-            else{ \
-                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = (_UNSIGNED_T(int ## TYPE ## _t))(~0ll); \
-                else{ \
-                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
-                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? (_UNSIGNED_T(int ## TYPE ## _t))(~0ll) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
-        return res;
-
-_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
-}
-
-_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
-}
-
-_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
-}
-
-_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
-}
-
-_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
-}
-
-_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
-}
-
-_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
-}
-
-_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
-}
-
-_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
-}
-
-_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
-}
-
-_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
-}
-
-_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
-}
-
-_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
-}
-
-// *********************************************************************************
-// *****************************  Shifts by a constant *****************************
-// *********************************************************************************
-//**************** Vector shift right by constant*************************************
-//************************************************************************************
-_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
-_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
-{
-    //no 8 bit shift available, go to 16 bit
-    int8x8_t res64;
-    __m128i r;
-    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    r = _mm_srai_epi16 (r, b); //SSE2
-    r = _mm_packs_epi16 (r,r); //we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
-_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
-{
-    int16x4_t res64;
-    return64(_mm_srai_epi16(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
-_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
-{
-    int32x2_t res64;
-    return64(_mm_srai_epi32(_pM128i(a), b));
-}
-
-_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //no arithmetic shift for 64bit values, serial solution used
-    int64x1_t res;
-    if(b>=64) res.m64_i64[0] = 0;
-    else res.m64_i64[0] = (*(int64_t*)&a) >> b;
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
-_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
-{
-    //no 8 bit shift available, go to 16 bit
-    uint8x8_t res64;
-    __m128i r;
-    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-    r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
-    r = _mm_packus_epi16 (r,r); //we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
-_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
-{
-    uint16x4_t res64;
-    return64(_mm_srli_epi16(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
-_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
-{
-    uint32x2_t res64;
-    return64(_mm_srli_epi32(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
-_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
-{
-    uint64x1_t res64;
-    return64(_mm_srli_epi64(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
-{
-    //no 8 bit shift available, go to 16 bit trick
-    __m128i zero, mask0, a_sign, r, a_sign_mask;
-    _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
-    zero = _mm_setzero_si128();
-    mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
-    a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
-    r = _mm_srai_epi16 (a, b);
-    a_sign_mask =  _mm_and_si128 (mask0, a_sign);
-    r =  _mm_andnot_si128 (mask0, r);
-    return _mm_or_si128 (r, a_sign_mask);
-}
-
-_NEON2SSE_GLOBAL int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
-#define vshrq_n_s16 _mm_srai_epi16
-
-_NEON2SSE_GLOBAL int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
-#define vshrq_n_s32 _mm_srai_epi32
-
-_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{
-    //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
-    __m128i c1, signmask,a0,  res64;
-    _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
-    c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
-    signmask  =  _mm_slli_epi64 (c1, (64 - b));
-    a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
-    a0 = _MM_CMPEQ_EPI64 (a, a0);
-    signmask = _mm_and_si128(a0, signmask);
-    res64 = _mm_srli_epi64 (a, b);
-    return _mm_or_si128(res64, signmask);
-}
-
-_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
-{
-    //no 8 bit shift available, need the special trick
-    __m128i mask0, r;
-    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
-    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
-    r = _mm_srli_epi16 ( a, b);
-    return _mm_and_si128 (r,  mask0);
-}
-
-_NEON2SSE_GLOBAL uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
-#define vshrq_n_u16 _mm_srli_epi16
-
-_NEON2SSE_GLOBAL uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
-#define vshrq_n_u32 _mm_srli_epi32
-
-_NEON2SSE_GLOBAL uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
-#define vshrq_n_u64 _mm_srli_epi64
-
-//*************************** Vector shift left by constant *************************
-//*********************************************************************************
-_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
-{
-    //no 8 bit shift available, go to 16 bit
-    int8x8_t res64;
-    __m128i r;
-    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    r = _mm_slli_epi16 (r, b); //SSE2
-    r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
-{
-    int16x4_t res64;
-    return64(_mm_slli_epi16(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
-{
-    int32x2_t res64;
-    return64(_mm_slli_epi32(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
-{
-    int64x1_t res64;
-    return64(_mm_slli_epi64(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
-_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
-{
-    //no 8 bit shift available, go to 16 bit
-    uint8x8_t res64;
-    __m128i mask8;
-    __m128i r;
-    mask8 = _mm_set1_epi16(0xff);
-    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-    r = _mm_slli_epi16 (r, b); //SSE2
-    r = _mm_and_si128(r, mask8); //to avoid saturation
-    r = _mm_packus_epi16 (r,r); //we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSE_GLOBAL uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
-#define vshl_n_u16 vshl_n_s16
-
-
-_NEON2SSE_GLOBAL uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
-#define vshl_n_u32 vshl_n_s32
-
-_NEON2SSE_GLOBAL uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
-#define vshl_n_u64 vshl_n_s64
-
-_NEON2SSE_GLOBAL int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-#define vshlq_n_s8 vshlq_n_u8
-
-_NEON2SSE_GLOBAL int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-#define vshlq_n_s16 _mm_slli_epi16
-
-_NEON2SSE_GLOBAL int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-#define vshlq_n_s32 _mm_slli_epi32
-
-_NEON2SSE_GLOBAL int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-#define vshlq_n_s64 _mm_slli_epi64
-
-_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
-{
-    //no 8 bit shift available, need the special trick
-    __m128i mask0, r;
-    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
-    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
-    r = _mm_slli_epi16 ( a, b);
-    return _mm_and_si128 (r,  mask0);
-}
-
-_NEON2SSE_GLOBAL uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
-#define vshlq_n_u16 vshlq_n_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
-#define vshlq_n_u32 vshlq_n_s32
-
-_NEON2SSE_GLOBAL uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
-#define vshlq_n_u64 vshlq_n_s64
-
-//************* Vector rounding shift right by constant ******************
-//*************************************************************************
-//No corresponding  x86 intrinsics exist, need to do some tricks
-_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
-_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
-{
-    //no 8 bit shift available, go to 16 bit
-    int8x8_t res64;
-    __m128i r, maskb;
-    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-    r = _mm_srai_epi16 (r, b);
-    r = _mm_add_epi16 (r, maskb); //actual rounding
-    r = _mm_packs_epi16 (r,r); ////we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
-_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
-{
-    int16x4_t res64;
-    return64(vrshrq_n_s16(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
-_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
-{
-    int32x2_t res64;
-    return64(vrshrq_n_s32(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    //serial solution is faster
-    int64x1_t res;
-    int64_t a_i64 = *( int64_t*)&a;
-    if(b==64) {
-        res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
-    } else {
-        int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
-        res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
-_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
-{
-    //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
-    uint8x8_t res64;
-    __m128i r, maskb;
-    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
-    r = _mm_srli_epi16 (r, b);
-    r = _mm_add_epi16 (r, maskb); //actual rounding
-    r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
-    return64(r);
-}
-
-_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
-_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
-{
-    uint16x4_t res64;
-    return64(vrshrq_n_u16(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
-_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
-{
-    uint32x2_t res64;
-    return64(vrshrq_n_u32(_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
-_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
-{
-    uint64x1_t res64;
-    return64(vrshrq_n_u64(_pM128i(a), b));
-}
-
-_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
-{
-    //no 8 bit shift available, go to 16 bit trick
-    __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-    r = vshrq_n_s8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
-    return _mm_add_epi8(r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-{
-    __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-    r = _mm_srai_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-{
-    __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-    r = _mm_srai_epi32(a, b);
-    return _mm_add_epi32 (r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{
-    //solution may be not optimal compared with a serial one
-    __m128i maskb;
-    int64x2_t r;
-    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-    r = vshrq_n_s64(a, b);
-    return _mm_add_epi64 (r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
-{
-    //no 8 bit shift available, go to 16 bit trick
-    __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
-    r = vshrq_n_u8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
-    return _mm_add_epi8(r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
-{
-    __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
-    r = _mm_srli_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
-{
-    __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
-    r = _mm_srli_epi32(a, b);
-    return _mm_add_epi32 (r, maskb); //actual rounding
-}
-
-_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
-_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
-{
-    //solution may be not optimal compared with a serial one
-    __m128i maskb,  r;
-    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
-    r = _mm_srli_epi64(a, b);
-    return _mm_add_epi64 (r, maskb); //actual rounding
-}
-
-//************* Vector shift right by constant and accumulate *********
-//*********************************************************************
-_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
-_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
-{
-    int8x8_t shift;
-    shift = vshr_n_s8(b, c);
-    return vadd_s8( a, shift);
-}
-
-_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
-_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
-{
-    int16x4_t shift;
-    shift = vshr_n_s16( b, c);
-    return vadd_s16(a, shift);
-}
-
-_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
-_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
-{
-    //may be not optimal compared with the serial execution
-    int32x2_t shift;
-    shift = vshr_n_s32(b, c);
-    return vadd_s32( a, shift);
-}
-
-_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
-_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-{
-    //may be not optimal compared with a serial solution
-    int64x1_t shift;
-    shift = vshr_n_s64(b, c);
-    return vadd_s64( a, shift);
-}
-
-_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
-_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
-{
-    uint8x8_t shift;
-    shift = vshr_n_u8(b, c);
-    return vadd_u8(a, shift);
-}
-
-_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
-_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
-{
-    uint16x4_t shift;
-    shift = vshr_n_u16(b, c);
-    return vadd_u16(a,shift);
-}
-
-_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
-_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
-{
-    //may be not optimal compared with the serial execution
-    uint32x2_t shift;
-    shift = vshr_n_u32(b, c);
-    return vadd_u32( a, shift);
-}
-
-_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
-_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
-{
-    //may be not optimal compared with the serial execution
-    uint64x1_t shift;
-    shift = vshr_n_u64(b, c);
-    return vadd_u64(a, shift);
-}
-
-_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
-{
-    int8x16_t shift;
-    shift = vshrq_n_s8(b, c);
-    return vaddq_s8(a, shift);
-}
-
-_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
-{
-    int16x8_t shift;
-    shift = vshrq_n_s16(b, c);
-    return vaddq_s16(a, shift);
-}
-
-_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
-{
-    int32x4_t shift;
-    shift = vshrq_n_s32(b, c);
-    return vaddq_s32(a, shift);
-}
-
-_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
-{
-    int64x2_t shift;
-    shift = vshrq_n_s64(b, c);
-    return vaddq_s64( a, shift);
-}
-
-_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
-{
-    uint8x16_t shift;
-    shift = vshrq_n_u8(b, c);
-    return vaddq_u8(a, shift);
-}
-
-_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
-{
-    uint16x8_t shift;
-    shift = vshrq_n_u16(b, c);
-    return vaddq_u16(a,  shift);
-}
-
-_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
-{
-    uint32x4_t shift;
-    shift = vshrq_n_u32(b, c);
-    return vaddq_u32(a, shift);
-}
-
-_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
-_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
-{
-    uint64x2_t shift;
-    shift = vshrq_n_u64(b, c);
-    return vaddq_u64(a, shift);
-}
-
-//************* Vector rounding shift right by constant and accumulate ****************************
-//************************************************************************************************
-_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
-_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
-{
-    int8x8_t shift;
-    shift = vrshr_n_s8(b, c);
-    return vadd_s8( a, shift);
-}
-
-_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
-_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
-{
-    int16x4_t shift;
-    shift = vrshr_n_s16( b, c);
-    return vadd_s16(a, shift);
-}
-
-_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
-_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
-{
-    //may be not optimal compared with the serial execution
-    int32x2_t shift;
-    shift = vrshr_n_s32(b, c);
-    return vadd_s32( a, shift);
-}
-
-_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-{
-    int64x1_t shift;
-    shift = vrshr_n_s64(b, c);
-    return vadd_s64( a, shift);
-}
-
-_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
-_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
-{
-    uint8x8_t shift;
-    shift = vrshr_n_u8(b, c);
-    return vadd_u8(a, shift);
-}
-
-_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
-_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
-{
-    uint16x4_t shift;
-    shift = vrshr_n_u16(b, c);
-    return vadd_u16(a,shift);
-}
-
-_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
-_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
-{
-    //may be not optimal compared with the serial execution
-    uint32x2_t shift;
-    shift = vrshr_n_u32(b, c);
-    return vadd_u32( a, shift);
-}
-
-_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
-{
-    //may be not optimal compared with the serial execution
-    uint64x1_t shift;
-    shift = vrshr_n_u64(b, c);
-    return vadd_u64( a, shift);
-}
-
-_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
-{
-    int8x16_t shift;
-    shift = vrshrq_n_s8(b, c);
-    return vaddq_s8(a, shift);
-}
-
-_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
-{
-    int16x8_t shift;
-    shift = vrshrq_n_s16(b, c);
-    return vaddq_s16(a, shift);
-}
-
-_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
-{
-    int32x4_t shift;
-    shift = vrshrq_n_s32(b, c);
-    return vaddq_s32(a, shift);
-}
-
-_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-{
-    int64x2_t shift;
-    shift = vrshrq_n_s64(b, c);
-    return vaddq_s64(a, shift);
-}
-
-_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
-{
-    uint8x16_t shift;
-    shift = vrshrq_n_u8(b, c);
-    return vaddq_u8(a, shift);
-}
-
-_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
-{
-    uint16x8_t shift;
-    shift = vrshrq_n_u16(b, c);
-    return vaddq_u16(a,  shift);
-}
-
-_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
-{
-    uint32x4_t shift;
-    shift = vrshrq_n_u32(b, c);
-    return vaddq_u32(a, shift);
-}
-
-_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
-_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
-{
-    uint64x2_t shift;
-    shift = vrshrq_n_u64(b, c);
-    return vaddq_u64(a, shift);
-}
-
-//**********************Vector saturating shift left by constant *****************************
-//********************************************************************************************
-//we don't check const ranges  assuming they are met
-_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
-_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
-{
-    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-    int8x8_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi16 (a128, b);
-    r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
-    return64(r128);
-}
-
-_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
-_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
-{
-    // go to 32 bit to get the auto saturation (in packs function)
-    int16x4_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi32 (a128, b); //shift_res
-    r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
-    return64(r128);
-}
-
-_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
-_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
-{
-    //serial execution may be faster
-    int32x2_t res64;
-    return64(vqshlq_n_s32 (_pM128i(a), b));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    // no effective SIMD solution here
-    int64x1_t res;
-    int64_t bmask;
-    int64_t a_i64 = *( int64_t*)&a;
-    bmask = ( int64_t)1 << (63 - b); //positive
-    if (a_i64 >= bmask) {
-        res.m64_i64[0] = ~(_SIGNBIT64);
-    } else {
-        res.m64_i64[0]  = (a_i64 <= -bmask) ? (int64_t)_SIGNBIT64 : a_i64 << b;
-    }
-    return res;
-}
-
-
-_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
-_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
-{
-    //no 8 bit shift available in IA32 SIMD, go to 16 bit
-    uint8x8_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi16 (a128, b); //shift_res
-    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-    return64(r128);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
-_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
-{
-    // go to 32 bit to get the auto saturation (in packus function)
-    uint16x4_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi32 (a128, b); //shift_res
-    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
-    return64(r128);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
-_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
-{
-    uint32x2_t res64;
-    return64(vqshlq_n_u32(_pM128i(a), b));
-}
-
-_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    // no effective SIMD solution here
-    uint64x1_t res;
-    uint64_t bmask;
-    uint64_t a_i64 = *(uint64_t*)&a;
-    bmask = ( uint64_t)1 << (64 - b);
-    res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
-    return res;
-}
-
-_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
-{
-    // go to 16 bit to get the auto saturation (in packs function)
-    __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-    r128_1 = _mm_slli_epi16 (a128, b);
-    //swap hi and low part of a128 to process the remaining data
-    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    a128 = _MM_CVTEPI8_EPI16 (a128);
-    r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
-}
-
-_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
-{
-    // manual saturation solution looks LESS optimal than 32 bits conversion one
-    // go to 32 bit to get the auto saturation (in packs function)
-    __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-    //swap hi and low part of a128 to process the remaining data
-    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    a128 = _MM_CVTEPI16_EPI32 (a128);
-    r128_2 = _mm_slli_epi32 (a128, b);
-    return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
-}
-
-_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
-{
-    // no 64 bit saturation option available, special tricks necessary
-    __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
-    c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
-    maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
-    saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
-    shift_res = _mm_slli_epi32 (a, b);
-    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-    //result with positive numbers saturated
-    shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-    //treat negative numbers
-    maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
-    saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
-    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
-    return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
-}
-
-_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    // no effective SIMD solution here
-    _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
-    int64_t bmask;
-    int i;
-    bmask = ( int64_t)1 << (63 - b); //positive
-    _mm_store_si128((__m128i*)atmp, a);
-    for (i = 0; i<2; i++) {
-        if (atmp[i] >= bmask) {
-            res[i] = ~(_SIGNBIT64);
-        } else {
-            res[i] = (atmp[i] <= -bmask) ? (int64_t)_SIGNBIT64 : atmp[i] << b;
-        }
-    }
-    return _mm_load_si128((__m128i*)res);
-}
-
-_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
-{
-    // go to 16 bit to get the auto saturation (in packs function)
-    __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
-    r128_1 = _mm_slli_epi16 (a128, b);
-    //swap hi and low part of a128 to process the remaining data
-    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    a128 = _MM_CVTEPU8_EPI16 (a128);
-    r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-}
-
-_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
-{
-    // manual saturation solution looks more optimal than 32 bits conversion one
-    __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
-    cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
-    c8000 = _mm_set1_epi16 (-32768); // (int16_t)0x8000
-//no unsigned shorts comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi16(a, c8000); //go to signed
-    saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
-    shift_res = _mm_slli_epi16 (a, b);
-    return _mm_or_si128 (shift_res, saturation_mask);
-}
-
-_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
-{
-    // manual saturation solution, no 64 bit saturation option, the serial version may be faster
-    __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
-    cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
-    c80000000 = _mm_set1_epi32 (0x80000000);
-//no unsigned ints comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi32(a, c80000000); //go to signed
-    saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
-    shift_res = _mm_slli_epi32 (a, b);
-    return _mm_or_si128 (shift_res, saturation_mask);
-}
-
-_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    // no effective SIMD solution here
-    _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
-    uint64_t bmask;
-    int i;
-    bmask = ( uint64_t)1 << (64 - b);
-    _mm_store_si128((__m128i*)atmp, a);
-    for (i = 0; i<2; i++) {
-        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
-    }
-    return _mm_load_si128((__m128i*)res);
-}
-
-//**************Vector signed->unsigned saturating shift left by constant *************
-//*************************************************************************************
-_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
-_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
-{
-    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
-    uint8x8_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi16 (a128, b);
-    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
-    return64(r128);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
-_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
-{
-    uint16x4_t res64;
-    __m128i a128, r128;
-    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
-    r128 = _mm_slli_epi32 (a128, b); //shift_res
-    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
-    return64(r128);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
-_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
-{
-    int32x2_t res64;
-    return64( vqshluq_n_s32(_pM128i(a), b));
-}
-
-_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
-{
-    uint64x1_t res;
-    uint64_t limit;
-    if (a.m64_i64[0]<=0) {
-        res.m64_u64[0] = 0;
-    } else {
-        limit = (uint64_t) 1 << (64 - b);
-        res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? ~((uint64_t)0) : (uint64_t)a.m64_i64[0] << b;
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
-{
-    __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
-    r128_1 = _mm_slli_epi16 (a128, b);
-    //swap hi and low part of a128 to process the remaining data
-    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    a128 = _MM_CVTEPI8_EPI16 (a128);
-    r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
-}
-
-_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
-{
-    // manual saturation solution looks LESS optimal than 32 bits conversion one
-    __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
-    //swap hi and low part of a128 to process the remaining data
-    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
-    a128 = _MM_CVTEPI16_EPI32 (a128);
-    r128_2 = _mm_slli_epi32 (a128, b);
-    return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
-}
-
-_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
-{
-    //solution may be  not optimal compared with the serial one
-    __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
-    zero = _mm_setzero_si128();
-    maskA = _mm_cmpeq_epi32(a, a);
-    maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
-    //saturate negative numbers to zero
-    maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
-    a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
-    //saturate positive to 0xffffffff
-    a_masked = _mm_and_si128 (a0, maskA);
-    a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
-    a_shift = _mm_slli_epi32 (a0, b);
-    return _mm_or_si128 (a_shift, a_masked); //actual saturation
-}
-
-_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    // no effective SIMD solution here, serial execution looks faster
-    _NEON2SSE_ALIGN_16 int64_t atmp[2];
-    _NEON2SSE_ALIGN_16 uint64_t res[2];
-    uint64_t limit;
-    int i;
-    _mm_store_si128((__m128i*)atmp, a);
-    for (i = 0; i<2; i++) {
-        if (atmp[i]<=0) {
-            res[i] = 0;
-        } else {
-            limit = (uint64_t) 1 << (64 - b);
-            res[i] = ( ((uint64_t)atmp[i]) >= limit) ? ~((uint64_t)0) : (uint64_t)atmp[i] << b;
-        }
-    }
-    return _mm_load_si128((__m128i*)res);
-}
-
-//************** Vector narrowing  shift right by constant **************
-//**********************************************************************
-_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-{
-    int8x8_t res64;
-    __m128i r16;
-    r16  = vshrq_n_s16(a,b);
-    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-    return64(r16);
-}
-
-_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-{
-    int16x4_t res64;
-    __m128i r32;
-    r32  = vshrq_n_s32(a,b);
-    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-    return64(r32);
-}
-
-_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-{
-    int32x2_t res64;
-    __m128i r64;
-    r64  = vshrq_n_s64(a,b);
-    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
-{
-    uint8x8_t res64;
-    __m128i mask, r16;
-    mask = _mm_set1_epi16(0xff);
-    r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-    r16 = _mm_and_si128(r16, mask); //to avoid saturation
-    r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
-{
-    uint16x4_t res64;
-    __m128i mask, r32;
-    mask = _mm_set1_epi32(0xffff);
-    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
-    r32 = _mm_and_si128(r32, mask); //to avoid saturation
-    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
-_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-{
-    uint32x2_t res64;
-    __m128i r64;
-    r64  = vshrq_n_u64(a,b);
-    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-//************** Vector signed->unsigned narrowing saturating shift right by constant ********
-//*********************************************************************************************
-_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
-{
-    uint8x8_t res64;
-    __m128i r16;
-    r16  = vshrq_n_s16(a,b);
-    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
-{
-    uint16x4_t res64;
-    __m128i r32;
-    r32  = vshrq_n_s32(a,b);
-    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-{
-    _NEON2SSE_ALIGN_16 int64_t atmp[2];
-    uint32x2_t res;
-    int64_t res64;
-    _mm_store_si128((__m128i*)atmp, a);
-    if (atmp[0] < 0) {
-        res.m64_u32[0] = 0;
-    } else {
-        res64 = (atmp[0] >> b);
-        res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
-    }
-    if (atmp[1] < 0) {
-        res.m64_u32[1] = 0;
-    } else {
-        res64 = (atmp[1] >> b);
-        res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
-    }
-    return res;
-}
-
-//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
-_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
-{
-    //solution may be not optimal compared with the serial one
-    __m128i r16;
-    uint8x8_t res64;
-    r16 = vrshrq_n_s16(a,b);
-    r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
-{
-    //solution may be not optimal compared with the serial one
-    __m128i r32;
-    uint16x4_t res64;
-    r32 = vrshrq_n_s32(a,b);
-    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
-{
-    _NEON2SSE_ALIGN_16 int64_t atmp[2];
-    uint32x2_t res;
-    int64_t res64;
-    _mm_store_si128((__m128i*)atmp, a);
-    if (atmp[0] < 0) {
-        res.m64_u32[0] = 0;
-    } else {
-        res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
-        res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
-    }
-    if (atmp[1] < 0) {
-        res.m64_u32[1] = 0;
-    } else {
-        res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
-        res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
-    }
-    return res;
-}
-
-//***** Vector narrowing saturating shift right by constant ******
-//*****************************************************************
-_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
-_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
-{
-    int8x8_t res64;
-    __m128i r16;
-    r16  = vshrq_n_s16(a,b);
-    r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
-_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
-{
-    int16x4_t res64;
-    __m128i r32;
-    r32  = vshrq_n_s32(a,b);
-    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{
-    //no optimal SIMD solution found
-    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
-    int32x2_t res;
-    _mm_store_si128((__m128i*)atmp, a);
-    res64[0] = (atmp[0] >> b);
-    res64[1] = (atmp[1] >> b);
-    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-    res.m64_i32[0] = (int32_t)res64[0];
-    res.m64_i32[1] = (int32_t)res64[1];
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
-{
-    uint8x8_t res64;
-    __m128i r16;
-    r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
-{
-    uint16x4_t res64;
-    __m128i r32;
-    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
-_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-{
-    //serial solution may be faster
-    uint32x2_t res64;
-    __m128i r64, res_hi, zero;
-    zero = _mm_setzero_si128();
-    r64  = vshrq_n_u64(a,b);
-    res_hi = _mm_srli_epi64(r64,  32);
-    res_hi = _mm_cmpgt_epi32(res_hi, zero);
-    r64 = _mm_or_si128(r64, res_hi);
-    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-
-//********* Vector rounding narrowing shift right by constant *************************
-//****************************************************************************************
-_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-{
-    int8x8_t res64;
-    __m128i r16;
-     r16  = vrshrq_n_s16(a,b);
-    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-    return64(r16);
-}
-
-_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-{
-    int16x4_t res64;
-    __m128i r32;
-    r32  = vrshrq_n_s32(a,b);
-    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
-    return64(r32);
-}
-
-_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
-{
-    int32x2_t res64;
-    __m128i r64;
-    r64  = vrshrq_n_s64(a,b);
-    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
-{
-    uint8x8_t res64;
-    __m128i mask, r16;
-    mask = _mm_set1_epi16(0xff);
-    r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-    r16 = _mm_and_si128(r16, mask); //to avoid saturation
-    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
-{
-    uint16x4_t res64;
-    __m128i mask, r32;
-    mask = _mm_set1_epi32(0xffff);
-    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-    r32 = _mm_and_si128(r32, mask); //to avoid saturation
-    r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
-_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
-{
-    uint32x2_t res64;
-    __m128i r64;
-    r64  = vrshrq_n_u64(a,b);
-    r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-//************* Vector rounding narrowing saturating shift right by constant ************
-//****************************************************************************************
-_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
-_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
-{
-    int8x8_t res64;
-    __m128i r16;
-    r16  = vrshrq_n_s16(a,b);
-    r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
-_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
-{
-    int16x4_t res64;
-    __m128i r32;
-    r32  = vrshrq_n_s32(a,b);
-    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{
-    //no optimal SIMD solution found
-    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
-    int32x2_t res;
-    _mm_store_si128((__m128i*)atmp, a);
-    maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
-    res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
-    maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
-    res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
-    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
-    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
-    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
-    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
-    res.m64_i32[0] = (int32_t)res64[0];
-    res.m64_i32[1] = (int32_t)res64[1];
-    return res;
-}
-
-_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
-_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
-{
-    uint8x8_t res64;
-    __m128i r16;
-    r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
-    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
-    return64(r16);
-}
-
-_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
-_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
-{
-    uint16x4_t res64;
-    __m128i r32;
-    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
-    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
-    return64(r32);
-}
-
-_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
-_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
-{
-    //serial solution may be faster
-    uint32x2_t res64;
-    __m128i r64, res_hi, zero;
-    zero = _mm_setzero_si128();
-    r64  = vrshrq_n_u64(a,b);
-    res_hi = _mm_srli_epi64(r64,  32);
-    res_hi = _mm_cmpgt_epi32(res_hi, zero);
-    r64 = _mm_or_si128(r64, res_hi);
-    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
-    return64(r64);
-}
-
-//************** Vector widening shift left by constant ****************
-//************************************************************************
-_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
-_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
-{
-    __m128i r;
-    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
-    return _mm_slli_epi16 (r, b);
-}
-
-_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
-_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
-{
-    __m128i r;
-    r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
-    return _mm_slli_epi32 (r, b);
-}
-
-_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
-_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
-{
-    __m128i r;
-    r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
-    return _mm_slli_epi64 (r, b);
-}
-
-_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
-_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
-{
-    //no uint8 to uint16 conversion available, manual conversion used
-    __m128i zero,  r;
-    zero = _mm_setzero_si128 ();
-    r = _mm_unpacklo_epi8(_pM128i(a), zero);
-    return _mm_slli_epi16 (r, b);
-}
-
-_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
-_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
-{
-    //no uint16 to uint32 conversion available, manual conversion used
-    __m128i zero,  r;
-    zero = _mm_setzero_si128 ();
-    r = _mm_unpacklo_epi16(_pM128i(a), zero);
-    return _mm_slli_epi32 (r, b);
-}
-
-_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
-_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
-{
-    //no uint32 to uint64 conversion available, manual conversion used
-    __m128i zero,  r;
-    zero = _mm_setzero_si128 ();
-    r = _mm_unpacklo_epi32(_pM128i(a), zero);
-    return _mm_slli_epi64 (r, b);
-}
-
-//************************************************************************************
-//**************************** Shifts with insert ************************************
-//************************************************************************************
-//takes each element in a vector,  shifts them by an immediate value,
-//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
-
-//**************** Vector shift right and insert ************************************
-//Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
-//All other bits are taken from b shifted.
-_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
-{
-    int8x8_t res64;
-    return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
-{
-    int16x4_t res64;
-    return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
-{
-    int32x2_t res64;
-    return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
-}
-
-
-_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
-{
-    int64x1_t res;
-    if (c ==64)
-        res = a;
-    else{
-        res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
-    }
-    return res;
-}
-
-_NEON2SSE_GLOBAL uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-#define vsri_n_u8 vsri_n_s8
-
-_NEON2SSE_GLOBAL uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-#define vsri_n_u16 vsri_n_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
-#define vsri_n_u32 vsri_n_s32
-
-
-_NEON2SSE_GLOBAL uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
-#define vsri_n_u64 vsri_n_s64
-
-_NEON2SSE_GLOBAL poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
-#define vsri_n_p8 vsri_n_u8
-
-_NEON2SSE_GLOBAL poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
-#define vsri_n_p16 vsri_n_u16
-
-_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
-{
-    __m128i maskA, a_masked;
-    uint8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 static const uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
-    maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
-    a_masked = _mm_and_si128 (a, maskA);
-    b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
-    return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
-{
-    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
-    uint16x8_t b_shift;
-    uint16x8_t a_c;
-    b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
-    a_c = vshrq_n_u16( a, (16 - c));
-    a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
-{
-    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
-    uint32x4_t b_shift;
-    uint32x4_t a_c;
-    b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
-    a_c = vshrq_n_u32( a, (32 - c));
-    a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-{
-    //serial solution may be faster
-    uint64x2_t b_shift;
-    uint64x2_t a_c;
-    b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
-    a_c = _mm_srli_epi64(a, (64 - c));
-    a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
-}
-
-_NEON2SSE_GLOBAL uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-#define vsriq_n_u8 vsriq_n_s8
-
-_NEON2SSE_GLOBAL uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-#define vsriq_n_u16 vsriq_n_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
-#define vsriq_n_u32 vsriq_n_s32
-
-_NEON2SSE_GLOBAL uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
-#define vsriq_n_u64 vsriq_n_s64
-
-_NEON2SSE_GLOBAL poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
-#define vsriq_n_p8 vsriq_n_u8
-
-_NEON2SSE_GLOBAL poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
-#define vsriq_n_p16 vsriq_n_u16
-
-//***** Vector shift left and insert *********************************************
-//*********************************************************************************
-//Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
-//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
-_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
-{
-    int8x8_t res64;
-    return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
-}
-
-
-_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
-{
-    int16x4_t res64;
-    return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
-}
-
-
-_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
-{
-    int32x2_t res64;
-    return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
-}
-
-_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
-{
-    int64x1_t res;
-    res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
-    return res;
-}
-
-
-_NEON2SSE_GLOBAL uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-#define vsli_n_u8 vsli_n_s8
-
-_NEON2SSE_GLOBAL uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-#define vsli_n_u16 vsli_n_s16
-
-_NEON2SSE_GLOBAL uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
-#define vsli_n_u32 vsli_n_s32
-
-_NEON2SSE_GLOBAL uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
-#define vsli_n_u64 vsli_n_s64
-
-_NEON2SSE_GLOBAL poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
-#define vsli_n_p8 vsli_n_u8
-
-_NEON2SSE_GLOBAL poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
-#define vsli_n_p16 vsli_n_u16
-
-_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
-{
-    __m128i maskA, a_masked;
-    int8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 static const uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
-    maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
-    b_shift = vshlq_n_s8( b, c);
-    a_masked = _mm_and_si128 (a, maskA);
-    return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
-{
-    //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
-    int16x8_t b_shift;
-    int16x8_t a_c;
-    b_shift = vshlq_n_s16( b, c);
-    a_c = vshlq_n_s16( a, (16 - c));
-    a_c  = _mm_srli_epi16(a_c, (16 - c));
-    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
-{
-    //solution may be  not optimal compared with the serial one
-    //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
-    int32x4_t b_shift;
-    int32x4_t a_c;
-    b_shift = vshlq_n_s32( b, c);
-    a_c = vshlq_n_s32( a, (32 - c));
-    a_c  = _mm_srli_epi32(a_c, (32 - c));
-    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-}
-
-_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
-{
-    //solution may be  not optimal compared with the serial one
-    //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
-    int64x2_t b_shift;
-    int64x2_t a_c;
-    b_shift = vshlq_n_s64( b, c);
-    a_c = vshlq_n_s64( a, (64 - c));
-    a_c  = _mm_srli_epi64(a_c, (64 - c));
-    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
-}
-
-_NEON2SSE_GLOBAL uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-#define vsliq_n_u8 vsliq_n_s8
-
-_NEON2SSE_GLOBAL uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-#define vsliq_n_u16 vsliq_n_s16
-
-_NEON2SSE_GLOBAL uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
-#define vsliq_n_u32 vsliq_n_s32
-
-_NEON2SSE_GLOBAL uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
-#define vsliq_n_u64 vsliq_n_s64
-
-_NEON2SSE_GLOBAL poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
-#define vsliq_n_p8 vsliq_n_u8
-
-_NEON2SSE_GLOBAL poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
-#define vsliq_n_p16 vsliq_n_u16
-
-// ***********************************************************************************************
-// ****************** Loads and stores of a single vector ***************************************
-// ***********************************************************************************************
-//Performs loads and stores of a single vector of some type.
-//*******************************  Loads ********************************************************
-// ***********************************************************************************************
-//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
-//also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
-// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
-//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
-#define LOAD_SI128(ptr) \
-        ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
-
-_NEON2SSE_GLOBAL uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-#define vld1q_u8 LOAD_SI128
-
-_NEON2SSE_GLOBAL uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-#define vld1q_u16 LOAD_SI128
-
-_NEON2SSE_GLOBAL uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-#define vld1q_u32 LOAD_SI128
-
-_NEON2SSE_GLOBAL uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-#define vld1q_u64 LOAD_SI128
-
-_NEON2SSE_GLOBAL int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-#define vld1q_s8 LOAD_SI128
-
-_NEON2SSE_GLOBAL int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-#define vld1q_s16 LOAD_SI128
-
-_NEON2SSE_GLOBAL int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-#define vld1q_s32 LOAD_SI128
-
-_NEON2SSE_GLOBAL int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-#define vld1q_s64 LOAD_SI128
-
-_NEON2SSE_GLOBAL float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
-// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
-/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
-{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-__m128 f2;
-f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
-}*/
-
-_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
-_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
-{
-    if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
-        return _mm_load_ps(ptr);
-    else
-        return _mm_loadu_ps(ptr);
-}
-
-_NEON2SSE_GLOBAL poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
-#define vld1q_p8  LOAD_SI128
-
-_NEON2SSE_GLOBAL poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
-#define vld1q_p16 LOAD_SI128
-
-_NEON2SSE_GLOBAL uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
-#define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
-
-_NEON2SSE_GLOBAL uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
-#define vld1_u16 vld1_u8
-
-_NEON2SSE_GLOBAL uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
-#define vld1_u32 vld1_u8
-
-
-_NEON2SSE_GLOBAL uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-#define vld1_u64 vld1_u8
-
-_NEON2SSE_GLOBAL int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
-#define vld1_s8 vld1_u8
-
-_NEON2SSE_GLOBAL int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
-#define vld1_s16 vld1_u16
-
-_NEON2SSE_GLOBAL int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
-#define vld1_s32 vld1_u32
-
-_NEON2SSE_GLOBAL int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-#define vld1_s64 vld1_u64
-
-_NEON2SSE_GLOBAL float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
-// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
-
-_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
-_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
-{
-    float32x2_t res;
-    res.m64_f32[0] = *(ptr);
-    res.m64_f32[1] = *(ptr + 1);
-    return res;
-}
-
-_NEON2SSE_GLOBAL poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
-#define vld1_p8 vld1_u8
-
-_NEON2SSE_GLOBAL poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
-#define vld1_p16 vld1_u16
-
-
-_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
-_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
-{
-    if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
-        return _mm_load_pd(ptr);
-    else
-        return _mm_loadu_pd(ptr);
-}
-
-
-//***********************************************************************************************************
-//******* Lane load functions - insert the data at  vector's given position (lane) *************************
-//***********************************************************************************************************
-_NEON2SSE_GLOBAL uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
-
-
-_NEON2SSE_GLOBAL int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
-{
-    //we need to deal with  ptr  16bit NOT aligned case
-    __m128 p;
-    p = _mm_set1_ps(*(ptr));
-    return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
-}
-
-_NEON2SSE_GLOBAL int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
-#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
-#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
-
-_NEON2SSE_GLOBAL poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
-#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
-
-_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
-{
-    uint8x8_t res;
-    res = vec;
-    res.m64_u8[lane] = *(ptr);
-    return res;
-}
-
-_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
-{
-    uint16x4_t res;
-    res = vec;
-    res.m64_u16[lane] = *(ptr);
-    return res;
-}
-
-_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
-{
-    uint32x2_t res;
-    res = vec;
-    res.m64_u32[lane] = *(ptr);
-    return res;
-}
-
-_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
-{
-    uint64x1_t res;
-    UNREFERENCED_PARAMETER(vec);
-    UNREFERENCED_PARAMETER(lane);
-    res.m64_u64[0] = *(ptr);
-    return res;
-}
-
-
-_NEON2SSE_GLOBAL int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
-
-_NEON2SSE_GLOBAL int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
-
-_NEON2SSE_GLOBAL int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
-
-_NEON2SSE_GLOBAL float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
-{
-    float32x2_t res;
-    res = vec;
-    res.m64_f32[lane] = *(ptr);
-    return res;
-}
-
-_NEON2SSE_GLOBAL int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
-#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
-
-_NEON2SSE_GLOBAL poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
-#define vld1_lane_p8 vld1_lane_u8
-
-_NEON2SSE_GLOBAL poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
-#define vld1_lane_p16 vld1_lane_s16
-
-// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
-// ******************************************************************************************************************
-_NEON2SSE_GLOBAL uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
-
-_NEON2SSE_GLOBAL uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
-
-_NEON2SSE_GLOBAL uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
-
-_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
-{
-    _NEON2SSE_ALIGN_16 uint64_t val[2];
-
-    val[0] = *(ptr);
-    val[1] = *(ptr);
-
-    return LOAD_SI128(val);
-}
-
-_NEON2SSE_GLOBAL int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
-
-_NEON2SSE_GLOBAL int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
-
-_NEON2SSE_GLOBAL int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
-
-_NEON2SSE_GLOBAL int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
-
-_NEON2SSE_GLOBAL float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-//current IA SIMD doesn't support float16, need to go to 32 bits
-
-_NEON2SSE_GLOBAL float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
-
-_NEON2SSE_GLOBAL poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
-
-_NEON2SSE_GLOBAL poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
-
-_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    uint8x8_t res;
-    int i;
-    for(i = 0; i<8; i++) {
-        res.m64_u8[i] =  *(ptr);
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    uint16x4_t res;
-    int i;
-    for(i = 0; i<4; i++) {
-        res.m64_u16[i] =  *(ptr);
-    }
-    return res;
-}
-
-_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
-{
-    uint32x2_t res;
-    res.m64_u32[0] = *(ptr);
-    res.m64_u32[1] = *(ptr);
-    return res;
-}
-
-_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
-_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
-{
-    uint64x1_t res;
-    res.m64_u64[0] = *(ptr);
-    return res;
-}
-
-_NEON2SSE_GLOBAL int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
-
-
-_NEON2SSE_GLOBAL int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
-
-
-_NEON2SSE_GLOBAL int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
-
-
-_NEON2SSE_GLOBAL int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
-#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
-
-_NEON2SSE_GLOBAL float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
-_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
-{
-    float32x2_t res;
-    res.m64_f32[0] = *(ptr);
-    res.m64_f32[1] = res.m64_f32[0];
-    return res; // use last 64bits only
-}
-
-_NEON2SSE_GLOBAL poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
-#define vld1_dup_p8 vld1_dup_u8
-
-
-_NEON2SSE_GLOBAL poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
-#define vld1_dup_p16 vld1_dup_u16
-
-
-//*************************************************************************************
-//********************************* Store **********************************************
-//*************************************************************************************
-// If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
-//here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
-#define STORE_SI128(ptr, val) \
-        (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
-
-_NEON2SSE_GLOBAL void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
-#define vst1q_u8 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
-#define vst1q_u16 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
-#define vst1q_u32 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
-#define vst1q_u64 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
-#define vst1q_s8 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
-#define vst1q_s16 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
-#define vst1q_s32 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
-#define vst1q_s64 STORE_SI128
-
-_NEON2SSE_GLOBAL void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
-// IA32 SIMD doesn't work with 16bit floats currently
-
-_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
-_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
-{
-    if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
-        _mm_store_ps (ptr, val);
-    else
-        _mm_storeu_ps (ptr, val);
-}
-
-_NEON2SSE_GLOBAL void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
-#define vst1q_p8  vst1q_u8
-
-_NEON2SSE_GLOBAL void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
-#define vst1q_p16 vst1q_u16
-
-_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
-{
-    int i;
-    for (i = 0; i<8; i++) {
-        *(ptr + i) = ((uint8_t*)&val)[i];
-    }
-    //_mm_storel_epi64((__m128i*)ptr, val);
-    return;
-}
-
-_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
-{
-    int i;
-    for (i = 0; i<4; i++) {
-        *(ptr + i) = ((uint16_t*)&val)[i];
-    }
-    //_mm_storel_epi64((__m128i*)ptr, val);
-    return;
-}
-
-_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
-{
-    int i;
-    for (i = 0; i<2; i++) {
-        *(ptr + i) = ((uint32_t*)&val)[i];
-    }
-    //_mm_storel_epi64((__m128i*)ptr, val);
-    return;
-}
-
-_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
-{
-    *(ptr) = *((uint64_t*)&val);
-    //_mm_storel_epi64((__m128i*)ptr, val);
-    return;
-}
-
-_NEON2SSE_GLOBAL void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
-#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
-
-_NEON2SSE_GLOBAL void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
-#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
-
-_NEON2SSE_GLOBAL void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
-#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
-
-_NEON2SSE_GLOBAL void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
-#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
-
-_NEON2SSE_GLOBAL void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
-{
-    *(ptr) =   val.m64_f32[0];
-    *(ptr + 1) = val.m64_f32[1];
-    return;
-}
-
-_NEON2SSE_GLOBAL void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
-#define vst1_p8 vst1_u8
-
-_NEON2SSE_GLOBAL void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
-#define vst1_p16 vst1_u16
-
-//***********Store a lane of a vector into memory (extract given lane) *********************
-//******************************************************************************************
-_NEON2SSE_GLOBAL void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
-#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
-
-_NEON2SSE_GLOBAL void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
-{
-    *((int32_t*)ptr) = _MM_EXTRACT_PS(val,lane);
-}
-
-_NEON2SSE_GLOBAL void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
-#define vst1q_lane_p8   vst1q_lane_u8
-
-_NEON2SSE_GLOBAL void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
-#define vst1q_lane_p16   vst1q_lane_s16
-
-_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
-{
-    *(ptr) = val.m64_u8[lane];
-}
-
-_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
-{
-    *(ptr) = val.m64_u16[lane];
-}
-
-_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
-{
-    *(ptr) = val.m64_u32[lane];
-}
-
-_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
-{
-    UNREFERENCED_PARAMETER(lane);
-    *(ptr) = val.m64_u64[0];
-}
-
-_NEON2SSE_GLOBAL void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-#define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
-
-_NEON2SSE_GLOBAL void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
-
-_NEON2SSE_GLOBAL void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-#define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
-
-
-_NEON2SSE_GLOBAL void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
-#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
-
-
-_NEON2SSE_GLOBAL void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-//current IA SIMD doesn't support float16
-
-_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
-_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
-{
-    *(ptr) = val.m64_f32[lane];
-}
-
-_NEON2SSE_GLOBAL void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
-#define vst1_lane_p8 vst1_lane_u8
-
-_NEON2SSE_GLOBAL void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
-#define vst1_lane_p16 vst1_lane_s16
-
-//***********************************************************************************************
-//**************** Loads and stores of an N-element structure **********************************
-//***********************************************************************************************
-//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
-//We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
-//****************** 2 elements load  *********************************************
-_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
-{
-    uint8x16x2_t v;
-    v.val[0] = vld1q_u8(ptr);
-    v.val[1] = vld1q_u8((ptr + 16));
-    v = vuzpq_s8(v.val[0], v.val[1]);
-    return v;
-}
-
-_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
-{
-    uint16x8x2_t v;
-    v.val[0] = vld1q_u16( ptr);
-    v.val[1] = vld1q_u16( (ptr + 8));
-    v = vuzpq_s16(v.val[0], v.val[1]);
-    return v;
-}
-
-_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-{
-    uint32x4x2_t v;
-    v.val[0] = vld1q_u32 ( ptr);
-    v.val[1] = vld1q_u32 ( (ptr + 4));
-    v = vuzpq_s32(v.val[0], v.val[1]);
-    return v;
-}
-
-_NEON2SSE_GLOBAL int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
-#define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
-
-_NEON2SSE_GLOBAL int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
-
-_NEON2SSE_GLOBAL int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
-
-
-_NEON2SSE_GLOBAL float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
-// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
-
-_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
-{
-    float32x4x2_t v;
-    v.val[0] =  vld1q_f32 (ptr);
-    v.val[1] =  vld1q_f32 ((ptr + 4));
-    v = vuzpq_f32(v.val[0], v.val[1]);
-    return v;
-}
-
-_NEON2SSE_GLOBAL poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
-#define  vld2q_p8 vld2q_u8
-
-_NEON2SSE_GLOBAL poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
-#define vld2q_p16 vld2q_u16
-
-_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
-_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
-{
-    uint8x8x2_t v;
-    __m128i ld128;
-    ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
-    ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
-    vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
-    return v;
-}
-
-_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
-_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
-{
-    _NEON2SSE_ALIGN_16 uint16x4x2_t v;
-    __m128i ld128;


Commit: acc818a0cb701ee52b86bf010e15aa55505566c2
    https://github.com/scummvm/scummvm/commit/acc818a0cb701ee52b86bf010e15aa55505566c2
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Intel/AMD's SIMD path goes to the normal one.

Changed paths:
    engines/ags/lib/allegro/surface_simd_sse.cpp
    engines/ags/lib/allegro/surface_simd_sse.h


diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index f85b54b937a..7874194e4e5 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -15,14 +15,17 @@ namespace AGS3 {
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
 void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
 }
 
 template<int ScaleThreshold>
 void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
 }
 
 template<int ScaleThreshold>
 void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
 }
 
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
index 4d7bfd4302d..57296e1572d 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.h
+++ b/engines/ags/lib/allegro/surface_simd_sse.h
@@ -27,7 +27,9 @@
 
 namespace AGS3 {
 
-
+inline __m128i simd2BppTo4Bpp(__m64 pixels) {
+    
+}
 
 } // namespace AGS3
 


Commit: 87656d66dfa9bb148da697b90c27a1359af54d2c
    https://github.com/scummvm/scummvm/commit/87656d66dfa9bb148da697b90c27a1359af54d2c
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Cleaned up blending/blitting pull request.

Changed paths:
  R bench_output.txt
  R bench_output_fast.txt
  R benchgfx32.bmp
  R benchgfx8.bmp
    engines/ags/engine/main/engine.cpp
    engines/ags/tests/test_gfx.cpp


diff --git a/bench_output.txt b/bench_output.txt
deleted file mode 100644
index 8dee5f04919..00000000000
--- a/bench_output.txt
+++ /dev/null
@@ -1,251 +0,0 @@
-121User picked target 'kq2agdi' (engine ID 'ags', game ID 'kq2agdi')...
-   Looking for a plugin supporting this target... Adventure Game Studio
-Running King's Quest II: Romancing the Stones Remake (English)
-kq2vga.exe: 40cfb7563df7dacf6530b19289a4745b, 12574643 bytes.
-Initializing backend libs
-Initializing game data
-Opened game data file: game28.dta
-Game data version: 42
-Compiled with: 3.2.0
-Startup directory: ./
-Data directory: ./
-Setting up game configuration
-Voice pack found: speech.vox
-audio.vox found and initialized.
-Initializing TTF renderer
-Initializing mouse: number of buttons reported is 3
-Install timer
-Initialize legacy path finder library
-Game title: 'King's Quest II'
-Game uid (old format): `1025889151`
-Game guid: '{b85ea0b0-35c5-4e53-bfc7-2281bf481001}'
-Game GUI version: 115
-Lipsync data found and loaded
-Checking for disk space
-Game native resolution: 320 x 200 (32 bit)
-Graphic settings: driver: Software, windowed: no, screen size: 0 x 0, game scale: proportional
-Graphic settings: refresh rate (optional): 0, vsync: 0
-Requested graphics driver 'Software' not found, will try existing drivers instead
-Graphics mode set: 320 x 200 (32-bit) fullscreen desktop
-Graphics mode set: refresh rate (optional): 0, vsync: 0
-Mouse speed control: enabled, unit: 1.000000, user value: 1.000000
-Multitasking mode set: 0
-Setting up window
-Multitasking mode set: 0
-Initialize sprites
-34135008 34135024 34135040 34135056 34135072 34135088
-Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 640
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1552
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 643
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1555
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 643
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1556
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 642
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1556
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 644
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1554
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 2226
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 5666
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 2227
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 5675
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 2227
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 5665
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 2225
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 5666
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 2227
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 5664
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 1526
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 3850
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 1526
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 3819
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 1528
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 3817
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 1528
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 3819
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 1528
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 3818
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 2586
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 6620
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 2585
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 6619
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 2585
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 6622
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 2586
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 6623
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 2584
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 6622
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 699
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1710
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 698
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1707
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 699
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1708
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 698
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1714
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 698
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1706
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 1527
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 3706
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 1525
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 3709
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 1523
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 3709
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 1526
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 3705
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 1524
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 3706
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 629
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1546
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 629
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1546
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 628
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1547
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 628
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1547
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 629
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1547
-
-Engine initialization complete
-Starting game
-WARNING: channel 2 - same clip assigned
-Quitting the game...
-***** ENGINE HAS SHUTDOWN
diff --git a/bench_output_fast.txt b/bench_output_fast.txt
deleted file mode 100644
index 6449b1a93c2..00000000000
--- a/bench_output_fast.txt
+++ /dev/null
@@ -1,251 +0,0 @@
-User picked target 'kq2agdi' (engine ID 'ags', game ID 'kq2agdi')...
-   Looking for a plugin supporting this target... Adventure Game Studio
-Running King's Quest II: Romancing the Stones Remake (English)
-kq2vga.exe: 40cfb7563df7dacf6530b19289a4745b, 12574643 bytes.
-Initializing backend libs
-Initializing game data
-Opened game data file: game28.dta
-Game data version: 42
-Compiled with: 3.2.0
-Startup directory: ./
-Data directory: ./
-Setting up game configuration
-Voice pack found: speech.vox
-audio.vox found and initialized.
-Initializing TTF renderer
-Initializing mouse: number of buttons reported is 3
-Install timer
-Initialize legacy path finder library
-Game title: 'King's Quest II'
-Game uid (old format): `1025889151`
-Game guid: '{b85ea0b0-35c5-4e53-bfc7-2281bf481001}'
-Game GUI version: 115
-Lipsync data found and loaded
-Checking for disk space
-Game native resolution: 320 x 200 (32 bit)
-Graphic settings: driver: Software, windowed: no, screen size: 0 x 0, game scale: proportional
-Graphic settings: refresh rate (optional): 0, vsync: 0
-Requested graphics driver 'Software' not found, will try existing drivers instead
-Graphics mode set: 320 x 200 (32-bit) fullscreen desktop
-Graphics mode set: refresh rate (optional): 0, vsync: 0
-Mouse speed control: enabled, unit: 1.000000, user value: 1.000000
-Multitasking mode set: 0
-Setting up window
-Multitasking mode set: 0
-Initialize sprites
-55839744 55839760 55839776 55839792 55839808 55839824
-Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 622
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1546
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 640
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1545
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 639
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1546
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 640
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1545
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 640
-
-Dest: 32 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1545
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 2213
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 5618
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 2212
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 5621
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 2214
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 5619
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 2213
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 5619
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 2212
-
-Dest: 32 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 5618
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 1526
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 1524
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 3687
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 1528
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 3687
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 1526
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 3686
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 1525
-
-Dest: 32 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 2571
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 6575
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 2571
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 6574
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 2570
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 6574
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 2570
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 6575
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 2570
-
-Dest: 16 bpp, Gfx: 32 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 6574
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 693
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1695
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 692
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1695
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 693
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1694
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 691
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1695
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 692
-
-Dest: 16 bpp, Gfx: 16 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1695
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 1514
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 1513
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 1517
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 3687
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 1519
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 1512
-
-Dest: 16 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 3688
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: false, Iters: 100000
-exec time (mills): 625
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: RGB to RGB, Stretched: true, Iters: 100000
-exec time (mills): 1540
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: false, Iters: 100000
-exec time (mills): 626
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Source Alpha, Stretched: true, Iters: 100000
-exec time (mills): 1540
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: false, Iters: 100000
-exec time (mills): 625
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: ARGB to ARGB, Stretched: true, Iters: 100000
-exec time (mills): 1540
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: false, Iters: 100000
-exec time (mills): 625
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Opaque, Stretched: true, Iters: 100000
-exec time (mills): 1539
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: false, Iters: 100000
-exec time (mills): 625
-
-Dest: 8 bpp, Gfx: 8 bpp, Blender: Tint with Light, Stretched: true, Iters: 100000
-exec time (mills): 1540
-
-Engine initialization complete
-Starting game
-WARNING: channel 2 - same clip assigned
-Quitting the game...
-***** ENGINE HAS SHUTDOWN
diff --git a/benchgfx32.bmp b/benchgfx32.bmp
deleted file mode 100644
index 488896208a8..00000000000
Binary files a/benchgfx32.bmp and /dev/null differ
diff --git a/benchgfx8.bmp b/benchgfx8.bmp
deleted file mode 100644
index aea80181e41..00000000000
Binary files a/benchgfx8.bmp and /dev/null differ
diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index 11c996fdcf5..f9255b0ec1e 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -24,7 +24,6 @@
 //
 
 #include "ags/lib/allegro/color.h"
-#include "ags/lib/std/chrono.h"
 #include "ags/shared/core/platform.h"
 #include "ags/lib/allegro.h" // allegro_install and _exit
 #include "ags/engine/ac/asset_helper.h"
@@ -73,7 +72,6 @@
 #include "ags/engine/platform/base/ags_platform_driver.h"
 #include "ags/shared/util/directory.h"
 #include "ags/shared/util/error.h"
-#include "ags/shared/util/file.h"
 #include "ags/shared/util/path.h"
 #include "ags/shared/util/string_utils.h"
 #include "ags/ags.h"
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index a78d0d6960a..547ffbab6a0 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -21,6 +21,7 @@
 
 #include "ags/shared/debugging/out.h"
 #include "common/scummsys.h"
+#include "common/debug.h"
 #include "ags/shared/core/platform.h"
 #include "ags/shared/gfx/gfx_def.h"
 //#include "ags/shared/debugging/assert.h"
@@ -36,7 +37,11 @@
 #include "graphics/pixelformat.h"
 
 #ifdef __aarch64__
+#define OPT_NEON
 #include "ags/lib/allegro/surface_simd_neon.h"
+#elif defined(__x86_64__) || defined(__i686__)
+#define OPT_SSE
+#include "ags/lib/allegro/surface_simd_sse.h"
 #endif
 
 namespace AGS3 {
@@ -44,17 +49,14 @@ namespace AGS3 {
 namespace GfxDef = AGS::Shared::GfxDef;
 using namespace AGS::Shared;
 
-// This is so that it debug info can be printed
-// If there is a better way of doing this, please change it
-#undef printf
 // Comment this out if you don't want the console to be clogged with info durning tests
 #define VERBOSE_TEST_GFX
 
 void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	_G(_bitmap_simd_optimizations) = opt;
 #ifdef VERBOSE_TEST_GFX
-	if (opt) printf("SIMD optimizations: true\n");
-	else printf("SIMD optmizations: false\n");
+	if (opt) debug("SIMD optimizations: true\n");
+	else debug("SIMD optmizations: false\n");
 #endif
 	Bitmap *benchgfx32 = BitmapHelper::CreateBitmap(100, 100, 32);
 	Bitmap *benchgfx16 = BitmapHelper::CreateBitmapCopy(benchgfx32, 16);
@@ -62,7 +64,7 @@ void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	Bitmap *dest32 = BitmapHelper::CreateBitmap(100, 100, 32);
 	Bitmap *dest16 = BitmapHelper::CreateBitmap(100, 100, 16);
 	Bitmap *dest8 = BitmapHelper::CreateBitmap(100, 100, 8);
-	Debug::Printf(kDbgMsg_Info, "%d %d %d %d %d %d", benchgfx32, benchgfx16, benchgfx8, dest32, dest16, dest8);
+	debug("%d %d %d %d %d %d", benchgfx32, benchgfx16, benchgfx8, dest32, dest16, dest8);
 	int benchRuns[] = {1000, 10000, 100000};
 	int blenderModes[] = {kRgbToRgbBlender, kSourceAlphaBlender, kArgbToArgbBlender, kOpaqueBlenderMode, kTintLightBlenderMode};
 	const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
@@ -77,22 +79,22 @@ void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 					uint32 start, end;
 					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
 #ifdef VERBOSE_TEST_GFX
-					if (runs == 2) printf("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
+					if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
 #endif
 					start = std::chrono::high_resolution_clock::now();
 					for (int i = 0; i < benchRuns[runs]; i++)
 						destinations[dest]->Blit(graphics[gfx], 0, 0, kBitmap_Transparency);
 					end = std::chrono::high_resolution_clock::now();
 #ifdef VERBOSE_TEST_GFX
-					if (runs == 2) printf("exec time (mills): %u\n\n", end - start);
-					if (runs == 2) printf("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
+					if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
+					if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
 #endif
 					start = std::chrono::high_resolution_clock::now();
 					for (int i = 0; i < benchRuns[runs]; i++)
 						destinations[dest]->StretchBlt(graphics[gfx], Rect(0, 0, 99, 99), kBitmap_Transparency);
 					end = std::chrono::high_resolution_clock::now();
 #ifdef VERBOSE_TEST_GFX
-					if (runs == 2) printf("exec time (mills): %u\n\n", end - start);
+					if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
 #endif
 				}
 			}
@@ -150,37 +152,49 @@ void Test_BlenderModes() {
 											r16 >>= 3; g16 >>= 2; b16 >>= 3;
 											control2bppCol = b16 | (g16 << 5) | (r16 << 11);
 											{
+#ifdef OPT_NEON
 												uint32x4_t src = vdupq_n_u32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
 												uint32x4_t dest = vdupq_n_u32(destB | (destG << 8) | (destR << 16) | (destA << 24));
 												uint32x4_t alphas = vdupq_n_u32(alpha);
 												simdCol = vgetq_lane_u32(blendPixelSIMD(src, dest, alphas), 0);
+#else
+												//__m128i src = _mm_set1_epi32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
+												//__m128i dest = _mm_set1_epi32(destB | (destG << 8) | (destR << 16) | (destA << 24));
+												//__m128i alphas = _mm_set1_epi32(alpha);
+												//simdCol = _mm_extract_epi32();
+												simdCol = controlCol; // Not implemented yet
+#endif
 											}
 											{
+#ifdef OPT_NEON
 												uint16x8_t src = vdupq_n_u16((srcB >> 3) | ((srcG >> 2) << 5) | ((srcR >> 3) << 11));
 												uint16x8_t dest = vdupq_n_u16((destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11));
 												uint16x8_t alphas = vdupq_n_u16((uint16)alpha);
 												simd2bppCol = vgetq_lane_u16(blendPixelSIMD2Bpp(src, dest, alphas), 0);
+#else
+												simd2bppCol = control2bppCol; // Not implemented yet
+#endif
 											}
 #ifdef VERBOSE_TEST_GFX
-											printf("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d\n", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
+											debug("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d\n", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
 #endif
 											switch ((BlenderMode)blenderMode) {
-												case kSourceAlphaBlender: printf("blenderMode: kSourceAlphaBlender\n"); break;
-												case kArgbToArgbBlender: printf("blenderMode: kArgbToArgbBlender\n"); break;
-												case kArgbToRgbBlender: printf("blenderMode: kArgbToRgbBlender\n"); break;
-												case kRgbToArgbBlender: printf("blenderMode: kRgbToArgbBlender\n"); break;
-												case kRgbToRgbBlender: printf("blenderMode: kRgbToRgbBlender\n"); break;
-												case kAlphaPreservedBlenderMode: printf("blenderMode: kAlphaPreservedBlenderMode\n"); break;
-												case kOpaqueBlenderMode: printf("blenderMode: kOpaqueBlenderMode\n"); break;
-												case kAdditiveBlenderMode: printf("blenderMode: kAdditiveBlenderMode\n"); break;
-												case kTintBlenderMode: printf("blenderMode: kTintBlenderMode\n"); break;
-												case kTintLightBlenderMode: printf("blenderMode: kTintLightBlenderMode\n"); break;
+												case kSourceAlphaBlender: debug("blenderMode: kSourceAlphaBlender\n"); break;
+												case kArgbToArgbBlender: debug("blenderMode: kArgbToArgbBlender\n"); break;
+												case kArgbToRgbBlender: debug("blenderMode: kArgbToRgbBlender\n"); break;
+												case kRgbToArgbBlender: debug("blenderMode: kRgbToArgbBlender\n"); break;
+												case kRgbToRgbBlender: debug("blenderMode: kRgbToRgbBlender\n"); break;
+												case kAlphaPreservedBlenderMode: debug("blenderMode: kAlphaPreservedBlenderMode\n"); break;
+												case kOpaqueBlenderMode: debug("blenderMode: kOpaqueBlenderMode\n"); break;
+												case kAdditiveBlenderMode: debug("blenderMode: kAdditiveBlenderMode\n"); break;
+												case kTintBlenderMode: debug("blenderMode: kTintBlenderMode\n"); break;
+												case kTintLightBlenderMode: debug("blenderMode: kTintLightBlenderMode\n"); break;
 											}
 #ifdef VERBOSE_TEST_GFX
-											printf("controlCol %x argb: %d, %d, %d, %d\n", controlCol, a, r, g, b);
-											printf("simdCol %x argb: %d, %d, %d, %d\n", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
-											printf("control2bppCol %x rgb: %d, %d, %d\n", control2bppCol, r16, g16, b16);
-											printf("simd2bppCol %x rgb: %d, %d, %d\n\n", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
+											debug("controlCol %x argb: %d, %d, %d, %d\n", controlCol, a, r, g, b);
+											debug("simdCol %x argb: %d, %d, %d, %d\n", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
+											debug("control2bppCol %x rgb: %d, %d, %d\n", control2bppCol, r16, g16, b16);
+											debug("simd2bppCol %x rgb: %d, %d, %d\n\n", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
 #endif
 											int tolerance, tolerance16;
 											switch ((BlenderMode)blenderMode) {
@@ -245,6 +259,7 @@ void Test_GfxTransparency() {
 
 void Test_Gfx() {
 	Test_GfxTransparency();
+#if defined(OPT_NEON) || defined(OPT_SSE)
 	Test_DrawingLoops();
 	Test_BlenderModes();
 	// This could take a LONG time
@@ -254,6 +269,7 @@ void Test_Gfx() {
 		Test_GfxSpeed(true, 0, kTintLightBlenderMode);
 	}
 	_G(_bitmap_simd_optimizations) = has_simd;
+#endif
 }
 
 } // namespace AGS3


Commit: ace1a346cc1973a75417b0bb378de88e2dc89541
    https://github.com/scummvm/scummvm/commit/ace1a346cc1973a75417b0bb378de88e2dc89541
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Finished SSE2 blending optimizations

Finished writing the code in surface_simd_sse.cpp. I also added a backup
option in case no processor simd extensions are found. In that case it
just defualts to the normal drawInnerGeneric. I also made
drawInnerGeneric a bit faster by moving certain things into compile
time. Tests were changed to also include SSE2.

Changed paths:
  A engines/ags/lib/allegro/surface_simd_none.cpp
    engines/ags/ags.h
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.h
    engines/ags/lib/allegro/surface_simd_sse.cpp
    engines/ags/lib/allegro/surface_simd_sse.h
    engines/ags/tests/test_gfx.cpp
    engines/ags/tests/test_memory.cpp
    engines/ags/tests/test_string.cpp


diff --git a/engines/ags/ags.h b/engines/ags/ags.h
index 155eb087a0d..bd786b958a7 100644
--- a/engines/ags/ags.h
+++ b/engines/ags/ags.h
@@ -37,9 +37,6 @@
 #include "ags/shared/gfx/bitmap.h"
 #include "ags/lib/allegro/system.h"
 
-// DEBUG: @eklipsed TAKE OUT!!!
-//#define ENABLE_AGS_TESTS 1
-
 namespace AGS3 {
 class Globals;
 }
diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 34dfc60e670..14257f6fb6c 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -113,7 +113,6 @@ static bool checkForSIMDExtensions() {
 		 : "=rm" (extensions)
 		 :
 		 : "eax", "ebx", "ecx", "edx");
-	debug("extensions_bits: %ux\n", extensions);
 	return extensions & (1 << 26); // SSE2 extensions bit
 #  elif _MSC_VER
 	uint32 extensions;
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index feaad80ea14..1fcdacf8b05 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -106,7 +106,7 @@ void BITMAP::floodfill(int x, int y, int color) {
 
 const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
-template<int ScaleThreshold>
+template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
 void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
@@ -120,7 +120,7 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 	}
 	if (xStart < 0) { // Clip the left
 		xCtrStart = -xStart;
-		xCtrBppStart = xCtrStart * src.format.bytesPerPixel;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
 		xStart = 0;
 	}
 	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
@@ -151,25 +151,25 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			}
 		}
 		// Loop through the pixels of the row
-		for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel, scaleXCtr += scaleX) {
+		for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += scaleX) {
 			const byte *srcVal = srcP + xDir * xCtrBpp;
 			if (ScaleThreshold != 0) {
-				srcVal = srcP + (scaleXCtr / ScaleThreshold) * src.format.bytesPerPixel;
+				srcVal = srcP + (scaleXCtr / ScaleThreshold) * SrcBytesPerPixel;
 			}
-			uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
+			uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
 
 			// Check if this is a transparent color we should skip
 			if (skipTrans && ((srcCol & alphaMask) == transColor))
 				continue;
 
-			byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
+			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 
 			// When blitting to the same format we can just copy the color
-			if (format.bytesPerPixel == 1) {
+			if (DestBytesPerPixel == 1) {
 				*destVal = srcCol;
 				continue;
-			} else if (sameFormat && srcAlpha == -1) {
-				if (format.bytesPerPixel == 4)
+			} else if ((DestBytesPerPixel == SrcBytesPerPixel) && srcAlpha == -1) {
+				if (DestBytesPerPixel)
 					*(uint32 *)destVal = srcCol;
 				else
 					*(uint16 *)destVal = srcCol;
@@ -177,14 +177,28 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			}
 
 			// We need the rgb values to do blending and/or convert between formats
-			if (src.format.bytesPerPixel == 1) {
+			if (SrcBytesPerPixel == 1) {
 				const RGB &rgb = palette[srcCol];
 				aSrc = 0xff;
 				rSrc = rgb.r;
 				gSrc = rgb.g;
 				bSrc = rgb.b;
 			} else {
-				src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+				if (SrcBytesPerPixel == 4) {
+					aSrc = srcCol >> 24;
+					rSrc = (srcCol >> 16) & 0xff;
+					gSrc = (srcCol >> 8) & 0xff;
+					bSrc = srcCol & 0xff;
+				} else { // SrcBytesPerPixel == 2
+					aSrc = 0xff;
+					rSrc = (srcCol >> 11) & 0x1f;
+					rSrc = (rSrc << 3) | (rSrc >> 2);
+					gSrc = (srcCol >> 5) & 0x3f;
+					gSrc = (gSrc << 2) | (gSrc >> 4);
+					bSrc = srcCol & 0x1f;
+					bSrc = (bSrc << 3) | (bSrc >> 2);
+				}
+				//src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
 			}
 
 			if (srcAlpha == -1) {
@@ -207,11 +221,15 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
 			}
 
-			uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			if (format.bytesPerPixel == 4)
+			uint32 pixel;// = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			if (DestBytesPerPixel == 4) {
+				pixel = (aDest << 24) | (rDest << 16) | (gDest << 8) | (bDest);
 				*(uint32 *)destVal = pixel;
-			else
+			}
+			else {
+				pixel = ((rDest >> 3) << 11) | ((gDest >> 2) << 5) | (bDest >> 3);
 				*(uint16 *)destVal = pixel;
+			}
 		}
 
 		destP += destArea.pitch;
@@ -277,22 +295,36 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
 	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
 	if (!_G(_bitmap_simd_optimizations)) {
-		DRAWINNER(drawInnerGeneric<0>);
-	} else {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<0>); break;
-			case 2: DRAWINNER(drawInner2Bpp<0>); break;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); break;
+			case 1: DRAWINNER((drawInnerGeneric<1, 1, 0>)); return;
+			case 2: DRAWINNER((drawInnerGeneric<2, 2, 0>)); return;
+			case 4: DRAWINNER((drawInnerGeneric<4, 4, 0>)); return;
+			}
+		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+			DRAWINNER((drawInnerGeneric<4, 2, 0>));
+		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+			DRAWINNER((drawInnerGeneric<2, 4, 0>));
 		}
+	} else {
+		if (sameFormat) {
+			switch (format.bytesPerPixel) {
+			case 1: DRAWINNER(drawInner1Bpp<0>); return;
+			case 2: DRAWINNER(drawInner2Bpp<0>); return;
+			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); return;
+			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInner4BppWithConv<4, 2, 0>));
+			return;
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInner4BppWithConv<2, 4, 0>));
-		} else {
-			DRAWINNER(drawInnerGeneric<0>);
+			return;
 		}
 	}
+	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
+		DRAWINNER((drawInnerGeneric<4, 1, 0>));
+	else
+		DRAWINNER((drawInnerGeneric<2, 1, 0>));
 #undef DRAWINNER
 }
 
@@ -345,22 +377,36 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY)
 	if (!_G(_bitmap_simd_optimizations)) {
-		DRAWINNER(drawInnerGeneric<SCALE_THRESHOLD>);
+		if (sameFormat) {
+			switch (format.bytesPerPixel) {
+			case 1: DRAWINNER((drawInnerGeneric<1, 1, SCALE_THRESHOLD>)); return;
+			case 2: DRAWINNER((drawInnerGeneric<2, 2, SCALE_THRESHOLD>)); return;
+			case 4: DRAWINNER((drawInnerGeneric<4, 4, SCALE_THRESHOLD>)); return;
+			}
+		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+			DRAWINNER((drawInnerGeneric<4, 2, SCALE_THRESHOLD>));
+		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+			DRAWINNER((drawInnerGeneric<2, 4, SCALE_THRESHOLD>));
+		}
 	} else {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); break;
-			case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); break;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); break;
+			case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); return;
+			case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); return;
+			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); return;
 			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInner4BppWithConv<4, 2, SCALE_THRESHOLD>));
+			return;
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInner4BppWithConv<2, 4, SCALE_THRESHOLD>));
-		} else {
-			DRAWINNER(drawInnerGeneric<SCALE_THRESHOLD>);
+			return;
 		}
 	}
+	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
+		DRAWINNER((drawInnerGeneric<4, 1, SCALE_THRESHOLD>));
+	else
+		DRAWINNER((drawInnerGeneric<2, 1, SCALE_THRESHOLD>));
 #undef DRAWINNER
 }
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index a87bc5fa896..7e011908459 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -271,7 +271,7 @@ public:
 	void drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
 	void drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
-	template<int ScaleThreshold>
+	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
 	void drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	
 	inline uint32 getColor(const byte *data, byte bpp) const {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index ce6361235a3..d2e3bb6911c 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -22,6 +22,10 @@
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
 #ifdef __aarch64__
 
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#endif
+
 #include <arm_neon.h>
 #include "ags/lib/allegro/surface.h"
 
@@ -399,13 +403,15 @@ inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, ui
 template<int DestBytesPerPixel, int SrcBytesPerPixel>
 inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
 	uint32x4_t srcCols, destCol;
-	if (SrcBytesPerPixel == 4) {
+
+	if (DestBytesPerPixel == 4)
 		destCol = vld1q_u32((uint32 *)destPtr);
-		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-	} else {
+	else
 		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
+	if (SrcBytesPerPixel == 4)
+		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+	else
 		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-	}
 	// we do this here because we need to check if we should skip the pixel before we blend it
 	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
 	mask1 = vorrq_u32(mask1, skipMask);
diff --git a/engines/ags/lib/allegro/surface_simd_none.cpp b/engines/ags/lib/allegro/surface_simd_none.cpp
new file mode 100644
index 00000000000..d22f7f84300
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_none.cpp
@@ -0,0 +1,36 @@
+#include "ags/lib/allegro/surface.h"
+#include "ags/lib/allegro/surface_simd_neon.h"
+#include "ags/lib/allegro/surface_simd_sse.h"
+
+// There is no SIMD implementation on this platform
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+
+namespace AGS3 {
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric<DestBytesPerPixel, SrcBytesPerPixel, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+}
+template<int ScaleThreshold>
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric<2, 2, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+}
+template<int ScaleThreshold>
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric<1, 1, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+}
+
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 7874194e4e5..208e8254381 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -8,24 +8,496 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
-#include "ags/lib/allegro/surface_simd_neon.h"
+#include "ags/lib/allegro/surface_simd_sse.h"
 
 namespace AGS3 {
 
+inline uint32 extract32_idx0(__m128i x) {
+	return _mm_cvtsi128_si32(x);
+}
+inline uint32 extract32_idx1(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+inline uint32 extract32_idx2(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(2, 2, 2, 2)));
+}
+inline uint32 extract32_idx3(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 3, 3)));
+}
+
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
 void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(srcAlpha), _mm_set1_epi32(24));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(tintRed), _mm_set1_epi32(16)));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(tintGreen), _mm_set1_epi32(8)));
+	tint = _mm_or_si128(tint, _mm_set1_epi32(tintBlue));
+	__m128i maskedAlphas = _mm_set1_epi32(alphaMask);
+	__m128i transColors = _mm_set1_epi32(transColor);
+    __m128i alphas = _mm_set1_epi32(srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m128i addIndexes = _mm_set_epi32(3, 2, 1, 0);
+	if (horizFlip) addIndexes = _mm_set_epi32(0, 1, 2, 3);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m128i scaleAdds = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		xStart = 0;
+	}
+	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 4 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
+
+		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
+				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			byte srcBuffer[4*4] = {0};
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
+				__m128i indexes = _mm_set1_epi32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
+				if (SrcBytesPerPixel == 4)
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 2);
+				else
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(size_t)SrcBytesPerPixel], srcP + extract32_idx2(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(size_t)SrcBytesPerPixel], srcP + extract32_idx3(indexes), SrcBytesPerPixel);
+				scaleXCtr += scaleX*4;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * (intptr_t)DestBytesPerPixel];
+				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
+	if (xCtrWidth % 4 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, _mm_setzero_si128());
+		}
+		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
+		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 4;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && ((srcCol & alphaMask) == transColor))
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
 }
 
 template<int ScaleThreshold>
 void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	__m128i tint = _mm_set1_epi16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
+	__m128i transColors = _mm_set1_epi16(transColor);
+	__m128i alphas = _mm_set1_epi16(srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m128i addIndexes = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	if (horizFlip) addIndexes = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+	__m128i scaleAdds = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)scaleX*7, (uint32)scaleX*6, (uint32)scaleX*5, (uint32)scaleX*4);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * 2;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 8 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
+		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
+				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			uint16 srcBuffer[8];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
+				indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
+				indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8), 1);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
+				srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
+				srcBuffer[2] = *(const uint16 *)(srcP + extract32_idx2(indexes));
+				srcBuffer[3] = *(const uint16 *)(srcP + extract32_idx3(indexes));
+				srcBuffer[4] = *(const uint16 *)(srcP + extract32_idx0(indexes2));
+				srcBuffer[5] = *(const uint16 *)(srcP + extract32_idx1(indexes2));
+				srcBuffer[6] = *(const uint16 *)(srcP + extract32_idx2(indexes2));
+				srcBuffer[7] = *(const uint16 *)(srcP + extract32_idx3(indexes2));
+				scaleXCtr += scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * 2];
+				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// We have a picture that is a multiple of 8, so no extra pixels to draw
+	if (xCtrWidth % 8 == 0) return;
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, _mm_setzero_si128());
+		}
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (horizFlip) srcP += 2 * 7;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * 2;
+		destX = xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && srcCol == transColor)
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
 }
 
 template<int ScaleThreshold>
 void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-    drawInnerGeneric<ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	const int xDir = horizFlip ? -1 : 1;
+	__m128i transColors = _mm_set1_epi16(transColor | (transColor << 8));
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m128i scaleAdds1 = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)scaleX*7, (uint32)scaleX*6, (uint32)scaleX*5, (uint32)scaleX*4);
+	__m128i scaleAdds3 = _mm_set_epi32((uint32)scaleX*11, (uint32)scaleX*10, (uint32)scaleX*9, (uint32)scaleX*8);
+	__m128i scaleAdds4 = _mm_set_epi32((uint32)scaleX*15, (uint32)scaleX*14, (uint32)scaleX*13, (uint32)scaleX*12);
+	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 16 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		if (ScaleThreshold != 0) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
+			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
+			__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
+			__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
+			if (ScaleThreshold != 0) {
+				// If we are scaling, we have to set each pixel individually
+				__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
+				__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
+				indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), 8);
+				indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8);
+				indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), 8);
+				indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), 8);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				srcCols = _mm_set_epi8(
+					srcP[extract32_idx3(indexes4)],
+					srcP[extract32_idx2(indexes4)],
+					srcP[extract32_idx1(indexes4)],
+					srcP[extract32_idx0(indexes4)],
+					srcP[extract32_idx3(indexes3)],
+					srcP[extract32_idx2(indexes3)],
+					srcP[extract32_idx1(indexes3)],
+					srcP[extract32_idx0(indexes3)],
+					srcP[extract32_idx3(indexes2)],
+					srcP[extract32_idx2(indexes2)],
+					srcP[extract32_idx1(indexes2)],
+					srcP[extract32_idx0(indexes2)],
+					srcP[extract32_idx3(indexes1)],
+					srcP[extract32_idx2(indexes1)],
+					srcP[extract32_idx1(indexes1)],
+					srcP[extract32_idx0(indexes1)]);
+				scaleXCtr += scaleX*16;
+			}
+
+			// Mask out transparent pixels
+			__m128i mask1 = skipTrans ? _mm_cmpeq_epi8(srcCols, transColors) : _mm_setzero_si128();
+			__m128i final = _mm_or_si128(_mm_andnot_si128(mask1, srcCols), _mm_and_si128(destCols, mask1));
+			if (horizFlip) {
+				__m128i final_swap16 = _mm_srli_epi16(final, 8);
+				final_swap16 = _mm_or_si128(final_swap16, _mm_slli_epi16(_mm_and_si128(final, _mm_set1_epi16(0xff)), 8));
+				final_swap16 = _mm_shufflelo_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final_swap16 = _mm_shufflehi_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final_swap16), _mm_castsi128_pd(final_swap16), _MM_SHUFFLE2(0, 1)));
+			}
+			_mm_storeu_si128((__m128i *)destPtr, final);
+		}
+		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (ScaleThreshold != 0) {
+				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+			}
+			// Check if this is a transparent color we should skip
+			if (skipTrans && *srcCol == transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (horizFlip) srcP -= 15; // Undo what we did up there
+		destP += destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
+		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+	}
 }
 
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
index 57296e1572d..68fcf71245c 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.h
+++ b/engines/ags/lib/allegro/surface_simd_sse.h
@@ -22,13 +22,441 @@
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
 #if defined(__x86_64__) || defined(__i686__)
 
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#endif
+
 #include <immintrin.h>
 #include "ags/lib/allegro/surface.h"
 
 namespace AGS3 {
 
-inline __m128i simd2BppTo4Bpp(__m64 pixels) {
-    
+inline __m128i simd2BppTo4Bpp(__m128i pixels) {
+	__m128i x = _mm_unpacklo_epi16(pixels, _mm_setzero_si128());
+
+	// c is the extracted 5/6 bit color from the image
+	__m128i c = _mm_srli_epi32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
+	__m128i r = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2)), 16);
+	c = _mm_srli_epi32(_mm_and_si128(x, _mm_set1_epi32(0x07e0)), 5);
+	__m128i g = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 2), _mm_srli_epi32(c, 4)), 8);
+	c = _mm_and_si128(x, _mm_set1_epi32(0x001f));
+	__m128i b = _mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
+	return _mm_or_si128(_mm_or_si128(_mm_or_si128(r, g), b), _mm_set1_epi32(0xff000000));
+}
+
+inline __m128i simd4BppTo2Bpp(__m128i pixels) {
+	// x is the final 16 bit rgb pixel
+	__m128i x = _mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x000000ff)), 3);
+	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x0000ff00)), 8+2), 5));
+	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x00ff0000)), 16+3), 11));
+	x = _mm_slli_epi32(x, 16);
+	x = _mm_srai_epi32(x, 16);
+	return _mm_packs_epi32(x, _mm_setzero_si128());
+}
+
+inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = _mm_add_epi16(alphas, _mm_and_si128(_mm_cmpgt_epi16(alphas, _mm_setzero_si128()), _mm_set1_epi16(1)));
+
+	// Split the components into rgb
+	__m128i srcComps[] = {
+		_mm_and_si128(srcCols, _mm_set1_epi16(0x1f)),		    		 // B
+		_mm_and_si128(_mm_srli_epi16(srcCols, 5), _mm_set1_epi16(0x3f)), // G
+		_mm_srli_epi16(srcCols, 11),									 // R
+	}, destComps[] = {
+		_mm_and_si128(destCols, _mm_set1_epi16(0x1f)),		    		  // B
+		_mm_and_si128(_mm_srli_epi16(destCols, 5), _mm_set1_epi16(0x3f)), // G
+		_mm_srli_epi16(destCols, 11),									  // R
+	};
+
+	// Calculate the differences between the colors
+	__m128i diffs[] = {
+		_mm_sub_epi16(srcComps[0], destComps[0]), // B
+		_mm_sub_epi16(srcComps[1], destComps[1]), // G
+		_mm_sub_epi16(srcComps[2], destComps[2]), // R
+	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
+	alphas = _mm_srli_epi16(alphas, 2);
+	diffs[1] = _mm_srli_epi16(_mm_mullo_epi16(diffs[1], alphas), 6);
+	alphas = _mm_srli_epi16(alphas, 1);
+	diffs[0] = _mm_srli_epi16(_mm_mullo_epi16(diffs[0], alphas), 5);
+	diffs[2] = _mm_srli_epi16(_mm_mullo_epi16(diffs[2], alphas), 5);
+
+	// Here we add the difference between the 2 colors times alpha onto the destination
+	diffs[0] = _mm_and_si128(_mm_add_epi16(diffs[0], destComps[0]), _mm_set1_epi16(0x1f));
+	diffs[1] = _mm_and_si128(_mm_add_epi16(diffs[1], destComps[1]), _mm_set1_epi16(0x3f));
+	diffs[2] = _mm_and_si128(_mm_add_epi16(diffs[2], destComps[2]), _mm_set1_epi16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
+	diffs[0] = _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[1], 5));
+	return _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[2], 11));
+}
+
+inline __m128i mul32_as32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a,b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), _mm_srli_si128(b,4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
+}
+
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
+inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did.
+	alphas = _mm_add_epi32(alphas, _mm_and_si128(_mm_cmpgt_epi32(alphas, _mm_setzero_si128()), _mm_set1_epi32(1)));
+
+	// Get the alpha from the destination
+	__m128i alpha = _mm_and_si128(destCols, _mm_set1_epi32(0xff000000));
+
+	// Get red and blue components
+	__m128i srcColsCopy = srcCols;
+	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
+	__m128i destColsCopy = destCols;
+	destColsCopy = _mm_and_si128(destColsCopy, _mm_set1_epi32(0xff00ff));
+
+	// Compute the difference, then multiply by alpha and divide by 256
+	srcColsCopy = _mm_sub_epi32(srcColsCopy, destColsCopy);
+	srcColsCopy = mul32_as32(srcColsCopy, alphas);
+	//srcColsCopy = _mm_mul_epi32(srcColsCopy, alphas);
+	srcColsCopy = _mm_srli_epi32(srcColsCopy, 8);
+	srcColsCopy = _mm_add_epi32(srcColsCopy, destCols); // Add the new red/blue to the old red/blue
+
+	// Do the same for the green component
+	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
+	destCols = _mm_and_si128(destCols, _mm_set1_epi32(0xff00));
+	srcCols = _mm_sub_epi32(srcCols, destCols);
+	srcCols = mul32_as32(srcCols, alphas);
+	//srcCols = _mm_mul_epi32(srcCols, alphas);
+	srcCols = _mm_srli_epi32(srcCols, 8);
+	srcCols = _mm_add_epi32(srcCols, destCols); // Add the new green to the old green
+
+	// Keep values in 8bit range and glue red/blue and green together
+	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
+	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
+	srcCols = _mm_or_si128(srcCols, srcColsCopy);
+
+	// Remember that alpha is not alphas, but rather the alpha of destcols
+	if (preserveAlpha) {
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		srcCols = _mm_or_si128(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
+	__m128 srcA = _mm_cvtepi32_ps(_mm_srli_epi32(srcCols, 24));
+	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0 / 255.0));
+	__m128 srcR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff)));
+	__m128 srcG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff)));
+	__m128 srcB = _mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff)));
+
+	__m128 destA = _mm_cvtepi32_ps(_mm_srli_epi32(destCols, 24));
+	destA = _mm_mul_ps(destA, _mm_set1_ps(1.0 / 255.0));
+	__m128 destR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff)));
+	__m128 destG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff)));
+	__m128 destB = _mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff)));
+
+	// the destination alpha gets multiplied by 255 - source alpha
+	destA = _mm_mul_ps(destA, _mm_sub_ps(_mm_set1_ps(1.0f), srcA));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
+	__m128 combA = _mm_add_ps(srcA, destA);
+	__m128 combArcp = _mm_rcp_ps(combA);
+	destR = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcR, srcA), _mm_mul_ps(destR, destA)), combArcp);
+	destG = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcG, srcA), _mm_mul_ps(destG, destA)), combArcp);
+	destB = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcB, srcA), _mm_mul_ps(destB, destA)), combArcp);
+	combA = _mm_mul_ps(combA, _mm_set1_ps(255.0));
+
+	// Now put it back together
+	return _mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(combA), 24),
+		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destR), 16),
+		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destG), 8),
+			_mm_cvtps_epi32(destB))));
+}
+
+inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
+
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	// do the transformation (we don't actually need alpha at all)
+	__m128 ddr, ddg, ddb;
+	ddr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ddg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ddb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	__m128 ssr, ssg, ssb;
+	ssr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ssg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ssb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
+	__m128 dmaxes = _mm_max_ps(ddr, _mm_max_ps(ddg, ddb));
+	__m128 smaxes = _mm_max_ps(ssr, _mm_max_ps(ssg, ssb));
+	__m128 smins = _mm_min_ps(ssr, _mm_min_ps(ssg, ssb));
+
+	// This is here to stop from dividing by 0
+	const __m128 eplison0 = _mm_set1_ps(0.0000001);
+
+	__m128 chroma = _mm_max_ps(_mm_sub_ps(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
+	__m128 hr, hg, hb, hue;
+	hr = _mm_div_ps(_mm_sub_ps(ssg, ssb), chroma);
+	hr = _mm_sub_ps(hr, _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_mul_ps(hr, _mm_set1_ps(1.0 / 6.0)))), _mm_set1_ps(6.0)));
+	hr = _mm_add_ps(hr, _mm_and_ps(_mm_cmplt_ps(hr, _mm_setzero_ps()), _mm_set1_ps(6.0)));
+	hg = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssb, ssr), chroma), _mm_set1_ps(2.0));
+	hg = _mm_max_ps(hg, _mm_setzero_ps());
+	hb = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssr, ssg), chroma), _mm_set1_ps(4.0));
+	hb = _mm_max_ps(hb, _mm_setzero_ps());
+
+	// And then compute which one will be used based on criteria
+	__m128 hrfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssr, smaxes), _mm_cmpneq_ps(ssr, ssb)), _mm_set1_ps(1.0));
+	__m128 hgfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssg, smaxes), _mm_cmpneq_ps(ssg, ssr)), _mm_set1_ps(1.0));
+	__m128 hbfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssb, smaxes), _mm_cmpneq_ps(ssb, ssg)), _mm_set1_ps(1.0));
+	hue = _mm_mul_ps(hr, hrfactors);
+	hue = _mm_add_ps(hue, _mm_mul_ps(hg, hgfactors));
+	hue = _mm_add_ps(hue, _mm_mul_ps(hb, hbfactors));
+
+	// Mess with the light like the original function
+	__m128 val = dmaxes;
+	if (light) {
+		val = _mm_sub_ps(val, _mm_sub_ps(_mm_set1_ps(1.0), _mm_mul_ps(_mm_cvtepi32_ps(alphas), _mm_set1_ps(1.0 / 250.0))));
+		val = _mm_max_ps(val, _mm_setzero_ps());
+	}
+		
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
+	chroma = _mm_mul_ps(val, _mm_div_ps(_mm_sub_ps(smaxes, smins), _mm_add_ps(smaxes, eplison0)));
+	__m128 hprime_mod2 = _mm_mul_ps(hue, _mm_set1_ps(1.0 / 2.0));
+	hprime_mod2 = _mm_mul_ps(_mm_sub_ps(hprime_mod2, _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(hprime_mod2, _mm_set1_ps(0.5))))), _mm_set1_ps(2.0));
+	__m128 x = _mm_mul_ps(chroma, _mm_sub_ps(_mm_set1_ps(1), _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), _mm_sub_ps(hprime_mod2, _mm_set1_ps(1)))));
+	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	__m128i hprime_rounded = _mm_cvtps_epi32(_mm_sub_ps(hue, _mm_set1_ps(0.5)));
+	__m128i x_int = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(255.0)));
+	__m128i c_int = _mm_cvtps_epi32(_mm_mul_ps(chroma, _mm_set1_ps(255.0)));
+
+	// Again HSV->RGB is also a piecewise function
+	__m128i val0 = _mm_or_si128(_mm_slli_epi32(x_int, 8), _mm_slli_epi32(c_int, 16));
+	val0 = _mm_and_si128(val0, _mm_or_si128(_mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(0)), _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(6))));
+	__m128i val1 = _mm_or_si128(_mm_slli_epi32(c_int, 8), _mm_slli_epi32(x_int, 16));
+	val1 = _mm_and_si128(val1, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(1)));
+	__m128i val2 = _mm_or_si128(_mm_slli_epi32(c_int, 8), x_int);
+	val2 = _mm_and_si128(val2, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(2)));
+	__m128i val3 = _mm_or_si128(_mm_slli_epi32(x_int, 8), c_int);
+	val3 = _mm_and_si128(val3, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(3)));
+	__m128i val4 = _mm_or_si128(_mm_slli_epi32(x_int, 16), c_int);
+	val4 = _mm_and_si128(val4, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(4)));
+	__m128i val5 = _mm_or_si128(_mm_slli_epi32(c_int, 16), x_int);
+	val5 = _mm_and_si128(val5, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(5)));
+
+	// or the values together
+	__m128i final = _mm_or_si128(val0, _mm_or_si128(val1, _mm_or_si128(val2, _mm_or_si128(val3, _mm_or_si128(val4, val5)))));
+
+	// add the minimums back in
+	__m128i val_add = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(val, chroma), _mm_set1_ps(255.0)));
+	val_add = _mm_or_si128(val_add, _mm_or_si128(_mm_slli_epi32(val_add, 8), _mm_or_si128(_mm_slli_epi32(val_add, 16), _mm_and_si128(destCols, _mm_set1_epi32(0xff000000)))));
+	final = _mm_add_epi32(final, val_add);
+	return final;
+}
+
+inline __m128i mul32_as16(__m128i a, __m128i b) {	
+	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
+	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
+	__m128i res = _mm_mullo_epi16(a16, b16);
+	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
+}
+
+inline __m128i findmin32_as16(__m128i a, __m128i b) {
+	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
+	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
+	__m128i res = _mm_min_epi16(a16, b16);
+	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
+}
+
+inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	__m128i srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
+		srcAlphas = _mm_srli_epi32(srcCols, 24);
+		difAlphas = _mm_add_epi32(_mm_and_si128(alphas, _mm_set1_epi32(0xff)), _mm_set1_epi32(1));
+		difAlphas = _mm_srli_epi32(mul32_as16(srcAlphas, difAlphas), 8);
+		difAlphas = _mm_slli_epi32(difAlphas, 24);
+		srcAlphas = _mm_slli_epi32(srcAlphas, 24);
+		mask = _mm_cmpeq_epi32(alphas, _mm_setzero_si128());
+		srcAlphas = _mm_and_si128(srcAlphas, mask);
+		difAlphas = _mm_andnot_si128(mask, difAlphas);
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		srcCols = _mm_or_si128(srcCols, _mm_or_si128(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
+		alphas = _mm_srli_epi32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
+		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
+		mask = _mm_cmpgt_epi32(_mm_srli_epi32(srcCols, 24), _mm_setzero_si128());
+		ch1 = _mm_and_si128(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = _mm_andnot_si128(mask, destCols);
+		return _mm_or_si128(ch1, ch2);
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, _mm_srli_epi32(srcCols, 24), false);
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
+		ch2 = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		ch2 = _mm_or_si128(ch2, _mm_slli_epi32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
+		ch1 = _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
+		// mask and or them together
+		mask = _mm_or_si128(_mm_cmpeq_epi32(alphas, _mm_setzero_si128()), _mm_cmpeq_epi32(alphas, _mm_set1_epi32(0xff)));
+		ch1 = _mm_and_si128(ch1, mask);
+		ch2 = _mm_andnot_si128(mask, ch2);
+		return _mm_or_si128(ch1, ch2);
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
+		return _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
+		srcAlphas = _mm_add_epi32(_mm_srli_epi32(srcCols, 24), _mm_srli_epi32(destCols, 24));
+		srcAlphas = findmin32_as16(srcAlphas, _mm_set1_epi32(0xff));
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		return _mm_or_si128(srcCols, _mm_slli_epi32(srcAlphas, 24));
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+	return _mm_setzero_si128();
+}
+
+#include "common/debug.h"
+inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	__m128i mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = _mm_and_si128(_mm_set1_epi16(0xff), _mm_cmpeq_epi16(alphas, _mm_setzero_si128()));
+		ch2 = _mm_and_si128(alphas, _mm_cmpgt_epi16(alphas, _mm_setzero_si128()));
+		alphas = _mm_or_si128(ch1, ch2);
+		// fall through
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = _mm_or_si128(_mm_cmpeq_epi16(alphas, _mm_set1_epi16(0)), _mm_cmpeq_epi16(alphas, _mm_set1_epi16(255)));
+		ch1 = _mm_and_si128(srcCols, mask);
+		ch2 = _mm_andnot_si128(mask, rgbBlendSIMD2Bpp(srcCols, destCols, alphas));
+		return _mm_or_si128(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		__m128i srcColsLo = simd2BppTo4Bpp(_mm_and_si128(srcCols, _mm_set_epi32(0, 0, -1, -1)));
+		__m128i srcColsHi = simd2BppTo4Bpp(_mm_srli_si128(srcCols, 8));
+		__m128i destColsLo = simd2BppTo4Bpp(_mm_and_si128(destCols, _mm_set_epi32(0, 0, -1, -1)));
+		__m128i destColsHi = simd2BppTo4Bpp(_mm_srli_si128(destCols, 8));
+		__m128i alphasLo = _mm_unpacklo_epi16(_mm_and_si128(alphas, _mm_set_epi32(0, 0, -1, -1)), _mm_setzero_si128());
+		__m128i alphasHi = _mm_unpacklo_epi16(_mm_srli_si128(alphas, 8), _mm_setzero_si128());
+		__m128i lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		__m128i hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return _mm_or_si128(lo, _mm_slli_si128(hi, 8));
+	}
+	return _mm_setzero_si128();
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i maskedAlphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+	__m128i srcCols, destCol;
+
+	if (DestBytesPerPixel == 4)
+		destCol = _mm_loadu_si128((const __m128i *)destPtr);
+	else
+		destCol = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)destPtr));
+	if (SrcBytesPerPixel == 4)
+		srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
+	else
+		srcCols = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)(srcP2 + xDir * xCtrBpp)));
+
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	__m128i mask1 = skipTrans ? _mm_cmpeq_epi32(_mm_and_si128(srcCols, maskedAlphas), transColors) : _mm_setzero_si128();
+	mask1 = _mm_or_si128(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	__m128i destCols2 = _mm_and_si128(destCol, mask1);
+	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
+	__m128i final = _mm_or_si128(destCols2, srcCols2);
+	if (horizFlip) {
+		final = _mm_shuffle_epi32(final, _MM_SHUFFLE(0, 1, 2, 3));
+	}
+	if (DestBytesPerPixel == 4) {
+		_mm_storeu_si128((__m128i *)destPtr, final);
+	} else {
+		_mm_storel_epi64((__m128i *)destPtr, simd4BppTo2Bpp(final));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+	__m128i destCol = _mm_loadu_si128((const __m128i *)destPtr);
+	__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
+	__m128i mask1 = skipTrans ? _mm_cmpeq_epi16(srcCols, transColors) : _mm_setzero_si128();
+	mask1 = _mm_or_si128(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	__m128i destCols2 = _mm_and_si128(destCol, mask1);
+	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
+	__m128i final = _mm_or_si128(destCols2, srcCols2);
+	if (horizFlip) {
+		final = _mm_shufflelo_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm_shufflehi_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final), _mm_castsi128_pd(final), _MM_SHUFFLE2(0, 1)));
+	}
+	_mm_storeu_si128((__m128i *)destPtr, final);
 }
 
 } // namespace AGS3
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index 547ffbab6a0..1f3c2ac25f5 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -64,18 +64,18 @@ void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	Bitmap *dest32 = BitmapHelper::CreateBitmap(100, 100, 32);
 	Bitmap *dest16 = BitmapHelper::CreateBitmap(100, 100, 16);
 	Bitmap *dest8 = BitmapHelper::CreateBitmap(100, 100, 8);
-	debug("%d %d %d %d %d %d", benchgfx32, benchgfx16, benchgfx8, dest32, dest16, dest8);
 	int benchRuns[] = {1000, 10000, 100000};
 	int blenderModes[] = {kRgbToRgbBlender, kSourceAlphaBlender, kArgbToArgbBlender, kOpaqueBlenderMode, kTintLightBlenderMode};
 	const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
 	Bitmap *destinations[] = {dest32, dest16, dest8};
 	Bitmap *graphics[] = {benchgfx32, benchgfx16, benchgfx8};
 	int bpps[] = {32, 16, 8};
+	if (blenderModeEnd >= sizeof(blenderModes) / sizeof(blenderModes[0])) blenderModeEnd = (sizeof(blenderModes) / sizeof(blenderModes[0])) - 1;
 	for (int dest = 0; dest < 3; dest++) {
 		for (int gfx = 0; gfx < 3; gfx++) {
 			if (dest == 2 && gfx != 2) continue;
 			for (int mode = blenderModeStart; mode <= blenderModeEnd; mode++) {
-				for (int runs = 0; runs < sizeof(benchRuns)/sizeof(int); runs++) {
+				for (int runs = 0; (size_t)runs < sizeof(benchRuns)/sizeof(int); runs++) {
 					uint32 start, end;
 					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
 #ifdef VERBOSE_TEST_GFX
@@ -139,7 +139,7 @@ void Test_BlenderModes() {
 											dummy.blendPixel(srcA, srcR, srcG, srcB, a, r, g, b, alpha, false, (byte *)&pixelDummy);
 											controlCol = b | (g << 8) | (r << 16) | (a << 24);
 
-											uint8 a16 = destA, r16 = destR >> 3, g16 = destG >> 2, b16 = destB >> 3;
+											uint8 a16 = 0xff, r16 = destR >> 3, g16 = destG >> 2, b16 = destB >> 3;
 											r16 = (r16 << 3) | (r16 >> 2);
 											g16 = (g16 << 2) | (g16 >> 4);
 											b16 = (b16 << 3) | (b16 >> 2);
@@ -158,11 +158,10 @@ void Test_BlenderModes() {
 												uint32x4_t alphas = vdupq_n_u32(alpha);
 												simdCol = vgetq_lane_u32(blendPixelSIMD(src, dest, alphas), 0);
 #else
-												//__m128i src = _mm_set1_epi32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
-												//__m128i dest = _mm_set1_epi32(destB | (destG << 8) | (destR << 16) | (destA << 24));
-												//__m128i alphas = _mm_set1_epi32(alpha);
-												//simdCol = _mm_extract_epi32();
-												simdCol = controlCol; // Not implemented yet
+												__m128i src = _mm_set1_epi32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
+												__m128i dest = _mm_set1_epi32(destB | (destG << 8) | (destR << 16) | (destA << 24));
+												__m128i alphas = _mm_set1_epi32((int)alpha);
+												simdCol = _mm_cvtsi128_si32(blendPixelSIMD(src, dest, alphas));
 #endif
 											}
 											{
@@ -172,29 +171,32 @@ void Test_BlenderModes() {
 												uint16x8_t alphas = vdupq_n_u16((uint16)alpha);
 												simd2bppCol = vgetq_lane_u16(blendPixelSIMD2Bpp(src, dest, alphas), 0);
 #else
-												simd2bppCol = control2bppCol; // Not implemented yet
+												__m128i src = _mm_set1_epi16((srcB >> 3) | ((srcG >> 2) << 5) | ((srcR >> 3) << 11));
+												__m128i dest = _mm_set1_epi16((destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11));
+												__m128i alphas = _mm_set1_epi16((uint16)alpha);
+												simd2bppCol = (uint16)(_mm_cvtsi128_si32(blendPixelSIMD2Bpp(src, dest, alphas)) & 0xffff);
 #endif
 											}
 #ifdef VERBOSE_TEST_GFX
-											debug("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d\n", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
+											debug("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
 #endif
 											switch ((BlenderMode)blenderMode) {
-												case kSourceAlphaBlender: debug("blenderMode: kSourceAlphaBlender\n"); break;
-												case kArgbToArgbBlender: debug("blenderMode: kArgbToArgbBlender\n"); break;
-												case kArgbToRgbBlender: debug("blenderMode: kArgbToRgbBlender\n"); break;
-												case kRgbToArgbBlender: debug("blenderMode: kRgbToArgbBlender\n"); break;
-												case kRgbToRgbBlender: debug("blenderMode: kRgbToRgbBlender\n"); break;
-												case kAlphaPreservedBlenderMode: debug("blenderMode: kAlphaPreservedBlenderMode\n"); break;
-												case kOpaqueBlenderMode: debug("blenderMode: kOpaqueBlenderMode\n"); break;
-												case kAdditiveBlenderMode: debug("blenderMode: kAdditiveBlenderMode\n"); break;
-												case kTintBlenderMode: debug("blenderMode: kTintBlenderMode\n"); break;
-												case kTintLightBlenderMode: debug("blenderMode: kTintLightBlenderMode\n"); break;
+												case kSourceAlphaBlender: debug("blenderMode: kSourceAlphaBlender"); break;
+												case kArgbToArgbBlender: debug("blenderMode: kArgbToArgbBlender"); break;
+												case kArgbToRgbBlender: debug("blenderMode: kArgbToRgbBlender"); break;
+												case kRgbToArgbBlender: debug("blenderMode: kRgbToArgbBlender"); break;
+												case kRgbToRgbBlender: debug("blenderMode: kRgbToRgbBlender"); break;
+												case kAlphaPreservedBlenderMode: debug("blenderMode: kAlphaPreservedBlenderMode"); break;
+												case kOpaqueBlenderMode: debug("blenderMode: kOpaqueBlenderMode"); break;
+												case kAdditiveBlenderMode: debug("blenderMode: kAdditiveBlenderMode"); break;
+												case kTintBlenderMode: debug("blenderMode: kTintBlenderMode"); break;
+												case kTintLightBlenderMode: debug("blenderMode: kTintLightBlenderMode"); break;
 											}
 #ifdef VERBOSE_TEST_GFX
-											debug("controlCol %x argb: %d, %d, %d, %d\n", controlCol, a, r, g, b);
-											debug("simdCol %x argb: %d, %d, %d, %d\n", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
-											debug("control2bppCol %x rgb: %d, %d, %d\n", control2bppCol, r16, g16, b16);
-											debug("simd2bppCol %x rgb: %d, %d, %d\n\n", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
+											debug("controlCol %x argb: %d, %d, %d, %d", controlCol, a, r, g, b);
+											debug("simdCol %x argb: %d, %d, %d, %d", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
+											debug("control2bppCol %x rgb: %d, %d, %d", control2bppCol, r16, g16, b16);
+											debug("simd2bppCol %x rgb: %d, %d, %d", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
 #endif
 											int tolerance, tolerance16;
 											switch ((BlenderMode)blenderMode) {
@@ -250,7 +252,7 @@ void Test_GfxTransparency() {
 	int trans255[arr_sz] = { 0 };
 	int trans100_back[arr_sz] = { 0 };
 
-	for (int i = 0; i < arr_sz; ++i) {
+	for (size_t i = 0; i < arr_sz; ++i) {
 		trans255[i] = GfxDef::Trans100ToLegacyTrans255(trans100[i]);
 		trans100_back[i] = GfxDef::LegacyTrans255ToTrans100(trans255[i]);
 		assert(trans100[i] == trans100_back[i]);
@@ -260,14 +262,12 @@ void Test_GfxTransparency() {
 void Test_Gfx() {
 	Test_GfxTransparency();
 #if defined(OPT_NEON) || defined(OPT_SSE)
-	Test_DrawingLoops();
-	Test_BlenderModes();
+	//Test_DrawingLoops();
+	//Test_BlenderModes();
 	// This could take a LONG time
 	bool has_simd = _G(_bitmap_simd_optimizations);
+	if (has_simd) Test_GfxSpeed(true, 0, kTintLightBlenderMode);
 	Test_GfxSpeed(false, 0, kTintLightBlenderMode);
-	if (has_simd) {
-		Test_GfxSpeed(true, 0, kTintLightBlenderMode);
-	}
 	_G(_bitmap_simd_optimizations) = has_simd;
 #endif
 }
diff --git a/engines/ags/tests/test_memory.cpp b/engines/ags/tests/test_memory.cpp
index 652685124e8..59dabd87e95 100644
--- a/engines/ags/tests/test_memory.cpp
+++ b/engines/ags/tests/test_memory.cpp
@@ -19,6 +19,7 @@
  *
  */
 
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
 #include "ags/shared/core/platform.h"
 #include "ags/shared/util/memory.h"
 //#include "ags/shared/debugging/assert.h"
diff --git a/engines/ags/tests/test_string.cpp b/engines/ags/tests/test_string.cpp
index aa371b6972c..cb069e45d79 100644
--- a/engines/ags/tests/test_string.cpp
+++ b/engines/ags/tests/test_string.cpp
@@ -142,16 +142,16 @@ void Test_String() {
 		size_t find8 = s5.FindCharReverse('x');
 		size_t find9 = s1.FindChar('i', 2);
 		size_t find10 = s1.FindCharReverse('i', 12);
-		assert(find1 == 5);
-		assert(find2 == 13);
-		assert(find3 == -1);
-		assert(find4 == -1);
-		assert(find5 == 19);
-		assert(find6 == 0);
-		assert(find7 == -1);
-		assert(find8 == -1);
-		assert(find9 == 10);
-		assert(find10 == 10);
+		assert(find1 == 5LLU);
+		assert(find2 == 13LLU);
+		assert(find3 == -1LLU);
+		assert(find4 == -1LLU);
+		assert(find5 == 19LLU);
+		assert(find6 == 0LLU);
+		assert(find7 == -1LLU);
+		assert(find8 == -1LLU);
+		assert(find9 == 10LLU);
+		assert(find10 == 10LLU);
 	}
 
 	// Test GetAt


Commit: def889099e6045eca226405f4d51b559f3c50d1a
    https://github.com/scummvm/scummvm/commit/def889099e6045eca226405f4d51b559f3c50d1a
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Added check for MSVC specific macros in simd

I was using just the GCC and CLANG macros to see what platform SCUMMVM
was being compiled on, but neglected the MSVC ones. This would lead it
to not compile on that compiler. I fixed that by adding those. I also
added the fallback simd implementation .cpp file into module.mk for the
ags engine.

Changed paths:
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_neon.h
    engines/ags/lib/allegro/surface_simd_sse.cpp
    engines/ags/lib/allegro/surface_simd_sse.h
    engines/ags/module.mk


diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index 59033291de6..0b3a6ce8531 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -1,4 +1,4 @@
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(_M_ARM64)
 
 #include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index d2e3bb6911c..beeb287e4c8 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -20,13 +20,14 @@
  */
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(_M_ARM64)
 
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #endif
 
 #include <arm_neon.h>
+#include "ags/globals.h"
 #include "ags/lib/allegro/surface.h"
 
 namespace AGS3 {
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 208e8254381..7d03d84efc9 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -1,4 +1,4 @@
-#if defined(__x86_64__) || defined(__i686__)
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
 
 #include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
index 68fcf71245c..68b58e8761d 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.h
+++ b/engines/ags/lib/allegro/surface_simd_sse.h
@@ -20,13 +20,14 @@
  */
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
-#if defined(__x86_64__) || defined(__i686__)
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
 
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #endif
 
 #include <immintrin.h>
+#include "ags/globals.h"
 #include "ags/lib/allegro/surface.h"
 
 namespace AGS3 {
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 38af18ef161..35fddd6ef2d 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -26,6 +26,7 @@ MODULE_OBJS = \
 	lib/allegro/surface.o \
 	lib/allegro/surface_simd_neon.o \
 	lib/allegro/surface_simd_sse.o \
+	lib/allegro/surface_simd_none.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \
 	lib/std/std.o \


Commit: b3681c5cb3438184884435118a2876aeda928cd7
    https://github.com/scummvm/scummvm/commit/b3681c5cb3438184884435118a2876aeda928cd7
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Making MSVC and iOS compilers happy

Made it so that iOS doesn't use Arm NEON since it only supports a very
limited set of instructions (like it apparently doesn't have intrinics
for something as simple as bit shifting?). I also changed every float
literal in surface_simd_sse from a double literal to float because
windows x64 was complaining about it.

Changed paths:
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_neon.h
    engines/ags/lib/allegro/surface_simd_sse.cpp
    engines/ags/lib/allegro/surface_simd_sse.h


diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index 0b3a6ce8531..e9d9ac76365 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -1,4 +1,6 @@
-#if defined(__aarch64__) || defined(_M_ARM64)
+#include "ags/lib/allegro/surface_simd_neon.h"
+
+#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
 
 #include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
@@ -8,8 +10,6 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
-#include "ags/lib/allegro/surface_simd_neon.h"
-
 namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index beeb287e4c8..199093212bc 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -20,11 +20,18 @@
  */
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
-#if defined(__aarch64__) || defined(_M_ARM64)
+#ifdef __APPLE__ // Appeasing iOS
+#include <TargetConditionals.h>
+#endif
+
+#if (defined(__aarch64__) || defined(_M_ARM64)) && (!defined(__APPLE__) || defined(TARGET_OS_MAC))
 
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #endif
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
+#endif
 
 #include <arm_neon.h>
 #include "ags/globals.h"
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 7d03d84efc9..37075bfe31f 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -1,4 +1,5 @@
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#include "ags/lib/allegro/surface_simd_sse.h"
+#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
 
 #include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
@@ -8,8 +9,6 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
-#include "ags/lib/allegro/surface_simd_sse.h"
-
 namespace AGS3 {
 
 inline uint32 extract32_idx0(__m128i x) {
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
index 68b58e8761d..02a748b26c9 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.h
+++ b/engines/ags/lib/allegro/surface_simd_sse.h
@@ -25,6 +25,9 @@
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #endif
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
+#endif
 
 #include <immintrin.h>
 #include "ags/globals.h"
@@ -154,13 +157,13 @@ inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, b
 
 inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
 	__m128 srcA = _mm_cvtepi32_ps(_mm_srli_epi32(srcCols, 24));
-	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0 / 255.0));
+	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0f / 255.0f));
 	__m128 srcR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff)));
 	__m128 srcG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff)));
 	__m128 srcB = _mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff)));
 
 	__m128 destA = _mm_cvtepi32_ps(_mm_srli_epi32(destCols, 24));
-	destA = _mm_mul_ps(destA, _mm_set1_ps(1.0 / 255.0));
+	destA = _mm_mul_ps(destA, _mm_set1_ps(1.0f / 255.0f));
 	__m128 destR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff)));
 	__m128 destG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff)));
 	__m128 destB = _mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff)));
@@ -204,13 +207,13 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 
 	// do the transformation (we don't actually need alpha at all)
 	__m128 ddr, ddg, ddb;
-	ddr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
-	ddg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
-	ddb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ddr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ddg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ddb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
 	__m128 ssr, ssg, ssb;
-	ssr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
-	ssg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
-	ssb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0 / 255.0));
+	ssr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ssg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ssb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
 
 	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
 	__m128 dmaxes = _mm_max_ps(ddr, _mm_max_ps(ddg, ddb));
@@ -218,24 +221,24 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 	__m128 smins = _mm_min_ps(ssr, _mm_min_ps(ssg, ssb));
 
 	// This is here to stop from dividing by 0
-	const __m128 eplison0 = _mm_set1_ps(0.0000001);
+	const __m128 eplison0 = _mm_set1_ps(0.0000001f);
 
 	__m128 chroma = _mm_max_ps(_mm_sub_ps(smaxes, smins), eplison0);
 
 	// RGB to HSV is a piecewise function, so we compute each part of the function first...
 	__m128 hr, hg, hb, hue;
 	hr = _mm_div_ps(_mm_sub_ps(ssg, ssb), chroma);
-	hr = _mm_sub_ps(hr, _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_mul_ps(hr, _mm_set1_ps(1.0 / 6.0)))), _mm_set1_ps(6.0)));
-	hr = _mm_add_ps(hr, _mm_and_ps(_mm_cmplt_ps(hr, _mm_setzero_ps()), _mm_set1_ps(6.0)));
-	hg = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssb, ssr), chroma), _mm_set1_ps(2.0));
+	hr = _mm_sub_ps(hr, _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_mul_ps(hr, _mm_set1_ps(1.0f / 6.0f)))), _mm_set1_ps(6.0f)));
+	hr = _mm_add_ps(hr, _mm_and_ps(_mm_cmplt_ps(hr, _mm_setzero_ps()), _mm_set1_ps(6.0f)));
+	hg = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssb, ssr), chroma), _mm_set1_ps(2.0f));
 	hg = _mm_max_ps(hg, _mm_setzero_ps());
-	hb = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssr, ssg), chroma), _mm_set1_ps(4.0));
+	hb = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssr, ssg), chroma), _mm_set1_ps(4.0f));
 	hb = _mm_max_ps(hb, _mm_setzero_ps());
 
 	// And then compute which one will be used based on criteria
-	__m128 hrfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssr, smaxes), _mm_cmpneq_ps(ssr, ssb)), _mm_set1_ps(1.0));
-	__m128 hgfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssg, smaxes), _mm_cmpneq_ps(ssg, ssr)), _mm_set1_ps(1.0));
-	__m128 hbfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssb, smaxes), _mm_cmpneq_ps(ssb, ssg)), _mm_set1_ps(1.0));
+	__m128 hrfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssr, smaxes), _mm_cmpneq_ps(ssr, ssb)), _mm_set1_ps(1.0f));
+	__m128 hgfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssg, smaxes), _mm_cmpneq_ps(ssg, ssr)), _mm_set1_ps(1.0f));
+	__m128 hbfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssb, smaxes), _mm_cmpneq_ps(ssb, ssg)), _mm_set1_ps(1.0f));
 	hue = _mm_mul_ps(hr, hrfactors);
 	hue = _mm_add_ps(hue, _mm_mul_ps(hg, hgfactors));
 	hue = _mm_add_ps(hue, _mm_mul_ps(hb, hbfactors));
@@ -243,7 +246,7 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 	// Mess with the light like the original function
 	__m128 val = dmaxes;
 	if (light) {
-		val = _mm_sub_ps(val, _mm_sub_ps(_mm_set1_ps(1.0), _mm_mul_ps(_mm_cvtepi32_ps(alphas), _mm_set1_ps(1.0 / 250.0))));
+		val = _mm_sub_ps(val, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_cvtepi32_ps(alphas), _mm_set1_ps(1.0f / 250.0f))));
 		val = _mm_max_ps(val, _mm_setzero_ps());
 	}
 		
@@ -251,13 +254,13 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 	// the hue and saturation come from the source (tint) color, and the value comes from
 	// the destinaion (real source) color
 	chroma = _mm_mul_ps(val, _mm_div_ps(_mm_sub_ps(smaxes, smins), _mm_add_ps(smaxes, eplison0)));
-	__m128 hprime_mod2 = _mm_mul_ps(hue, _mm_set1_ps(1.0 / 2.0));
-	hprime_mod2 = _mm_mul_ps(_mm_sub_ps(hprime_mod2, _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(hprime_mod2, _mm_set1_ps(0.5))))), _mm_set1_ps(2.0));
+	__m128 hprime_mod2 = _mm_mul_ps(hue, _mm_set1_ps(1.0f / 2.0f));
+	hprime_mod2 = _mm_mul_ps(_mm_sub_ps(hprime_mod2, _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(hprime_mod2, _mm_set1_ps(0.5))))), _mm_set1_ps(2.0f));
 	__m128 x = _mm_mul_ps(chroma, _mm_sub_ps(_mm_set1_ps(1), _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), _mm_sub_ps(hprime_mod2, _mm_set1_ps(1)))));
-	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0f), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0f)))));
 	__m128i hprime_rounded = _mm_cvtps_epi32(_mm_sub_ps(hue, _mm_set1_ps(0.5)));
-	__m128i x_int = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(255.0)));
-	__m128i c_int = _mm_cvtps_epi32(_mm_mul_ps(chroma, _mm_set1_ps(255.0)));
+	__m128i x_int = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(255.0f)));
+	__m128i c_int = _mm_cvtps_epi32(_mm_mul_ps(chroma, _mm_set1_ps(255.0f)));
 
 	// Again HSV->RGB is also a piecewise function
 	__m128i val0 = _mm_or_si128(_mm_slli_epi32(x_int, 8), _mm_slli_epi32(c_int, 16));
@@ -277,7 +280,7 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 	__m128i final = _mm_or_si128(val0, _mm_or_si128(val1, _mm_or_si128(val2, _mm_or_si128(val3, _mm_or_si128(val4, val5)))));
 
 	// add the minimums back in
-	__m128i val_add = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(val, chroma), _mm_set1_ps(255.0)));
+	__m128i val_add = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(val, chroma), _mm_set1_ps(255.0f)));
 	val_add = _mm_or_si128(val_add, _mm_or_si128(_mm_slli_epi32(val_add, 8), _mm_or_si128(_mm_slli_epi32(val_add, 16), _mm_and_si128(destCols, _mm_set1_epi32(0xff000000)))));
 	final = _mm_add_epi32(final, val_add);
 	return final;


Commit: 9c11912da9a6b217d31c9f45c1dbfde44d812503
    https://github.com/scummvm/scummvm/commit/9c11912da9a6b217d31c9f45c1dbfde44d812503
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Not using Arm NEON on iOS sim. or Arm Windows

Not all of Arm NEON intrinsics aren't included in the iOS simulator's
arm_neon.h file, so we just don't compile arm neon for the simulator
anymore. Also, arm_neon.h on Windows seems to be just an empty header
or atleast a header with only a few intrinsics of the many that should
be there.

Changed paths:
    engines/ags/lib/allegro/surface_simd_neon.h


diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index 199093212bc..f81cb2453fc 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -24,7 +24,8 @@
 #include <TargetConditionals.h>
 #endif
 
-#if (defined(__aarch64__) || defined(_M_ARM64)) && (!defined(__APPLE__) || defined(TARGET_OS_MAC))
+#if !defined(TARGET_OS_SIMULATOR) || TARGET_OS_SIMULATOR != 1 // Appeasing iOS/Iphone simultator?
+#if defined(__aarch64__)
 
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
 #define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
@@ -471,4 +472,5 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint,
 } // namespace AGS3
 
 #endif /* __aarch64__ */
+#endif /* Make it so that IOS and IPHONE SIM are not used with NEON */
 #endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON */


Commit: 33cb39c2e8f68a6f72720aee3ef9e6abcea37fec
    https://github.com/scummvm/scummvm/commit/33cb39c2e8f68a6f72720aee3ef9e6abcea37fec
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Appeasing iOS compiler again

Changed paths:
    engines/ags/lib/allegro/surface_simd_neon.h


diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
index f81cb2453fc..0aa98fad831 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ b/engines/ags/lib/allegro/surface_simd_neon.h
@@ -25,6 +25,7 @@
 #endif
 
 #if !defined(TARGET_OS_SIMULATOR) || TARGET_OS_SIMULATOR != 1 // Appeasing iOS/Iphone simultator?
+#if !defined(TARGET_OS_IPHONE) || TARGET_OS_IPHONE != 1 // Appeasing iOS/Iphone simultator?
 #if defined(__aarch64__)
 
 #ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
@@ -473,4 +474,5 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint,
 
 #endif /* __aarch64__ */
 #endif /* Make it so that IOS and IPHONE SIM are not used with NEON */
+#endif /* Make it so that IOS and IPHONE SIM are not used with NEON */
 #endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON */


Commit: ff72736c499920b9ab895202d5dfb514d3cc065e
    https://github.com/scummvm/scummvm/commit/ff72736c499920b9ab895202d5dfb514d3cc065e
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: PowerPC Altivec: initial support

Changed paths:
  A engines/ags/lib/allegro/surface_simd_ppc.cpp
  A engines/ags/lib/allegro/surface_simd_ppc.h
    engines/ags/globals.cpp
    engines/ags/module.mk


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 14257f6fb6c..65db6f79d5a 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -128,6 +128,8 @@ static bool checkForSIMDExtensions() {
 #  endif
 #elif defined(__aarch64__)
 	return true;
+#elif defined(__powerpc__)
+	return __builtin_cpu_supports("altivec");
 #else
 	return false;
 #endif
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
new file mode 100644
index 00000000000..b07c35429e0
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -0,0 +1,505 @@
+#include "ags/lib/allegro/surface_simd_ppc.h"
+
+#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+namespace AGS3 {
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	vector unsigned int tint = vec_sl(vec_splat_u32(srcAlpha), 24);
+	tint = vec_or(tint, vec_sl(vec_splat_u32(tintRed), 16));
+	tint = vec_or(tint, vec_sl(vec_splat_u32(tintGreen), 8));
+	tint = vec_or(tint, vec_splat_u32(tintBlue));
+	vector unsigned int maskedAlphas = vec_splat_u32(&alphaMask);
+	vector unsigned int transColors = vec_splat_u32(&transColor);
+	vector unsigned int alphas = vec_splat_u32(&srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	vector unsigned int addIndexes = {0, 1, 2, 3};
+	if (horizFlip) addIndexes = {3, 2, 1, 0};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	vector unsigned int scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		xStart = 0;
+	}
+	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 4 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		vector unsigned int xCtrWidthSIMD = vec_splat_u32(xCtrWidth); // This is the width of the row
+
+		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
+				vector unsigned int skipMask = vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			byte srcBuffer[4*4];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
+				vector unsigned int indexes = vec_splat_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
+				if (SrcBytesPerPixel == 4)
+					indexes = vec_sl(vec_sl(vec_add(indexes, scaleAdds), 8), 2);
+				else
+					indexes = vec_sl(vec_sl(vec_add(indexes, scaleAdds), 8), 1);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + indexes[0], SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + indexes[1], SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + indexes[2], SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + indexes[3], SrcBytesPerPixel);
+				scaleXCtr += scaleX*4;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
+				vector unsigned int skipMask = vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
+	if (xCtrWidth % 4 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
+			//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+		}
+		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
+		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 4;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && ((srcCol & alphaMask) == transColor))
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	vector unsigned short tint = vec_splat_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
+	vector unsigned short transColors = vec_splat_u16(transColor);
+	vector unsigned short alphas = vec_splat_u16(srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	vector unsigned short addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+	vector unsigned int scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	vector unsigned int scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xCtrBppStart = xCtrStart * 2;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 8 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		vector unsigned short xCtrWidthSIMD = vec_splat_u16(xCtrWidth); // This is the width of the row
+		if (ScaleThreshold == 0) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
+				vector unsigned int skipMask = vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
+				//drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += destArea.pitch;
+			srcP += vertFlip ? -src.pitch : src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			uint16 srcBuffer[8];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				vector unsigned int indexes = vector_splat_u32(scaleXCtr), indexes2 = vector_splat_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				// Calculate in parallel the indexes of the pixels
+				indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
+				indexes2 = vec_sl(vec_sr(vec_add(indexes2, scaleAdds2), 8), 1);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				srcBuffer[0] = *(const uint16 *)(srcP + indexes[0]);
+				srcBuffer[1] = *(const uint16 *)(srcP + indexes[1]);
+				srcBuffer[2] = *(const uint16 *)(srcP + indexes[2]);
+				srcBuffer[3] = *(const uint16 *)(srcP + indexes[3]);
+				srcBuffer[4] = *(const uint16 *)(srcP + indexes2[0]);
+				srcBuffer[5] = *(const uint16 *)(srcP + indexes2[1]);
+				srcBuffer[6] = *(const uint16 *)(srcP + indexes2[2]);
+				srcBuffer[7] = *(const uint16 *)(srcP + indexes2[3]);
+				scaleXCtr += scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * 2];
+				vector unsigned int skipMask = vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
+				//drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+		}
+	}
+
+	// We have a picture that is a multiple of 8, so no extra pixels to draw
+	if (xCtrWidth % 8 == 0) return;
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (ScaleThreshold == 0) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			//drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+		}
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (horizFlip) srcP += 2 * 7;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * 2;
+		destX = xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (ScaleThreshold != 0) {
+			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (skipTrans && srcCol == transColor)
+			continue;
+
+		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (srcAlpha != -1) {
+			if (useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = tintRed;
+				gSrc = tintGreen;
+				bSrc = tintBlue;
+				aSrc = srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<int ScaleThreshold>
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	const int xDir = horizFlip ? -1 : 1;
+	vector unsigned char transColors = vec_splat_u8(&transColor);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	vector unsigned int scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	vector unsigned int scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	vector unsigned int scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
+	vector unsigned int scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
+	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrWidth = dstRect.width();
+	if (xStart + xCtrWidth > destArea.w) {
+		xCtrWidth = destArea.w - xStart;
+	}
+	if (xStart < 0) {
+		xCtrStart = -xStart;
+		xStart = 0;
+	}
+	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
+	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
+	if (yStart < 0) {
+		yCtr = -yStart;
+		destY = 0;
+		if (ScaleThreshold != 0) {
+			scaleYCtr = yCtr * scaleY;
+			srcYCtr = scaleYCtr / ScaleThreshold;
+		}
+	}
+	if (yStart + yCtrHeight > destArea.h) {
+		yCtrHeight = destArea.h - yStart;
+	}
+	
+	byte *destP = (byte *)destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)src.getBasePtr(
+	                       horizFlip ? srcArea.right - 16 : srcArea.left,
+	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+		if (ScaleThreshold != 0) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
+			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
+			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
+			vector unsigned char destCols;
+			memcpy(&destCols, destPtr, sizeof(destCols)); // There are no unaligned load instructions in AltiVec
+			vector unsigned char srcCols;
+			memcpy(&srcCols, scrP + xDir * xCtr, sizeof(srcCols));
+			if (ScaleThreshold != 0) {
+				// If we are scaling, we have to set each pixel individually
+				vector unsigned int indexes1 = vec_splat_u32(scaleXCtr), indexes2 = vec_splat_u32(scaleXCtr);
+				vector unsigned int indexes3 = vec_splat_u32(scaleXCtr), indexes4 = vec_splat_u32(scaleXCtr);
+#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
+				indexes1 = vec_sr(vec_add(indexes1, scaleAdds1), 8);
+				indexes2 = vec_sr(vec_add(indexes2, scaleAdds2), 8);
+				indexes3 = vec_sr(vec_add(indexes3, scaleAdds3), 8);
+				indexes4 = vec_sr(vec_add(indexes4, scaleAdds4), 8);
+#else
+#error Change code to allow different scale threshold!
+#endif
+				srcCols[0] = srcP[indexes1[0]];
+				srcCols[1] = srcP[indexes1[1]];
+				srcCols[2] = srcP[indexes1[2]];
+				srcCols[3] = srcP[indexes1[3]];
+				srcCols[4] = srcP[indexes2[0]];
+				srcCols[5] = srcP[indexes2[1]];
+				srcCols[6] = srcP[indexes2[2]];
+				srcCols[7] = srcP[indexes2[3]];
+				srcCols[8] = srcP[indexes3[0]];
+				srcCols[9] = srcP[indexes3[1]];
+				srcCols[10] = srcP[indexes3[2]]);
+				srcCols[11] = srcP[indexes3[3]]);
+				srcCols[12] = srcP[indexes4[0]]);
+				srcCols[13] = srcP[indexes4[1]]);
+				srcCols[14] = srcP[indexes4[2]]);
+				srcCols[15] = srcP[indexes4[3]]);
+				scaleXCtr += scaleX*16;
+			}
+
+			// Mask out transparent pixels
+			vector unsigned char mask1 = skipTrans ? vec_cmpeq(srcCols, transColors) : vec_splat_u8(0);
+			vector unsigned char final = vec_or(vec_and(srcCols, vec_nor(mask1, vec_splat_u8(0))), vec_and(destCols, mask1));
+			if (horizFlip) {
+				final = (vector unsigned char){
+					final[0], final[1], final[2], final[3],
+					final[4], final[5], final[7], final[8],
+					final[8], final[9], final[10], final[11],
+					final[12], final[13], final[14], final[15],
+				};
+			}
+			memcpy(destPtr, final, sizeof(final));
+		}
+		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (ScaleThreshold != 0) {
+				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+			}
+			// Check if this is a transparent color we should skip
+			if (skipTrans && *srcCol == transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (horizFlip) srcP -= 15; // Undo what we did up there
+		destP += destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
+		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+	}
+}
+
+
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.h b/engines/ags/lib/allegro/surface_simd_ppc.h
new file mode 100644
index 00000000000..95860b9d5fe
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_simd_ppc.h
@@ -0,0 +1,471 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_H
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_H
+
+#if defined(__powerpc__)
+
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+#endif
+#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+#endif
+
+#include <altivec.h>
+#include "ags/globals.h"
+#include "ags/lib/allegro/surface.h"
+
+namespace AGS3 {
+
+/*inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
+	uint32x4_t x = vmovl_u16(pixels);
+
+	// c is the extracted 5/6 bit color from the image
+	uint32x4_t c = vshrq_n_u32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
+	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
+	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
+	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
+	c = vandq_u32(x, vmovq_n_u32(0x001f));
+	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
+	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
+}
+
+inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
+	// x is the final 16 bit rgb pixel
+	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+	return vmovn_u32(x);
+}
+
+inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
+
+	// Split the components into rgb
+	uint16x8_t srcComps[] = {
+		vandq_u16(srcCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),	// G
+		vshrq_n_u16(srcCols, 11),								// R
+	}, destComps[] = {
+		vandq_u16(destCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)), // G
+		vshrq_n_u16(destCols, 11),								// R
+	};
+
+	// At some point I made it so that it would put them into their 8bit depth format
+	// to keep the function as 1-1 with the original, but it didn't seem to help much
+	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
+	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
+	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
+	//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
+	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
+	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
+
+	// Calculate the differences between the colors
+	uint16x8_t diffs[] = {
+		vsubq_u16(srcComps[0], destComps[0]), // B
+		vsubq_u16(srcComps[1], destComps[1]), // G
+		vsubq_u16(srcComps[2], destComps[2]), // R
+	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
+	alphas = vshrq_n_u16(alphas, 2);
+	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
+	alphas = vshrq_n_u16(alphas, 1);
+	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
+	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
+
+	// Originally, I converted it back to normal here from the 8bpp form, but don't need to do that anymore
+	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
+	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
+	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
+
+	// Here we add the difference between the 2 colors times alpha onto the destination
+	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
+	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
+	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
+	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
+	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
+}
+
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
+inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
+
+	// Get the alpha from the destination
+	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
+
+	// Get red and blue components
+	uint32x4_t srcColsCopy = srcCols;
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	uint32x4_t destColsCopy = destCols;
+	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+
+	// compute the difference, then multiply by alpha and divide by 255
+	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
+	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
+	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
+	srcColsCopy = vaddq_u32(srcColsCopy, destCols); // Add the new red/blue to the old ones
+
+	// do the same for the green component
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
+	srcCols = vsubq_u32(srcCols, destCols);
+	srcCols = vmulq_u32(srcCols, alphas);
+	srcCols = vshrq_n_u32(srcCols, 8);
+	srcCols = vaddq_u32(srcCols, destCols); // Add the new green to the old green
+
+	// keep values in 8bit range and glue red/blue and green together
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	srcCols = vorrq_u32(srcCols, srcColsCopy);
+
+	// Remeber that alpha is not alphas, but rather the alpha of destCols
+	if (preserveAlpha) {
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+// uses the alpha from srcCols and destCols
+inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
+	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
+
+	// sAlphas1 has the alphas of the first pixel in lanes 0 and 1 and of the second pixel in lanes 2 and 3
+	// same with sAlphas2 but for the 2nd pixel
+	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
+	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
+
+	// Same thing going on here with dAlphas, except that it gets mutliplied by (1 - sAlpha) first
+	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
+	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
+	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
+	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
+	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
+
+	// first 2 pixels
+	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
+	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
+	// last 2 pixels
+	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
+	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
+	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
+	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
+	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
+	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1)); // compute reciprocal
+	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
+	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
+	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
+	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
+	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
+	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
+
+	// alpha channel is computed differently
+	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
+
+	// Final argb components as 16bit values
+	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
+
+	// copy alpha channel over
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
+
+	// cast 16bit to 8bit and reinterpret as uint32's
+	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
+}
+
+inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
+
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	// do the transformation (we don't actually need alpha at all)
+	float32x4_t ddr, ddg, ddb;
+	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t ssr, ssg, ssb;
+	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
+	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
+	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
+	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
+
+	// This is here to stop from dividing by 0
+	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
+
+	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
+	float32x4_t hr, hg, hb, hue;
+	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
+	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
+	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
+	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
+
+	// And then compute which one will be used based on criteria
+	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
+	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
+	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
+	hue = vmulq_f32(hr, hrfactors);
+	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
+	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
+
+	// Mess with the light like the original function
+	float32x4_t val = dmaxes;
+	if (light) {
+		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
+		val = vmaxq_f32(val, vmovq_n_f32(0.0));
+	}
+		
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
+	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
+	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
+	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
+	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
+	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
+	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
+
+	// Again HSV->RGB is also a piecewise function
+	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
+	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
+	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
+	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
+	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
+	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
+	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
+	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
+	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
+	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
+	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
+	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
+
+	// or the values together
+	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
+
+	// add the minimums back in
+	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
+	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
+	final = vaddq_u32(final, val_add);
+	return final;
+}
+
+inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
+	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
+		srcAlphas = vshrq_n_u32(srcCols, 24);
+		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
+		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
+		difAlphas = vshlq_n_u32(difAlphas, 24);
+		srcAlphas = vshlq_n_u32(srcAlphas, 24);
+		mask = vceqq_u32(alphas, vmovq_n_u32(0));
+		srcAlphas = vandq_u32(srcAlphas, mask);
+		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
+		alphas = vshrq_n_u32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
+		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
+		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
+		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
+		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
+		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+		// mask and or them together
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
+		ch1 = vandq_u32(ch1, mask);
+		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
+		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
+		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
+		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+}
+
+inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	uint16x8_t mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
+		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
+		alphas = vorrq_u16(ch1, ch2);
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
+		ch1 = vandq_u32(srcCols, mask);
+		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
+		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
+		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
+		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
+		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
+		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
+		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return vcombine_u16(lo, hi);
+	}
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
+	uint32x4_t srcCols, destCol;
+
+	if (DestBytesPerPixel == 4)
+		destCol = vld1q_u32((uint32 *)destPtr);
+	else
+		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
+	if (SrcBytesPerPixel == 4)
+		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+	else
+		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
+	mask1 = vorrq_u32(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
+	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
+	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u32(final);
+		final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+	}
+	if (DestBytesPerPixel == 4) {
+		vst1q_u32((uint32 *)destPtr, final);
+	} else {
+		vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
+	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
+	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
+	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
+	mask1 = vorrq_u16(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
+	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
+	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u16(final);
+		final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
+	}
+	vst1q_u16((uint16 *)destPtr, final);
+}*/
+
+} // namespace AGS3
+
+#endif /* __powerpc__ */
+#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC */
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 35fddd6ef2d..940db8b117e 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -26,6 +26,7 @@ MODULE_OBJS = \
 	lib/allegro/surface.o \
 	lib/allegro/surface_simd_neon.o \
 	lib/allegro/surface_simd_sse.o \
+	lib/allegro/surface_simd_ppc.o \
 	lib/allegro/surface_simd_none.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \


Commit: bdbceeb67425ed28d1ce3d6ecfffbe8daae848cc
    https://github.com/scummvm/scummvm/commit/bdbceeb67425ed28d1ce3d6ecfffbe8daae848cc
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed PowerPC code not compiler under GCC 6.59.21

Changed paths:
    engines/ags/globals.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 65db6f79d5a..eb14221bcc3 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -129,7 +129,15 @@ static bool checkForSIMDExtensions() {
 #elif defined(__aarch64__)
 	return true;
 #elif defined(__powerpc__)
+#  if __GNUC__ > 6 || \
+		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 ||
+							__GNUC_MINOR__ == 59) &&
+							(__GNUC_PATCHLEVEL__ > 21 ||
+							__GNUC_PATCHLEVEL__ == 21))
 	return __builtin_cpu_supports("altivec");
+#  else
+	return true; // Just assume that we have these extensions 
+#  endif
 #else
 	return false;
 #endif


Commit: f53e39bac4cd5575890d3ff2d3c7c363ebbf7627
    https://github.com/scummvm/scummvm/commit/f53e39bac4cd5575890d3ff2d3c7c363ebbf7627
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed last commit

Changed paths:
    engines/ags/globals.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index eb14221bcc3..8a3fd9a06e9 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -130,9 +130,9 @@ static bool checkForSIMDExtensions() {
 	return true;
 #elif defined(__powerpc__)
 #  if __GNUC__ > 6 || \
-		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 ||
-							__GNUC_MINOR__ == 59) &&
-							(__GNUC_PATCHLEVEL__ > 21 ||
+		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 || \
+							__GNUC_MINOR__ == 59) && \
+							(__GNUC_PATCHLEVEL__ > 21 || \
 							__GNUC_PATCHLEVEL__ == 21))
 	return __builtin_cpu_supports("altivec");
 #  else


Commit: 0f6da5b2993c94aa394193e7324975adcf05db6c
    https://github.com/scummvm/scummvm/commit/0f6da5b2993c94aa394193e7324975adcf05db6c
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Still fixing PowerPC blitting issues

Changed paths:
    engines/ags/lib/allegro/surface_simd_ppc.cpp


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index b07c35429e0..a702c30c728 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -22,9 +22,9 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 	tint = vec_or(tint, vec_sl(vec_splat_u32(tintRed), 16));
 	tint = vec_or(tint, vec_sl(vec_splat_u32(tintGreen), 8));
 	tint = vec_or(tint, vec_splat_u32(tintBlue));
-	vector unsigned int maskedAlphas = vec_splat_u32(&alphaMask);
-	vector unsigned int transColors = vec_splat_u32(&transColor);
-	vector unsigned int alphas = vec_splat_u32(&srcAlpha);
+	vector unsigned int maskedAlphas = vec_splat_u32(alphaMask);
+	vector unsigned int transColors = vec_splat_u32(transColor);
+	vector unsigned int alphas = vec_splat_u32(srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	vector unsigned int addIndexes = {0, 1, 2, 3};
@@ -361,7 +361,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 template<int ScaleThreshold>
 void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
-	vector unsigned char transColors = vec_splat_u8(&transColor);
+	vector unsigned char transColors = vec_splat_u8(transColor);
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
 	vector unsigned int scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};


Commit: f5908486b908c4a3c828881c260a4855ebfa8767
    https://github.com/scummvm/scummvm/commit/f5908486b908c4a3c828881c260a4855ebfa8767
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Still trying to get PPC blitting to compile

Changed paths:
    engines/ags/lib/allegro/surface_simd_ppc.cpp


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index a702c30c728..f78e6303d59 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -15,6 +15,7 @@ namespace AGS3 {
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
 void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -27,11 +28,11 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 	vector unsigned int alphas = vec_splat_u32(srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
-	vector unsigned int addIndexes = {0, 1, 2, 3};
-	if (horizFlip) addIndexes = {3, 2, 1, 0};
+	vector unsigned int addIndexes = (vector unsigned int){0, 1, 2, 3};
+	if (horizFlip) addIndexes = (vector unsigned int){3, 2, 1, 0};
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	vector unsigned int scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	vector unsigned int scaleAdds = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
@@ -197,12 +198,12 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 	vector unsigned short alphas = vec_splat_u16(srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
-	vector unsigned short addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+	vector unsigned short addIndexes = (vector unsigned short){0, 1, 2, 3, 4, 5, 6, 7};
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
-	vector unsigned int scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	vector unsigned int scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	if (horizFlip) addIndexes = (vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0};
+	vector unsigned int scaleAdds = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	vector unsigned int scaleAdds2 = (vector unsigned int){(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
@@ -364,10 +365,10 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 	vector unsigned char transColors = vec_splat_u8(transColor);
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	vector unsigned int scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	vector unsigned int scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-	vector unsigned int scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
-	vector unsigned int scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
+	vector unsigned int scaleAdds1 = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	vector unsigned int scaleAdds2 = (vector unsigned int){(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	vector unsigned int scaleAdds3 = (vector unsigned int){(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
+	vector unsigned int scaleAdds4 = (vector unsigned int){(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
 	
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
@@ -488,7 +489,6 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 	}
 }
 
-
 template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);


Commit: 01fa027296b92258d57801c24de7bc6b0e5c71c4
    https://github.com/scummvm/scummvm/commit/01fa027296b92258d57801c24de7bc6b0e5c71c4
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Still trying to get PPC blitting to compile

Changed paths:
    engines/ags/lib/allegro/surface_simd_ppc.cpp


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index f78e6303d59..9ccb4fbd3e3 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -1,7 +1,3 @@
-#include "ags/lib/allegro/surface_simd_ppc.h"
-
-#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-
 #include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
 #include "ags/lib/allegro/flood.h"
@@ -10,6 +6,10 @@
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
+#include "ags/lib/allegro/surface_simd_ppc.h"
+
+#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+
 namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits


Commit: 89fef524b8acaa37ecbdd54b709994e5e97e8044
    https://github.com/scummvm/scummvm/commit/89fef524b8acaa37ecbdd54b709994e5e97e8044
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed more compiling issues for blitting PPC

PowerPC's <altivec.h> header redefines bool to be __vector(4) __bool which
is weird, so I changed the prototypes of the functions to use int instead
of bool. Hopefully this fixes things.

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_ppc.cpp
    engines/ags/lib/allegro/surface_simd_sse.cpp


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 1fcdacf8b05..8d3a04d747c 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -107,7 +107,7 @@ void BITMAP::floodfill(int x, int y, int color) {
 const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 7e011908459..8b7d409b65a 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -266,13 +266,13 @@ public:
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int ScaleThreshold>
-	void drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	
 	inline uint32 getColor(const byte *data, byte bpp) const {
 		switch (bpp) {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index e9d9ac76365..79010abf5b3 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -14,7 +14,7 @@ namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -185,7 +185,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -356,7 +356,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	uint8x16_t transColors = vld1q_dup_u8(&transColor);
 
@@ -480,16 +480,16 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 
 } // namespace AGS3
 
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index 9ccb4fbd3e3..077450cac9b 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -14,8 +14,9 @@ namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	return;
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -98,9 +99,9 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
 				if (SrcBytesPerPixel == 4)
-					indexes = vec_sl(vec_sl(vec_add(indexes, scaleAdds), 8), 2);
+					indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 2);
 				else
-					indexes = vec_sl(vec_sl(vec_add(indexes, scaleAdds), 8), 1);
+					indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
 #else
 #error Change code to allow different scale threshold!
 #endif
@@ -189,7 +190,9 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	return;
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -360,7 +363,9 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
+	return;
 	const int xDir = horizFlip ? -1 : 1;
 	vector unsigned char transColors = vec_splat_u8(transColor);
 
@@ -489,16 +494,16 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 	}
 }
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 
 } // namespace AGS3
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 37075bfe31f..fe3413949d2 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -26,7 +26,7 @@ inline uint32 extract32_idx3(__m128i x) {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -200,7 +200,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -371,7 +371,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
+void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
 	const int xDir = horizFlip ? -1 : 1;
 	__m128i transColors = _mm_set1_epi16(transColor | (transColor << 8));
 
@@ -500,16 +500,16 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 
 } // namespace AGS3
 


Commit: ef265e68dd57a2b4db7c1696be70b36c449b92ab
    https://github.com/scummvm/scummvm/commit/ef265e68dd57a2b4db7c1696be70b36c449b92ab
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Still trying to get PPC to compile

Changed paths:
    engines/ags/lib/allegro/surface_simd_ppc.cpp
    engines/ags/lib/allegro/surface_simd_sse.cpp


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index 077450cac9b..7bedcca23a5 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -267,7 +267,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			uint16 srcBuffer[8];
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-				vector unsigned int indexes = vector_splat_u32(scaleXCtr), indexes2 = vector_splat_u32(scaleXCtr);
+				vector unsigned int indexes = vec_splat_u32(scaleXCtr), indexes2 = vect_splat_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
 				indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index fe3413949d2..260e48fde7e 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -499,7 +499,6 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 	}
 }
 
-
 template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
 template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);


Commit: f88d3633b48b57d9c280fbc327fa79b73055420e
    https://github.com/scummvm/scummvm/commit/f88d3633b48b57d9c280fbc327fa79b73055420e
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Still trying to get PPC to compile

Changed paths:
    engines/ags/lib/allegro/surface_simd_ppc.cpp


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index 7bedcca23a5..a91449c2b4d 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -72,7 +72,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
 				// Skip pixels that are beyond the row
-				vector unsigned int skipMask = vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				vector unsigned int skipMask = (vector unsigned int)vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
 				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
 			// Goto next row in source and destination image
@@ -115,7 +115,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
-				vector unsigned int skipMask = vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				vector unsigned int skipMask = (vector unsigned int)vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
 				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
@@ -244,7 +244,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
 				// Skip pixels that are beyond the row
-				vector unsigned int skipMask = vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
+				vector unsigned int skipMask = (vector unsigned int)vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
 				//drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
 			// Goto next row in source and destination image
@@ -267,7 +267,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			uint16 srcBuffer[8];
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-				vector unsigned int indexes = vec_splat_u32(scaleXCtr), indexes2 = vect_splat_u32(scaleXCtr);
+				vector unsigned int indexes = vec_splat_u32(scaleXCtr), indexes2 = vec_splat_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
 				indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
@@ -289,7 +289,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * 2];
-				vector unsigned int skipMask = vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
+				vector unsigned int skipMask = (vector unsigned int)vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
 				//drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
@@ -458,7 +458,7 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			}
 
 			// Mask out transparent pixels
-			vector unsigned char mask1 = skipTrans ? vec_cmpeq(srcCols, transColors) : vec_splat_u8(0);
+			vector unsigned char mask1 = skipTrans ? (vector unsigned int)vec_cmpeq(srcCols, transColors) : vec_splat_u8(0);
 			vector unsigned char final = vec_or(vec_and(srcCols, vec_nor(mask1, vec_splat_u8(0))), vec_and(destCols, mask1));
 			if (horizFlip) {
 				final = (vector unsigned char){


Commit: 29a0903e5b000ae1959753f98f3798aefa49e8f9
    https://github.com/scummvm/scummvm/commit/29a0903e5b000ae1959753f98f3798aefa49e8f9
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Holding off PowerPC blitting optimizations

I'm taking GSOC in a slightly different direction. I will finish the
PowerPC blending/blitting optimizations, but first I'm going to focus
on the general Graphics::Surface and Graphics::ManagedSurface code for
now.

Changed paths:
    engines/ags/globals.cpp
    engines/ags/lib/allegro/surface_simd_ppc.cpp
    engines/ags/lib/allegro/surface_simd_ppc.h


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 8a3fd9a06e9..dea91412f98 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -128,16 +128,17 @@ static bool checkForSIMDExtensions() {
 #  endif
 #elif defined(__aarch64__)
 	return true;
-#elif defined(__powerpc__)
-#  if __GNUC__ > 6 || \
-		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 || \
-							__GNUC_MINOR__ == 59) && \
-							(__GNUC_PATCHLEVEL__ > 21 || \
-							__GNUC_PATCHLEVEL__ == 21))
-	return __builtin_cpu_supports("altivec");
-#  else
-	return true; // Just assume that we have these extensions 
-#  endif
+// TODO: Complete PowerPC code
+//#elif defined(__powerpc__)
+//#  if __GNUC__ > 6 || \
+//		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 || \
+//							__GNUC_MINOR__ == 59) && \
+//							(__GNUC_PATCHLEVEL__ > 21 || \
+//							__GNUC_PATCHLEVEL__ == 21))
+//	return __builtin_cpu_supports("altivec");
+//#  else
+//	return true; // Just assume that we have these extensions 
+//#  endif
 #else
 	return false;
 #endif
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
index a91449c2b4d..8a328104e8e 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ b/engines/ags/lib/allegro/surface_simd_ppc.cpp
@@ -1,4 +1,6 @@
-#include "ags/lib/allegro/gfx.h"
+// TODO: Complete PowerPC code
+
+/*#include "ags/lib/allegro/gfx.h"
 #include "ags/lib/allegro/color.h"
 #include "ags/lib/allegro/flood.h"
 #include "ags/ags.h"
@@ -350,9 +352,9 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				gSrc = tintGreen;
 				bSrc = tintBlue;
 				aSrc = srcAlpha;
-			}/* else {
-				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
-			}*/
+			}// else {
+			//	format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			//}
 			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
@@ -507,4 +509,4 @@ template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, in
 
 } // namespace AGS3
 
-#endif
+#endif*/
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.h b/engines/ags/lib/allegro/surface_simd_ppc.h
index 95860b9d5fe..e6152b5e15b 100644
--- a/engines/ags/lib/allegro/surface_simd_ppc.h
+++ b/engines/ags/lib/allegro/surface_simd_ppc.h
@@ -23,16 +23,18 @@
 
 #if defined(__powerpc__)
 
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#endif
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-#endif
-
-#include <altivec.h>
-#include "ags/globals.h"
-#include "ags/lib/allegro/surface.h"
+// TODO: Complete PowerPC code
+
+//#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+//#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
+//#endif
+//#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+//#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
+//#endif
+//
+//#include <altivec.h>
+//#include "ags/globals.h"
+//#include "ags/lib/allegro/surface.h"
 
 namespace AGS3 {
 


Commit: 6c353ba72b71f6df84ccd927eb72d37c506a8361
    https://github.com/scummvm/scummvm/commit/6c353ba72b71f6df84ccd927eb72d37c506a8361
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Use main SIMD detection features

Changed paths:
    engines/ags/globals.cpp
    engines/ags/globals.h
    engines/ags/lib/allegro/surface.cpp


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index dea91412f98..9d89bed9aae 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -103,52 +103,12 @@ namespace AGS3 {
 
 Globals *g_globals;
 
-static bool checkForSIMDExtensions() {
-#if defined(__x86_64__) || defined(__i686__)
-#  ifdef __GNUC__
-	uint32 extensions;
-	asm ("mov $1, %%eax\n\t"
-		 "cpuid\n\t"
-		 "mov %%edx, %0\n\t"
-		 : "=rm" (extensions)
-		 :
-		 : "eax", "ebx", "ecx", "edx");
-	return extensions & (1 << 26); // SSE2 extensions bit
-#  elif _MSC_VER
-	uint32 extensions;
-	__asm
-	{
-		mov eax,1
-		cpuid
-		mov extensions,edx
-	}
-	return extensions & (1 << 26); // SSE2 extensions bit
-#  else
-	return false;
-#  endif
-#elif defined(__aarch64__)
-	return true;
-// TODO: Complete PowerPC code
-//#elif defined(__powerpc__)
-//#  if __GNUC__ > 6 || \
-//		(__GNUC__ == 6 && (__GNUC_MINOR__ > 59 || \
-//							__GNUC_MINOR__ == 59) && \
-//							(__GNUC_PATCHLEVEL__ > 21 || \
-//							__GNUC_PATCHLEVEL__ == 21))
-//	return __builtin_cpu_supports("altivec");
-//#  else
-//	return true; // Just assume that we have these extensions 
-//#  endif
-#else
-	return false;
-#endif
-}
-
 Globals::Globals() {
 	g_globals = this;
 
 	// Allegro globals
-	__bitmap_simd_optimizations = checkForSIMDExtensions();
+	_simd_flags |= g_system->hasFeature(OSystem::kFeatureCpuNEON) ? SIMD_NEON : SIMD_NONE;
+	_simd_flags |= g_system->hasFeature(OSystem::kFeatureCpuSSE2) ? SIMD_SSE2 : SIMD_NONE;
 	Common::fill((byte *)&_black_palette, (byte *)&_black_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_current_palette, (byte *)&_current_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_prev_current_palette, (byte *)&_prev_current_palette + PAL_SIZE, 0);
diff --git a/engines/ags/globals.h b/engines/ags/globals.h
index 8d9c53bb4d8..cacb17fb8ea 100644
--- a/engines/ags/globals.h
+++ b/engines/ags/globals.h
@@ -184,6 +184,12 @@ struct ViewStruct;
 
 class Globals {
 public:
+	enum SimdFlags : uint {
+		SIMD_NONE = 0,
+		SIMD_NEON = (1 << 0),
+		SIMD_SSE2 = (1 << 1),
+	};
+
 	/**
 	 * @defgroup agsglobals AGS Globals
 	 * @ingroup agsengine
@@ -221,7 +227,7 @@ public:
 	int _trans_blend_green = 0;
 	int _trans_blend_blue = 0;
 	BlenderMode __blender_mode = kRgbToRgbBlender;
-	bool __bitmap_simd_optimizations = true;
+	uint _simd_flags = SIMD_NONE;
 	/* current format information and worker routines */
 	int _utype = U_UTF8;
 
diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 8d3a04d747c..f8094a0b1f3 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -294,7 +294,7 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
 	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
-	if (!_G(_bitmap_simd_optimizations)) {
+	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER((drawInnerGeneric<1, 1, 0>)); return;
@@ -376,7 +376,7 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
 #define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY)
-	if (!_G(_bitmap_simd_optimizations)) {
+	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER((drawInnerGeneric<1, 1, SCALE_THRESHOLD>)); return;


Commit: a1858e31f01f1432a3ae901fb4042dffd5b71744
    https://github.com/scummvm/scummvm/commit/a1858e31f01f1432a3ae901fb4042dffd5b71744
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: JANITORIAL: Cleaned up old bliting files

Changed paths:
  R engines/ags/lib/allegro/surface_simd_ppc.cpp
  R engines/ags/lib/allegro/surface_simd_ppc.h


diff --git a/engines/ags/lib/allegro/surface_simd_ppc.cpp b/engines/ags/lib/allegro/surface_simd_ppc.cpp
deleted file mode 100644
index 8a328104e8e..00000000000
--- a/engines/ags/lib/allegro/surface_simd_ppc.cpp
+++ /dev/null
@@ -1,512 +0,0 @@
-// TODO: Complete PowerPC code
-
-/*#include "ags/lib/allegro/gfx.h"
-#include "ags/lib/allegro/color.h"
-#include "ags/lib/allegro/flood.h"
-#include "ags/ags.h"
-#include "ags/globals.h"
-#include "common/textconsole.h"
-#include "graphics/screen.h"
-
-#include "ags/lib/allegro/surface_simd_ppc.h"
-
-#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-
-namespace AGS3 {
-
-// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-	return;
-	const int xDir = horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	vector unsigned int tint = vec_sl(vec_splat_u32(srcAlpha), 24);
-	tint = vec_or(tint, vec_sl(vec_splat_u32(tintRed), 16));
-	tint = vec_or(tint, vec_sl(vec_splat_u32(tintGreen), 8));
-	tint = vec_or(tint, vec_splat_u32(tintBlue));
-	vector unsigned int maskedAlphas = vec_splat_u32(alphaMask);
-	vector unsigned int transColors = vec_splat_u32(transColor);
-	vector unsigned int alphas = vec_splat_u32(srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	vector unsigned int addIndexes = (vector unsigned int){0, 1, 2, 3};
-	if (horizFlip) addIndexes = (vector unsigned int){3, 2, 1, 0};
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	vector unsigned int scaleAdds = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
-	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
-		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		xStart = 0;
-	}
-	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
-		destY = 0;
-		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
-		}
-	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
-	}
-	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 4 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-		vector unsigned int xCtrWidthSIMD = vec_splat_u32(xCtrWidth); // This is the width of the row
-
-		if (ScaleThreshold == 0) {
-			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				vector unsigned int skipMask = (vector unsigned int)vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
-				srcP += src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			byte srcBuffer[4*4];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
-				vector unsigned int indexes = vec_splat_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-				// Calculate in parallel the indexes of the pixels
-				if (SrcBytesPerPixel == 4)
-					indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 2);
-				else
-					indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
-#else
-#error Change code to allow different scale threshold!
-#endif
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + indexes[0], SrcBytesPerPixel);
-				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + indexes[1], SrcBytesPerPixel);
-				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + indexes[2], SrcBytesPerPixel);
-				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + indexes[3], SrcBytesPerPixel);
-				scaleXCtr += scaleX*4;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
-				vector unsigned int skipMask = (vector unsigned int)vec_cmpeq(vec_add(vec_splat_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
-		}
-	}
-
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-	// We have a picture that is a multiple of 4, so no extra pixels to draw
-	if (xCtrWidth % 4 == 0) return;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
-		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
-			//drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
-		}
-		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
-		if (horizFlip) srcP += SrcBytesPerPixel * 3;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 4;
-		xCtrBpp = xCtr * SrcBytesPerPixel;
-		destX = xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
-		}
-		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
-		
-		// Check if this is a transparent color we should skip
-		if (skipTrans && ((srcCol & alphaMask) == transColor))
-			continue;
-
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
-			}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		if (DestBytesPerPixel == 4)
-			*(uint32 *)destVal = srcCol;
-		else
-			*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-	return;
-	const int xDir = horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	vector unsigned short tint = vec_splat_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
-	vector unsigned short transColors = vec_splat_u16(transColor);
-	vector unsigned short alphas = vec_splat_u16(srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	vector unsigned short addIndexes = (vector unsigned short){0, 1, 2, 3, 4, 5, 6, 7};
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (horizFlip) addIndexes = (vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0};
-	vector unsigned int scaleAdds = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	vector unsigned int scaleAdds2 = (vector unsigned int){(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
-	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
-		xCtrBppStart = xCtrStart * 2;
-		xStart = 0;
-	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
-		destY = 0;
-		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
-		}
-	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
-	}
-	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 8 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-		vector unsigned short xCtrWidthSIMD = vec_splat_u16(xCtrWidth); // This is the width of the row
-		if (ScaleThreshold == 0) {
-			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				vector unsigned int skipMask = (vector unsigned int)vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
-				//drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			uint16 srcBuffer[8];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-				vector unsigned int indexes = vec_splat_u32(scaleXCtr), indexes2 = vec_splat_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-				// Calculate in parallel the indexes of the pixels
-				indexes = vec_sl(vec_sr(vec_add(indexes, scaleAdds), 8), 1);
-				indexes2 = vec_sl(vec_sr(vec_add(indexes2, scaleAdds2), 8), 1);
-#else
-#error Change code to allow different scale threshold!
-#endif
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				srcBuffer[0] = *(const uint16 *)(srcP + indexes[0]);
-				srcBuffer[1] = *(const uint16 *)(srcP + indexes[1]);
-				srcBuffer[2] = *(const uint16 *)(srcP + indexes[2]);
-				srcBuffer[3] = *(const uint16 *)(srcP + indexes[3]);
-				srcBuffer[4] = *(const uint16 *)(srcP + indexes2[0]);
-				srcBuffer[5] = *(const uint16 *)(srcP + indexes2[1]);
-				srcBuffer[6] = *(const uint16 *)(srcP + indexes2[2]);
-				srcBuffer[7] = *(const uint16 *)(srcP + indexes2[3]);
-				scaleXCtr += scaleX*8;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * 2];
-				vector unsigned int skipMask = (vector unsigned int)vec_cmpgt(vec_add(vec_add(vec_splat_u16(xCtr), addIndexes) vec_splat_u16(1)), xCtrWidthSIMD);
-				//drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
-		}
-	}
-
-	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
-		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-			byte *destPtr = &destP[destX * 2];
-			//drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
-		}
-		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
-		if (horizFlip) srcP += 2 * 7;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 8;
-		xCtrBpp = xCtr * 2;
-		destX = xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
-		}
-		byte *destVal = (byte *)&destP[destX * 2];
-		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
-		
-		// Check if this is a transparent color we should skip
-		if (skipTrans && srcCol == transColor)
-			continue;
-
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
-			}// else {
-			//	format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
-			//}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-	return;
-	const int xDir = horizFlip ? -1 : 1;
-	vector unsigned char transColors = vec_splat_u8(transColor);
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	vector unsigned int scaleAdds1 = (vector unsigned int){0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	vector unsigned int scaleAdds2 = (vector unsigned int){(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-	vector unsigned int scaleAdds3 = (vector unsigned int){(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
-	vector unsigned int scaleAdds4 = (vector unsigned int){(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
-	
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
-	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
-		xStart = 0;
-	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
-		destY = 0;
-		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
-		}
-	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
-	}
-	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 16 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
-		if (ScaleThreshold != 0) {
-			// So here we update the srcYCtr differently due to this being for
-			// scaling
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
-			if (srcYCtr != newSrcYCtr) {
-				// Since the source yctr might not update every row of the destination, we have
-				// to see if we are on a new row...
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-		}
-		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
-		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
-			byte *destPtr = &destP[destX];
-
-			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
-			// can't have any blending applied to them
-			vector unsigned char destCols;
-			memcpy(&destCols, destPtr, sizeof(destCols)); // There are no unaligned load instructions in AltiVec
-			vector unsigned char srcCols;
-			memcpy(&srcCols, scrP + xDir * xCtr, sizeof(srcCols));
-			if (ScaleThreshold != 0) {
-				// If we are scaling, we have to set each pixel individually
-				vector unsigned int indexes1 = vec_splat_u32(scaleXCtr), indexes2 = vec_splat_u32(scaleXCtr);
-				vector unsigned int indexes3 = vec_splat_u32(scaleXCtr), indexes4 = vec_splat_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-				indexes1 = vec_sr(vec_add(indexes1, scaleAdds1), 8);
-				indexes2 = vec_sr(vec_add(indexes2, scaleAdds2), 8);
-				indexes3 = vec_sr(vec_add(indexes3, scaleAdds3), 8);
-				indexes4 = vec_sr(vec_add(indexes4, scaleAdds4), 8);
-#else
-#error Change code to allow different scale threshold!
-#endif
-				srcCols[0] = srcP[indexes1[0]];
-				srcCols[1] = srcP[indexes1[1]];
-				srcCols[2] = srcP[indexes1[2]];
-				srcCols[3] = srcP[indexes1[3]];
-				srcCols[4] = srcP[indexes2[0]];
-				srcCols[5] = srcP[indexes2[1]];
-				srcCols[6] = srcP[indexes2[2]];
-				srcCols[7] = srcP[indexes2[3]];
-				srcCols[8] = srcP[indexes3[0]];
-				srcCols[9] = srcP[indexes3[1]];
-				srcCols[10] = srcP[indexes3[2]]);
-				srcCols[11] = srcP[indexes3[3]]);
-				srcCols[12] = srcP[indexes4[0]]);
-				srcCols[13] = srcP[indexes4[1]]);
-				srcCols[14] = srcP[indexes4[2]]);
-				srcCols[15] = srcP[indexes4[3]]);
-				scaleXCtr += scaleX*16;
-			}
-
-			// Mask out transparent pixels
-			vector unsigned char mask1 = skipTrans ? (vector unsigned int)vec_cmpeq(srcCols, transColors) : vec_splat_u8(0);
-			vector unsigned char final = vec_or(vec_and(srcCols, vec_nor(mask1, vec_splat_u8(0))), vec_and(destCols, mask1));
-			if (horizFlip) {
-				final = (vector unsigned char){
-					final[0], final[1], final[2], final[3],
-					final[4], final[5], final[7], final[8],
-					final[8], final[9], final[10], final[11],
-					final[12], final[13], final[14], final[15],
-				};
-			}
-			memcpy(destPtr, final, sizeof(final));
-		}
-		// Get the last x values
-
-		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
-		if (horizFlip) srcP += 15;
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
-			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-			if (ScaleThreshold != 0) {
-				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
-			}
-			// Check if this is a transparent color we should skip
-			if (skipTrans && *srcCol == transColor)
-				continue;
-
-			byte *destVal = (byte *)&destP[destX];
-			*destVal = *srcCol;
-		}
-		if (horizFlip) srcP -= 15; // Undo what we did up there
-		destP += destArea.pitch; // Goto next row
-		// Only advance the src row by 1 every time like this if we don't scale
-		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
-	}
-}
-
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-
-} // namespace AGS3
-
-#endif*/
diff --git a/engines/ags/lib/allegro/surface_simd_ppc.h b/engines/ags/lib/allegro/surface_simd_ppc.h
deleted file mode 100644
index e6152b5e15b..00000000000
--- a/engines/ags/lib/allegro/surface_simd_ppc.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* ScummVM - Graphic Adventure Engine
- *
- * ScummVM is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_H
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_H
-
-#if defined(__powerpc__)
-
-// TODO: Complete PowerPC code
-
-//#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-//#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-//#endif
-//#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-//#define AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC_IMPL
-//#endif
-//
-//#include <altivec.h>
-//#include "ags/globals.h"
-//#include "ags/lib/allegro/surface.h"
-
-namespace AGS3 {
-
-/*inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
-	uint32x4_t x = vmovl_u16(pixels);
-
-	// c is the extracted 5/6 bit color from the image
-	uint32x4_t c = vshrq_n_u32(x, 11);
-
-	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
-	// sinificant bits in the original color for the least significant bits in the new one
-	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
-	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
-	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
-	c = vandq_u32(x, vmovq_n_u32(0x001f));
-	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
-
-	// By default 2bpp to 4bpp makes the alpha channel 255
-	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
-}
-
-inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
-	// x is the final 16 bit rgb pixel
-	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
-	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
-	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
-	return vmovn_u32(x);
-}
-
-inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did
-	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
-
-	// Split the components into rgb
-	uint16x8_t srcComps[] = {
-		vandq_u16(srcCols, vmovq_n_u16(0x1f)),					// B
-		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),	// G
-		vshrq_n_u16(srcCols, 11),								// R
-	}, destComps[] = {
-		vandq_u16(destCols, vmovq_n_u16(0x1f)),					// B
-		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)), // G
-		vshrq_n_u16(destCols, 11),								// R
-	};
-
-	// At some point I made it so that it would put them into their 8bit depth format
-	// to keep the function as 1-1 with the original, but it didn't seem to help much
-	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
-	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
-	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
-	//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
-	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
-	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
-
-	// Calculate the differences between the colors
-	uint16x8_t diffs[] = {
-		vsubq_u16(srcComps[0], destComps[0]), // B
-		vsubq_u16(srcComps[1], destComps[1]), // G
-		vsubq_u16(srcComps[2], destComps[2]), // R
-	};
-
-	// Multiply by alpha and shift depth bits to the right
-	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
-	alphas = vshrq_n_u16(alphas, 2);
-	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
-	alphas = vshrq_n_u16(alphas, 1);
-	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
-	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
-
-	// Originally, I converted it back to normal here from the 8bpp form, but don't need to do that anymore
-	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
-	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
-	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
-
-	// Here we add the difference between the 2 colors times alpha onto the destination
-	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
-	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
-	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
-
-	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
-	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
-	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
-}
-
-// preserveAlpha:
-//		false => set destCols's alpha to 0
-// 		true => keep destCols's alpha
-inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did
-	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
-
-	// Get the alpha from the destination
-	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
-
-	// Get red and blue components
-	uint32x4_t srcColsCopy = srcCols;
-	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-	uint32x4_t destColsCopy = destCols;
-	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
-
-	// compute the difference, then multiply by alpha and divide by 255
-	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
-	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
-	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
-	srcColsCopy = vaddq_u32(srcColsCopy, destCols); // Add the new red/blue to the old ones
-
-	// do the same for the green component
-	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
-	srcCols = vsubq_u32(srcCols, destCols);
-	srcCols = vmulq_u32(srcCols, alphas);
-	srcCols = vshrq_n_u32(srcCols, 8);
-	srcCols = vaddq_u32(srcCols, destCols); // Add the new green to the old green
-
-	// keep values in 8bit range and glue red/blue and green together
-	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-	srcCols = vorrq_u32(srcCols, srcColsCopy);
-
-	// Remeber that alpha is not alphas, but rather the alpha of destCols
-	if (preserveAlpha) {
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		srcCols = vorrq_u32(srcCols, alpha);
-	}
-	return srcCols;
-}
-
-// uses the alpha from srcCols and destCols
-inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
-	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
-	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
-
-	// sAlphas1 has the alphas of the first pixel in lanes 0 and 1 and of the second pixel in lanes 2 and 3
-	// same with sAlphas2 but for the 2nd pixel
-	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
-	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
-
-	// Same thing going on here with dAlphas, except that it gets mutliplied by (1 - sAlpha) first
-	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
-	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
-	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
-	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
-	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
-
-	// first 2 pixels
-	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
-	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
-	// last 2 pixels
-	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
-	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
-
-	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
-	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
-	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
-	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
-	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1)); // compute reciprocal
-	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
-	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
-	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
-	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
-	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
-	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
-
-	// alpha channel is computed differently
-	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
-
-	// Final argb components as 16bit values
-	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
-
-	// copy alpha channel over
-	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
-	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
-	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
-	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
-
-	// cast 16bit to 8bit and reinterpret as uint32's
-	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
-}
-
-inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
-	// This function is NOT 1 to 1 with the original... It just approximates it
-	// It gets the value of the HSV of the dest color
-	// Then it gets the HSV of the srcCols
-
-	// how the values are transformed
-	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
-	// srcCols[0] = A | R | G | B
-	// srcCols[1] = A | R | G | B
-	// srcCols[2] = A | R | G | B
-	// srcCols[3] = A | R | G | B
-	//  ->
-	// to 4 float32x4_t's each being a seperate channel with each lane
-	// corresponding to their respective srcCols lane
-	// dda = { A[0], A[1], A[2], A[3] }
-	// ddr = { R[0], R[1], R[2], R[3] }
-	// ddg = { G[0], G[1], G[2], G[3] }
-	// ddb = { B[0], B[1], B[2], B[3] }
-
-	// do the transformation (we don't actually need alpha at all)
-	float32x4_t ddr, ddg, ddb;
-	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-	float32x4_t ssr, ssg, ssb;
-	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-
-	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
-	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
-	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
-	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
-
-	// This is here to stop from dividing by 0
-	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
-
-	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
-
-	// RGB to HSV is a piecewise function, so we compute each part of the function first...
-	float32x4_t hr, hg, hb, hue;
-	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
-	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
-	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
-	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
-
-	// And then compute which one will be used based on criteria
-	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
-	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
-	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
-	hue = vmulq_f32(hr, hrfactors);
-	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
-	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
-
-	// Mess with the light like the original function
-	float32x4_t val = dmaxes;
-	if (light) {
-		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
-		val = vmaxq_f32(val, vmovq_n_f32(0.0));
-	}
-		
-	// then it stiches the HSV back together
-	// the hue and saturation come from the source (tint) color, and the value comes from
-	// the destinaion (real source) color
-	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
-	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
-	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
-	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
-	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
-	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
-	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
-
-	// Again HSV->RGB is also a piecewise function
-	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
-	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
-	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
-	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
-	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
-	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
-	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
-	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
-	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
-	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
-	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
-	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
-
-	// or the values together
-	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
-
-	// add the minimums back in
-	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
-	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
-	final = vaddq_u32(final, val_add);
-	return final;
-}
-
-inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
-	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
-	auto setupArgbAlphas = [&]() {
-		// This acts the same as this in the normal blender functions
-		// if (alpha == 0)
-		//     alpha = aSrc;
-		// else
-		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
-		// where alpha is the alpha byte of the srcCols
-		srcAlphas = vshrq_n_u32(srcCols, 24);
-		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
-		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
-		difAlphas = vshlq_n_u32(difAlphas, 24);
-		srcAlphas = vshlq_n_u32(srcAlphas, 24);
-		mask = vceqq_u32(alphas, vmovq_n_u32(0));
-		srcAlphas = vandq_u32(srcAlphas, mask);
-		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
-	};
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
-		alphas = vshrq_n_u32(srcCols, 24);
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
-		setupArgbAlphas();
-		// only blend if alpha isn't 0, otherwise use destCols
-		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
-		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
-		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
-		setupArgbAlphas();
-		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
-	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
-		// if alpha is NOT 0 or 255
-		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
-		ch2 = argbBlendSIMD(ch2, destCols);
-		// if alpha is 0 or 255
-		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-		// mask and or them together
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
-		ch1 = vandq_u32(ch1, mask);
-		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
-		return rgbBlendSIMD(srcCols, destCols, alphas, true);
-	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
-		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
-		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
-		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
-	case kTintBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
-	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
-	}
-}
-
-inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
-	uint16x8_t mask, ch1, ch2;
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
-	case kOpaqueBlenderMode:
-	case kAdditiveBlenderMode:
-		return srcCols;
-	case kArgbToArgbBlender:
-	case kArgbToRgbBlender:
-		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
-		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
-		alphas = vorrq_u16(ch1, ch2);
-	case kRgbToRgbBlender:
-	case kAlphaPreservedBlenderMode:
-		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
-	case kRgbToArgbBlender:
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
-		ch1 = vandq_u32(srcCols, mask);
-		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kTintBlenderMode:
-	case kTintLightBlenderMode:
-		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
-		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
-		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
-		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
-		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
-		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
-		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
-		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
-		return vcombine_u16(lo, hi);
-	}
-}
-
-template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
-	uint32x4_t srcCols, destCol;
-
-	if (DestBytesPerPixel == 4)
-		destCol = vld1q_u32((uint32 *)destPtr);
-	else
-		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
-	if (SrcBytesPerPixel == 4)
-		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-	else
-		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-	// we do this here because we need to check if we should skip the pixel before we blend it
-	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
-	mask1 = vorrq_u32(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
-		}
-	}
-	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
-	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
-	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-	if (horizFlip) {
-		final = vrev64q_u32(final);
-		final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
-	}
-	if (DestBytesPerPixel == 4) {
-		vst1q_u32((uint32 *)destPtr, final);
-	} else {
-		vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
-	}
-}
-
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
-	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
-	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
-	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
-	mask1 = vorrq_u16(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
-		}
-	}
-	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
-	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
-	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
-	if (horizFlip) {
-		final = vrev64q_u16(final);
-		final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
-	}
-	vst1q_u16((uint16 *)destPtr, final);
-}*/
-
-} // namespace AGS3
-
-#endif /* __powerpc__ */
-#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_PPC */


Commit: cf358fbc4e76a6b6984a85a45f2c0b3ad7eae57d
    https://github.com/scummvm/scummvm/commit/cf358fbc4e76a6b6984a85a45f2c0b3ad7eae57d
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
BUILD: AGS removed PPC blending files

Changed paths:
    engines/ags/module.mk


diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 940db8b117e..35fddd6ef2d 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -26,7 +26,6 @@ MODULE_OBJS = \
 	lib/allegro/surface.o \
 	lib/allegro/surface_simd_neon.o \
 	lib/allegro/surface_simd_sse.o \
-	lib/allegro/surface_simd_ppc.o \
 	lib/allegro/surface_simd_none.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \


Commit: 153afb1081829dabfb20c8131931f9a17f7f0806
    https://github.com/scummvm/scummvm/commit/153afb1081829dabfb20c8131931f9a17f7f0806
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Cleaned up blending funcs argument passing

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_sse.cpp


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index f8094a0b1f3..251c33fc218 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -107,51 +107,51 @@ void BITMAP::floodfill(int x, int y, int color) {
 const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint32_t alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
+void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
 
 	// Instead of skipping pixels outside our boundary here, we just clip
 	// our area instead.
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) { // Clip the right
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) { // Clip the right
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) { // Clip the left
-		xCtrStart = -xStart;
+	if (args.xStart < 0) { // Clip the left
+		xCtrStart = -args.xStart;
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		xStart = 0;
+		args.xStart = 0;
 	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-	if (yStart < 0) { // Clip the top
-		yCtr = -yStart;
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) { // Clip the top
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) { // Clip the bottom
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 1 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr :
-	                       srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 1 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
+	                       args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		if (ScaleThreshold != 0) {
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 		}
 		// Loop through the pixels of the row
-		for (int destX = xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += scaleX) {
+		for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
 			const byte *srcVal = srcP + xDir * xCtrBpp;
 			if (ScaleThreshold != 0) {
 				srcVal = srcP + (scaleXCtr / ScaleThreshold) * SrcBytesPerPixel;
@@ -159,7 +159,7 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
 
 			// Check if this is a transparent color we should skip
-			if (skipTrans && ((srcCol & alphaMask) == transColor))
+			if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
 				continue;
 
 			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
@@ -168,7 +168,7 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			if (DestBytesPerPixel == 1) {
 				*destVal = srcCol;
 				continue;
-			} else if ((DestBytesPerPixel == SrcBytesPerPixel) && srcAlpha == -1) {
+			} else if ((DestBytesPerPixel == SrcBytesPerPixel) && args.srcAlpha == -1) {
 				if (DestBytesPerPixel)
 					*(uint32 *)destVal = srcCol;
 				else
@@ -178,7 +178,7 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 
 			// We need the rgb values to do blending and/or convert between formats
 			if (SrcBytesPerPixel == 1) {
-				const RGB &rgb = palette[srcCol];
+				const RGB &rgb = args.palette[srcCol];
 				aSrc = 0xff;
 				rSrc = rgb.r;
 				gSrc = rgb.g;
@@ -201,24 +201,24 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 				//src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
 			}
 
-			if (srcAlpha == -1) {
+			if (args.srcAlpha == -1) {
 				// This means we don't use blending.
 				aDest = aSrc;
 				rDest = rSrc;
 				gDest = gSrc;
 				bDest = bSrc;
 			} else {
-				if (useTint) {
+				if (args.useTint) {
 					rDest = rSrc;
 					gDest = gSrc;
 					bDest = bSrc;
 					aDest = aSrc;
-					rSrc = tintRed;
-					gSrc = tintGreen;
-					bSrc = tintBlue;
-					aSrc = srcAlpha;
+					rSrc = args.tintRed;
+					gSrc = args.tintGreen;
+					bSrc = args.tintBlue;
+					aSrc = args.srcAlpha;
 				}
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			}
 
 			uint32 pixel;// = format.ARGBToColor(aDest, rDest, gDest, bDest);
@@ -232,11 +232,25 @@ void BITMAP::drawInnerGeneric(int yStart, int xStart, uint32_t transColor, uint3
 			}
 		}
 
-		destP += destArea.pitch;
-		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+		destP += args.destArea.pitch;
+		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 
+BITMAP::DrawInnerArgs::DrawInnerArgs(int yStart, int xStart, uint32 transColor,
+	uint32 alphaMask, PALETTE palette, int useTint, int sameFormat,
+	const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea,
+	int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed,
+	int tintGreen, int tintBlue, const Common::Rect &dstRect,
+	const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX,
+	int scaleY) : yStart(yStart), xStart(xStart), transColor(transColor),
+	alphaMask(alphaMask), palette(palette), useTint(useTint), sameFormat(sameFormat), src(src),
+	destArea(destArea), horizFlip(horizFlip), vertFlip(vertFlip),
+	skipTrans(skipTrans), srcAlpha(srcAlpha), tintRed(tintRed),
+	tintGreen(tintGreen), tintBlue(tintBlue), dstRect(dstRect),
+	srcArea(srcArea), blenderMode(blenderMode), scaleX(scaleX), scaleY(scaleY) {
+}
+
 void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
                   int dstX, int dstY, bool horizFlip, bool vertFlip,
                   bool skipTrans, int srcAlpha, int tintRed, int tintGreen,
@@ -292,7 +306,8 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-#define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0)
+	auto args = DrawInnerArgs(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0);
+#define DRAWINNER(func) func(args)
 	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
@@ -375,7 +390,8 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
 	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
 
-#define DRAWINNER(func) func(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY)
+	auto args = DrawInnerArgs(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY);
+#define DRAWINNER(func) func(args)
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 8b7d409b65a..81fd4e4fa47 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -265,14 +265,29 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
+	struct DrawInnerArgs {
+		bool useTint, sameFormat, horizFlip, vertFlip, skipTrans, doScale;
+		int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
+		uint32 transColor, alphaMask;
+		color *palette;
+
+		BlenderMode blenderMode;
+		Common::Rect dstRect, srcArea;
+
+		const ::Graphics::ManagedSurface &src;
+		::Graphics::Surface destArea;
+
+		DrawInnerArgs(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	};
+
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner4BppWithConv(DrawInnerArgs &args);
 	template<int ScaleThreshold>
-	void drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner2Bpp(DrawInnerArgs &args);
 	template<int ScaleThreshold>
-	void drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInner1Bpp(DrawInnerArgs &args);
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-	void drawInnerGeneric(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+	void drawInnerGeneric(DrawInnerArgs &args);
 	
 	inline uint32 getColor(const byte *data, byte bpp) const {
 		switch (bpp) {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index 79010abf5b3..81bf428baf1 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -14,68 +14,68 @@ namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
+void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(srcAlpha), 24);
-	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintRed), 16));
-	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(tintGreen), 8));
-	tint = vorrq_u32(tint, vdupq_n_u32(tintBlue));
-	uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
-	uint32x4_t transColors = vld1q_dup_u32(&transColor);
-	uint32x4_t alphas = vld1q_dup_u32(&srcAlpha);
+	uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(args.srcAlpha), 24);
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintRed), 16));
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintGreen), 8));
+	tint = vorrq_u32(tint, vdupq_n_u32(args.tintBlue));
+	uint32x4_t maskedAlphas = vld1q_dup_u32(&args.alphaMask);
+	uint32x4_t transColors = vld1q_dup_u32(&args.transColor);
+	uint32x4_t alphas = vld1q_dup_u32(&args.srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	uint32x4_t addIndexes = {0, 1, 2, 3};
-	if (horizFlip) addIndexes = {3, 2, 1, 0};
+	if (args.horizFlip) addIndexes = {3, 2, 1, 0};
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
+	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		xStart = 0;
+		args.xStart = 0;
 	}
-	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 4 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
 
 		if (ScaleThreshold == 0) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
 				// Skip pixels that are beyond the row
 				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
@@ -83,7 +83,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 
@@ -91,7 +91,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// scaling size, we create a small dummy buffer that we copy the pixels into and then
 			// call the drawPixelsSIMD function
 			byte srcBuffer[4*4];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
@@ -105,13 +105,13 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
 				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
 				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
-				scaleXCtr += scaleX*4;
+				scaleXCtr += args.scaleX*4;
 
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
 				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
 			// check for if we fall off the edge of the row)
@@ -121,12 +121,12 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// 3) the scaling code will actually draw the until the last 4 pixels of the image
 			//    and do the extra if checks because the scaling code is already much slower
 			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
 		}
 	}
 
 	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// We have a picture that is a multiple of 4, so no extra pixels to draw
 	if (xCtrWidth % 4 == 0) return;
 	// Drawing the last few not scaled pixels here.
@@ -135,44 +135,44 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
-			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u32(0));
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
 		}
 		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
-		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
 	} else {
 		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 4;
 		xCtrBpp = xCtr * SrcBytesPerPixel;
-		destX = xStart+xCtr;
+		destX = args.xStart+xCtr;
 	}
 
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
 		}
 		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
 		
 		// Check if this is a transparent color we should skip
-		if (skipTrans && ((srcCol & alphaMask) == transColor))
+		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
 			continue;
 
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
 				rDest = rSrc;
 				gDest = gSrc;
 				bDest = bSrc;
 				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
 			}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
 			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
@@ -185,64 +185,64 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
+void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	uint16x8_t tint = vdupq_n_u16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
-	uint16x8_t transColors = vdupq_n_u16(transColor);
-	uint16x8_t alphas = vdupq_n_u16(srcAlpha);
+	uint16x8_t tint = vdupq_n_u16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
+	uint16x8_t transColors = vdupq_n_u16(args.transColor);
+	uint16x8_t alphas = vdupq_n_u16(args.srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
-	uint32x4_t scaleAdds = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
+	if (args.horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
 		xCtrBppStart = xCtrStart * 2;
-		xStart = 0;
+		args.xStart = 0;
 	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 8 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
 		if (ScaleThreshold == 0) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
 				// Skip pixels that are beyond the row
 				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
@@ -250,7 +250,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 
@@ -258,7 +258,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// scaling size, we create a small dummy buffer that we copy the pixels into and then
 			// call the drawPixelsSIMD function
 			uint16 srcBuffer[8];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
@@ -277,13 +277,13 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
 				srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
 				srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
-				scaleXCtr += scaleX*8;
+				scaleXCtr += args.scaleX*8;
 
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * 2];
 				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
 			// check for if we fall off the edge of the row)
@@ -293,60 +293,60 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// 3) the scaling code will actually draw the until the last 4 pixels of the image
 			//    and do the extra if checks because the scaling code is already much slower
 			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
 		}
 	}
 
 	// We have a picture that is a multiple of 8, so no extra pixels to draw
 	if (xCtrWidth % 8 == 0) return;
 	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 			byte *destPtr = &destP[destX * 2];
-			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, vmovq_n_u16(0));
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
 		}
 		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
-		if (horizFlip) srcP += 2 * 7;
+		if (args.horizFlip) srcP += 2 * 7;
 	} else {
 		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 8;
 		xCtrBpp = xCtr * 2;
-		destX = xStart+xCtr;
+		destX = args.xStart+xCtr;
 	}
 
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
 		}
 		byte *destVal = (byte *)&destP[destX * 2];
 		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
 		
 		// Check if this is a transparent color we should skip
-		if (skipTrans && srcCol == transColor)
+		if (args.skipTrans && srcCol == args.transColor)
 			continue;
 
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
 				rDest = rSrc;
 				gDest = gSrc;
 				bDest = bSrc;
 				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
 			}/* else {
 				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
 			}*/
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
 			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
@@ -356,45 +356,45 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
-	uint8x16_t transColors = vld1q_dup_u8(&transColor);
+void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	uint32x4_t scaleAdds1 = {0, (uint32)scaleX, (uint32)scaleX*2, (uint32)scaleX*3};
-	uint32x4_t scaleAdds2 = {(uint32)scaleX*4, (uint32)scaleX*5, (uint32)scaleX*6, (uint32)scaleX*7};
-	uint32x4_t scaleAdds3 = {(uint32)scaleX*8, (uint32)scaleX*9, (uint32)scaleX*10, (uint32)scaleX*11};
-	uint32x4_t scaleAdds4 = {(uint32)scaleX*12, (uint32)scaleX*13, (uint32)scaleX*14, (uint32)scaleX*15};
+	uint32x4_t scaleAdds1 = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
+	uint32x4_t scaleAdds3 = {(uint32)args.scaleX*8, (uint32)args.scaleX*9, (uint32)args.scaleX*10, (uint32)args.scaleX*11};
+	uint32x4_t scaleAdds4 = {(uint32)args.scaleX*12, (uint32)args.scaleX*13, (uint32)args.scaleX*14, (uint32)args.scaleX*15};
 	
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
-		xStart = 0;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		args.xStart = 0;
 	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 16 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		if (ScaleThreshold != 0) {
 			// So here we update the srcYCtr differently due to this being for
 			// scaling
@@ -403,11 +403,11 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				// Since the source yctr might not update every row of the destination, we have
 				// to see if we are on a new row...
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 		}
-		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
+		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
 		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
 			byte *destPtr = &destP[destX];
 
@@ -443,13 +443,13 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
-				scaleXCtr += scaleX*16;
+				scaleXCtr += args.scaleX*16;
 			}
 
 			// Mask out transparent pixels
-			uint8x16_t mask1 = skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
+			uint8x16_t mask1 = args.skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
 			uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
-			if (horizFlip) {
+			if (args.horizFlip) {
 				final = vrev64q_u8(final);
 				final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
 			}
@@ -459,37 +459,37 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 
 		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
-		if (horizFlip) srcP += 15;
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
+		if (args.horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
 			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
 			if (ScaleThreshold != 0) {
 				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
 			}
 			// Check if this is a transparent color we should skip
-			if (skipTrans && *srcCol == transColor)
+			if (args.skipTrans && *srcCol == args.transColor)
 				continue;
 
 			byte *destVal = (byte *)&destP[destX];
 			*destVal = *srcCol;
 		}
-		if (horizFlip) srcP -= 15; // Undo what we did up there
-		destP += destArea.pitch; // Goto next row
+		if (args.horizFlip) srcP -= 15; // Undo what we did up there
+		destP += args.destArea.pitch; // Goto next row
 		// Only advance the src row by 1 every time like this if we don't scale
-		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
 
 } // namespace AGS3
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 260e48fde7e..2d450acbb68 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -26,68 +26,68 @@ inline uint32 extract32_idx3(__m128i x) {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
 template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
+void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(srcAlpha), _mm_set1_epi32(24));
-	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(tintRed), _mm_set1_epi32(16)));
-	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(tintGreen), _mm_set1_epi32(8)));
-	tint = _mm_or_si128(tint, _mm_set1_epi32(tintBlue));
-	__m128i maskedAlphas = _mm_set1_epi32(alphaMask);
-	__m128i transColors = _mm_set1_epi32(transColor);
-    __m128i alphas = _mm_set1_epi32(srcAlpha);
+    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(args.srcAlpha), _mm_set1_epi32(24));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintRed), _mm_set1_epi32(16)));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintGreen), _mm_set1_epi32(8)));
+	tint = _mm_or_si128(tint, _mm_set1_epi32(args.tintBlue));
+	__m128i maskedAlphas = _mm_set1_epi32(args.alphaMask);
+	__m128i transColors = _mm_set1_epi32(args.transColor);
+    __m128i alphas = _mm_set1_epi32(args.srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	__m128i addIndexes = _mm_set_epi32(3, 2, 1, 0);
-	if (horizFlip) addIndexes = _mm_set_epi32(0, 1, 2, 3);
+	if (args.horizFlip) addIndexes = _mm_set_epi32(0, 1, 2, 3);
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	__m128i scaleAdds = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
+	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		xStart = 0;
+		args.xStart = 0;
 	}
-	int destY = yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 4 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
 
 		if (ScaleThreshold == 0) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
 				// Skip pixels that are beyond the row
 				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
@@ -95,7 +95,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 
@@ -103,7 +103,7 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// scaling size, we create a small dummy buffer that we copy the pixels into and then
 			// call the drawPixelsSIMD function
 			byte srcBuffer[4*4] = {0};
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				__m128i indexes = _mm_set1_epi32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
@@ -120,13 +120,13 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
 				memcpy(&srcBuffer[2*(size_t)SrcBytesPerPixel], srcP + extract32_idx2(indexes), SrcBytesPerPixel);
 				memcpy(&srcBuffer[3*(size_t)SrcBytesPerPixel], srcP + extract32_idx3(indexes), SrcBytesPerPixel);
-				scaleXCtr += scaleX*4;
+				scaleXCtr += args.scaleX*4;
 
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * (intptr_t)DestBytesPerPixel];
 				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
 			// check for if we fall off the edge of the row)
@@ -136,12 +136,12 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 			// 3) the scaling code will actually draw the until the last 4 pixels of the image
 			//    and do the extra if checks because the scaling code is already much slower
 			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
 		}
 	}
 
 	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// We have a picture that is a multiple of 4, so no extra pixels to draw
 	if (xCtrWidth % 4 == 0) return;
 	// Drawing the last few not scaled pixels here.
@@ -150,44 +150,44 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
-			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, _mm_setzero_si128());
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
 		}
 		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
-		if (horizFlip) srcP += SrcBytesPerPixel * 3;
+		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
 	} else {
 		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 4;
 		xCtrBpp = xCtr * SrcBytesPerPixel;
-		destX = xStart+xCtr;
+		destX = args.xStart+xCtr;
 	}
 
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * SrcBytesPerPixel);
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
 		}
 		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
 		
 		// Check if this is a transparent color we should skip
-		if (skipTrans && ((srcCol & alphaMask) == transColor))
+		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
 			continue;
 
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
 				rDest = rSrc;
 				gDest = gSrc;
 				bDest = bSrc;
 				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
 			}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
 			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
@@ -200,64 +200,64 @@ void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, ui
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
+void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	__m128i tint = _mm_set1_epi16(src.format.ARGBToColor(srcAlpha, tintRed, tintGreen, tintBlue));
-	__m128i transColors = _mm_set1_epi16(transColor);
-	__m128i alphas = _mm_set1_epi16(srcAlpha);
+	__m128i tint = _mm_set1_epi16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
+	__m128i transColors = _mm_set1_epi16(args.transColor);
+	__m128i alphas = _mm_set1_epi16(args.srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	__m128i addIndexes = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (horizFlip) addIndexes = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-	__m128i scaleAdds = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
-	__m128i scaleAdds2 = _mm_set_epi32((uint32)scaleX*7, (uint32)scaleX*6, (uint32)scaleX*5, (uint32)scaleX*4);
+	if (args.horizFlip) addIndexes = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
 
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
 		xCtrBppStart = xCtrStart * 2;
-		xStart = 0;
+		args.xStart = 0;
 	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? dstRect.height() : (dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 8 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
 		if (ScaleThreshold == 0) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
 				// Skip pixels that are beyond the row
 				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// Goto next row in source and destination image
-			destP += destArea.pitch;
-			srcP += vertFlip ? -src.pitch : src.pitch;
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
 			int newSrcYCtr = scaleYCtr / ScaleThreshold;
@@ -265,7 +265,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 
@@ -273,7 +273,7 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// scaling size, we create a small dummy buffer that we copy the pixels into and then
 			// call the drawPixelsSIMD function
 			uint16 srcBuffer[8];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart, scaleXCtr = xCtrStart * scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
 				__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
 #if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
@@ -292,13 +292,13 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				srcBuffer[5] = *(const uint16 *)(srcP + extract32_idx1(indexes2));
 				srcBuffer[6] = *(const uint16 *)(srcP + extract32_idx2(indexes2));
 				srcBuffer[7] = *(const uint16 *)(srcP + extract32_idx3(indexes2));
-				scaleXCtr += scaleX*8;
+				scaleXCtr += args.scaleX*8;
 
 				// Now this is pretty much the same as before with non-scaled code, except that we use
 				// our dummy source buffer instead of the actuall source bitmap
 				byte *destPtr = &destP[destX * 2];
 				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, srcAlpha, skipTrans, horizFlip, useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
 			}
 			// We calculate every row here except the last (because then we need to
 			// check for if we fall off the edge of the row)
@@ -308,60 +308,60 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 			// 3) the scaling code will actually draw the until the last 4 pixels of the image
 			//    and do the extra if checks because the scaling code is already much slower
 			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += destArea.pitch;
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
 		}
 	}
 
 	// We have a picture that is a multiple of 8, so no extra pixels to draw
 	if (xCtrWidth % 8 == 0) return;
 	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = xStart;
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
 	if (ScaleThreshold == 0) {
 		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 			byte *destPtr = &destP[destX * 2];
-			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, srcAlpha, skipTrans, horizFlip, useTint, _mm_setzero_si128());
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
 		}
 		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
-		if (horizFlip) srcP += 2 * 7;
+		if (args.horizFlip) srcP += 2 * 7;
 	} else {
 		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
 		xCtr = xCtrWidth - xCtrWidth % 8;
 		xCtrBpp = xCtr * 2;
-		destX = xStart+xCtr;
+		destX = args.xStart+xCtr;
 	}
 
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
 		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * scaleX) / ScaleThreshold * 2);
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
 		}
 		byte *destVal = (byte *)&destP[destX * 2];
 		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
 		
 		// Check if this is a transparent color we should skip
-		if (skipTrans && srcCol == transColor)
+		if (args.skipTrans && srcCol == args.transColor)
 			continue;
 
-		src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (srcAlpha != -1) {
-			if (useTint) {
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
 				rDest = rSrc;
 				gDest = gSrc;
 				bDest = bSrc;
 				aDest = aSrc;
-				rSrc = tintRed;
-				gSrc = tintGreen;
-				bSrc = tintBlue;
-				aSrc = srcAlpha;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
 			}/* else {
 				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
 			}*/
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
+			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
 		} else {
 			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
@@ -371,45 +371,45 @@ void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 }
 
 template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	const int xDir = horizFlip ? -1 : 1;
-	__m128i transColors = _mm_set1_epi16(transColor | (transColor << 8));
+void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
 
 	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	__m128i scaleAdds1 = _mm_set_epi32((uint32)scaleX*3, (uint32)scaleX*2, (uint32)scaleX, 0);
-	__m128i scaleAdds2 = _mm_set_epi32((uint32)scaleX*7, (uint32)scaleX*6, (uint32)scaleX*5, (uint32)scaleX*4);
-	__m128i scaleAdds3 = _mm_set_epi32((uint32)scaleX*11, (uint32)scaleX*10, (uint32)scaleX*9, (uint32)scaleX*8);
-	__m128i scaleAdds4 = _mm_set_epi32((uint32)scaleX*15, (uint32)scaleX*14, (uint32)scaleX*13, (uint32)scaleX*12);
+	__m128i scaleAdds1 = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
+	__m128i scaleAdds3 = _mm_set_epi32((uint32)args.scaleX*11, (uint32)args.scaleX*10, (uint32)args.scaleX*9, (uint32)args.scaleX*8);
+	__m128i scaleAdds4 = _mm_set_epi32((uint32)args.scaleX*15, (uint32)args.scaleX*14, (uint32)args.scaleX*13, (uint32)args.scaleX*12);
 	
 	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
 	// we are in the inner loop)
-	int xCtrStart = 0, xCtrWidth = dstRect.width();
-	if (xStart + xCtrWidth > destArea.w) {
-		xCtrWidth = destArea.w - xStart;
+	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
 	}
-	if (xStart < 0) {
-		xCtrStart = -xStart;
-		xStart = 0;
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		args.xStart = 0;
 	}
-	int destY = yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = dstRect.height();
-	if (ScaleThreshold != 0) yCtrHeight = dstRect.height();
-	if (yStart < 0) {
-		yCtr = -yStart;
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
 		destY = 0;
 		if (ScaleThreshold != 0) {
-			scaleYCtr = yCtr * scaleY;
+			scaleYCtr = yCtr * args.scaleY;
 			srcYCtr = scaleYCtr / ScaleThreshold;
 		}
 	}
-	if (yStart + yCtrHeight > destArea.h) {
-		yCtrHeight = destArea.h - yStart;
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
 	}
 	
-	byte *destP = (byte *)destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)src.getBasePtr(
-	                       horizFlip ? srcArea.right - 16 : srcArea.left,
-	                       vertFlip ? srcArea.bottom - 1 - yCtr : srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += scaleY) {
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		if (ScaleThreshold != 0) {
 			// So here we update the srcYCtr differently due to this being for
 			// scaling
@@ -418,11 +418,11 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 				// Since the source yctr might not update every row of the destination, we have
 				// to see if we are on a new row...
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += src.pitch * diffSrcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
 				srcYCtr = newSrcYCtr;
 			}
 		}
-		int xCtr = xCtrStart, destX = xStart, scaleXCtr = xCtrStart * scaleX;
+		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
 		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
 			byte *destPtr = &destP[destX];
 
@@ -460,13 +460,13 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 					srcP[extract32_idx2(indexes1)],
 					srcP[extract32_idx1(indexes1)],
 					srcP[extract32_idx0(indexes1)]);
-				scaleXCtr += scaleX*16;
+				scaleXCtr += args.scaleX*16;
 			}
 
 			// Mask out transparent pixels
-			__m128i mask1 = skipTrans ? _mm_cmpeq_epi8(srcCols, transColors) : _mm_setzero_si128();
+			__m128i mask1 = args.skipTrans ? _mm_cmpeq_epi8(srcCols, transColors) : _mm_setzero_si128();
 			__m128i final = _mm_or_si128(_mm_andnot_si128(mask1, srcCols), _mm_and_si128(destCols, mask1));
-			if (horizFlip) {
+			if (args.horizFlip) {
 				__m128i final_swap16 = _mm_srli_epi16(final, 8);
 				final_swap16 = _mm_or_si128(final_swap16, _mm_slli_epi16(_mm_and_si128(final, _mm_set1_epi16(0xff)), 8));
 				final_swap16 = _mm_shufflelo_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
@@ -479,36 +479,36 @@ void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alp
 
 		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
 		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
-		if (horizFlip) srcP += 15;
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += scaleX) {
+		if (args.horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
 			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
 			if (ScaleThreshold != 0) {
 				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
 			}
 			// Check if this is a transparent color we should skip
-			if (skipTrans && *srcCol == transColor)
+			if (args.skipTrans && *srcCol == args.transColor)
 				continue;
 
 			byte *destVal = (byte *)&destP[destX];
 			*destVal = *srcCol;
 		}
-		if (horizFlip) srcP -= 15; // Undo what we did up there
-		destP += destArea.pitch; // Goto next row
+		if (args.horizFlip) srcP -= 15; // Undo what we did up there
+		destP += args.destArea.pitch; // Goto next row
 		// Only advance the src row by 1 every time like this if we don't scale
-		if (ScaleThreshold == 0) srcP += vertFlip ? -src.pitch : src.pitch;
+		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, int, int, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, int, int, int, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
 
 } // namespace AGS3
 


Commit: bc20c0185d2428f5eef5de9e0ab4a6cdc804408b
    https://github.com/scummvm/scummvm/commit/bc20c0185d2428f5eef5de9e0ab4a6cdc804408b
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: GRAPHICS: Changed bending functions templates

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_simd_neon.cpp
    engines/ags/lib/allegro/surface_simd_sse.cpp


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 251c33fc218..cf4a10a6334 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -104,9 +104,8 @@ void BITMAP::floodfill(int x, int y, int color) {
 	AGS3::floodfill(this, x, y, color);
 }
 
-const int SCALE_THRESHOLD = 0x100;
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
-template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
@@ -127,9 +126,9 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 	if (args.yStart < 0) { // Clip the top
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
@@ -142,8 +141,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
 	                       args.srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (ScaleThreshold != 0) {
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+		if (Scale) {
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			if (srcYCtr != newSrcYCtr) {
 				int diffSrcYCtr = newSrcYCtr - srcYCtr;
 				srcP += args.src.pitch * diffSrcYCtr;
@@ -153,8 +152,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 		// Loop through the pixels of the row
 		for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
 			const byte *srcVal = srcP + xDir * xCtrBpp;
-			if (ScaleThreshold != 0) {
-				srcVal = srcP + (scaleXCtr / ScaleThreshold) * SrcBytesPerPixel;
+			if (Scale) {
+				srcVal = srcP + (scaleXCtr / SCALE_THRESHOLD) * SrcBytesPerPixel;
 			}
 			uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
 
@@ -233,7 +232,7 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 		}
 
 		destP += args.destArea.pitch;
-		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 
@@ -312,34 +311,34 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER((drawInnerGeneric<1, 1, 0>)); return;
-			case 2: DRAWINNER((drawInnerGeneric<2, 2, 0>)); return;
-			case 4: DRAWINNER((drawInnerGeneric<4, 4, 0>)); return;
+			case 1: DRAWINNER((drawInnerGeneric<1, 1, false>)); return;
+			case 2: DRAWINNER((drawInnerGeneric<2, 2, false>)); return;
+			case 4: DRAWINNER((drawInnerGeneric<4, 4, false>)); return;
 			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInnerGeneric<4, 2, 0>));
+			DRAWINNER((drawInnerGeneric<4, 2, false>));
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInnerGeneric<2, 4, 0>));
+			DRAWINNER((drawInnerGeneric<2, 4, false>));
 		}
 	} else {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<0>); return;
-			case 2: DRAWINNER(drawInner2Bpp<0>); return;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); return;
+			case 1: DRAWINNER(drawInner1Bpp<false>); return;
+			case 2: DRAWINNER(drawInner2Bpp<false>); return;
+			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, false>)); return;
 			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInner4BppWithConv<4, 2, 0>));
+			DRAWINNER((drawInner4BppWithConv<4, 2, false>));
 			return;
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInner4BppWithConv<2, 4, 0>));
+			DRAWINNER((drawInner4BppWithConv<2, 4, false>));
 			return;
 		}
 	}
 	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
-		DRAWINNER((drawInnerGeneric<4, 1, 0>));
+		DRAWINNER((drawInnerGeneric<4, 1, false>));
 	else
-		DRAWINNER((drawInnerGeneric<2, 1, 0>));
+		DRAWINNER((drawInnerGeneric<2, 1, false>));
 #undef DRAWINNER
 }
 
@@ -395,34 +394,34 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER((drawInnerGeneric<1, 1, SCALE_THRESHOLD>)); return;
-			case 2: DRAWINNER((drawInnerGeneric<2, 2, SCALE_THRESHOLD>)); return;
-			case 4: DRAWINNER((drawInnerGeneric<4, 4, SCALE_THRESHOLD>)); return;
+			case 1: DRAWINNER((drawInnerGeneric<1, 1, true>)); return;
+			case 2: DRAWINNER((drawInnerGeneric<2, 2, true>)); return;
+			case 4: DRAWINNER((drawInnerGeneric<4, 4, true>)); return;
 			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInnerGeneric<4, 2, SCALE_THRESHOLD>));
+			DRAWINNER((drawInnerGeneric<4, 2, true>));
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInnerGeneric<2, 4, SCALE_THRESHOLD>));
+			DRAWINNER((drawInnerGeneric<2, 4, true>));
 		}
 	} else {
 		if (sameFormat) {
 			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); return;
-			case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); return;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); return;
+			case 1: DRAWINNER(drawInner1Bpp<true>); return;
+			case 2: DRAWINNER(drawInner2Bpp<true>); return;
+			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, true>)); return;
 			}
 		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInner4BppWithConv<4, 2, SCALE_THRESHOLD>));
+			DRAWINNER((drawInner4BppWithConv<4, 2, true>));
 			return;
 		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInner4BppWithConv<2, 4, SCALE_THRESHOLD>));
+			DRAWINNER((drawInner4BppWithConv<2, 4, true>));
 			return;
 		}
 	}
 	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
-		DRAWINNER((drawInnerGeneric<4, 1, SCALE_THRESHOLD>));
+		DRAWINNER((drawInnerGeneric<4, 1, true>));
 	else
-		DRAWINNER((drawInnerGeneric<2, 1, SCALE_THRESHOLD>));
+		DRAWINNER((drawInnerGeneric<2, 1, true>));
 #undef DRAWINNER
 }
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 81fd4e4fa47..4ebe699d8bb 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -265,6 +265,8 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
+	constexpr static int SCALE_THRESHOLD_BITS = 8;
+	constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
 	struct DrawInnerArgs {
 		bool useTint, sameFormat, horizFlip, vertFlip, skipTrans, doScale;
 		int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
@@ -280,13 +282,13 @@ public:
 		DrawInnerArgs(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
 	};
 
-	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 	void drawInner4BppWithConv(DrawInnerArgs &args);
-	template<int ScaleThreshold>
+	template<bool Scale>
 	void drawInner2Bpp(DrawInnerArgs &args);
-	template<int ScaleThreshold>
+	template<bool Scale>
 	void drawInner1Bpp(DrawInnerArgs &args);
-	template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 	void drawInnerGeneric(DrawInnerArgs &args);
 	
 	inline uint32 getColor(const byte *data, byte bpp) const {
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
index 81bf428baf1..79d2f57bc0c 100644
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ b/engines/ags/lib/allegro/surface_simd_neon.cpp
@@ -13,7 +13,7 @@
 namespace AGS3 {
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
@@ -45,13 +45,13 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -65,8 +65,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
 
-		if (ScaleThreshold == 0) {
-			// If we are not scaling the image
+		if (!Scale) {
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
 				// Skip pixels that are beyond the row
@@ -78,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			// Since the source yctr might not update every row of the destination, we have
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
@@ -94,12 +93,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
-				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
-#else
-#error Change code to allow different scale threshold!
-#endif
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), SrcBytesPerPixel);
 				// Simply memcpy them in. memcpy has no real performance overhead here
 				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
 				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
@@ -132,7 +127,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
+	if (!Scale) {
 		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
 			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
@@ -150,8 +145,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
 		}
 		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
@@ -184,7 +179,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	}
 }
 
-template<int ScaleThreshold>
+template<bool Scale>
 void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
@@ -213,13 +208,13 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -232,7 +227,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
-		if (ScaleThreshold == 0) {
+		if (!Scale) {
 			// If we are not scaling the image
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
@@ -245,7 +240,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			// Since the source yctr might not update every row of the destination, we have
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
@@ -261,13 +256,9 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
 				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
-				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
-				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
-#else
-#error Change code to allow different scale threshold!
-#endif
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
+				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 2);
 				// Simply memcpy them in. memcpy has no real performance overhead here
 				srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
 				srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
@@ -304,7 +295,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
+	if (!Scale) {
 		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 			byte *destPtr = &destP[destX * 2];
 			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
@@ -322,8 +313,8 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
 		}
 		byte *destVal = (byte *)&destP[destX * 2];
 		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
@@ -355,7 +346,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 }
 
-template<int ScaleThreshold>
+template<bool Scale>
 void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
@@ -377,13 +368,13 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -395,10 +386,10 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
 	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			// So here we update the srcYCtr differently due to this being for
 			// scaling
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			if (srcYCtr != newSrcYCtr) {
 				// Since the source yctr might not update every row of the destination, we have
 				// to see if we are on a new row...
@@ -415,18 +406,14 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 			// can't have any blending applied to them
 			uint8x16_t destCols = vld1q_u8(destPtr);
 			uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
-			if (ScaleThreshold != 0) {
+			if (Scale) {
 				// If we are scaling, we have to set each pixel individually
 				uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
 				uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
-				indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
-				indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
-				indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
-				indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
-#else
-#error Change code to allow different scale threshold!
-#endif
+				indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
+				indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
+				indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
+				indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
 				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
@@ -462,8 +449,8 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 		if (args.horizFlip) srcP += 15;
 		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
 			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-			if (ScaleThreshold != 0) {
-				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+			if (Scale) {
+				srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
 			}
 			// Check if this is a transparent color we should skip
 			if (args.skipTrans && *srcCol == args.transColor)
@@ -475,21 +462,21 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 		if (args.horizFlip) srcP -= 15; // Undo what we did up there
 		destP += args.destArea.pitch; // Goto next row
 		// Only advance the src row by 1 every time like this if we don't scale
-		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 
 
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 4, false>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 4, true>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, false>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<4, 2, true>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, false>(DrawInnerArgs &args);
+template void BITMAP::drawInner4BppWithConv<2, 4, true>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<false>(DrawInnerArgs &args);
+template void BITMAP::drawInner2Bpp<true>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<false>(DrawInnerArgs &args);
+template void BITMAP::drawInner1Bpp<true>(DrawInnerArgs &args);
 
 } // namespace AGS3
 
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
index 2d450acbb68..212a19011cb 100644
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ b/engines/ags/lib/allegro/surface_simd_sse.cpp
@@ -25,7 +25,7 @@ inline uint32 extract32_idx3(__m128i x) {
 }
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
@@ -57,13 +57,13 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -77,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
 
-		if (ScaleThreshold == 0) {
+		if (!Scale) {
 			// If we are not scaling the image
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
@@ -90,7 +90,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			// Since the source yctr might not update every row of the destination, we have
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
@@ -106,15 +106,11 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				__m128i indexes = _mm_set1_epi32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
 				if (SrcBytesPerPixel == 4)
-					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 2);
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
 				else
-					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
-#else
-#error Change code to allow different scale threshold!
-#endif
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
 				// Simply memcpy them in. memcpy has no real performance overhead here
 				memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
 				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
@@ -147,7 +143,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
+	if (!Scale) {
 		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
 			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
@@ -165,8 +161,8 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
 		}
 		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
 		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
@@ -228,13 +224,13 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -247,7 +243,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
 		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
-		if (ScaleThreshold == 0) {
+		if (!Scale) {
 			// If we are not scaling the image
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
@@ -260,7 +256,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 		} else {
 			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			// Since the source yctr might not update every row of the destination, we have
 			// to see if we are on a new row...
 			if (srcYCtr != newSrcYCtr) {
@@ -276,13 +272,9 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
 				__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
-				indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
-				indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8), 1);
-#else
-#error Change code to allow different scale threshold!
-#endif
+				indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
+				indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 1);
 				// Simply memcpy them in. memcpy has no real performance overhead here
 				srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
 				srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
@@ -319,7 +311,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
-	if (ScaleThreshold == 0) {
+	if (!Scale) {
 		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
 			byte *destPtr = &destP[destX * 2];
 			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
@@ -337,8 +329,8 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 	// For the last 4 pixels, we just do them in serial, nothing special
 	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
 		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (ScaleThreshold != 0) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
 		}
 		byte *destVal = (byte *)&destP[destX * 2];
 		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
@@ -370,7 +362,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
 	}
 }
 
-template<int ScaleThreshold>
+template<bool Scale>
 void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
@@ -392,13 +384,13 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 		args.xStart = 0;
 	}
 	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
-	if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
 		destY = 0;
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / ScaleThreshold;
+			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
@@ -410,10 +402,10 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
 	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
 	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (ScaleThreshold != 0) {
+		if (Scale) {
 			// So here we update the srcYCtr differently due to this being for
 			// scaling
-			int newSrcYCtr = scaleYCtr / ScaleThreshold;
+			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
 			if (srcYCtr != newSrcYCtr) {
 				// Since the source yctr might not update every row of the destination, we have
 				// to see if we are on a new row...
@@ -430,19 +422,15 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 			// can't have any blending applied to them
 			__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
 			__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
-			if (ScaleThreshold != 0) {
+			if (Scale) {
 				// If we are scaling, we have to set each pixel individually
 				__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
 				__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
-#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
 				// Calculate in parallel the indexes of the pixels
-				indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), 8);
-				indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8);
-				indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), 8);
-				indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), 8);
-#else
-#error Change code to allow different scale threshold!
-#endif
+				indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
+				indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
+				indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
+				indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
 				srcCols = _mm_set_epi8(
 					srcP[extract32_idx3(indexes4)],
 					srcP[extract32_idx2(indexes4)],
@@ -482,8 +470,8 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 		if (args.horizFlip) srcP += 15;
 		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
 			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-			if (ScaleThreshold != 0) {
-				srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
+			if (Scale) {
+				srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
 			}
 			// Check if this is a transparent color we should skip
 			if (args.skipTrans && *srcCol == args.transColor)
@@ -495,7 +483,7 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
 		if (args.horizFlip) srcP -= 15; // Undo what we did up there
 		destP += args.destArea.pitch; // Goto next row
 		// Only advance the src row by 1 every time like this if we don't scale
-		if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
 	}
 }
 


Commit: 1dfbaa35c5debcf24f41bcedb865da0ddf597671
    https://github.com/scummvm/scummvm/commit/1dfbaa35c5debcf24f41bcedb865da0ddf597671
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: GRAPHICS: Moved duplicate code to DrawInnerArgs

Changed paths:
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index cf4a10a6334..473e7b0dee5 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -236,58 +236,48 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 	}
 }
 
-BITMAP::DrawInnerArgs::DrawInnerArgs(int yStart, int xStart, uint32 transColor,
-	uint32 alphaMask, PALETTE palette, int useTint, int sameFormat,
-	const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea,
-	int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed,
-	int tintGreen, int tintBlue, const Common::Rect &dstRect,
-	const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX,
-	int scaleY) : yStart(yStart), xStart(xStart), transColor(transColor),
-	alphaMask(alphaMask), palette(palette), useTint(useTint), sameFormat(sameFormat), src(src),
-	destArea(destArea), horizFlip(horizFlip), vertFlip(vertFlip),
-	skipTrans(skipTrans), srcAlpha(srcAlpha), tintRed(tintRed),
-	tintGreen(tintGreen), tintBlue(tintBlue), dstRect(dstRect),
-	srcArea(srcArea), blenderMode(blenderMode), scaleX(scaleX), scaleY(scaleY) {
-}
-
-void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
-                  int dstX, int dstY, bool horizFlip, bool vertFlip,
-                  bool skipTrans, int srcAlpha, int tintRed, int tintGreen,
-                  int tintBlue) {
-	assert(format.bytesPerPixel == 2 || format.bytesPerPixel == 4 ||
-	       (format.bytesPerPixel == 1 && srcBitmap->format.bytesPerPixel == 1));
-
+BITMAP::DrawInnerArgs::DrawInnerArgs(BITMAP *dstBitmap, const BITMAP *srcBitmap,
+	const Common::Rect &srcRect, const Common::Rect &_dstRect, bool _skipTrans,
+	int _srcAlpha, bool _horizFlip, bool _vertFlip, int _tintRed,
+	int _tintGreen, int _tintBlue, bool _doScale) : skipTrans(_skipTrans),
+		srcAlpha(_srcAlpha), horizFlip(_horizFlip), vertFlip(_vertFlip),
+		tintRed(_tintRed), tintGreen(_tintGreen), tintBlue(_tintBlue),
+		doScale(_doScale), src(**srcBitmap), shouldDraw(false),
+		useTint(_tintRed >= 0 && _tintGreen >= 0 && _tintBlue >= 0),
+		blenderMode(_G(_blender_mode)), dstRect(_dstRect) {
 	// Allegro disables draw when the clipping rect has negative width/height.
 	// Common::Rect instead asserts, which we don't want.
-	if (cr <= cl || cb <= ct)
+	if (dstBitmap->cr <= dstBitmap->cl || dstBitmap->cb <= dstBitmap->ct)
 		return;
 
-	// Ensure the src rect is constrained to the source bitmap
-	Common::Rect srcArea = srcRect;
+	// Figure out the dest area that will be updated
+	srcArea = srcRect;
 	srcArea.clip(Common::Rect(0, 0, srcBitmap->w, srcBitmap->h));
 	if (srcArea.isEmpty())
 		return;
-
-	// Figure out the dest area that will be updated
-	Common::Rect dstRect(dstX, dstY, dstX + srcArea.width(), dstY + srcArea.height());
+	
+	if (!doScale) {
+		// Ensure the src rect is constrained to the source bitmap
+		dstRect.setWidth(srcArea.width());
+		dstRect.setHeight(srcArea.height());
+	}
 	Common::Rect destRect = dstRect.findIntersectingRect(
-	                            Common::Rect(cl, ct, cr, cb));
+	                            Common::Rect(dstBitmap->cl, dstBitmap->ct, dstBitmap->cr, dstBitmap->cb));
 	if (destRect.isEmpty())
 		// Area is entirely outside the clipping area, so nothing to draw
 		return;
 
 	// Get source and dest surface. Note that for the destination we create
 	// a temporary sub-surface based on the allowed clipping area
-	const Graphics::ManagedSurface &src = **srcBitmap;
-	Graphics::ManagedSurface &dest = *_owner;
-	Graphics::Surface destArea = dest.getSubArea(destRect);
+	Graphics::ManagedSurface &dest = *dstBitmap->_owner;
+	destArea = dest.getSubArea(destRect);
 
 	// Define scaling and other stuff used by the drawing loops
-	bool useTint = (tintRed >= 0 && tintGreen >= 0 && tintBlue >= 0);
-	bool sameFormat = (src.format == format);
+	scaleX = SCALE_THRESHOLD * srcRect.width() / dstRect.width();
+	scaleY = SCALE_THRESHOLD * srcRect.height() / dstRect.height();
+	sameFormat = (src.format == dstBitmap->format);
 
-	PALETTE palette;
-	if (src.format.bytesPerPixel == 1 && format.bytesPerPixel != 1) {
+	if (src.format.bytesPerPixel == 1 && dstBitmap->format.bytesPerPixel != 1) {
 		for (int i = 0; i < PAL_SIZE; ++i) {
 			palette[i].r = VGA_COLOR_TRANS(_G(current_palette)[i].r);
 			palette[i].g = VGA_COLOR_TRANS(_G(current_palette)[i].g);
@@ -295,42 +285,52 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 		}
 	}
 
-	uint32 transColor = 0, alphaMask = 0xff;
+	transColor = 0, alphaMask = 0xff;
 	if (skipTrans && src.format.bytesPerPixel != 1) {
 		transColor = src.format.ARGBToColor(0, 255, 0, 255);
 		alphaMask = src.format.ARGBToColor(255, 0, 0, 0);
 		alphaMask = ~alphaMask;
 	}
 
-	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
-	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
+	xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
+	yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
+	shouldDraw = true;
+}
 
-	auto args = DrawInnerArgs(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, _G(_blender_mode), 0, 0);
+void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
+                  int dstX, int dstY, bool horizFlip, bool vertFlip,
+                  bool skipTrans, int srcAlpha, int tintRed, int tintGreen,
+                  int tintBlue) {
+	assert(format.bytesPerPixel == 2 || format.bytesPerPixel == 4 ||
+	       (format.bytesPerPixel == 1 && srcBitmap->format.bytesPerPixel == 1));
+
+	auto args = DrawInnerArgs(this, srcBitmap, srcRect, Common::Rect(dstX, dstY, dstX+1, dstY+1), skipTrans, srcAlpha, horizFlip, vertFlip, tintRed, tintGreen, tintBlue, false);
+	if (!args.shouldDraw) return;
 #define DRAWINNER(func) func(args)
 	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
-		if (sameFormat) {
+		if (args.sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER((drawInnerGeneric<1, 1, false>)); return;
 			case 2: DRAWINNER((drawInnerGeneric<2, 2, false>)); return;
 			case 4: DRAWINNER((drawInnerGeneric<4, 4, false>)); return;
 			}
-		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInnerGeneric<4, 2, false>));
-		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInnerGeneric<2, 4, false>));
 		}
 	} else {
-		if (sameFormat) {
+		if (args.sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER(drawInner1Bpp<false>); return;
 			case 2: DRAWINNER(drawInner2Bpp<false>); return;
 			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, false>)); return;
 			}
-		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInner4BppWithConv<4, 2, false>));
 			return;
-		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInner4BppWithConv<2, 4, false>));
 			return;
 		}
@@ -346,74 +346,32 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
                          const Common::Rect &dstRect, bool skipTrans, int srcAlpha) {
 	assert(format.bytesPerPixel == 2 || format.bytesPerPixel == 4 ||
 	       (format.bytesPerPixel == 1 && srcBitmap->format.bytesPerPixel == 1));
-
-	// Allegro disables draw when the clipping rect has negative width/height.
-	// Common::Rect instead asserts, which we don't want.
-	if (cr <= cl || cb <= ct)
-		return;
-
-	// Figure out the dest area that will be updated
-	Common::Rect destRect = dstRect.findIntersectingRect(
-	                            Common::Rect(cl, ct, cr, cb));
-	if (destRect.isEmpty())
-		// Area is entirely outside the clipping area, so nothing to draw
-		return;
-
-	// Get source and dest surface. Note that for the destination we create
-	// a temporary sub-surface based on the allowed clipping area
-	const Graphics::ManagedSurface &src = **srcBitmap;
-	Graphics::ManagedSurface &dest = *_owner;
-	Graphics::Surface destArea = dest.getSubArea(destRect);
-
-	// Define scaling and other stuff used by the drawing loops
-	const int scaleX = SCALE_THRESHOLD * srcRect.width() / dstRect.width();
-	const int scaleY = SCALE_THRESHOLD * srcRect.height() / dstRect.height();
-	bool sameFormat = (src.format == format);
-
-	PALETTE palette;
-	if (src.format.bytesPerPixel == 1 && format.bytesPerPixel != 1) {
-		for (int i = 0; i < PAL_SIZE; ++i) {
-			palette[i].r = VGA_COLOR_TRANS(_G(current_palette)[i].r);
-			palette[i].g = VGA_COLOR_TRANS(_G(current_palette)[i].g);
-			palette[i].b = VGA_COLOR_TRANS(_G(current_palette)[i].b);
-		}
-	}
-
-	uint32 transColor = 0, alphaMask = 0xff;
-	if (skipTrans && src.format.bytesPerPixel != 1) {
-		transColor = src.format.ARGBToColor(0, 255, 0, 255);
-		alphaMask = src.format.ARGBToColor(255, 0, 0, 0);
-		alphaMask = ~alphaMask;
-	}
-
-	int xStart = (dstRect.left < destRect.left) ? dstRect.left - destRect.left : 0;
-	int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
-
-	auto args = DrawInnerArgs(yStart, xStart, transColor, alphaMask, palette, 0, sameFormat, src, destArea, false, false, skipTrans, srcAlpha, 0, 0, 0, dstRect, srcRect, _G(_blender_mode), scaleX, scaleY);
+	auto args = DrawInnerArgs(this, srcBitmap, srcRect, dstRect, skipTrans, srcAlpha, false, false, 0, 0, 0, true);
+	if (!args.shouldDraw) return;
 #define DRAWINNER(func) func(args)
 	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
-		if (sameFormat) {
+		if (args.sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER((drawInnerGeneric<1, 1, true>)); return;
 			case 2: DRAWINNER((drawInnerGeneric<2, 2, true>)); return;
 			case 4: DRAWINNER((drawInnerGeneric<4, 4, true>)); return;
 			}
-		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInnerGeneric<4, 2, true>));
-		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInnerGeneric<2, 4, true>));
 		}
 	} else {
-		if (sameFormat) {
+		if (args.sameFormat) {
 			switch (format.bytesPerPixel) {
 			case 1: DRAWINNER(drawInner1Bpp<true>); return;
 			case 2: DRAWINNER(drawInner2Bpp<true>); return;
 			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, true>)); return;
 			}
-		} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) { 
+		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
 			DRAWINNER((drawInner4BppWithConv<4, 2, true>));
 			return;
-		} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
+		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
 			DRAWINNER((drawInner4BppWithConv<2, 4, true>));
 			return;
 		}
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 4ebe699d8bb..28dbeed425c 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -268,10 +268,11 @@ public:
 	constexpr static int SCALE_THRESHOLD_BITS = 8;
 	constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
 	struct DrawInnerArgs {
-		bool useTint, sameFormat, horizFlip, vertFlip, skipTrans, doScale;
+		const bool useTint, horizFlip, vertFlip, skipTrans, doScale;
+		bool sameFormat, shouldDraw;
 		int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
 		uint32 transColor, alphaMask;
-		color *palette;
+		PALETTE palette;
 
 		BlenderMode blenderMode;
 		Common::Rect dstRect, srcArea;
@@ -279,7 +280,11 @@ public:
 		const ::Graphics::ManagedSurface &src;
 		::Graphics::Surface destArea;
 
-		DrawInnerArgs(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
+		DrawInnerArgs(BITMAP *dstBitmap, const BITMAP *srcBitmap,
+					  const Common::Rect &srcRect, const Common::Rect &dstRect,
+					  bool skipTrans, int srcAlpha, bool horizFlip,
+					  bool vertFlip, int tintRed, int tintGreen, int tintBlue,
+					  bool doScale);
 	};
 
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>


Commit: 1cf3c7832a236e93b682f5c04fe4daf0332ad2aa
    https://github.com/scummvm/scummvm/commit/1cf3c7832a236e93b682f5c04fe4daf0332ad2aa
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: GRAPHICS: SIMD blending refactoring

Here I moved the simd paths to their own translation units and removed
their unessesary header files. I also made it so that less of the
translation units have template forward declarations.

I made it so that surface.cpp now chooses at runtime what simd path it
should take.

Changed paths:
  A engines/ags/lib/allegro/surface_generic.cpp
  A engines/ags/lib/allegro/surface_neon.cpp
  A engines/ags/lib/allegro/surface_sse2.cpp
  R engines/ags/lib/allegro/surface_simd_neon.cpp
  R engines/ags/lib/allegro/surface_simd_neon.h
  R engines/ags/lib/allegro/surface_simd_none.cpp
  R engines/ags/lib/allegro/surface_simd_sse.cpp
  R engines/ags/lib/allegro/surface_simd_sse.h
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/module.mk


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index 473e7b0dee5..a9f31272652 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -105,149 +105,19 @@ void BITMAP::floodfill(int x, int y, int color) {
 }
 
 #define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
-template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
-void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-
-	// Instead of skipping pixels outside our boundary here, we just clip
-	// our area instead.
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) { // Clip the right
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) { // Clip the left
-		xCtrStart = -args.xStart;
-		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) { // Clip the top
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 1 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
-	                       args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (Scale) {
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-		}
-		// Loop through the pixels of the row
-		for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
-			const byte *srcVal = srcP + xDir * xCtrBpp;
-			if (Scale) {
-				srcVal = srcP + (scaleXCtr / SCALE_THRESHOLD) * SrcBytesPerPixel;
-			}
-			uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
-
-			// Check if this is a transparent color we should skip
-			if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
-				continue;
-
-			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-
-			// When blitting to the same format we can just copy the color
-			if (DestBytesPerPixel == 1) {
-				*destVal = srcCol;
-				continue;
-			} else if ((DestBytesPerPixel == SrcBytesPerPixel) && args.srcAlpha == -1) {
-				if (DestBytesPerPixel)
-					*(uint32 *)destVal = srcCol;
-				else
-					*(uint16 *)destVal = srcCol;
-				continue;
-			}
-
-			// We need the rgb values to do blending and/or convert between formats
-			if (SrcBytesPerPixel == 1) {
-				const RGB &rgb = args.palette[srcCol];
-				aSrc = 0xff;
-				rSrc = rgb.r;
-				gSrc = rgb.g;
-				bSrc = rgb.b;
-			} else {
-				if (SrcBytesPerPixel == 4) {
-					aSrc = srcCol >> 24;
-					rSrc = (srcCol >> 16) & 0xff;
-					gSrc = (srcCol >> 8) & 0xff;
-					bSrc = srcCol & 0xff;
-				} else { // SrcBytesPerPixel == 2
-					aSrc = 0xff;
-					rSrc = (srcCol >> 11) & 0x1f;
-					rSrc = (rSrc << 3) | (rSrc >> 2);
-					gSrc = (srcCol >> 5) & 0x3f;
-					gSrc = (gSrc << 2) | (gSrc >> 4);
-					bSrc = srcCol & 0x1f;
-					bSrc = (bSrc << 3) | (bSrc >> 2);
-				}
-				//src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-			}
-
-			if (args.srcAlpha == -1) {
-				// This means we don't use blending.
-				aDest = aSrc;
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-			} else {
-				if (args.useTint) {
-					rDest = rSrc;
-					gDest = gSrc;
-					bDest = bSrc;
-					aDest = aSrc;
-					rSrc = args.tintRed;
-					gSrc = args.tintGreen;
-					bSrc = args.tintBlue;
-					aSrc = args.srcAlpha;
-				}
-				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
-			}
-
-			uint32 pixel;// = format.ARGBToColor(aDest, rDest, gDest, bDest);
-			if (DestBytesPerPixel == 4) {
-				pixel = (aDest << 24) | (rDest << 16) | (gDest << 8) | (bDest);
-				*(uint32 *)destVal = pixel;
-			}
-			else {
-				pixel = ((rDest >> 3) << 11) | ((gDest >> 2) << 5) | (bDest >> 3);
-				*(uint16 *)destVal = pixel;
-			}
-		}
-
-		destP += args.destArea.pitch;
-		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-	}
-}
 
-BITMAP::DrawInnerArgs::DrawInnerArgs(BITMAP *dstBitmap, const BITMAP *srcBitmap,
+BITMAP::DrawInnerArgs::DrawInnerArgs(BITMAP *_dstBitmap, const BITMAP *srcBitmap,
 	const Common::Rect &srcRect, const Common::Rect &_dstRect, bool _skipTrans,
 	int _srcAlpha, bool _horizFlip, bool _vertFlip, int _tintRed,
-	int _tintGreen, int _tintBlue, bool _doScale) : skipTrans(_skipTrans),
+	int _tintGreen, int _tintBlue, bool doScale) : skipTrans(_skipTrans),
 		srcAlpha(_srcAlpha), horizFlip(_horizFlip), vertFlip(_vertFlip),
 		tintRed(_tintRed), tintGreen(_tintGreen), tintBlue(_tintBlue),
-		doScale(_doScale), src(**srcBitmap), shouldDraw(false),
+		src(**srcBitmap), shouldDraw(false), dstBitmap(*_dstBitmap),
 		useTint(_tintRed >= 0 && _tintGreen >= 0 && _tintBlue >= 0),
 		blenderMode(_G(_blender_mode)), dstRect(_dstRect) {
 	// Allegro disables draw when the clipping rect has negative width/height.
 	// Common::Rect instead asserts, which we don't want.
-	if (dstBitmap->cr <= dstBitmap->cl || dstBitmap->cb <= dstBitmap->ct)
+	if (dstBitmap.cr <= dstBitmap.cl || dstBitmap.cb <= dstBitmap.ct)
 		return;
 
 	// Figure out the dest area that will be updated
@@ -262,22 +132,22 @@ BITMAP::DrawInnerArgs::DrawInnerArgs(BITMAP *dstBitmap, const BITMAP *srcBitmap,
 		dstRect.setHeight(srcArea.height());
 	}
 	Common::Rect destRect = dstRect.findIntersectingRect(
-	                            Common::Rect(dstBitmap->cl, dstBitmap->ct, dstBitmap->cr, dstBitmap->cb));
+	                            Common::Rect(dstBitmap.cl, dstBitmap.ct, dstBitmap.cr, dstBitmap.cb));
 	if (destRect.isEmpty())
 		// Area is entirely outside the clipping area, so nothing to draw
 		return;
 
 	// Get source and dest surface. Note that for the destination we create
 	// a temporary sub-surface based on the allowed clipping area
-	Graphics::ManagedSurface &dest = *dstBitmap->_owner;
+	Graphics::ManagedSurface &dest = *dstBitmap._owner;
 	destArea = dest.getSubArea(destRect);
 
 	// Define scaling and other stuff used by the drawing loops
 	scaleX = SCALE_THRESHOLD * srcRect.width() / dstRect.width();
 	scaleY = SCALE_THRESHOLD * srcRect.height() / dstRect.height();
-	sameFormat = (src.format == dstBitmap->format);
+	sameFormat = (src.format == dstBitmap.format);
 
-	if (src.format.bytesPerPixel == 1 && dstBitmap->format.bytesPerPixel != 1) {
+	if (src.format.bytesPerPixel == 1 && dstBitmap.format.bytesPerPixel != 1) {
 		for (int i = 0; i < PAL_SIZE; ++i) {
 			palette[i].r = VGA_COLOR_TRANS(_G(current_palette)[i].r);
 			palette[i].g = VGA_COLOR_TRANS(_G(current_palette)[i].g);
@@ -306,40 +176,26 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 
 	auto args = DrawInnerArgs(this, srcBitmap, srcRect, Common::Rect(dstX, dstY, dstX+1, dstY+1), skipTrans, srcAlpha, horizFlip, vertFlip, tintRed, tintGreen, tintBlue, false);
 	if (!args.shouldDraw) return;
-#define DRAWINNER(func) func(args)
-	// Calling drawInnerXXXX with a ScaleThreshold of 0 just does normal un-scaled drawing
-	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
-		if (args.sameFormat) {
-			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER((drawInnerGeneric<1, 1, false>)); return;
-			case 2: DRAWINNER((drawInnerGeneric<2, 2, false>)); return;
-			case 4: DRAWINNER((drawInnerGeneric<4, 4, false>)); return;
-			}
-		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInnerGeneric<4, 2, false>));
-		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInnerGeneric<2, 4, false>));
-		}
-	} else {
-		if (args.sameFormat) {
-			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<false>); return;
-			case 2: DRAWINNER(drawInner2Bpp<false>); return;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, false>)); return;
-			}
-		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInner4BppWithConv<4, 2, false>));
-			return;
-		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInner4BppWithConv<2, 4, false>));
-			return;
-		}
+	if (!args.sameFormat && args.src.format.bytesPerPixel == 1) {
+		if (format.bytesPerPixel == 4)
+			drawInnerGeneric<4, 1, false>(args);
+		else
+			drawInnerGeneric<2, 1, false>(args);
+		return;
 	}
-	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
-		DRAWINNER((drawInnerGeneric<4, 1, false>));
-	else
-		DRAWINNER((drawInnerGeneric<2, 1, false>));
-#undef DRAWINNER
+#ifdef SCUMMVM_NEON
+	if (_G(simd_flags) & AGS3::Globals::SIMD_NEON) {
+		drawNEON<false>(args);
+		return;
+	}
+#endif
+#ifdef SCUMMVM_SSE2
+	if (_G(simd_flags) & AGS3::Globals::SIMD_SSE2) {
+		drawSSE2<false>(args);
+		return;
+	}
+#endif
+	drawGeneric<false>(args);
 }
 
 void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
@@ -348,39 +204,26 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 	       (format.bytesPerPixel == 1 && srcBitmap->format.bytesPerPixel == 1));
 	auto args = DrawInnerArgs(this, srcBitmap, srcRect, dstRect, skipTrans, srcAlpha, false, false, 0, 0, 0, true);
 	if (!args.shouldDraw) return;
-#define DRAWINNER(func) func(args)
-	if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
-		if (args.sameFormat) {
-			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER((drawInnerGeneric<1, 1, true>)); return;
-			case 2: DRAWINNER((drawInnerGeneric<2, 2, true>)); return;
-			case 4: DRAWINNER((drawInnerGeneric<4, 4, true>)); return;
-			}
-		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInnerGeneric<4, 2, true>));
-		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInnerGeneric<2, 4, true>));
-		}
-	} else {
-		if (args.sameFormat) {
-			switch (format.bytesPerPixel) {
-			case 1: DRAWINNER(drawInner1Bpp<true>); return;
-			case 2: DRAWINNER(drawInner2Bpp<true>); return;
-			case 4: DRAWINNER((drawInner4BppWithConv<4, 4, true>)); return;
-			}
-		} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
-			DRAWINNER((drawInner4BppWithConv<4, 2, true>));
-			return;
-		} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-			DRAWINNER((drawInner4BppWithConv<2, 4, true>));
-			return;
-		}
+	if (!args.sameFormat && args.src.format.bytesPerPixel == 1) {
+		if (format.bytesPerPixel == 4)
+			drawInnerGeneric<4, 1, true>(args);
+		else
+			drawInnerGeneric<2, 1, true>(args);
+		return;
 	}
-	if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
-		DRAWINNER((drawInnerGeneric<4, 1, true>));
-	else
-		DRAWINNER((drawInnerGeneric<2, 1, true>));
-#undef DRAWINNER
+#ifdef SCUMMVM_NEON
+	if (_G(simd_flags) & AGS3::Globals::SIMD_NEON) {
+		drawNEON<true>(args);
+		return;
+	}
+#endif
+#ifdef SCUMMVM_SSE2
+	if (_G(simd_flags) & AGS3::Globals::SIMD_SSE2) {
+		drawSSE2<true>(args);
+		return;
+	}
+#endif
+	drawGeneric<true>(args);
 }
 void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
 	switch (_G(_blender_mode)) {
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 28dbeed425c..3cd6738c532 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -268,7 +268,7 @@ public:
 	constexpr static int SCALE_THRESHOLD_BITS = 8;
 	constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
 	struct DrawInnerArgs {
-		const bool useTint, horizFlip, vertFlip, skipTrans, doScale;
+		const bool useTint, horizFlip, vertFlip, skipTrans;
 		bool sameFormat, shouldDraw;
 		int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
 		uint32 transColor, alphaMask;
@@ -277,6 +277,7 @@ public:
 		BlenderMode blenderMode;
 		Common::Rect dstRect, srcArea;
 
+		BITMAP &dstBitmap;
 		const ::Graphics::ManagedSurface &src;
 		::Graphics::Surface destArea;
 
@@ -286,13 +287,17 @@ public:
 					  bool vertFlip, int tintRed, int tintGreen, int tintBlue,
 					  bool doScale);
 	};
-
-	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
-	void drawInner4BppWithConv(DrawInnerArgs &args);
+	friend class DrawInnerImpl;
 	template<bool Scale>
-	void drawInner2Bpp(DrawInnerArgs &args);
+	void drawGeneric(DrawInnerArgs &args);
+#ifdef SCUMMVM_NEON
 	template<bool Scale>
-	void drawInner1Bpp(DrawInnerArgs &args);
+	void drawNEON(DrawInnerArgs &args);
+#endif
+#ifdef SCUMMVM_SSE2
+	template<bool Scale>
+	void drawSSE2(DrawInnerArgs &args);
+#endif
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 	void drawInnerGeneric(DrawInnerArgs &args);
 	
diff --git a/engines/ags/lib/allegro/surface_generic.cpp b/engines/ags/lib/allegro/surface_generic.cpp
new file mode 100644
index 00000000000..82626eb9859
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_generic.cpp
@@ -0,0 +1,187 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/surface.h"
+#include "ags/globals.h"
+
+namespace AGS3 {
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
+void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+
+	// Instead of skipping pixels outside our boundary here, we just clip
+	// our area instead.
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) { // Clip the right
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) { // Clip the left
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) { // Clip the top
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 1 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
+	                       args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		if (Scale) {
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		// Loop through the pixels of the row
+		for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
+			const byte *srcVal = srcP + xDir * xCtrBpp;
+			if (Scale) {
+				srcVal = srcP + (scaleXCtr / BITMAP::SCALE_THRESHOLD) * SrcBytesPerPixel;
+			}
+			uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
+
+			// Check if this is a transparent color we should skip
+			if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
+				continue;
+
+			byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+
+			// When blitting to the same format we can just copy the color
+			if (DestBytesPerPixel == 1) {
+				*destVal = srcCol;
+				continue;
+			} else if ((DestBytesPerPixel == SrcBytesPerPixel) && args.srcAlpha == -1) {
+				if (DestBytesPerPixel)
+					*(uint32 *)destVal = srcCol;
+				else
+					*(uint16 *)destVal = srcCol;
+				continue;
+			}
+
+			// We need the rgb values to do blending and/or convert between formats
+			if (SrcBytesPerPixel == 1) {
+				const RGB &rgb = args.palette[srcCol];
+				aSrc = 0xff;
+				rSrc = rgb.r;
+				gSrc = rgb.g;
+				bSrc = rgb.b;
+			} else {
+				if (SrcBytesPerPixel == 4) {
+					aSrc = srcCol >> 24;
+					rSrc = (srcCol >> 16) & 0xff;
+					gSrc = (srcCol >> 8) & 0xff;
+					bSrc = srcCol & 0xff;
+				} else { // SrcBytesPerPixel == 2
+					aSrc = 0xff;
+					rSrc = (srcCol >> 11) & 0x1f;
+					rSrc = (rSrc << 3) | (rSrc >> 2);
+					gSrc = (srcCol >> 5) & 0x3f;
+					gSrc = (gSrc << 2) | (gSrc >> 4);
+					bSrc = srcCol & 0x1f;
+					bSrc = (bSrc << 3) | (bSrc >> 2);
+				}
+				//src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+			}
+
+			if (args.srcAlpha == -1) {
+				// This means we don't use blending.
+				aDest = aSrc;
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+			} else {
+				if (args.useTint) {
+					rDest = rSrc;
+					gDest = gSrc;
+					bDest = bSrc;
+					aDest = aSrc;
+					rSrc = args.tintRed;
+					gSrc = args.tintGreen;
+					bSrc = args.tintBlue;
+					aSrc = args.srcAlpha;
+				}
+				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			}
+
+			uint32 pixel;// = format.ARGBToColor(aDest, rDest, gDest, bDest);
+			if (DestBytesPerPixel == 4) {
+				pixel = (aDest << 24) | (rDest << 16) | (gDest << 8) | (bDest);
+				*(uint32 *)destVal = pixel;
+			}
+			else {
+				pixel = ((rDest >> 3) << 11) | ((gDest >> 2) << 5) | (bDest >> 3);
+				*(uint16 *)destVal = pixel;
+			}
+		}
+
+		destP += args.destArea.pitch;
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+	}
+}
+
+template<bool Scale>
+void BITMAP::drawGeneric(DrawInnerArgs &args) {
+	if (args.sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: drawInnerGeneric<1, 1, Scale>(args); break;
+		case 2: drawInnerGeneric<2, 2, Scale>(args); break;
+		case 4: drawInnerGeneric<4, 4, Scale>(args); break;
+		}
+	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
+		drawInnerGeneric<4, 2, Scale>(args);
+	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
+		drawInnerGeneric<2, 4, Scale>(args);
+	}
+}
+
+template void BITMAP::drawGeneric<false>(DrawInnerArgs &);
+template void BITMAP::drawGeneric<true>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 4, false>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 4, true>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 2, false>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 2, true>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<2, 4, false>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<2, 4, true>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 1, false>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<4, 1, true>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<2, 1, false>(DrawInnerArgs &);
+template void BITMAP::drawInnerGeneric<2, 1, true>(DrawInnerArgs &);
+
+} // end of namespace AGS3
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
new file mode 100644
index 00000000000..9eb5b4b54fd
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -0,0 +1,918 @@
+#include <arm_neon.h>
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+namespace AGS3 {
+
+inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
+	uint32x4_t x = vmovl_u16(pixels);
+
+	// c is the extracted 5/6 bit color from the image
+	uint32x4_t c = vshrq_n_u32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
+	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
+	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
+	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
+	c = vandq_u32(x, vmovq_n_u32(0x001f));
+	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
+	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
+}
+
+inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
+	// x is the final 16 bit rgb pixel
+	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
+	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
+	return vmovn_u32(x);
+}
+
+inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
+
+	// Split the components into rgb
+	uint16x8_t srcComps[] = {
+		vandq_u16(srcCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),	// G
+		vshrq_n_u16(srcCols, 11),								// R
+	}, destComps[] = {
+		vandq_u16(destCols, vmovq_n_u16(0x1f)),					// B
+		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)), // G
+		vshrq_n_u16(destCols, 11),								// R
+	};
+
+	// At some point I made it so that it would put them into their 8bit depth format
+	// to keep the function as 1-1 with the original, but it didn't seem to help much
+	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
+	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
+	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
+	//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
+	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
+	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
+
+	// Calculate the differences between the colors
+	uint16x8_t diffs[] = {
+		vsubq_u16(srcComps[0], destComps[0]), // B
+		vsubq_u16(srcComps[1], destComps[1]), // G
+		vsubq_u16(srcComps[2], destComps[2]), // R
+	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
+	alphas = vshrq_n_u16(alphas, 2);
+	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
+	alphas = vshrq_n_u16(alphas, 1);
+	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
+	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
+
+	// Originally, I converted it back to normal here from the 8bpp form, but don't need to do that anymore
+	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
+	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
+	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
+
+	// Here we add the difference between the 2 colors times alpha onto the destination
+	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
+	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
+	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
+	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
+	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
+}
+
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
+inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
+
+	// Get the alpha from the destination
+	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
+
+	// Get red and blue components
+	uint32x4_t srcColsCopy = srcCols;
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	uint32x4_t destColsCopy = destCols;
+	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
+
+	// compute the difference, then multiply by alpha and divide by 255
+	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
+	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
+	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
+	srcColsCopy = vaddq_u32(srcColsCopy, destCols); // Add the new red/blue to the old ones
+
+	// do the same for the green component
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
+	srcCols = vsubq_u32(srcCols, destCols);
+	srcCols = vmulq_u32(srcCols, alphas);
+	srcCols = vshrq_n_u32(srcCols, 8);
+	srcCols = vaddq_u32(srcCols, destCols); // Add the new green to the old green
+
+	// keep values in 8bit range and glue red/blue and green together
+	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
+	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
+	srcCols = vorrq_u32(srcCols, srcColsCopy);
+
+	// Remeber that alpha is not alphas, but rather the alpha of destCols
+	if (preserveAlpha) {
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+// uses the alpha from srcCols and destCols
+inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
+	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
+	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
+
+	// sAlphas1 has the alphas of the first pixel in lanes 0 and 1 and of the second pixel in lanes 2 and 3
+	// same with sAlphas2 but for the 2nd pixel
+	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
+	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
+
+	// Same thing going on here with dAlphas, except that it gets mutliplied by (1 - sAlpha) first
+	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
+	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
+	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
+	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
+	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
+
+	// first 2 pixels
+	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
+	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
+	// last 2 pixels
+	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
+	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
+	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
+	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
+	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
+	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1)); // compute reciprocal
+	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
+	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
+	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
+	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
+	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
+	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
+
+	// alpha channel is computed differently
+	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
+
+	// Final argb components as 16bit values
+	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
+
+	// copy alpha channel over
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
+	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
+	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
+
+	// cast 16bit to 8bit and reinterpret as uint32's
+	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
+}
+
+inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
+
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	// do the transformation (we don't actually need alpha at all)
+	float32x4_t ddr, ddg, ddb;
+	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+	float32x4_t ssr, ssg, ssb;
+	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
+	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
+	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
+	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
+	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
+
+	// This is here to stop from dividing by 0
+	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
+
+	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
+	float32x4_t hr, hg, hb, hue;
+	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
+	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
+	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
+	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
+
+	// And then compute which one will be used based on criteria
+	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
+	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
+	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
+	hue = vmulq_f32(hr, hrfactors);
+	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
+	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
+
+	// Mess with the light like the original function
+	float32x4_t val = dmaxes;
+	if (light) {
+		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
+		val = vmaxq_f32(val, vmovq_n_f32(0.0));
+	}
+		
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
+	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
+	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
+	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
+	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
+	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
+	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
+	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
+
+	// Again HSV->RGB is also a piecewise function
+	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
+	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
+	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
+	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
+	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
+	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
+	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
+	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
+	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
+	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
+	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
+	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
+
+	// or the values together
+	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
+
+	// add the minimums back in
+	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
+	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
+	final = vaddq_u32(final, val_add);
+	return final;
+}
+
+inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
+	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
+		srcAlphas = vshrq_n_u32(srcCols, 24);
+		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
+		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
+		difAlphas = vshlq_n_u32(difAlphas, 24);
+		srcAlphas = vshlq_n_u32(srcAlphas, 24);
+		mask = vceqq_u32(alphas, vmovq_n_u32(0));
+		srcAlphas = vandq_u32(srcAlphas, mask);
+		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
+		alphas = vshrq_n_u32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
+		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
+		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
+		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
+		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
+		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+		// mask and or them together
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
+		ch1 = vandq_u32(ch1, mask);
+		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
+		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
+		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
+		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
+		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
+		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+}
+
+inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+	uint16x8_t mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
+		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
+		alphas = vorrq_u16(ch1, ch2);
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
+		ch1 = vandq_u32(srcCols, mask);
+		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
+		return vorrq_u32(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
+		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
+		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
+		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
+		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
+		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
+		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return vcombine_u16(lo, hi);
+	}
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
+	uint32x4_t srcCols, destCol;
+
+	if (DestBytesPerPixel == 4)
+		destCol = vld1q_u32((uint32 *)destPtr);
+	else
+		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
+	if (SrcBytesPerPixel == 4)
+		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
+	else
+		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
+	mask1 = vorrq_u32(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
+	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
+	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u32(final);
+		final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
+	}
+	if (DestBytesPerPixel == 4) {
+		vst1q_u32((uint32 *)destPtr, final);
+	} else {
+		vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
+	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
+	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
+	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
+	mask1 = vorrq_u16(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
+	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
+	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
+	if (horizFlip) {
+		final = vrev64q_u16(final);
+		final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
+	}
+	vst1q_u16((uint16 *)destPtr, final);
+}
+
+class DrawInnerImpl {
+public:
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
+static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(args.srcAlpha), 24);
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintRed), 16));
+	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintGreen), 8));
+	tint = vorrq_u32(tint, vdupq_n_u32(args.tintBlue));
+	uint32x4_t maskedAlphas = vld1q_dup_u32(&args.alphaMask);
+	uint32x4_t transColors = vld1q_dup_u32(&args.transColor);
+	uint32x4_t alphas = vld1q_dup_u32(&args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	uint32x4_t addIndexes = {0, 1, 2, 3};
+	if (args.horizFlip) addIndexes = {3, 2, 1, 0};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
+
+		if (!Scale) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
+				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			byte srcBuffer[4*4];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
+				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), SrcBytesPerPixel);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
+				scaleXCtr += args.scaleX*4;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
+				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
+	if (xCtrWidth % 4 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
+		}
+		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
+		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 4;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = args.dstBitmap.getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	uint16x8_t tint = vdupq_n_u16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
+	uint16x8_t transColors = vdupq_n_u16(args.transColor);
+	uint16x8_t alphas = vdupq_n_u16(args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	if (args.horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
+	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * 2;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
+		if (!Scale) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
+				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			uint16 srcBuffer[8];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 2);
+				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS), 2);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
+				srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
+				srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
+				srcBuffer[3] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 3));
+				srcBuffer[4] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 0));
+				srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
+				srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
+				srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
+				scaleXCtr += args.scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * 2];
+				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// We have a picture that is a multiple of 8, so no extra pixels to draw
+	if (xCtrWidth % 8 == 0) return;
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
+		}
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (args.horizFlip) srcP += 2 * 7;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * 2;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && srcCol == args.transColor)
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	uint32x4_t scaleAdds1 = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
+	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
+	uint32x4_t scaleAdds3 = {(uint32)args.scaleX*8, (uint32)args.scaleX*9, (uint32)args.scaleX*10, (uint32)args.scaleX*11};
+	uint32x4_t scaleAdds4 = {(uint32)args.scaleX*12, (uint32)args.scaleX*13, (uint32)args.scaleX*14, (uint32)args.scaleX*15};
+	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		if (Scale) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
+			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
+			uint8x16_t destCols = vld1q_u8(destPtr);
+			uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
+			if (Scale) {
+				// If we are scaling, we have to set each pixel individually
+				uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
+				uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
+				indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), BITMAP::SCALE_THRESHOLD_BITS);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 3)], srcCols, 3);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 0)], srcCols, 4);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 1)], srcCols, 5);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 2)], srcCols, 6);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 3)], srcCols, 7);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 0)], srcCols, 8);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 1)], srcCols, 9);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 2)], srcCols, 10);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 3)], srcCols, 11);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 0)], srcCols, 12);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
+				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
+				scaleXCtr += args.scaleX*16;
+			}
+
+			// Mask out transparent pixels
+			uint8x16_t mask1 = args.skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
+			uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
+			if (args.horizFlip) {
+				final = vrev64q_u8(final);
+				final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
+			}
+			vst1q_u8(destPtr, final);
+		}
+		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (args.horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (Scale) {
+				srcCol = (const byte *)(srcP + scaleXCtr / BITMAP::SCALE_THRESHOLD);
+			}
+			// Check if this is a transparent color we should skip
+			if (args.skipTrans && *srcCol == args.transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (args.horizFlip) srcP -= 15; // Undo what we did up there
+		destP += args.destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+	}
+}
+
+}; // end of class DrawInnerImpl
+
+template<bool Scale>
+void BITMAP::drawNEON(DrawInnerArgs &args) {
+	if (args.sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		}
+	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
+		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
+		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+	}
+}
+
+template void BITMAP::drawNEON<false>(DrawInnerArgs &);
+template void BITMAP::drawNEON<true>(DrawInnerArgs &);
+
+} // namespace AGS3
diff --git a/engines/ags/lib/allegro/surface_simd_neon.cpp b/engines/ags/lib/allegro/surface_simd_neon.cpp
deleted file mode 100644
index 79d2f57bc0c..00000000000
--- a/engines/ags/lib/allegro/surface_simd_neon.cpp
+++ /dev/null
@@ -1,483 +0,0 @@
-#include "ags/lib/allegro/surface_simd_neon.h"
-
-#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
-
-#include "ags/lib/allegro/gfx.h"
-#include "ags/lib/allegro/color.h"
-#include "ags/lib/allegro/flood.h"
-#include "ags/ags.h"
-#include "ags/globals.h"
-#include "common/textconsole.h"
-#include "graphics/screen.h"
-
-namespace AGS3 {
-
-// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
-void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	uint32x4_t tint = vshlq_n_u32(vdupq_n_u32(args.srcAlpha), 24);
-	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintRed), 16));
-	tint = vorrq_u32(tint, vshlq_n_u32(vdupq_n_u32(args.tintGreen), 8));
-	tint = vorrq_u32(tint, vdupq_n_u32(args.tintBlue));
-	uint32x4_t maskedAlphas = vld1q_dup_u32(&args.alphaMask);
-	uint32x4_t transColors = vld1q_dup_u32(&args.transColor);
-	uint32x4_t alphas = vld1q_dup_u32(&args.srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	uint32x4_t addIndexes = {0, 1, 2, 3};
-	if (args.horizFlip) addIndexes = {3, 2, 1, 0};
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
-
-		if (!Scale) {
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += args.destArea.pitch;
-			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			byte srcBuffer[4*4];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
-				uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
-				// Calculate in parallel the indexes of the pixels
-				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), SrcBytesPerPixel);
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
-				memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
-				memcpy(&srcBuffer[2*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 2), SrcBytesPerPixel);
-				memcpy(&srcBuffer[3*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 3), SrcBytesPerPixel);
-				scaleXCtr += args.scaleX*4;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * (uintptr_t)DestBytesPerPixel];
-				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
-		}
-	}
-
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
-	// We have a picture that is a multiple of 4, so no extra pixels to draw
-	if (xCtrWidth % 4 == 0) return;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (!Scale) {
-		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
-			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
-		}
-		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
-		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 4;
-		xCtrBpp = xCtr * SrcBytesPerPixel;
-		destX = args.xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (Scale) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
-		}
-		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
-		
-		// Check if this is a transparent color we should skip
-		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
-			continue;
-
-		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (args.srcAlpha != -1) {
-			if (args.useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = args.tintRed;
-				gSrc = args.tintGreen;
-				bSrc = args.tintBlue;
-				aSrc = args.srcAlpha;
-			}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		if (DestBytesPerPixel == 4)
-			*(uint32 *)destVal = srcCol;
-		else
-			*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<bool Scale>
-void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	uint16x8_t tint = vdupq_n_u16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
-	uint16x8_t transColors = vdupq_n_u16(args.transColor);
-	uint16x8_t alphas = vdupq_n_u16(args.srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	uint16x8_t addIndexes = {0, 1, 2, 3, 4, 5, 6, 7};
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (args.horizFlip) addIndexes = {7, 6, 5, 4, 3, 2, 1, 0};
-	uint32x4_t scaleAdds = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
-	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		xCtrBppStart = xCtrStart * 2;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
-		if (!Scale) {
-			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += args.destArea.pitch;
-			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			uint16 srcBuffer[8];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-				uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
-				// Calculate in parallel the indexes of the pixels
-				indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
-				indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 2);
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
-				srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
-				srcBuffer[2] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 2));
-				srcBuffer[3] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 3));
-				srcBuffer[4] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 0));
-				srcBuffer[5] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 1));
-				srcBuffer[6] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 2));
-				srcBuffer[7] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes2, 3));
-				scaleXCtr += args.scaleX*8;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * 2];
-				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
-		}
-	}
-
-	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (!Scale) {
-		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-			byte *destPtr = &destP[destX * 2];
-			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
-		}
-		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
-		if (args.horizFlip) srcP += 2 * 7;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 8;
-		xCtrBpp = xCtr * 2;
-		destX = args.xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (Scale) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
-		}
-		byte *destVal = (byte *)&destP[destX * 2];
-		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
-		
-		// Check if this is a transparent color we should skip
-		if (args.skipTrans && srcCol == args.transColor)
-			continue;
-
-		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (args.srcAlpha != -1) {
-			if (args.useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = args.tintRed;
-				gSrc = args.tintGreen;
-				bSrc = args.tintBlue;
-				aSrc = args.srcAlpha;
-			}/* else {
-				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
-			}*/
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<bool Scale>
-void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	uint32x4_t scaleAdds1 = {0, (uint32)args.scaleX, (uint32)args.scaleX*2, (uint32)args.scaleX*3};
-	uint32x4_t scaleAdds2 = {(uint32)args.scaleX*4, (uint32)args.scaleX*5, (uint32)args.scaleX*6, (uint32)args.scaleX*7};
-	uint32x4_t scaleAdds3 = {(uint32)args.scaleX*8, (uint32)args.scaleX*9, (uint32)args.scaleX*10, (uint32)args.scaleX*11};
-	uint32x4_t scaleAdds4 = {(uint32)args.scaleX*12, (uint32)args.scaleX*13, (uint32)args.scaleX*14, (uint32)args.scaleX*15};
-	
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (Scale) {
-			// So here we update the srcYCtr differently due to this being for
-			// scaling
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			if (srcYCtr != newSrcYCtr) {
-				// Since the source yctr might not update every row of the destination, we have
-				// to see if we are on a new row...
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-		}
-		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
-		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
-			byte *destPtr = &destP[destX];
-
-			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
-			// can't have any blending applied to them
-			uint8x16_t destCols = vld1q_u8(destPtr);
-			uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
-			if (Scale) {
-				// If we are scaling, we have to set each pixel individually
-				uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
-				uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
-				indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
-				indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
-				indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
-				indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 3)], srcCols, 3);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 0)], srcCols, 4);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 1)], srcCols, 5);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 2)], srcCols, 6);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes2, 3)], srcCols, 7);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 0)], srcCols, 8);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 1)], srcCols, 9);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 2)], srcCols, 10);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes3, 3)], srcCols, 11);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 0)], srcCols, 12);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 1)], srcCols, 13);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 2)], srcCols, 14);
-				srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes4, 3)], srcCols, 15);
-				scaleXCtr += args.scaleX*16;
-			}
-
-			// Mask out transparent pixels
-			uint8x16_t mask1 = args.skipTrans ? vceqq_u8(srcCols, transColors) : vmovq_n_u8(0);
-			uint8x16_t final = vorrq_u8(vandq_u8(srcCols, vmvnq_u8(mask1)), vandq_u8(destCols, mask1));
-			if (args.horizFlip) {
-				final = vrev64q_u8(final);
-				final = vcombine_u8(vget_high_u8(final), vget_low_u8(final));
-			}
-			vst1q_u8(destPtr, final);
-		}
-		// Get the last x values
-
-		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
-		if (args.horizFlip) srcP += 15;
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
-			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-			if (Scale) {
-				srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
-			}
-			// Check if this is a transparent color we should skip
-			if (args.skipTrans && *srcCol == args.transColor)
-				continue;
-
-			byte *destVal = (byte *)&destP[destX];
-			*destVal = *srcCol;
-		}
-		if (args.horizFlip) srcP -= 15; // Undo what we did up there
-		destP += args.destArea.pitch; // Goto next row
-		// Only advance the src row by 1 every time like this if we don't scale
-		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-	}
-}
-
-
-template void BITMAP::drawInner4BppWithConv<4, 4, false>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 4, true>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, false>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, true>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, false>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, true>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<false>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<true>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<false>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<true>(DrawInnerArgs &args);
-
-} // namespace AGS3
-
-#endif
diff --git a/engines/ags/lib/allegro/surface_simd_neon.h b/engines/ags/lib/allegro/surface_simd_neon.h
deleted file mode 100644
index 0aa98fad831..00000000000
--- a/engines/ags/lib/allegro/surface_simd_neon.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/* ScummVM - Graphic Adventure Engine
- *
- * ScummVM is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_H
-#ifdef __APPLE__ // Appeasing iOS
-#include <TargetConditionals.h>
-#endif
-
-#if !defined(TARGET_OS_SIMULATOR) || TARGET_OS_SIMULATOR != 1 // Appeasing iOS/Iphone simultator?
-#if !defined(TARGET_OS_IPHONE) || TARGET_OS_IPHONE != 1 // Appeasing iOS/Iphone simultator?
-#if defined(__aarch64__)
-
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#endif
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON_IMPL
-#endif
-
-#include <arm_neon.h>
-#include "ags/globals.h"
-#include "ags/lib/allegro/surface.h"
-
-namespace AGS3 {
-
-inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
-	uint32x4_t x = vmovl_u16(pixels);
-
-	// c is the extracted 5/6 bit color from the image
-	uint32x4_t c = vshrq_n_u32(x, 11);
-
-	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
-	// sinificant bits in the original color for the least significant bits in the new one
-	uint32x4_t r = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2)), 16);
-	c = vshrq_n_u32(vandq_u32(x, vmovq_n_u32(0x07e0)), 5);
-	uint32x4_t g = vshlq_n_u32(vorrq_u32(vshlq_n_u32(c, 2), vshrq_n_u32(c, 4)), 8);
-	c = vandq_u32(x, vmovq_n_u32(0x001f));
-	uint32x4_t b = vorrq_u32(vshlq_n_u32(c, 3), vshrq_n_u32(c, 2));
-
-	// By default 2bpp to 4bpp makes the alpha channel 255
-	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
-}
-
-inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
-	// x is the final 16 bit rgb pixel
-	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
-	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
-	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x00ff0000)), 16+3), 11));
-	return vmovn_u32(x);
-}
-
-inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did
-	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
-
-	// Split the components into rgb
-	uint16x8_t srcComps[] = {
-		vandq_u16(srcCols, vmovq_n_u16(0x1f)),					// B
-		vandq_u16(vshrq_n_u16(srcCols, 5), vmovq_n_u16(0x3f)),	// G
-		vshrq_n_u16(srcCols, 11),								// R
-	}, destComps[] = {
-		vandq_u16(destCols, vmovq_n_u16(0x1f)),					// B
-		vandq_u16(vshrq_n_u16(destCols, 5), vmovq_n_u16(0x3f)), // G
-		vshrq_n_u16(destCols, 11),								// R
-	};
-
-	// At some point I made it so that it would put them into their 8bit depth format
-	// to keep the function as 1-1 with the original, but it didn't seem to help much
-	//srcComps[0] = vorrq_u16(vshlq_n_u16(srcComps[0], 3), vshrq_n_u16(srcComps[0], 2));
-	//srcComps[1] = vorrq_u16(vshlq_n_u16(srcComps[1], 2), vshrq_n_u16(srcComps[1], 4));
-	//srcComps[2] = vorrq_u16(vshlq_n_u16(srcComps[2], 3), vshrq_n_u16(srcComps[2], 2));
-	//destComps[0] = vorrq_u16(vshlq_n_u16(destComps[0], 3), vshrq_n_u16(destComps[0], 2));
-	//destComps[1] = vorrq_u16(vshlq_n_u16(destComps[1], 2), vshrq_n_u16(destComps[1], 4));
-	//destComps[2] = vorrq_u16(vshlq_n_u16(destComps[2], 3), vshrq_n_u16(destComps[2], 2));
-
-	// Calculate the differences between the colors
-	uint16x8_t diffs[] = {
-		vsubq_u16(srcComps[0], destComps[0]), // B
-		vsubq_u16(srcComps[1], destComps[1]), // G
-		vsubq_u16(srcComps[2], destComps[2]), // R
-	};
-
-	// Multiply by alpha and shift depth bits to the right
-	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
-	alphas = vshrq_n_u16(alphas, 2);
-	diffs[1] = vshrq_n_u16(vmulq_u16(diffs[1], alphas), 6);
-	alphas = vshrq_n_u16(alphas, 1);
-	diffs[0] = vshrq_n_u16(vmulq_u16(diffs[0], alphas), 5);
-	diffs[2] = vshrq_n_u16(vmulq_u16(diffs[2], alphas), 5);
-
-	// Originally, I converted it back to normal here from the 8bpp form, but don't need to do that anymore
-	//diffs[0] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[0], destComps[0]), 3), vmovq_n_u16(0x1f));
-	//diffs[1] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[1], destComps[1]), 2), vmovq_n_u16(0x3f));
-	//diffs[2] = vandq_u16(vshrq_n_u16(vaddq_u16(diffs[2], destComps[2]), 3), vmovq_n_u16(0x1f));
-
-	// Here we add the difference between the 2 colors times alpha onto the destination
-	diffs[0] = vandq_u16(vaddq_u16(diffs[0], destComps[0]), vmovq_n_u16(0x1f));
-	diffs[1] = vandq_u16(vaddq_u16(diffs[1], destComps[1]), vmovq_n_u16(0x3f));
-	diffs[2] = vandq_u16(vaddq_u16(diffs[2], destComps[2]), vmovq_n_u16(0x1f));
-
-	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
-	diffs[0] = vorrq_u16(diffs[0], vshlq_n_u16(diffs[1], 5));
-	return vorrq_u16(diffs[0], vshlq_n_u16(diffs[2], 11));
-}
-
-// preserveAlpha:
-//		false => set destCols's alpha to 0
-// 		true => keep destCols's alpha
-inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did
-	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
-
-	// Get the alpha from the destination
-	uint32x4_t alpha = vandq_u32(destCols, vmovq_n_u32(0xff000000));
-
-	// Get red and blue components
-	uint32x4_t srcColsCopy = srcCols;
-	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-	uint32x4_t destColsCopy = destCols;
-	destColsCopy = vandq_u32(destColsCopy, vmovq_n_u32(0xff00ff));
-
-	// compute the difference, then multiply by alpha and divide by 255
-	srcColsCopy = vsubq_u32(srcColsCopy, destColsCopy);
-	srcColsCopy = vmulq_u32(srcColsCopy, alphas);
-	srcColsCopy = vshrq_n_u32(srcColsCopy, 8);
-	srcColsCopy = vaddq_u32(srcColsCopy, destCols); // Add the new red/blue to the old ones
-
-	// do the same for the green component
-	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-	destCols = vandq_u32(destCols, vmovq_n_u32(0xff00));
-	srcCols = vsubq_u32(srcCols, destCols);
-	srcCols = vmulq_u32(srcCols, alphas);
-	srcCols = vshrq_n_u32(srcCols, 8);
-	srcCols = vaddq_u32(srcCols, destCols); // Add the new green to the old green
-
-	// keep values in 8bit range and glue red/blue and green together
-	srcColsCopy = vandq_u32(srcColsCopy, vmovq_n_u32(0xff00ff));
-	srcCols = vandq_u32(srcCols, vmovq_n_u32(0xff00));
-	srcCols = vorrq_u32(srcCols, srcColsCopy);
-
-	// Remeber that alpha is not alphas, but rather the alpha of destCols
-	if (preserveAlpha) {
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		srcCols = vorrq_u32(srcCols, alpha);
-	}
-	return srcCols;
-}
-
-// uses the alpha from srcCols and destCols
-inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
-	float16x4_t sAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(srcCols, 24)));
-	sAlphas = vmul_n_f16(sAlphas, 1.0 / 255.0);
-
-	// sAlphas1 has the alphas of the first pixel in lanes 0 and 1 and of the second pixel in lanes 2 and 3
-	// same with sAlphas2 but for the 2nd pixel
-	float16x8_t sAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 0)), vmov_n_f16(vduph_lane_f16(sAlphas, 1)));
-	float16x8_t sAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(sAlphas, 2)), vmov_n_f16(vduph_lane_f16(sAlphas, 3)));
-
-	// Same thing going on here with dAlphas, except that it gets mutliplied by (1 - sAlpha) first
-	float16x4_t dAlphas = vcvt_f16_f32(vcvtq_f32_u32(vshrq_n_u32(destCols, 24)));
-	dAlphas = vmul_n_f16(dAlphas, 1.0 / 255.0);
-	dAlphas = vmul_f16(dAlphas, vsub_f16(vmov_n_f16(1.0), sAlphas));
-	float16x8_t dAlphas1 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 0)), vmov_n_f16(vduph_lane_f16(dAlphas, 1)));
-	float16x8_t dAlphas2 = vcombine_f16(vmov_n_f16(vduph_lane_f16(dAlphas, 2)), vmov_n_f16(vduph_lane_f16(dAlphas, 3)));
-
-	// first 2 pixels
-	float16x8_t srcRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(srcCols))));
-	float16x8_t destRgb1 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_low_u32(destCols))));
-	// last 2 pixels
-	float16x8_t srcRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(srcCols))));
-	float16x8_t destRgb2 = vcvtq_f16_u16(vmovl_u8(vreinterpret_u8_u32(vget_high_u32(destCols))));
-
-	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
-	srcRgb1 = vmulq_f16(srcRgb1, sAlphas1);
-	destRgb1 = vmulq_f16(destRgb1, dAlphas1);
-	srcRgb1 = vaddq_f16(srcRgb1, destRgb1);
-	float16x8_t alphasRec = vrecpeq_f16(vaddq_f16(sAlphas1, dAlphas1)); // compute reciprocal
-	srcRgb1 = vmulq_f16(srcRgb1, alphasRec);
-	srcRgb2 = vmulq_f16(srcRgb2, sAlphas2);
-	destRgb2 = vmulq_f16(destRgb2, dAlphas2);
-	srcRgb2 = vaddq_f16(srcRgb2, destRgb2);
-	alphasRec = vrecpeq_f16(vaddq_f16(sAlphas2, dAlphas2));
-	srcRgb2 = vmulq_f16(srcRgb2, alphasRec);
-
-	// alpha channel is computed differently
-	uint16x4_t alphas = vcvta_u16_f16(vmul_n_f16(vadd_f16(sAlphas, dAlphas), 255.0));
-
-	// Final argb components as 16bit values
-	uint16x8_t uintSrcRgb1 = vcvtq_u16_f16(srcRgb1), uintSrcRgb2 = vcvtq_u16_f16(srcRgb2);
-
-	// copy alpha channel over
-	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 3, alphas, 0);
-	uintSrcRgb1 = vcopyq_lane_u16(uintSrcRgb1, 7, alphas, 1);
-	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 3, alphas, 2);
-	uintSrcRgb2 = vcopyq_lane_u16(uintSrcRgb2, 7, alphas, 3);
-
-	// cast 16bit to 8bit and reinterpret as uint32's
-	return vcombine_u32(vreinterpret_u32_u8(vmovn_u16(uintSrcRgb1)), vreinterpret_u32_u8(vmovn_u16(uintSrcRgb2)));
-}
-
-inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
-	// This function is NOT 1 to 1 with the original... It just approximates it
-	// It gets the value of the HSV of the dest color
-	// Then it gets the HSV of the srcCols
-
-	// how the values are transformed
-	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
-	// srcCols[0] = A | R | G | B
-	// srcCols[1] = A | R | G | B
-	// srcCols[2] = A | R | G | B
-	// srcCols[3] = A | R | G | B
-	//  ->
-	// to 4 float32x4_t's each being a seperate channel with each lane
-	// corresponding to their respective srcCols lane
-	// dda = { A[0], A[1], A[2], A[3] }
-	// ddr = { R[0], R[1], R[2], R[3] }
-	// ddg = { G[0], G[1], G[2], G[3] }
-	// ddb = { B[0], B[1], B[2], B[3] }
-
-	// do the transformation (we don't actually need alpha at all)
-	float32x4_t ddr, ddg, ddb;
-	ddr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(destCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ddb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(destCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-	float32x4_t ssr, ssg, ssb;
-	ssr = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssg = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 8), vmovq_n_u32(0xff))), 1.0 / 255.0);
-	ssb = vmulq_n_f32(vcvtq_f32_u32(vandq_u32(srcCols, vmovq_n_u32(0xff))), 1.0 / 255.0);
-
-	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
-	float32x4_t dmaxes = vmaxq_f32(ddr, vmaxq_f32(ddg, ddb));
-	float32x4_t smaxes = vmaxq_f32(ssr, vmaxq_f32(ssg, ssb));
-	float32x4_t smins = vminq_f32(ssr, vminq_f32(ssg, ssb));
-
-	// This is here to stop from dividing by 0
-	const float32x4_t eplison0 = vmovq_n_f32(0.0000001);
-
-	float32x4_t chroma = vmaxq_f32(vsubq_f32(smaxes, smins), eplison0);
-
-	// RGB to HSV is a piecewise function, so we compute each part of the function first...
-	float32x4_t hr, hg, hb, hue;
-	hr = vdivq_f32(vsubq_f32(ssg, ssb), chroma);
-	hr = vsubq_f32(hr, vmulq_n_f32(vrndmq_f32(vmulq_n_f32(hr, 1.0 / 6.0)), 6.0));
-	hg = vaddq_f32(vdivq_f32(vsubq_f32(ssb, ssr), chroma), vmovq_n_f32(2.0));
-	hb = vaddq_f32(vdivq_f32(vsubq_f32(ssr, ssg), chroma), vmovq_n_f32(4.0));
-
-	// And then compute which one will be used based on criteria
-	float32x4_t hrfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssr, smaxes), vmvnq_u32(vceqq_u32(ssr, ssb))), vmovq_n_u32(1)));
-	float32x4_t hgfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssg, smaxes), vmvnq_u32(vceqq_u32(ssg, ssr))), vmovq_n_u32(1)));
-	float32x4_t hbfactors = vcvtq_f32_u32(vandq_u32(vandq_u32(vceqq_f32(ssb, smaxes), vmvnq_u32(vceqq_u32(ssb, ssg))), vmovq_n_u32(1)));
-	hue = vmulq_f32(hr, hrfactors);
-	hue = vaddq_f32(hue, vmulq_f32(hg, hgfactors));
-	hue = vaddq_f32(hue, vmulq_f32(hb, hbfactors));
-
-	// Mess with the light like the original function
-	float32x4_t val = dmaxes;
-	if (light) {
-		val = vsubq_f32(val, vsubq_f32(vmovq_n_f32(1.0), vmulq_n_f32(vcvtq_f32_u32(alphas), 1.0 / 250.0)));
-		val = vmaxq_f32(val, vmovq_n_f32(0.0));
-	}
-		
-	// then it stiches the HSV back together
-	// the hue and saturation come from the source (tint) color, and the value comes from
-	// the destinaion (real source) color
-	chroma = vmulq_f32(val, vdivq_f32(vsubq_f32(smaxes, smins), vaddq_f32(smaxes, eplison0)));
-	float32x4_t hprime_mod2 = vmulq_n_f32(hue, 1.0 / 2.0);
-	hprime_mod2 = vmulq_n_f32(vsubq_f32(hprime_mod2, vrndmq_f32(hprime_mod2)), 2.0);
-	float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0)))));
-	uint32x4_t hprime_rounded = vcvtq_u32_f32(hue);
-	uint32x4_t x_int = vcvtq_u32_f32(vmulq_n_f32(x, 255.0));
-	uint32x4_t c_int = vcvtq_u32_f32(vmulq_n_f32(chroma, 255.0));
-
-	// Again HSV->RGB is also a piecewise function
-	uint32x4_t val0 = vorrq_u32(vshlq_n_u32(x_int, 8), vshlq_n_u32(c_int, 16));
-	val0 = vandq_u32(val0, vorrq_u32(vceqq_u32(hprime_rounded, vmovq_n_u32(0)), vceqq_u32(hprime_rounded, vmovq_n_u32(6))));
-	uint32x4_t val1 = vorrq_u32(vshlq_n_u32(c_int, 8), vshlq_n_u32(x_int, 16));
-	val1 = vandq_u32(val1, vceqq_u32(hprime_rounded, vmovq_n_u32(1)));
-	uint32x4_t val2 = vorrq_u32(vshlq_n_u32(c_int, 8), x_int);
-	val2 = vandq_u32(val2, vceqq_u32(hprime_rounded, vmovq_n_u32(2)));
-	uint32x4_t val3 = vorrq_u32(vshlq_n_u32(x_int, 8), c_int);
-	val3 = vandq_u32(val3, vceqq_u32(hprime_rounded, vmovq_n_u32(3)));
-	uint32x4_t val4 = vorrq_u32(vshlq_n_u32(x_int, 16), c_int);
-	val4 = vandq_u32(val4, vceqq_u32(hprime_rounded, vmovq_n_u32(4)));
-	uint32x4_t val5 = vorrq_u32(vshlq_n_u32(c_int, 16), x_int);
-	val5 = vandq_u32(val5, vceqq_u32(hprime_rounded, vmovq_n_u32(5)));
-
-	// or the values together
-	uint32x4_t final = vorrq_u32(val0, vorrq_u32(val1, vorrq_u32(val2, vorrq_u32(val3, vorrq_u32(val4, val5)))));
-
-	// add the minimums back in
-	uint32x4_t val_add = vcvtq_u32_f32(vmulq_n_f32(vsubq_f32(val, chroma), 255.0));
-	val_add = vorrq_u32(val_add, vorrq_u32(vshlq_n_u32(val_add, 8), vorrq_u32(vshlq_n_u32(val_add, 16), vandq_u32(destCols, vmovq_n_u32(0xff000000)))));
-	final = vaddq_u32(final, val_add);
-	return final;
-}
-
-inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
-	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
-	auto setupArgbAlphas = [&]() {
-		// This acts the same as this in the normal blender functions
-		// if (alpha == 0)
-		//     alpha = aSrc;
-		// else
-		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
-		// where alpha is the alpha byte of the srcCols
-		srcAlphas = vshrq_n_u32(srcCols, 24);
-		difAlphas = vaddq_u32(vandq_u32(alphas, vmovq_n_u32(0xff)), vmovq_n_u32(1));
-		difAlphas = vshrq_n_u32(vmulq_u32(srcAlphas, difAlphas), 8);
-		difAlphas = vshlq_n_u32(difAlphas, 24);
-		srcAlphas = vshlq_n_u32(srcAlphas, 24);
-		mask = vceqq_u32(alphas, vmovq_n_u32(0));
-		srcAlphas = vandq_u32(srcAlphas, mask);
-		difAlphas = vandq_u32(difAlphas, vmvnq_u32(mask));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		srcCols = vorrq_u32(srcCols, vorrq_u32(srcAlphas, difAlphas));
-	};
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
-		alphas = vshrq_n_u32(srcCols, 24);
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
-		setupArgbAlphas();
-		// only blend if alpha isn't 0, otherwise use destCols
-		mask = vcgtq_u32(vshrq_n_u32(srcCols, 24), vmovq_n_u32(0));
-		ch1 = vandq_u32(argbBlendSIMD(srcCols, destCols), mask);
-		ch2 = vandq_u32(destCols, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
-		setupArgbAlphas();
-		return rgbBlendSIMD(srcCols, destCols, vshrq_n_u32(srcCols, 24), false);
-	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
-		// if alpha is NOT 0 or 255
-		ch2 = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		ch2 = vorrq_u32(ch2, vshlq_n_u32(alphas, 24));
-		ch2 = argbBlendSIMD(ch2, destCols);
-		// if alpha is 0 or 255
-		ch1 = vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-		// mask and or them together
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(0xff)));
-		ch1 = vandq_u32(ch1, mask);
-		ch2 = vandq_u32(ch2, vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
-		return rgbBlendSIMD(srcCols, destCols, alphas, true);
-	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
-		return vorrq_u32(srcCols, vmovq_n_u32(0xff000000));
-	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
-		srcAlphas = vaddq_u32(vshrq_n_u32(srcCols, 24), vshrq_n_u32(destCols, 24));
-		srcAlphas = vminq_u32(srcAlphas, vmovq_n_u32(0xff));
-		srcCols = vandq_u32(srcCols, vmovq_n_u32(0x00ffffff));
-		return vorrq_u32(srcCols, vshlq_n_u32(srcAlphas, 24));
-	case kTintBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
-	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
-	}
-}
-
-inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
-	uint16x8_t mask, ch1, ch2;
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
-	case kOpaqueBlenderMode:
-	case kAdditiveBlenderMode:
-		return srcCols;
-	case kArgbToArgbBlender:
-	case kArgbToRgbBlender:
-		ch1 = vandq_u16(vmovq_n_u16(0xff), vceqq_u16(alphas, vmovq_n_u16(0)));
-		ch2 = vandq_u16(alphas, vcgtq_u16(alphas, vmovq_n_u16(0)));
-		alphas = vorrq_u16(ch1, ch2);
-	case kRgbToRgbBlender:
-	case kAlphaPreservedBlenderMode:
-		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
-	case kRgbToArgbBlender:
-		mask = vorrq_u32(vceqq_u32(alphas, vmovq_n_u32(0)), vceqq_u32(alphas, vmovq_n_u32(255)));
-		ch1 = vandq_u32(srcCols, mask);
-		ch2 = vandq_u32(rgbBlendSIMD2Bpp(srcCols, destCols, alphas), vmvnq_u32(mask));
-		return vorrq_u32(ch1, ch2);
-	case kTintBlenderMode:
-	case kTintLightBlenderMode:
-		uint32x4_t srcColsLo = simd2BppTo4Bpp(vget_low_u16(srcCols));
-		uint32x4_t srcColsHi = simd2BppTo4Bpp(vget_high_u16(srcCols));
-		uint32x4_t destColsLo = simd2BppTo4Bpp(vget_low_u16(destCols));
-		uint32x4_t destColsHi = simd2BppTo4Bpp(vget_high_u16(destCols));
-		uint32x4_t alphasLo = vmovl_u16(vget_low_u16(alphas));
-		uint32x4_t alphasHi = vmovl_u16(vget_high_u16(alphas));
-		uint16x4_t lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
-		uint16x4_t hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
-		return vcombine_u16(lo, hi);
-	}
-}
-
-template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
-	uint32x4_t srcCols, destCol;
-
-	if (DestBytesPerPixel == 4)
-		destCol = vld1q_u32((uint32 *)destPtr);
-	else
-		destCol = simd2BppTo4Bpp(vld1_u16((uint16 *)destPtr));
-	if (SrcBytesPerPixel == 4)
-		srcCols = vld1q_u32((const uint32 *)(srcP2 + xDir * xCtrBpp));
-	else
-		srcCols = simd2BppTo4Bpp(vld1_u16((const uint16 *)(srcP2 + xDir * xCtrBpp)));
-	// we do this here because we need to check if we should skip the pixel before we blend it
-	uint32x4_t mask1 = skipTrans ? vceqq_u32(vandq_u32(srcCols, maskedAlphas), transColors) : vmovq_n_u32(0);
-	mask1 = vorrq_u32(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
-		}
-	}
-	uint32x4_t destCols2 = vandq_u32(destCol, mask1);
-	uint32x4_t srcCols2 = vandq_u32(srcCols, vmvnq_u32(mask1));
-	uint32x4_t final = vorrq_u32(destCols2, srcCols2);
-	if (horizFlip) {
-		final = vrev64q_u32(final);
-		final = vcombine_u32(vget_high_u32(final), vget_low_u32(final));
-	}
-	if (DestBytesPerPixel == 4) {
-		vst1q_u32((uint32 *)destPtr, final);
-	} else {
-		vst1_u16((uint16 *)destPtr, simd4BppTo2Bpp(final));
-	}
-}
-
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
-	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
-	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
-	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
-	mask1 = vorrq_u16(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
-		}
-	}
-	uint16x8_t destCols2 = vandq_u16(destCol, mask1);
-	uint16x8_t srcCols2 = vandq_u16(srcCols, vmvnq_u16(mask1));
-	uint16x8_t final = vorrq_u16(destCols2, srcCols2);
-	if (horizFlip) {
-		final = vrev64q_u16(final);
-		final = vcombine_u16(vget_high_u16(final), vget_low_u16(final));
-	}
-	vst1q_u16((uint16 *)destPtr, final);
-}
-
-} // namespace AGS3
-
-#endif /* __aarch64__ */
-#endif /* Make it so that IOS and IPHONE SIM are not used with NEON */
-#endif /* Make it so that IOS and IPHONE SIM are not used with NEON */
-#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_NEON */
diff --git a/engines/ags/lib/allegro/surface_simd_none.cpp b/engines/ags/lib/allegro/surface_simd_none.cpp
deleted file mode 100644
index d22f7f84300..00000000000
--- a/engines/ags/lib/allegro/surface_simd_none.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "ags/lib/allegro/surface.h"
-#include "ags/lib/allegro/surface_simd_neon.h"
-#include "ags/lib/allegro/surface_simd_sse.h"
-
-// There is no SIMD implementation on this platform
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-
-namespace AGS3 {
-
-template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
-void BITMAP::drawInner4BppWithConv(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric<DestBytesPerPixel, SrcBytesPerPixel, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-}
-template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric<2, 2, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-}
-template<int ScaleThreshold>
-void BITMAP::drawInner1Bpp(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, bool useTint, bool sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, bool horizFlip, bool vertFlip, bool skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY) {
-	drawInnerGeneric<1, 1, ScaleThreshold>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea, blenderMode, scaleX, scaleY);
-}
-
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner2Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-template void BITMAP::drawInner1Bpp<0x100>(int, int, uint32, uint32, PALETTE, bool, bool, const ::Graphics::ManagedSurface &, ::Graphics::Surface &, bool, bool, bool, int, int, int, int, const Common::Rect &, const Common::Rect &, const BlenderMode, int, int);
-
-} // namespace AGS3
-
-#endif
diff --git a/engines/ags/lib/allegro/surface_simd_sse.cpp b/engines/ags/lib/allegro/surface_simd_sse.cpp
deleted file mode 100644
index 212a19011cb..00000000000
--- a/engines/ags/lib/allegro/surface_simd_sse.cpp
+++ /dev/null
@@ -1,503 +0,0 @@
-#include "ags/lib/allegro/surface_simd_sse.h"
-#ifdef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
-
-#include "ags/lib/allegro/gfx.h"
-#include "ags/lib/allegro/color.h"
-#include "ags/lib/allegro/flood.h"
-#include "ags/ags.h"
-#include "ags/globals.h"
-#include "common/textconsole.h"
-#include "graphics/screen.h"
-
-namespace AGS3 {
-
-inline uint32 extract32_idx0(__m128i x) {
-	return _mm_cvtsi128_si32(x);
-}
-inline uint32 extract32_idx1(__m128i x) {
-	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 1, 1)));
-}
-inline uint32 extract32_idx2(__m128i x) {
-	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(2, 2, 2, 2)));
-}
-inline uint32 extract32_idx3(__m128i x) {
-	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 3, 3)));
-}
-
-// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
-template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
-void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(args.srcAlpha), _mm_set1_epi32(24));
-	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintRed), _mm_set1_epi32(16)));
-	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintGreen), _mm_set1_epi32(8)));
-	tint = _mm_or_si128(tint, _mm_set1_epi32(args.tintBlue));
-	__m128i maskedAlphas = _mm_set1_epi32(args.alphaMask);
-	__m128i transColors = _mm_set1_epi32(args.transColor);
-    __m128i alphas = _mm_set1_epi32(args.srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	__m128i addIndexes = _mm_set_epi32(3, 2, 1, 0);
-	if (args.horizFlip) addIndexes = _mm_set_epi32(0, 1, 2, 3);
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
-
-		if (!Scale) {
-			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += args.destArea.pitch;
-			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			byte srcBuffer[4*4] = {0};
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
-				__m128i indexes = _mm_set1_epi32(scaleXCtr);
-				// Calculate in parallel the indexes of the pixels
-				if (SrcBytesPerPixel == 4)
-					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
-				else
-					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
-				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
-				memcpy(&srcBuffer[2*(size_t)SrcBytesPerPixel], srcP + extract32_idx2(indexes), SrcBytesPerPixel);
-				memcpy(&srcBuffer[3*(size_t)SrcBytesPerPixel], srcP + extract32_idx3(indexes), SrcBytesPerPixel);
-				scaleXCtr += args.scaleX*4;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * (intptr_t)DestBytesPerPixel];
-				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
-		}
-	}
-
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
-	// We have a picture that is a multiple of 4, so no extra pixels to draw
-	if (xCtrWidth % 4 == 0) return;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (!Scale) {
-		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
-			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
-			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
-		}
-		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
-		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 4;
-		xCtrBpp = xCtr * SrcBytesPerPixel;
-		destX = args.xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (Scale) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
-		}
-		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
-		uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
-		
-		// Check if this is a transparent color we should skip
-		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
-			continue;
-
-		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (args.srcAlpha != -1) {
-			if (args.useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = args.tintRed;
-				gSrc = args.tintGreen;
-				bSrc = args.tintBlue;
-				aSrc = args.srcAlpha;
-			}
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		if (DestBytesPerPixel == 4)
-			*(uint32 *)destVal = srcCol;
-		else
-			*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<int ScaleThreshold>
-void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	byte rSrc, gSrc, bSrc, aSrc;
-	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-	__m128i tint = _mm_set1_epi16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
-	__m128i transColors = _mm_set1_epi16(args.transColor);
-	__m128i alphas = _mm_set1_epi16(args.srcAlpha);
-
-	// This is so that we can calculate what pixels to crop off in a vectorized way
-	__m128i addIndexes = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	if (args.horizFlip) addIndexes = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
-	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
-
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		xCtrBppStart = xCtrStart * 2;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
-		if (!Scale) {
-			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// Goto next row in source and destination image
-			destP += args.destArea.pitch;
-			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-		} else {
-			// Here we are scaling the image
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			// Since the source yctr might not update every row of the destination, we have
-			// to see if we are on a new row...
-			if (srcYCtr != newSrcYCtr) {
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-
-			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
-			// scaling size, we create a small dummy buffer that we copy the pixels into and then
-			// call the drawPixelsSIMD function
-			uint16 srcBuffer[8];
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
-				__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
-				// Calculate in parallel the indexes of the pixels
-				indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
-				indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 1);
-				// Simply memcpy them in. memcpy has no real performance overhead here
-				srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
-				srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
-				srcBuffer[2] = *(const uint16 *)(srcP + extract32_idx2(indexes));
-				srcBuffer[3] = *(const uint16 *)(srcP + extract32_idx3(indexes));
-				srcBuffer[4] = *(const uint16 *)(srcP + extract32_idx0(indexes2));
-				srcBuffer[5] = *(const uint16 *)(srcP + extract32_idx1(indexes2));
-				srcBuffer[6] = *(const uint16 *)(srcP + extract32_idx2(indexes2));
-				srcBuffer[7] = *(const uint16 *)(srcP + extract32_idx3(indexes2));
-				scaleXCtr += args.scaleX*8;
-
-				// Now this is pretty much the same as before with non-scaled code, except that we use
-				// our dummy source buffer instead of the actuall source bitmap
-				byte *destPtr = &destP[destX * 2];
-				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
-			}
-			// We calculate every row here except the last (because then we need to
-			// check for if we fall off the edge of the row)
-			// The only exception here is scaling drawing this is because:
-			// 1) if statements are costly, and the less we do the faster this loop is
-			// 2) with this, the only branch in the normal drawing loop is the width check
-			// 3) the scaling code will actually draw the until the last 4 pixels of the image
-			//    and do the extra if checks because the scaling code is already much slower
-			//    than the normal drawing loop, and the less duplicate code helps here.
-			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
-		}
-	}
-
-	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
-	// Get the last x values of the last row
-	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
-	// Drawing the last few not scaled pixels here.
-	// Same as the loop above but now we check if we are going to overflow,
-	// and thus we don't need to mask out pixels that go over the row.
-	if (!Scale) {
-		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
-			byte *destPtr = &destP[destX * 2];
-			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
-		}
-		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
-		if (args.horizFlip) srcP += 2 * 7;
-	} else {
-		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
-		xCtr = xCtrWidth - xCtrWidth % 8;
-		xCtrBpp = xCtr * 2;
-		destX = args.xStart+xCtr;
-	}
-
-	// For the last 4 pixels, we just do them in serial, nothing special
-	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
-		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
-		if (Scale) {
-			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
-		}
-		byte *destVal = (byte *)&destP[destX * 2];
-		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
-		
-		// Check if this is a transparent color we should skip
-		if (args.skipTrans && srcCol == args.transColor)
-			continue;
-
-		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
-		if (args.srcAlpha != -1) {
-			if (args.useTint) {
-				rDest = rSrc;
-				gDest = gSrc;
-				bDest = bSrc;
-				aDest = aSrc;
-				rSrc = args.tintRed;
-				gSrc = args.tintGreen;
-				bSrc = args.tintBlue;
-				aSrc = args.srcAlpha;
-			}/* else {
-				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
-			}*/
-			blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
-			srcCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
-		} else {
-			srcCol = format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
-		}
-		*(uint16 *)destVal = srcCol;
-	}
-}
-
-template<bool Scale>
-void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
-	const int xDir = args.horizFlip ? -1 : 1;
-	__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
-
-	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
-	__m128i scaleAdds1 = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
-	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
-	__m128i scaleAdds3 = _mm_set_epi32((uint32)args.scaleX*11, (uint32)args.scaleX*10, (uint32)args.scaleX*9, (uint32)args.scaleX*8);
-	__m128i scaleAdds4 = _mm_set_epi32((uint32)args.scaleX*15, (uint32)args.scaleX*14, (uint32)args.scaleX*13, (uint32)args.scaleX*12);
-	
-	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
-	// we are in the inner loop)
-	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
-	if (args.xStart + xCtrWidth > args.destArea.w) {
-		xCtrWidth = args.destArea.w - args.xStart;
-	}
-	if (args.xStart < 0) {
-		xCtrStart = -args.xStart;
-		args.xStart = 0;
-	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
-	if (Scale) yCtrHeight = args.dstRect.height();
-	if (args.yStart < 0) {
-		yCtr = -args.yStart;
-		destY = 0;
-		if (Scale) {
-			scaleYCtr = yCtr * args.scaleY;
-			srcYCtr = scaleYCtr / SCALE_THRESHOLD;
-		}
-	}
-	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = args.destArea.h - args.yStart;
-	}
-	
-	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
-	const byte *srcP = (const byte *)args.src.getBasePtr(
-	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
-	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
-	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
-		if (Scale) {
-			// So here we update the srcYCtr differently due to this being for
-			// scaling
-			int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
-			if (srcYCtr != newSrcYCtr) {
-				// Since the source yctr might not update every row of the destination, we have
-				// to see if we are on a new row...
-				int diffSrcYCtr = newSrcYCtr - srcYCtr;
-				srcP += args.src.pitch * diffSrcYCtr;
-				srcYCtr = newSrcYCtr;
-			}
-		}
-		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
-		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
-			byte *destPtr = &destP[destX];
-
-			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
-			// can't have any blending applied to them
-			__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
-			__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
-			if (Scale) {
-				// If we are scaling, we have to set each pixel individually
-				__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
-				__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
-				// Calculate in parallel the indexes of the pixels
-				indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
-				indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
-				indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
-				indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
-				srcCols = _mm_set_epi8(
-					srcP[extract32_idx3(indexes4)],
-					srcP[extract32_idx2(indexes4)],
-					srcP[extract32_idx1(indexes4)],
-					srcP[extract32_idx0(indexes4)],
-					srcP[extract32_idx3(indexes3)],
-					srcP[extract32_idx2(indexes3)],
-					srcP[extract32_idx1(indexes3)],
-					srcP[extract32_idx0(indexes3)],
-					srcP[extract32_idx3(indexes2)],
-					srcP[extract32_idx2(indexes2)],
-					srcP[extract32_idx1(indexes2)],
-					srcP[extract32_idx0(indexes2)],
-					srcP[extract32_idx3(indexes1)],
-					srcP[extract32_idx2(indexes1)],
-					srcP[extract32_idx1(indexes1)],
-					srcP[extract32_idx0(indexes1)]);
-				scaleXCtr += args.scaleX*16;
-			}
-
-			// Mask out transparent pixels
-			__m128i mask1 = args.skipTrans ? _mm_cmpeq_epi8(srcCols, transColors) : _mm_setzero_si128();
-			__m128i final = _mm_or_si128(_mm_andnot_si128(mask1, srcCols), _mm_and_si128(destCols, mask1));
-			if (args.horizFlip) {
-				__m128i final_swap16 = _mm_srli_epi16(final, 8);
-				final_swap16 = _mm_or_si128(final_swap16, _mm_slli_epi16(_mm_and_si128(final, _mm_set1_epi16(0xff)), 8));
-				final_swap16 = _mm_shufflelo_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
-				final_swap16 = _mm_shufflehi_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
-				final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final_swap16), _mm_castsi128_pd(final_swap16), _MM_SHUFFLE2(0, 1)));
-			}
-			_mm_storeu_si128((__m128i *)destPtr, final);
-		}
-		// Get the last x values
-
-		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
-		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
-		if (args.horizFlip) srcP += 15;
-		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
-			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
-			if (Scale) {
-				srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
-			}
-			// Check if this is a transparent color we should skip
-			if (args.skipTrans && *srcCol == args.transColor)
-				continue;
-
-			byte *destVal = (byte *)&destP[destX];
-			*destVal = *srcCol;
-		}
-		if (args.horizFlip) srcP -= 15; // Undo what we did up there
-		destP += args.destArea.pitch; // Goto next row
-		// Only advance the src row by 1 every time like this if we don't scale
-		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
-	}
-}
-
-template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
-template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
-template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
-template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
-
-} // namespace AGS3
-
-#endif
diff --git a/engines/ags/lib/allegro/surface_simd_sse.h b/engines/ags/lib/allegro/surface_simd_sse.h
deleted file mode 100644
index 02a748b26c9..00000000000
--- a/engines/ags/lib/allegro/surface_simd_sse.h
+++ /dev/null
@@ -1,469 +0,0 @@
-/* ScummVM - Graphic Adventure Engine
- *
- * ScummVM is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_H
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
-
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_IMPL
-#endif
-#ifndef AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
-#define AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE_IMPL
-#endif
-
-#include <immintrin.h>
-#include "ags/globals.h"
-#include "ags/lib/allegro/surface.h"
-
-namespace AGS3 {
-
-inline __m128i simd2BppTo4Bpp(__m128i pixels) {
-	__m128i x = _mm_unpacklo_epi16(pixels, _mm_setzero_si128());
-
-	// c is the extracted 5/6 bit color from the image
-	__m128i c = _mm_srli_epi32(x, 11);
-
-	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
-	// sinificant bits in the original color for the least significant bits in the new one
-	__m128i r = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2)), 16);
-	c = _mm_srli_epi32(_mm_and_si128(x, _mm_set1_epi32(0x07e0)), 5);
-	__m128i g = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 2), _mm_srli_epi32(c, 4)), 8);
-	c = _mm_and_si128(x, _mm_set1_epi32(0x001f));
-	__m128i b = _mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2));
-
-	// By default 2bpp to 4bpp makes the alpha channel 255
-	return _mm_or_si128(_mm_or_si128(_mm_or_si128(r, g), b), _mm_set1_epi32(0xff000000));
-}
-
-inline __m128i simd4BppTo2Bpp(__m128i pixels) {
-	// x is the final 16 bit rgb pixel
-	__m128i x = _mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x000000ff)), 3);
-	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x0000ff00)), 8+2), 5));
-	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x00ff0000)), 16+3), 11));
-	x = _mm_slli_epi32(x, 16);
-	x = _mm_srai_epi32(x, 16);
-	return _mm_packs_epi32(x, _mm_setzero_si128());
-}
-
-inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did
-	alphas = _mm_add_epi16(alphas, _mm_and_si128(_mm_cmpgt_epi16(alphas, _mm_setzero_si128()), _mm_set1_epi16(1)));
-
-	// Split the components into rgb
-	__m128i srcComps[] = {
-		_mm_and_si128(srcCols, _mm_set1_epi16(0x1f)),		    		 // B
-		_mm_and_si128(_mm_srli_epi16(srcCols, 5), _mm_set1_epi16(0x3f)), // G
-		_mm_srli_epi16(srcCols, 11),									 // R
-	}, destComps[] = {
-		_mm_and_si128(destCols, _mm_set1_epi16(0x1f)),		    		  // B
-		_mm_and_si128(_mm_srli_epi16(destCols, 5), _mm_set1_epi16(0x3f)), // G
-		_mm_srli_epi16(destCols, 11),									  // R
-	};
-
-	// Calculate the differences between the colors
-	__m128i diffs[] = {
-		_mm_sub_epi16(srcComps[0], destComps[0]), // B
-		_mm_sub_epi16(srcComps[1], destComps[1]), // G
-		_mm_sub_epi16(srcComps[2], destComps[2]), // R
-	};
-
-	// Multiply by alpha and shift depth bits to the right
-	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
-	alphas = _mm_srli_epi16(alphas, 2);
-	diffs[1] = _mm_srli_epi16(_mm_mullo_epi16(diffs[1], alphas), 6);
-	alphas = _mm_srli_epi16(alphas, 1);
-	diffs[0] = _mm_srli_epi16(_mm_mullo_epi16(diffs[0], alphas), 5);
-	diffs[2] = _mm_srli_epi16(_mm_mullo_epi16(diffs[2], alphas), 5);
-
-	// Here we add the difference between the 2 colors times alpha onto the destination
-	diffs[0] = _mm_and_si128(_mm_add_epi16(diffs[0], destComps[0]), _mm_set1_epi16(0x1f));
-	diffs[1] = _mm_and_si128(_mm_add_epi16(diffs[1], destComps[1]), _mm_set1_epi16(0x3f));
-	diffs[2] = _mm_and_si128(_mm_add_epi16(diffs[2], destComps[2]), _mm_set1_epi16(0x1f));
-
-	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
-	diffs[0] = _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[1], 5));
-	return _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[2], 11));
-}
-
-inline __m128i mul32_as32(__m128i a, __m128i b)
-{
-	__m128i tmp1 = _mm_mul_epu32(a,b);
-	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), _mm_srli_si128(b,4));
-	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
-}
-
-// preserveAlpha:
-//		false => set destCols's alpha to 0
-// 		true => keep destCols's alpha
-inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool preserveAlpha) {
-	// Here we add 1 to alphas if its 0. This is what the original blender function did.
-	alphas = _mm_add_epi32(alphas, _mm_and_si128(_mm_cmpgt_epi32(alphas, _mm_setzero_si128()), _mm_set1_epi32(1)));
-
-	// Get the alpha from the destination
-	__m128i alpha = _mm_and_si128(destCols, _mm_set1_epi32(0xff000000));
-
-	// Get red and blue components
-	__m128i srcColsCopy = srcCols;
-	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
-	__m128i destColsCopy = destCols;
-	destColsCopy = _mm_and_si128(destColsCopy, _mm_set1_epi32(0xff00ff));
-
-	// Compute the difference, then multiply by alpha and divide by 256
-	srcColsCopy = _mm_sub_epi32(srcColsCopy, destColsCopy);
-	srcColsCopy = mul32_as32(srcColsCopy, alphas);
-	//srcColsCopy = _mm_mul_epi32(srcColsCopy, alphas);
-	srcColsCopy = _mm_srli_epi32(srcColsCopy, 8);
-	srcColsCopy = _mm_add_epi32(srcColsCopy, destCols); // Add the new red/blue to the old red/blue
-
-	// Do the same for the green component
-	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
-	destCols = _mm_and_si128(destCols, _mm_set1_epi32(0xff00));
-	srcCols = _mm_sub_epi32(srcCols, destCols);
-	srcCols = mul32_as32(srcCols, alphas);
-	//srcCols = _mm_mul_epi32(srcCols, alphas);
-	srcCols = _mm_srli_epi32(srcCols, 8);
-	srcCols = _mm_add_epi32(srcCols, destCols); // Add the new green to the old green
-
-	// Keep values in 8bit range and glue red/blue and green together
-	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
-	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
-	srcCols = _mm_or_si128(srcCols, srcColsCopy);
-
-	// Remember that alpha is not alphas, but rather the alpha of destcols
-	if (preserveAlpha) {
-		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
-		srcCols = _mm_or_si128(srcCols, alpha);
-	}
-	return srcCols;
-}
-
-inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
-	__m128 srcA = _mm_cvtepi32_ps(_mm_srli_epi32(srcCols, 24));
-	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0f / 255.0f));
-	__m128 srcR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff)));
-	__m128 srcG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff)));
-	__m128 srcB = _mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff)));
-
-	__m128 destA = _mm_cvtepi32_ps(_mm_srli_epi32(destCols, 24));
-	destA = _mm_mul_ps(destA, _mm_set1_ps(1.0f / 255.0f));
-	__m128 destR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff)));
-	__m128 destG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff)));
-	__m128 destB = _mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff)));
-
-	// the destination alpha gets multiplied by 255 - source alpha
-	destA = _mm_mul_ps(destA, _mm_sub_ps(_mm_set1_ps(1.0f), srcA));
-
-	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
-	__m128 combA = _mm_add_ps(srcA, destA);
-	__m128 combArcp = _mm_rcp_ps(combA);
-	destR = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcR, srcA), _mm_mul_ps(destR, destA)), combArcp);
-	destG = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcG, srcA), _mm_mul_ps(destG, destA)), combArcp);
-	destB = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcB, srcA), _mm_mul_ps(destB, destA)), combArcp);
-	combA = _mm_mul_ps(combA, _mm_set1_ps(255.0));
-
-	// Now put it back together
-	return _mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(combA), 24),
-		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destR), 16),
-		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destG), 8),
-			_mm_cvtps_epi32(destB))));
-}
-
-inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool light) {
-	// This function is NOT 1 to 1 with the original... It just approximates it
-	// It gets the value of the HSV of the dest color
-	// Then it gets the HSV of the srcCols
-
-	// how the values are transformed
-	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
-	// srcCols[0] = A | R | G | B
-	// srcCols[1] = A | R | G | B
-	// srcCols[2] = A | R | G | B
-	// srcCols[3] = A | R | G | B
-	//  ->
-	// to 4 float32x4_t's each being a seperate channel with each lane
-	// corresponding to their respective srcCols lane
-	// dda = { A[0], A[1], A[2], A[3] }
-	// ddr = { R[0], R[1], R[2], R[3] }
-	// ddg = { G[0], G[1], G[2], G[3] }
-	// ddb = { B[0], B[1], B[2], B[3] }
-
-	// do the transformation (we don't actually need alpha at all)
-	__m128 ddr, ddg, ddb;
-	ddr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-	ddg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-	ddb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-	__m128 ssr, ssg, ssb;
-	ssr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-	ssg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-	ssb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
-
-	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
-	__m128 dmaxes = _mm_max_ps(ddr, _mm_max_ps(ddg, ddb));
-	__m128 smaxes = _mm_max_ps(ssr, _mm_max_ps(ssg, ssb));
-	__m128 smins = _mm_min_ps(ssr, _mm_min_ps(ssg, ssb));
-
-	// This is here to stop from dividing by 0
-	const __m128 eplison0 = _mm_set1_ps(0.0000001f);
-
-	__m128 chroma = _mm_max_ps(_mm_sub_ps(smaxes, smins), eplison0);
-
-	// RGB to HSV is a piecewise function, so we compute each part of the function first...
-	__m128 hr, hg, hb, hue;
-	hr = _mm_div_ps(_mm_sub_ps(ssg, ssb), chroma);
-	hr = _mm_sub_ps(hr, _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_mul_ps(hr, _mm_set1_ps(1.0f / 6.0f)))), _mm_set1_ps(6.0f)));
-	hr = _mm_add_ps(hr, _mm_and_ps(_mm_cmplt_ps(hr, _mm_setzero_ps()), _mm_set1_ps(6.0f)));
-	hg = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssb, ssr), chroma), _mm_set1_ps(2.0f));
-	hg = _mm_max_ps(hg, _mm_setzero_ps());
-	hb = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssr, ssg), chroma), _mm_set1_ps(4.0f));
-	hb = _mm_max_ps(hb, _mm_setzero_ps());
-
-	// And then compute which one will be used based on criteria
-	__m128 hrfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssr, smaxes), _mm_cmpneq_ps(ssr, ssb)), _mm_set1_ps(1.0f));
-	__m128 hgfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssg, smaxes), _mm_cmpneq_ps(ssg, ssr)), _mm_set1_ps(1.0f));
-	__m128 hbfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssb, smaxes), _mm_cmpneq_ps(ssb, ssg)), _mm_set1_ps(1.0f));
-	hue = _mm_mul_ps(hr, hrfactors);
-	hue = _mm_add_ps(hue, _mm_mul_ps(hg, hgfactors));
-	hue = _mm_add_ps(hue, _mm_mul_ps(hb, hbfactors));
-
-	// Mess with the light like the original function
-	__m128 val = dmaxes;
-	if (light) {
-		val = _mm_sub_ps(val, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_cvtepi32_ps(alphas), _mm_set1_ps(1.0f / 250.0f))));
-		val = _mm_max_ps(val, _mm_setzero_ps());
-	}
-		
-	// then it stiches the HSV back together
-	// the hue and saturation come from the source (tint) color, and the value comes from
-	// the destinaion (real source) color
-	chroma = _mm_mul_ps(val, _mm_div_ps(_mm_sub_ps(smaxes, smins), _mm_add_ps(smaxes, eplison0)));
-	__m128 hprime_mod2 = _mm_mul_ps(hue, _mm_set1_ps(1.0f / 2.0f));
-	hprime_mod2 = _mm_mul_ps(_mm_sub_ps(hprime_mod2, _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(hprime_mod2, _mm_set1_ps(0.5))))), _mm_set1_ps(2.0f));
-	__m128 x = _mm_mul_ps(chroma, _mm_sub_ps(_mm_set1_ps(1), _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), _mm_sub_ps(hprime_mod2, _mm_set1_ps(1)))));
-	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0f), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0f)))));
-	__m128i hprime_rounded = _mm_cvtps_epi32(_mm_sub_ps(hue, _mm_set1_ps(0.5)));
-	__m128i x_int = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(255.0f)));
-	__m128i c_int = _mm_cvtps_epi32(_mm_mul_ps(chroma, _mm_set1_ps(255.0f)));
-
-	// Again HSV->RGB is also a piecewise function
-	__m128i val0 = _mm_or_si128(_mm_slli_epi32(x_int, 8), _mm_slli_epi32(c_int, 16));
-	val0 = _mm_and_si128(val0, _mm_or_si128(_mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(0)), _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(6))));
-	__m128i val1 = _mm_or_si128(_mm_slli_epi32(c_int, 8), _mm_slli_epi32(x_int, 16));
-	val1 = _mm_and_si128(val1, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(1)));
-	__m128i val2 = _mm_or_si128(_mm_slli_epi32(c_int, 8), x_int);
-	val2 = _mm_and_si128(val2, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(2)));
-	__m128i val3 = _mm_or_si128(_mm_slli_epi32(x_int, 8), c_int);
-	val3 = _mm_and_si128(val3, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(3)));
-	__m128i val4 = _mm_or_si128(_mm_slli_epi32(x_int, 16), c_int);
-	val4 = _mm_and_si128(val4, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(4)));
-	__m128i val5 = _mm_or_si128(_mm_slli_epi32(c_int, 16), x_int);
-	val5 = _mm_and_si128(val5, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(5)));
-
-	// or the values together
-	__m128i final = _mm_or_si128(val0, _mm_or_si128(val1, _mm_or_si128(val2, _mm_or_si128(val3, _mm_or_si128(val4, val5)))));
-
-	// add the minimums back in
-	__m128i val_add = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(val, chroma), _mm_set1_ps(255.0f)));
-	val_add = _mm_or_si128(val_add, _mm_or_si128(_mm_slli_epi32(val_add, 8), _mm_or_si128(_mm_slli_epi32(val_add, 16), _mm_and_si128(destCols, _mm_set1_epi32(0xff000000)))));
-	final = _mm_add_epi32(final, val_add);
-	return final;
-}
-
-inline __m128i mul32_as16(__m128i a, __m128i b) {	
-	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
-	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
-	__m128i res = _mm_mullo_epi16(a16, b16);
-	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
-}
-
-inline __m128i findmin32_as16(__m128i a, __m128i b) {
-	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
-	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
-	__m128i res = _mm_min_epi16(a16, b16);
-	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
-}
-
-inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas) {
-	__m128i srcAlphas, difAlphas, mask, ch1, ch2;
-	auto setupArgbAlphas = [&]() {
-		// This acts the same as this in the normal blender functions
-		// if (alpha == 0)
-		//     alpha = aSrc;
-		// else
-		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
-		// where alpha is the alpha byte of the srcCols
-		srcAlphas = _mm_srli_epi32(srcCols, 24);
-		difAlphas = _mm_add_epi32(_mm_and_si128(alphas, _mm_set1_epi32(0xff)), _mm_set1_epi32(1));
-		difAlphas = _mm_srli_epi32(mul32_as16(srcAlphas, difAlphas), 8);
-		difAlphas = _mm_slli_epi32(difAlphas, 24);
-		srcAlphas = _mm_slli_epi32(srcAlphas, 24);
-		mask = _mm_cmpeq_epi32(alphas, _mm_setzero_si128());
-		srcAlphas = _mm_and_si128(srcAlphas, mask);
-		difAlphas = _mm_andnot_si128(mask, difAlphas);
-		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
-		srcCols = _mm_or_si128(srcCols, _mm_or_si128(srcAlphas, difAlphas));
-	};
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
-		alphas = _mm_srli_epi32(srcCols, 24);
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
-		setupArgbAlphas();
-		// only blend if alpha isn't 0, otherwise use destCols
-		mask = _mm_cmpgt_epi32(_mm_srli_epi32(srcCols, 24), _mm_setzero_si128());
-		ch1 = _mm_and_si128(argbBlendSIMD(srcCols, destCols), mask);
-		ch2 = _mm_andnot_si128(mask, destCols);
-		return _mm_or_si128(ch1, ch2);
-	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
-		setupArgbAlphas();
-		return rgbBlendSIMD(srcCols, destCols, _mm_srli_epi32(srcCols, 24), false);
-	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
-		// if alpha is NOT 0 or 255
-		ch2 = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
-		ch2 = _mm_or_si128(ch2, _mm_slli_epi32(alphas, 24));
-		ch2 = argbBlendSIMD(ch2, destCols);
-		// if alpha is 0 or 255
-		ch1 = _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
-		// mask and or them together
-		mask = _mm_or_si128(_mm_cmpeq_epi32(alphas, _mm_setzero_si128()), _mm_cmpeq_epi32(alphas, _mm_set1_epi32(0xff)));
-		ch1 = _mm_and_si128(ch1, mask);
-		ch2 = _mm_andnot_si128(mask, ch2);
-		return _mm_or_si128(ch1, ch2);
-	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
-		return rgbBlendSIMD(srcCols, destCols, alphas, false);
-	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
-		return rgbBlendSIMD(srcCols, destCols, alphas, true);
-	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
-		return _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
-	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
-		srcAlphas = _mm_add_epi32(_mm_srli_epi32(srcCols, 24), _mm_srli_epi32(destCols, 24));
-		srcAlphas = findmin32_as16(srcAlphas, _mm_set1_epi32(0xff));
-		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
-		return _mm_or_si128(srcCols, _mm_slli_epi32(srcAlphas, 24));
-	case kTintBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
-	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
-		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
-	}
-	return _mm_setzero_si128();
-}
-
-#include "common/debug.h"
-inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
-	__m128i mask, ch1, ch2;
-	switch (_G(_blender_mode)) {
-	case kSourceAlphaBlender:
-	case kOpaqueBlenderMode:
-	case kAdditiveBlenderMode:
-		return srcCols;
-	case kArgbToArgbBlender:
-	case kArgbToRgbBlender:
-		ch1 = _mm_and_si128(_mm_set1_epi16(0xff), _mm_cmpeq_epi16(alphas, _mm_setzero_si128()));
-		ch2 = _mm_and_si128(alphas, _mm_cmpgt_epi16(alphas, _mm_setzero_si128()));
-		alphas = _mm_or_si128(ch1, ch2);
-		// fall through
-	case kRgbToRgbBlender:
-	case kAlphaPreservedBlenderMode:
-		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
-	case kRgbToArgbBlender:
-		mask = _mm_or_si128(_mm_cmpeq_epi16(alphas, _mm_set1_epi16(0)), _mm_cmpeq_epi16(alphas, _mm_set1_epi16(255)));
-		ch1 = _mm_and_si128(srcCols, mask);
-		ch2 = _mm_andnot_si128(mask, rgbBlendSIMD2Bpp(srcCols, destCols, alphas));
-		return _mm_or_si128(ch1, ch2);
-	case kTintBlenderMode:
-	case kTintLightBlenderMode:
-		__m128i srcColsLo = simd2BppTo4Bpp(_mm_and_si128(srcCols, _mm_set_epi32(0, 0, -1, -1)));
-		__m128i srcColsHi = simd2BppTo4Bpp(_mm_srli_si128(srcCols, 8));
-		__m128i destColsLo = simd2BppTo4Bpp(_mm_and_si128(destCols, _mm_set_epi32(0, 0, -1, -1)));
-		__m128i destColsHi = simd2BppTo4Bpp(_mm_srli_si128(destCols, 8));
-		__m128i alphasLo = _mm_unpacklo_epi16(_mm_and_si128(alphas, _mm_set_epi32(0, 0, -1, -1)), _mm_setzero_si128());
-		__m128i alphasHi = _mm_unpacklo_epi16(_mm_srli_si128(alphas, 8), _mm_setzero_si128());
-		__m128i lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
-		__m128i hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
-		return _mm_or_si128(lo, _mm_slli_si128(hi, 8));
-	}
-	return _mm_setzero_si128();
-}
-
-template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i maskedAlphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
-	__m128i srcCols, destCol;
-
-	if (DestBytesPerPixel == 4)
-		destCol = _mm_loadu_si128((const __m128i *)destPtr);
-	else
-		destCol = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)destPtr));
-	if (SrcBytesPerPixel == 4)
-		srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
-	else
-		srcCols = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)(srcP2 + xDir * xCtrBpp)));
-
-	// we do this here because we need to check if we should skip the pixel before we blend it
-	__m128i mask1 = skipTrans ? _mm_cmpeq_epi32(_mm_and_si128(srcCols, maskedAlphas), transColors) : _mm_setzero_si128();
-	mask1 = _mm_or_si128(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
-		}
-	}
-	__m128i destCols2 = _mm_and_si128(destCol, mask1);
-	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
-	__m128i final = _mm_or_si128(destCols2, srcCols2);
-	if (horizFlip) {
-		final = _mm_shuffle_epi32(final, _MM_SHUFFLE(0, 1, 2, 3));
-	}
-	if (DestBytesPerPixel == 4) {
-		_mm_storeu_si128((__m128i *)destPtr, final);
-	} else {
-		_mm_storel_epi64((__m128i *)destPtr, simd4BppTo2Bpp(final));
-	}
-}
-
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
-	__m128i destCol = _mm_loadu_si128((const __m128i *)destPtr);
-	__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
-	__m128i mask1 = skipTrans ? _mm_cmpeq_epi16(srcCols, transColors) : _mm_setzero_si128();
-	mask1 = _mm_or_si128(mask1, skipMask);
-	if (srcAlpha != -1) {
-		// take into account for useTint
-		if (useTint) {
-			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
-		} else {
-			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
-		}
-	}
-	__m128i destCols2 = _mm_and_si128(destCol, mask1);
-	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
-	__m128i final = _mm_or_si128(destCols2, srcCols2);
-	if (horizFlip) {
-		final = _mm_shufflelo_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
-		final = _mm_shufflehi_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
-		final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final), _mm_castsi128_pd(final), _MM_SHUFFLE2(0, 1)));
-	}
-	_mm_storeu_si128((__m128i *)destPtr, final);
-}
-
-} // namespace AGS3
-
-#endif /* __x86_64__ __i686__ */
-#endif /* AGS_LIB_ALLEGRO_SURFACE_SIMD_SSE */
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
new file mode 100644
index 00000000000..51f93df8eb8
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -0,0 +1,940 @@
+#include <immintrin.h>
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+namespace AGS3 {
+
+inline __m128i simd2BppTo4Bpp(__m128i pixels) {
+	__m128i x = _mm_unpacklo_epi16(pixels, _mm_setzero_si128());
+
+	// c is the extracted 5/6 bit color from the image
+	__m128i c = _mm_srli_epi32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
+	__m128i r = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2)), 16);
+	c = _mm_srli_epi32(_mm_and_si128(x, _mm_set1_epi32(0x07e0)), 5);
+	__m128i g = _mm_slli_epi32(_mm_or_si128(_mm_slli_epi32(c, 2), _mm_srli_epi32(c, 4)), 8);
+	c = _mm_and_si128(x, _mm_set1_epi32(0x001f));
+	__m128i b = _mm_or_si128(_mm_slli_epi32(c, 3), _mm_srli_epi32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
+	return _mm_or_si128(_mm_or_si128(_mm_or_si128(r, g), b), _mm_set1_epi32(0xff000000));
+}
+
+inline __m128i simd4BppTo2Bpp(__m128i pixels) {
+	// x is the final 16 bit rgb pixel
+	__m128i x = _mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x000000ff)), 3);
+	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x0000ff00)), 8+2), 5));
+	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x00ff0000)), 16+3), 11));
+	x = _mm_slli_epi32(x, 16);
+	x = _mm_srai_epi32(x, 16);
+	return _mm_packs_epi32(x, _mm_setzero_si128());
+}
+
+inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = _mm_add_epi16(alphas, _mm_and_si128(_mm_cmpgt_epi16(alphas, _mm_setzero_si128()), _mm_set1_epi16(1)));
+
+	// Split the components into rgb
+	__m128i srcComps[] = {
+		_mm_and_si128(srcCols, _mm_set1_epi16(0x1f)),		    		 // B
+		_mm_and_si128(_mm_srli_epi16(srcCols, 5), _mm_set1_epi16(0x3f)), // G
+		_mm_srli_epi16(srcCols, 11),									 // R
+	}, destComps[] = {
+		_mm_and_si128(destCols, _mm_set1_epi16(0x1f)),		    		  // B
+		_mm_and_si128(_mm_srli_epi16(destCols, 5), _mm_set1_epi16(0x3f)), // G
+		_mm_srli_epi16(destCols, 11),									  // R
+	};
+
+	// Calculate the differences between the colors
+	__m128i diffs[] = {
+		_mm_sub_epi16(srcComps[0], destComps[0]), // B
+		_mm_sub_epi16(srcComps[1], destComps[1]), // G
+		_mm_sub_epi16(srcComps[2], destComps[2]), // R
+	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
+	alphas = _mm_srli_epi16(alphas, 2);
+	diffs[1] = _mm_srli_epi16(_mm_mullo_epi16(diffs[1], alphas), 6);
+	alphas = _mm_srli_epi16(alphas, 1);
+	diffs[0] = _mm_srli_epi16(_mm_mullo_epi16(diffs[0], alphas), 5);
+	diffs[2] = _mm_srli_epi16(_mm_mullo_epi16(diffs[2], alphas), 5);
+
+	// Here we add the difference between the 2 colors times alpha onto the destination
+	diffs[0] = _mm_and_si128(_mm_add_epi16(diffs[0], destComps[0]), _mm_set1_epi16(0x1f));
+	diffs[1] = _mm_and_si128(_mm_add_epi16(diffs[1], destComps[1]), _mm_set1_epi16(0x3f));
+	diffs[2] = _mm_and_si128(_mm_add_epi16(diffs[2], destComps[2]), _mm_set1_epi16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
+	diffs[0] = _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[1], 5));
+	return _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[2], 11));
+}
+
+inline __m128i mul32_as32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a,b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), _mm_srli_si128(b,4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
+}
+
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
+inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did.
+	alphas = _mm_add_epi32(alphas, _mm_and_si128(_mm_cmpgt_epi32(alphas, _mm_setzero_si128()), _mm_set1_epi32(1)));
+
+	// Get the alpha from the destination
+	__m128i alpha = _mm_and_si128(destCols, _mm_set1_epi32(0xff000000));
+
+	// Get red and blue components
+	__m128i srcColsCopy = srcCols;
+	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
+	__m128i destColsCopy = destCols;
+	destColsCopy = _mm_and_si128(destColsCopy, _mm_set1_epi32(0xff00ff));
+
+	// Compute the difference, then multiply by alpha and divide by 256
+	srcColsCopy = _mm_sub_epi32(srcColsCopy, destColsCopy);
+	srcColsCopy = mul32_as32(srcColsCopy, alphas);
+	//srcColsCopy = _mm_mul_epi32(srcColsCopy, alphas);
+	srcColsCopy = _mm_srli_epi32(srcColsCopy, 8);
+	srcColsCopy = _mm_add_epi32(srcColsCopy, destCols); // Add the new red/blue to the old red/blue
+
+	// Do the same for the green component
+	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
+	destCols = _mm_and_si128(destCols, _mm_set1_epi32(0xff00));
+	srcCols = _mm_sub_epi32(srcCols, destCols);
+	srcCols = mul32_as32(srcCols, alphas);
+	//srcCols = _mm_mul_epi32(srcCols, alphas);
+	srcCols = _mm_srli_epi32(srcCols, 8);
+	srcCols = _mm_add_epi32(srcCols, destCols); // Add the new green to the old green
+
+	// Keep values in 8bit range and glue red/blue and green together
+	srcColsCopy = _mm_and_si128(srcColsCopy, _mm_set1_epi32(0xff00ff));
+	srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0xff00));
+	srcCols = _mm_or_si128(srcCols, srcColsCopy);
+
+	// Remember that alpha is not alphas, but rather the alpha of destcols
+	if (preserveAlpha) {
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		srcCols = _mm_or_si128(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
+	__m128 srcA = _mm_cvtepi32_ps(_mm_srli_epi32(srcCols, 24));
+	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0f / 255.0f));
+	__m128 srcR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff)));
+	__m128 srcG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff)));
+	__m128 srcB = _mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff)));
+
+	__m128 destA = _mm_cvtepi32_ps(_mm_srli_epi32(destCols, 24));
+	destA = _mm_mul_ps(destA, _mm_set1_ps(1.0f / 255.0f));
+	__m128 destR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff)));
+	__m128 destG = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff)));
+	__m128 destB = _mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff)));
+
+	// the destination alpha gets multiplied by 255 - source alpha
+	destA = _mm_mul_ps(destA, _mm_sub_ps(_mm_set1_ps(1.0f), srcA));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
+	__m128 combA = _mm_add_ps(srcA, destA);
+	__m128 combArcp = _mm_rcp_ps(combA);
+	destR = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcR, srcA), _mm_mul_ps(destR, destA)), combArcp);
+	destG = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcG, srcA), _mm_mul_ps(destG, destA)), combArcp);
+	destB = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(srcB, srcA), _mm_mul_ps(destB, destA)), combArcp);
+	combA = _mm_mul_ps(combA, _mm_set1_ps(255.0));
+
+	// Now put it back together
+	return _mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(combA), 24),
+		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destR), 16),
+		_mm_or_si128(_mm_slli_epi32(_mm_cvtps_epi32(destG), 8),
+			_mm_cvtps_epi32(destB))));
+}
+
+inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
+
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	// do the transformation (we don't actually need alpha at all)
+	__m128 ddr, ddg, ddb;
+	ddr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ddg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(destCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ddb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(destCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	__m128 ssr, ssg, ssb;
+	ssr = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ssg = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 8), _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+	ssb = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(srcCols, _mm_set1_epi32(0xff))), _mm_set1_ps(1.0f / 255.0f));
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
+	__m128 dmaxes = _mm_max_ps(ddr, _mm_max_ps(ddg, ddb));
+	__m128 smaxes = _mm_max_ps(ssr, _mm_max_ps(ssg, ssb));
+	__m128 smins = _mm_min_ps(ssr, _mm_min_ps(ssg, ssb));
+
+	// This is here to stop from dividing by 0
+	const __m128 eplison0 = _mm_set1_ps(0.0000001f);
+
+	__m128 chroma = _mm_max_ps(_mm_sub_ps(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
+	__m128 hr, hg, hb, hue;
+	hr = _mm_div_ps(_mm_sub_ps(ssg, ssb), chroma);
+	hr = _mm_sub_ps(hr, _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_mul_ps(hr, _mm_set1_ps(1.0f / 6.0f)))), _mm_set1_ps(6.0f)));
+	hr = _mm_add_ps(hr, _mm_and_ps(_mm_cmplt_ps(hr, _mm_setzero_ps()), _mm_set1_ps(6.0f)));
+	hg = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssb, ssr), chroma), _mm_set1_ps(2.0f));
+	hg = _mm_max_ps(hg, _mm_setzero_ps());
+	hb = _mm_add_ps(_mm_div_ps(_mm_sub_ps(ssr, ssg), chroma), _mm_set1_ps(4.0f));
+	hb = _mm_max_ps(hb, _mm_setzero_ps());
+
+	// And then compute which one will be used based on criteria
+	__m128 hrfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssr, smaxes), _mm_cmpneq_ps(ssr, ssb)), _mm_set1_ps(1.0f));
+	__m128 hgfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssg, smaxes), _mm_cmpneq_ps(ssg, ssr)), _mm_set1_ps(1.0f));
+	__m128 hbfactors = _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(ssb, smaxes), _mm_cmpneq_ps(ssb, ssg)), _mm_set1_ps(1.0f));
+	hue = _mm_mul_ps(hr, hrfactors);
+	hue = _mm_add_ps(hue, _mm_mul_ps(hg, hgfactors));
+	hue = _mm_add_ps(hue, _mm_mul_ps(hb, hbfactors));
+
+	// Mess with the light like the original function
+	__m128 val = dmaxes;
+	if (light) {
+		val = _mm_sub_ps(val, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_cvtepi32_ps(alphas), _mm_set1_ps(1.0f / 250.0f))));
+		val = _mm_max_ps(val, _mm_setzero_ps());
+	}
+		
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
+	chroma = _mm_mul_ps(val, _mm_div_ps(_mm_sub_ps(smaxes, smins), _mm_add_ps(smaxes, eplison0)));
+	__m128 hprime_mod2 = _mm_mul_ps(hue, _mm_set1_ps(1.0f / 2.0f));
+	hprime_mod2 = _mm_mul_ps(_mm_sub_ps(hprime_mod2, _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(hprime_mod2, _mm_set1_ps(0.5))))), _mm_set1_ps(2.0f));
+	__m128 x = _mm_mul_ps(chroma, _mm_sub_ps(_mm_set1_ps(1), _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), _mm_sub_ps(hprime_mod2, _mm_set1_ps(1)))));
+	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0f), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0f)))));
+	__m128i hprime_rounded = _mm_cvtps_epi32(_mm_sub_ps(hue, _mm_set1_ps(0.5)));
+	__m128i x_int = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(255.0f)));
+	__m128i c_int = _mm_cvtps_epi32(_mm_mul_ps(chroma, _mm_set1_ps(255.0f)));
+
+	// Again HSV->RGB is also a piecewise function
+	__m128i val0 = _mm_or_si128(_mm_slli_epi32(x_int, 8), _mm_slli_epi32(c_int, 16));
+	val0 = _mm_and_si128(val0, _mm_or_si128(_mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(0)), _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(6))));
+	__m128i val1 = _mm_or_si128(_mm_slli_epi32(c_int, 8), _mm_slli_epi32(x_int, 16));
+	val1 = _mm_and_si128(val1, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(1)));
+	__m128i val2 = _mm_or_si128(_mm_slli_epi32(c_int, 8), x_int);
+	val2 = _mm_and_si128(val2, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(2)));
+	__m128i val3 = _mm_or_si128(_mm_slli_epi32(x_int, 8), c_int);
+	val3 = _mm_and_si128(val3, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(3)));
+	__m128i val4 = _mm_or_si128(_mm_slli_epi32(x_int, 16), c_int);
+	val4 = _mm_and_si128(val4, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(4)));
+	__m128i val5 = _mm_or_si128(_mm_slli_epi32(c_int, 16), x_int);
+	val5 = _mm_and_si128(val5, _mm_cmpeq_epi32(hprime_rounded, _mm_set1_epi32(5)));
+
+	// or the values together
+	__m128i final = _mm_or_si128(val0, _mm_or_si128(val1, _mm_or_si128(val2, _mm_or_si128(val3, _mm_or_si128(val4, val5)))));
+
+	// add the minimums back in
+	__m128i val_add = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(val, chroma), _mm_set1_ps(255.0f)));
+	val_add = _mm_or_si128(val_add, _mm_or_si128(_mm_slli_epi32(val_add, 8), _mm_or_si128(_mm_slli_epi32(val_add, 16), _mm_and_si128(destCols, _mm_set1_epi32(0xff000000)))));
+	final = _mm_add_epi32(final, val_add);
+	return final;
+}
+
+inline __m128i mul32_as16(__m128i a, __m128i b) {	
+	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
+	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
+	__m128i res = _mm_mullo_epi16(a16, b16);
+	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
+}
+
+inline __m128i findmin32_as16(__m128i a, __m128i b) {
+	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
+	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
+	__m128i res = _mm_min_epi16(a16, b16);
+	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
+}
+
+inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	__m128i srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
+		srcAlphas = _mm_srli_epi32(srcCols, 24);
+		difAlphas = _mm_add_epi32(_mm_and_si128(alphas, _mm_set1_epi32(0xff)), _mm_set1_epi32(1));
+		difAlphas = _mm_srli_epi32(mul32_as16(srcAlphas, difAlphas), 8);
+		difAlphas = _mm_slli_epi32(difAlphas, 24);
+		srcAlphas = _mm_slli_epi32(srcAlphas, 24);
+		mask = _mm_cmpeq_epi32(alphas, _mm_setzero_si128());
+		srcAlphas = _mm_and_si128(srcAlphas, mask);
+		difAlphas = _mm_andnot_si128(mask, difAlphas);
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		srcCols = _mm_or_si128(srcCols, _mm_or_si128(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
+		alphas = _mm_srli_epi32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
+		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
+		mask = _mm_cmpgt_epi32(_mm_srli_epi32(srcCols, 24), _mm_setzero_si128());
+		ch1 = _mm_and_si128(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = _mm_andnot_si128(mask, destCols);
+		return _mm_or_si128(ch1, ch2);
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, _mm_srli_epi32(srcCols, 24), false);
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
+		ch2 = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		ch2 = _mm_or_si128(ch2, _mm_slli_epi32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
+		ch1 = _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
+		// mask and or them together
+		mask = _mm_or_si128(_mm_cmpeq_epi32(alphas, _mm_setzero_si128()), _mm_cmpeq_epi32(alphas, _mm_set1_epi32(0xff)));
+		ch1 = _mm_and_si128(ch1, mask);
+		ch2 = _mm_andnot_si128(mask, ch2);
+		return _mm_or_si128(ch1, ch2);
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
+		return _mm_or_si128(srcCols, _mm_set1_epi32(0xff000000));
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
+		srcAlphas = _mm_add_epi32(_mm_srli_epi32(srcCols, 24), _mm_srli_epi32(destCols, 24));
+		srcAlphas = findmin32_as16(srcAlphas, _mm_set1_epi32(0xff));
+		srcCols = _mm_and_si128(srcCols, _mm_set1_epi32(0x00ffffff));
+		return _mm_or_si128(srcCols, _mm_slli_epi32(srcAlphas, 24));
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+	return _mm_setzero_si128();
+}
+
+inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+	__m128i mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = _mm_and_si128(_mm_set1_epi16(0xff), _mm_cmpeq_epi16(alphas, _mm_setzero_si128()));
+		ch2 = _mm_and_si128(alphas, _mm_cmpgt_epi16(alphas, _mm_setzero_si128()));
+		alphas = _mm_or_si128(ch1, ch2);
+		// fall through
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = _mm_or_si128(_mm_cmpeq_epi16(alphas, _mm_set1_epi16(0)), _mm_cmpeq_epi16(alphas, _mm_set1_epi16(255)));
+		ch1 = _mm_and_si128(srcCols, mask);
+		ch2 = _mm_andnot_si128(mask, rgbBlendSIMD2Bpp(srcCols, destCols, alphas));
+		return _mm_or_si128(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		__m128i srcColsLo = simd2BppTo4Bpp(_mm_and_si128(srcCols, _mm_set_epi32(0, 0, -1, -1)));
+		__m128i srcColsHi = simd2BppTo4Bpp(_mm_srli_si128(srcCols, 8));
+		__m128i destColsLo = simd2BppTo4Bpp(_mm_and_si128(destCols, _mm_set_epi32(0, 0, -1, -1)));
+		__m128i destColsHi = simd2BppTo4Bpp(_mm_srli_si128(destCols, 8));
+		__m128i alphasLo = _mm_unpacklo_epi16(_mm_and_si128(alphas, _mm_set_epi32(0, 0, -1, -1)), _mm_setzero_si128());
+		__m128i alphasHi = _mm_unpacklo_epi16(_mm_srli_si128(alphas, 8), _mm_setzero_si128());
+		__m128i lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		__m128i hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return _mm_or_si128(lo, _mm_slli_si128(hi, 8));
+	}
+	return _mm_setzero_si128();
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i maskedAlphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+	__m128i srcCols, destCol;
+
+	if (DestBytesPerPixel == 4)
+		destCol = _mm_loadu_si128((const __m128i *)destPtr);
+	else
+		destCol = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)destPtr));
+	if (SrcBytesPerPixel == 4)
+		srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
+	else
+		srcCols = simd2BppTo4Bpp(_mm_loadl_epi64((const __m128i *)(srcP2 + xDir * xCtrBpp)));
+
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	__m128i mask1 = skipTrans ? _mm_cmpeq_epi32(_mm_and_si128(srcCols, maskedAlphas), transColors) : _mm_setzero_si128();
+	mask1 = _mm_or_si128(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	__m128i destCols2 = _mm_and_si128(destCol, mask1);
+	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
+	__m128i final = _mm_or_si128(destCols2, srcCols2);
+	if (horizFlip) {
+		final = _mm_shuffle_epi32(final, _MM_SHUFFLE(0, 1, 2, 3));
+	}
+	if (DestBytesPerPixel == 4) {
+		_mm_storeu_si128((__m128i *)destPtr, final);
+	} else {
+		_mm_storel_epi64((__m128i *)destPtr, simd4BppTo2Bpp(final));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+	__m128i destCol = _mm_loadu_si128((const __m128i *)destPtr);
+	__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
+	__m128i mask1 = skipTrans ? _mm_cmpeq_epi16(srcCols, transColors) : _mm_setzero_si128();
+	mask1 = _mm_or_si128(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	__m128i destCols2 = _mm_and_si128(destCol, mask1);
+	__m128i srcCols2 = _mm_andnot_si128(mask1, srcCols);
+	__m128i final = _mm_or_si128(destCols2, srcCols2);
+	if (horizFlip) {
+		final = _mm_shufflelo_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm_shufflehi_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final), _mm_castsi128_pd(final), _MM_SHUFFLE2(0, 1)));
+	}
+	_mm_storeu_si128((__m128i *)destPtr, final);
+}
+
+inline uint32 extract32_idx0(__m128i x) {
+	return _mm_cvtsi128_si32(x);
+}
+inline uint32 extract32_idx1(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+inline uint32 extract32_idx2(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(2, 2, 2, 2)));
+}
+inline uint32 extract32_idx3(__m128i x) {
+	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 3, 3)));
+}
+
+class DrawInnerImpl {
+public:
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
+static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(args.srcAlpha), _mm_set1_epi32(24));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintRed), _mm_set1_epi32(16)));
+	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintGreen), _mm_set1_epi32(8)));
+	tint = _mm_or_si128(tint, _mm_set1_epi32(args.tintBlue));
+	__m128i maskedAlphas = _mm_set1_epi32(args.alphaMask);
+	__m128i transColors = _mm_set1_epi32(args.transColor);
+    __m128i alphas = _mm_set1_epi32(args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m128i addIndexes = _mm_set_epi32(3, 2, 1, 0);
+	if (args.horizFlip) addIndexes = _mm_set_epi32(0, 1, 2, 3);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 4 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
+
+		if (!Scale) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
+				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			byte srcBuffer[4*4] = {0};
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
+				__m128i indexes = _mm_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				if (SrcBytesPerPixel == 4)
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 2);
+				else
+					indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(size_t)SrcBytesPerPixel], srcP + extract32_idx2(indexes), SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(size_t)SrcBytesPerPixel], srcP + extract32_idx3(indexes), SrcBytesPerPixel);
+				scaleXCtr += args.scaleX*4;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * (intptr_t)DestBytesPerPixel];
+				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
+	if (xCtrWidth % 4 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
+		}
+		// Because we move in 4 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 3 pixels.
+		if (args.horizFlip) srcP += SrcBytesPerPixel * 3;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 4 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 4;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = args.dstBitmap.getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	__m128i tint = _mm_set1_epi16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
+	__m128i transColors = _mm_set1_epi16(args.transColor);
+	__m128i alphas = _mm_set1_epi16(args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m128i addIndexes = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	if (args.horizFlip) addIndexes = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+	__m128i scaleAdds = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * 2;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
+		if (!Scale) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
+				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			uint16 srcBuffer[8];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
+				srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
+				srcBuffer[2] = *(const uint16 *)(srcP + extract32_idx2(indexes));
+				srcBuffer[3] = *(const uint16 *)(srcP + extract32_idx3(indexes));
+				srcBuffer[4] = *(const uint16 *)(srcP + extract32_idx0(indexes2));
+				srcBuffer[5] = *(const uint16 *)(srcP + extract32_idx1(indexes2));
+				srcBuffer[6] = *(const uint16 *)(srcP + extract32_idx2(indexes2));
+				srcBuffer[7] = *(const uint16 *)(srcP + extract32_idx3(indexes2));
+				scaleXCtr += args.scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * 2];
+				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// We have a picture that is a multiple of 8, so no extra pixels to draw
+	if (xCtrWidth % 8 == 0) return;
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
+		}
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (args.horizFlip) srcP += 2 * 7;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * 2;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 4 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && srcCol == args.transColor)
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m128i scaleAdds1 = _mm_set_epi32((uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m128i scaleAdds2 = _mm_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4);
+	__m128i scaleAdds3 = _mm_set_epi32((uint32)args.scaleX*11, (uint32)args.scaleX*10, (uint32)args.scaleX*9, (uint32)args.scaleX*8);
+	__m128i scaleAdds4 = _mm_set_epi32((uint32)args.scaleX*15, (uint32)args.scaleX*14, (uint32)args.scaleX*13, (uint32)args.scaleX*12);
+	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		if (Scale) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16) {
+			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
+			__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
+			__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
+			if (Scale) {
+				// If we are scaling, we have to set each pixel individually
+				__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
+				__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), BITMAP::SCALE_THRESHOLD_BITS);
+				srcCols = _mm_set_epi8(
+					srcP[extract32_idx3(indexes4)],
+					srcP[extract32_idx2(indexes4)],
+					srcP[extract32_idx1(indexes4)],
+					srcP[extract32_idx0(indexes4)],
+					srcP[extract32_idx3(indexes3)],
+					srcP[extract32_idx2(indexes3)],
+					srcP[extract32_idx1(indexes3)],
+					srcP[extract32_idx0(indexes3)],
+					srcP[extract32_idx3(indexes2)],
+					srcP[extract32_idx2(indexes2)],
+					srcP[extract32_idx1(indexes2)],
+					srcP[extract32_idx0(indexes2)],
+					srcP[extract32_idx3(indexes1)],
+					srcP[extract32_idx2(indexes1)],
+					srcP[extract32_idx1(indexes1)],
+					srcP[extract32_idx0(indexes1)]);
+				scaleXCtr += args.scaleX*16;
+			}
+
+			// Mask out transparent pixels
+			__m128i mask1 = args.skipTrans ? _mm_cmpeq_epi8(srcCols, transColors) : _mm_setzero_si128();
+			__m128i final = _mm_or_si128(_mm_andnot_si128(mask1, srcCols), _mm_and_si128(destCols, mask1));
+			if (args.horizFlip) {
+				__m128i final_swap16 = _mm_srli_epi16(final, 8);
+				final_swap16 = _mm_or_si128(final_swap16, _mm_slli_epi16(_mm_and_si128(final, _mm_set1_epi16(0xff)), 8));
+				final_swap16 = _mm_shufflelo_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final_swap16 = _mm_shufflehi_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(final_swap16), _mm_castsi128_pd(final_swap16), _MM_SHUFFLE2(0, 1)));
+			}
+			_mm_storeu_si128((__m128i *)destPtr, final);
+		}
+		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (args.horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (Scale) {
+				srcCol = (const byte *)(srcP + scaleXCtr / BITMAP::SCALE_THRESHOLD);
+			}
+			// Check if this is a transparent color we should skip
+			if (args.skipTrans && *srcCol == args.transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (args.horizFlip) srcP -= 15; // Undo what we did up there
+		destP += args.destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+	}
+}
+
+}; // end of class DrawInnerImpl
+
+template<bool Scale>
+void BITMAP::drawSSE2(DrawInnerArgs &args) {
+	if (args.sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		}
+	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
+		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
+		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+	}
+}
+
+template void BITMAP::drawSSE2<false>(DrawInnerArgs &);
+template void BITMAP::drawSSE2<true>(DrawInnerArgs &);
+
+} // namespace AGS3
+
+#endif
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 35fddd6ef2d..33a7cd2d27a 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -24,9 +24,7 @@ MODULE_OBJS = \
 	lib/allegro/math.o \
 	lib/allegro/rotate.o \
 	lib/allegro/surface.o \
-	lib/allegro/surface_simd_neon.o \
-	lib/allegro/surface_simd_sse.o \
-	lib/allegro/surface_simd_none.o \
+	lib/allegro/surface_generic.o \
 	lib/allegro/system.o \
 	lib/allegro/unicode.o \
 	lib/std/std.o \
@@ -379,6 +377,17 @@ MODULE_OBJS += \
 	tests/test_version.o
 endif
 
+ifeq ($(SCUMMVM_NEON),1)
+MODULE_OBJS += \
+	lib/allegro/surface_neon.o
+$(MODULE)/lib/allegro/surface_neon.o: CXXFLAGS += $(NEON_CXXFLAGS)
+endif
+ifeq ($(SCUMMVM_SSE2),1)
+MODULE_OBJS += \
+	lib/allegro/surface_sse2.o
+$(MODULE)/lib/allegro/surface_sse2.o: CXXFLAGS += -msse2
+endif
+
 # This module can be built as a plugin
 ifeq ($(ENABLE_AGS), DYNAMIC_PLUGIN)
 PLUGIN := 1


Commit: e416492a069d4cf61fa7e059ac1ec6a628ddb4d4
    https://github.com/scummvm/scummvm/commit/e416492a069d4cf61fa7e059ac1ec6a628ddb4d4
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Now engine can detect AVX2

Changed paths:
    engines/ags/globals.cpp
    engines/ags/globals.h


diff --git a/engines/ags/globals.cpp b/engines/ags/globals.cpp
index 9d89bed9aae..5e79f10426a 100644
--- a/engines/ags/globals.cpp
+++ b/engines/ags/globals.cpp
@@ -109,6 +109,7 @@ Globals::Globals() {
 	// Allegro globals
 	_simd_flags |= g_system->hasFeature(OSystem::kFeatureCpuNEON) ? SIMD_NEON : SIMD_NONE;
 	_simd_flags |= g_system->hasFeature(OSystem::kFeatureCpuSSE2) ? SIMD_SSE2 : SIMD_NONE;
+	_simd_flags |= g_system->hasFeature(OSystem::kFeatureCpuAVX2) ? SIMD_AVX2 : SIMD_NONE;
 	Common::fill((byte *)&_black_palette, (byte *)&_black_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_current_palette, (byte *)&_current_palette + PAL_SIZE, 0);
 	Common::fill((byte *)&_prev_current_palette, (byte *)&_prev_current_palette + PAL_SIZE, 0);
diff --git a/engines/ags/globals.h b/engines/ags/globals.h
index cacb17fb8ea..f66020476eb 100644
--- a/engines/ags/globals.h
+++ b/engines/ags/globals.h
@@ -188,6 +188,7 @@ public:
 		SIMD_NONE = 0,
 		SIMD_NEON = (1 << 0),
 		SIMD_SSE2 = (1 << 1),
+		SIMD_AVX2 = (1 << 2),
 	};
 
 	/**


Commit: b22e073e3534430d902f71c5f4c476817acd8dad
    https://github.com/scummvm/scummvm/commit/b22e073e3534430d902f71c5f4c476817acd8dad
Author: wyatt-radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Added AVX2 support for blending functions

Changed paths:
  A engines/ags/lib/allegro/surface_avx2.cpp
    engines/ags/lib/allegro/surface.cpp
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_generic.cpp
    engines/ags/lib/allegro/surface_sse2.cpp
    engines/ags/module.mk
    engines/ags/tests/test_gfx.cpp


diff --git a/engines/ags/lib/allegro/surface.cpp b/engines/ags/lib/allegro/surface.cpp
index a9f31272652..3e8fb3aedae 100644
--- a/engines/ags/lib/allegro/surface.cpp
+++ b/engines/ags/lib/allegro/surface.cpp
@@ -189,6 +189,12 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 		return;
 	}
 #endif
+#ifdef SCUMMVM_AVX2
+	if (_G(simd_flags) & AGS3::Globals::SIMD_AVX2) {
+		drawAVX2<false>(args);
+		return;
+	}
+#endif
 #ifdef SCUMMVM_SSE2
 	if (_G(simd_flags) & AGS3::Globals::SIMD_SSE2) {
 		drawSSE2<false>(args);
@@ -202,7 +208,7 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
                          const Common::Rect &dstRect, bool skipTrans, int srcAlpha) {
 	assert(format.bytesPerPixel == 2 || format.bytesPerPixel == 4 ||
 	       (format.bytesPerPixel == 1 && srcBitmap->format.bytesPerPixel == 1));
-	auto args = DrawInnerArgs(this, srcBitmap, srcRect, dstRect, skipTrans, srcAlpha, false, false, 0, 0, 0, true);
+	auto args = DrawInnerArgs(this, srcBitmap, srcRect, dstRect, skipTrans, srcAlpha, false, false, -1, -1, -1, true);
 	if (!args.shouldDraw) return;
 	if (!args.sameFormat && args.src.format.bytesPerPixel == 1) {
 		if (format.bytesPerPixel == 4)
@@ -217,6 +223,12 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
 		return;
 	}
 #endif
+#ifdef SCUMMVM_AVX2
+	if (_G(simd_flags) & AGS3::Globals::SIMD_AVX2) {
+		drawAVX2<true>(args);
+		return;
+	}
+#endif
 #ifdef SCUMMVM_SSE2
 	if (_G(simd_flags) & AGS3::Globals::SIMD_SSE2) {
 		drawSSE2<true>(args);
diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 3cd6738c532..8bc9ab1cc6a 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -297,6 +297,10 @@ public:
 #ifdef SCUMMVM_SSE2
 	template<bool Scale>
 	void drawSSE2(DrawInnerArgs &args);
+#endif
+#ifdef SCUMMVM_AVX2
+	template<bool Scale>
+	void drawAVX2(DrawInnerArgs &args);
 #endif
 	template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
 	void drawInnerGeneric(DrawInnerArgs &args);
@@ -313,9 +317,6 @@ public:
 			error("Unsupported format in BITMAP::getColor");
 		}
 	}
-
-	// This is for testing the blending modes int Test_Gfx
-	friend void Test_BlenderModes();
 };
 
 /**
diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
new file mode 100644
index 00000000000..1a19517a47a
--- /dev/null
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -0,0 +1,948 @@
+#include <immintrin.h>
+#include "ags/lib/allegro/gfx.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/ags.h"
+#include "ags/globals.h"
+#include "common/textconsole.h"
+#include "graphics/screen.h"
+
+namespace AGS3 {
+
+inline __m256i simd2BppTo4Bpp(__m256i pixels) {
+	__m256i x = _mm256_unpacklo_epi16(pixels, _mm256_setzero_si256());
+
+	// c is the extracted 5/6 bit color from the image
+	__m256i c = _mm256_srli_epi32(x, 11);
+
+	// We convert it back to normal by shifting it thrice over, naturally, and then using the 2 most
+	// sinificant bits in the original color for the least significant bits in the new one
+	__m256i r = _mm256_slli_epi32(_mm256_or_si256(_mm256_slli_epi32(c, 3), _mm256_srli_epi32(c, 2)), 16);
+	c = _mm256_srli_epi32(_mm256_and_si256(x, _mm256_set1_epi32(0x07e0)), 5);
+	__m256i g = _mm256_slli_epi32(_mm256_or_si256(_mm256_slli_epi32(c, 2), _mm256_srli_epi32(c, 4)), 8);
+	c = _mm256_and_si256(x, _mm256_set1_epi32(0x001f));
+	__m256i b = _mm256_or_si256(_mm256_slli_epi32(c, 3), _mm256_srli_epi32(c, 2));
+
+	// By default 2bpp to 4bpp makes the alpha channel 255
+	return _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(r, g), b), _mm256_set1_epi32(0xff000000));
+}
+
+inline __m256i simd4BppTo2Bpp(__m256i pixels) {
+	// x is the final 16 bit rgb pixel
+	__m256i x = _mm256_srli_epi32(_mm256_and_si256(pixels, _mm256_set1_epi32(0x000000ff)), 3);
+	x = _mm256_or_si256(x, _mm256_slli_epi32(_mm256_srli_epi32(_mm256_and_si256(pixels, _mm256_set1_epi32(0x0000ff00)), 8+2), 5));
+	x = _mm256_or_si256(x, _mm256_slli_epi32(_mm256_srli_epi32(_mm256_and_si256(pixels, _mm256_set1_epi32(0x00ff0000)), 16+3), 11));
+	x = _mm256_slli_epi32(x, 16);
+	x = _mm256_srai_epi32(x, 16);
+	return _mm256_packs_epi32(x, _mm256_setzero_si256());
+}
+
+inline __m256i rgbBlendSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did
+	alphas = _mm256_add_epi16(alphas, _mm256_and_si256(_mm256_cmpgt_epi16(alphas, _mm256_setzero_si256()), _mm256_set1_epi16(1)));
+
+	// Split the components into rgb
+	__m256i srcComps[] = {
+		_mm256_and_si256(srcCols, _mm256_set1_epi16(0x1f)),		    		 // B
+		_mm256_and_si256(_mm256_srli_epi16(srcCols, 5), _mm256_set1_epi16(0x3f)), // G
+		_mm256_srli_epi16(srcCols, 11),									 // R
+	}, destComps[] = {
+		_mm256_and_si256(destCols, _mm256_set1_epi16(0x1f)),		    		  // B
+		_mm256_and_si256(_mm256_srli_epi16(destCols, 5), _mm256_set1_epi16(0x3f)), // G
+		_mm256_srli_epi16(destCols, 11),									  // R
+	};
+
+	// Calculate the differences between the colors
+	__m256i diffs[] = {
+		_mm256_sub_epi16(srcComps[0], destComps[0]), // B
+		_mm256_sub_epi16(srcComps[1], destComps[1]), // G
+		_mm256_sub_epi16(srcComps[2], destComps[2]), // R
+	};
+
+	// Multiply by alpha and shift depth bits to the right
+	// pretty much the same as (int)(((float)component / 255.0f) * ((float)alpha / 255.0f) * 255.0f)
+	alphas = _mm256_srli_epi16(alphas, 2);
+	diffs[1] = _mm256_srli_epi16(_mm256_mullo_epi16(diffs[1], alphas), 6);
+	alphas = _mm256_srli_epi16(alphas, 1);
+	diffs[0] = _mm256_srli_epi16(_mm256_mullo_epi16(diffs[0], alphas), 5);
+	diffs[2] = _mm256_srli_epi16(_mm256_mullo_epi16(diffs[2], alphas), 5);
+
+	// Here we add the difference between the 2 colors times alpha onto the destination
+	diffs[0] = _mm256_and_si256(_mm256_add_epi16(diffs[0], destComps[0]), _mm256_set1_epi16(0x1f));
+	diffs[1] = _mm256_and_si256(_mm256_add_epi16(diffs[1], destComps[1]), _mm256_set1_epi16(0x3f));
+	diffs[2] = _mm256_and_si256(_mm256_add_epi16(diffs[2], destComps[2]), _mm256_set1_epi16(0x1f));
+
+	// We compile all the colors into diffs[0] as a 16 bit rgb pixel
+	diffs[0] = _mm256_or_si256(diffs[0], _mm256_slli_epi16(diffs[1], 5));
+	return _mm256_or_si256(diffs[0], _mm256_slli_epi16(diffs[2], 11));
+}
+
+// preserveAlpha:
+//		false => set destCols's alpha to 0
+// 		true => keep destCols's alpha
+inline __m256i rgbBlendSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool preserveAlpha) {
+	// Here we add 1 to alphas if its 0. This is what the original blender function did.
+	alphas = _mm256_add_epi32(alphas, _mm256_and_si256(_mm256_cmpgt_epi32(alphas, _mm256_setzero_si256()), _mm256_set1_epi32(1)));
+
+	// Get the alpha from the destination
+	__m256i alpha = _mm256_and_si256(destCols, _mm256_set1_epi32(0xff000000));
+
+	// Get red and blue components
+	__m256i srcColsCopy = srcCols;
+	srcColsCopy = _mm256_and_si256(srcColsCopy, _mm256_set1_epi32(0xff00ff));
+	__m256i destColsCopy = destCols;
+	destColsCopy = _mm256_and_si256(destColsCopy, _mm256_set1_epi32(0xff00ff));
+
+	// Compute the difference, then multiply by alpha and divide by 256
+	srcColsCopy = _mm256_sub_epi32(srcColsCopy, destColsCopy);
+	srcColsCopy = _mm256_mullo_epi32(srcColsCopy, alphas);
+	//srcColsCopy = _mm256_mul_epi32(srcColsCopy, alphas);
+	srcColsCopy = _mm256_srli_epi32(srcColsCopy, 8);
+	srcColsCopy = _mm256_add_epi32(srcColsCopy, destCols); // Add the new red/blue to the old red/blue
+
+	// Do the same for the green component
+	srcCols = _mm256_and_si256(srcCols, _mm256_set1_epi32(0xff00));
+	destCols = _mm256_and_si256(destCols, _mm256_set1_epi32(0xff00));
+	srcCols = _mm256_sub_epi32(srcCols, destCols);
+	srcCols = _mm256_mullo_epi32(srcCols, alphas);
+	//srcCols = _mm256_mul_epi32(srcCols, alphas);
+	srcCols = _mm256_srli_epi32(srcCols, 8);
+	srcCols = _mm256_add_epi32(srcCols, destCols); // Add the new green to the old green
+
+	// Keep values in 8bit range and glue red/blue and green together
+	srcColsCopy = _mm256_and_si256(srcColsCopy, _mm256_set1_epi32(0xff00ff));
+	srcCols = _mm256_and_si256(srcCols, _mm256_set1_epi32(0xff00));
+	srcCols = _mm256_or_si256(srcCols, srcColsCopy);
+
+	// Remember that alpha is not alphas, but rather the alpha of destcols
+	if (preserveAlpha) {
+		srcCols = _mm256_and_si256(srcCols, _mm256_set1_epi32(0x00ffffff));
+		srcCols = _mm256_or_si256(srcCols, alpha);
+	}
+	return srcCols;
+}
+
+inline __m256i argbBlendSIMD(__m256i srcCols, __m256i destCols) {
+	__m256 srcA = _mm256_cvtepi32_ps(_mm256_srli_epi32(srcCols, 24));
+	srcA = _mm256_mul_ps(srcA, _mm256_set1_ps(1.0f / 255.0f));
+	__m256 srcR = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(srcCols, 16), _mm256_set1_epi32(0xff)));
+	__m256 srcG = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(srcCols, 8), _mm256_set1_epi32(0xff)));
+	__m256 srcB = _mm256_cvtepi32_ps(_mm256_and_si256(srcCols, _mm256_set1_epi32(0xff)));
+
+	__m256 destA = _mm256_cvtepi32_ps(_mm256_srli_epi32(destCols, 24));
+	destA = _mm256_mul_ps(destA, _mm256_set1_ps(1.0f / 255.0f));
+	__m256 destR = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(destCols, 16), _mm256_set1_epi32(0xff)));
+	__m256 destG = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(destCols, 8), _mm256_set1_epi32(0xff)));
+	__m256 destB = _mm256_cvtepi32_ps(_mm256_and_si256(destCols, _mm256_set1_epi32(0xff)));
+
+	// the destination alpha gets multiplied by 255 - source alpha
+	destA = _mm256_mul_ps(destA, _mm256_sub_ps(_mm256_set1_ps(1.0f), srcA));
+
+	// ((src * sAlpha) + (dest * dAlpha)) / (sAlpha + dAlpha)
+	__m256 combA = _mm256_add_ps(srcA, destA);
+	__m256 combArcp = _mm256_rcp_ps(combA);
+	destR = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(srcR, srcA), _mm256_mul_ps(destR, destA)), combArcp);
+	destG = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(srcG, srcA), _mm256_mul_ps(destG, destA)), combArcp);
+	destB = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(srcB, srcA), _mm256_mul_ps(destB, destA)), combArcp);
+	combA = _mm256_mul_ps(combA, _mm256_set1_ps(255.0));
+
+	// Now put it back together
+	return _mm256_or_si256(_mm256_slli_epi32(_mm256_cvtps_epi32(combA), 24),
+		_mm256_or_si256(_mm256_slli_epi32(_mm256_cvtps_epi32(destR), 16),
+		_mm256_or_si256(_mm256_slli_epi32(_mm256_cvtps_epi32(destG), 8),
+			_mm256_cvtps_epi32(destB))));
+}
+
+inline __m256i blendTintSpriteSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool light) {
+	// This function is NOT 1 to 1 with the original... It just approximates it
+	// It gets the value of the HSV of the dest color
+	// Then it gets the HSV of the srcCols
+
+	// how the values are transformed
+	// from 1 uint32x4_t srcCols with each lane being ARGB uint32
+	// srcCols[0] = A | R | G | B
+	// srcCols[1] = A | R | G | B
+	// srcCols[2] = A | R | G | B
+	// srcCols[3] = A | R | G | B
+	//  ->
+	// to 4 float32x4_t's each being a seperate channel with each lane
+	// corresponding to their respective srcCols lane
+	// dda = { A[0], A[1], A[2], A[3] }
+	// ddr = { R[0], R[1], R[2], R[3] }
+	// ddg = { G[0], G[1], G[2], G[3] }
+	// ddb = { B[0], B[1], B[2], B[3] }
+
+	// do the transformation (we don't actually need alpha at all)
+	__m256 ddr, ddg, ddb;
+	ddr = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(destCols, 16), _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+	ddg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(destCols, 8), _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+	ddb = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(destCols, _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+	__m256 ssr, ssg, ssb;
+	ssr = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(srcCols, 16), _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+	ssg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(srcCols, 8), _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+	ssb = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(srcCols, _mm256_set1_epi32(0xff))), _mm256_set1_ps(1.0f / 255.0f));
+
+	// Get the maxes and mins (needed for HSV->RGB and visa-versa)
+	__m256 dmaxes = _mm256_max_ps(ddr, _mm256_max_ps(ddg, ddb));
+	__m256 smaxes = _mm256_max_ps(ssr, _mm256_max_ps(ssg, ssb));
+	__m256 smins = _mm256_min_ps(ssr, _mm256_min_ps(ssg, ssb));
+
+	// This is here to stop from dividing by 0
+	const __m256 eplison0 = _mm256_set1_ps(0.0000001f);
+
+	__m256 chroma = _mm256_max_ps(_mm256_sub_ps(smaxes, smins), eplison0);
+
+	// RGB to HSV is a piecewise function, so we compute each part of the function first...
+	__m256 hr, hg, hb, hue;
+	hr = _mm256_div_ps(_mm256_sub_ps(ssg, ssb), chroma);
+	hr = _mm256_sub_ps(hr, _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtps_epi32(_mm256_mul_ps(hr, _mm256_set1_ps(1.0f / 6.0f)))), _mm256_set1_ps(6.0f)));
+	hr = _mm256_add_ps(hr, _mm256_and_ps(_mm256_cmp_ps(hr, _mm256_setzero_ps(), _CMP_LT_OS), _mm256_set1_ps(6.0f)));
+	hg = _mm256_add_ps(_mm256_div_ps(_mm256_sub_ps(ssb, ssr), chroma), _mm256_set1_ps(2.0f));
+	hg = _mm256_max_ps(hg, _mm256_setzero_ps());
+	hb = _mm256_add_ps(_mm256_div_ps(_mm256_sub_ps(ssr, ssg), chroma), _mm256_set1_ps(4.0f));
+	hb = _mm256_max_ps(hb, _mm256_setzero_ps());
+
+	// And then compute which one will be used based on criteria
+	__m256 hrfactors = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(ssr, smaxes, _CMP_EQ_OS), _mm256_cmp_ps(ssr, ssb, _CMP_NEQ_OS)), _mm256_set1_ps(1.0f));
+	__m256 hgfactors = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(ssg, smaxes, _CMP_EQ_OS), _mm256_cmp_ps(ssg, ssr, _CMP_NEQ_OS)), _mm256_set1_ps(1.0f));
+	__m256 hbfactors = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(ssb, smaxes, _CMP_EQ_OS), _mm256_cmp_ps(ssb, ssg, _CMP_NEQ_OS)), _mm256_set1_ps(1.0f));
+	hue = _mm256_mul_ps(hr, hrfactors);
+	hue = _mm256_add_ps(hue, _mm256_mul_ps(hg, hgfactors));
+	hue = _mm256_add_ps(hue, _mm256_mul_ps(hb, hbfactors));
+
+	// Mess with the light like the original function
+	__m256 val = dmaxes;
+	if (light) {
+		val = _mm256_sub_ps(val, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(_mm256_cvtepi32_ps(alphas), _mm256_set1_ps(1.0f / 250.0f))));
+		val = _mm256_max_ps(val, _mm256_setzero_ps());
+	}
+		
+	// then it stiches the HSV back together
+	// the hue and saturation come from the source (tint) color, and the value comes from
+	// the destinaion (real source) color
+	chroma = _mm256_mul_ps(val, _mm256_div_ps(_mm256_sub_ps(smaxes, smins), _mm256_add_ps(smaxes, eplison0)));
+	__m256 hprime_mod2 = _mm256_mul_ps(hue, _mm256_set1_ps(1.0f / 2.0f));
+	hprime_mod2 = _mm256_mul_ps(_mm256_sub_ps(hprime_mod2, _mm256_cvtepi32_ps(_mm256_cvtps_epi32(_mm256_sub_ps(hprime_mod2, _mm256_set1_ps(0.5))))), _mm256_set1_ps(2.0f));
+	__m256 x = _mm256_mul_ps(chroma, _mm256_sub_ps(_mm256_set1_ps(1), _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)), _mm256_sub_ps(hprime_mod2, _mm256_set1_ps(1)))));
+	//float32x4_t x = vmulq_f32(chroma, vsubq_f32(vmovq_n_f32(1.0f), vabsq_f32(vsubq_f32(hprime_mod2, vmovq_n_f32(1.0f)))));
+	__m256i hprime_rounded = _mm256_cvtps_epi32(_mm256_sub_ps(hue, _mm256_set1_ps(0.5)));
+	__m256i x_int = _mm256_cvtps_epi32(_mm256_mul_ps(x, _mm256_set1_ps(255.0f)));
+	__m256i c_int = _mm256_cvtps_epi32(_mm256_mul_ps(chroma, _mm256_set1_ps(255.0f)));
+
+	// Again HSV->RGB is also a piecewise function
+	__m256i val0 = _mm256_or_si256(_mm256_slli_epi32(x_int, 8), _mm256_slli_epi32(c_int, 16));
+	val0 = _mm256_and_si256(val0, _mm256_or_si256(_mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(0)), _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(6))));
+	__m256i val1 = _mm256_or_si256(_mm256_slli_epi32(c_int, 8), _mm256_slli_epi32(x_int, 16));
+	val1 = _mm256_and_si256(val1, _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(1)));
+	__m256i val2 = _mm256_or_si256(_mm256_slli_epi32(c_int, 8), x_int);
+	val2 = _mm256_and_si256(val2, _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(2)));
+	__m256i val3 = _mm256_or_si256(_mm256_slli_epi32(x_int, 8), c_int);
+	val3 = _mm256_and_si256(val3, _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(3)));
+	__m256i val4 = _mm256_or_si256(_mm256_slli_epi32(x_int, 16), c_int);
+	val4 = _mm256_and_si256(val4, _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(4)));
+	__m256i val5 = _mm256_or_si256(_mm256_slli_epi32(c_int, 16), x_int);
+	val5 = _mm256_and_si256(val5, _mm256_cmpeq_epi32(hprime_rounded, _mm256_set1_epi32(5)));
+
+	// or the values together
+	__m256i final = _mm256_or_si256(val0, _mm256_or_si256(val1, _mm256_or_si256(val2, _mm256_or_si256(val3, _mm256_or_si256(val4, val5)))));
+
+	// add the minimums back in
+	__m256i val_add = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_sub_ps(val, chroma), _mm256_set1_ps(255.0f)));
+	val_add = _mm256_or_si256(val_add, _mm256_or_si256(_mm256_slli_epi32(val_add, 8), _mm256_or_si256(_mm256_slli_epi32(val_add, 16), _mm256_and_si256(destCols, _mm256_set1_epi32(0xff000000)))));
+	final = _mm256_add_epi32(final, val_add);
+	return final;
+}
+
+inline __m256i mul32_as16(__m256i a, __m256i b) {	
+	__m256i a16 = _mm256_packs_epi32(a, _mm256_setzero_si256());
+	__m256i b16 = _mm256_packs_epi32(b, _mm256_setzero_si256());
+	__m256i res = _mm256_mullo_epi16(a16, b16);
+	return _mm256_unpacklo_epi16(res, _mm256_setzero_si256());
+}
+
+inline __m256i findmin32_as16(__m256i a, __m256i b) {
+	__m256i a16 = _mm256_packs_epi32(a, _mm256_setzero_si256());
+	__m256i b16 = _mm256_packs_epi32(b, _mm256_setzero_si256());
+	__m256i res = _mm256_min_epi16(a16, b16);
+	return _mm256_unpacklo_epi16(res, _mm256_setzero_si256());
+}
+
+inline __m256i blendPixelSIMD(__m256i srcCols, __m256i destCols, __m256i alphas) {
+	__m256i srcAlphas, difAlphas, mask, ch1, ch2;
+	auto setupArgbAlphas = [&]() {
+		// This acts the same as this in the normal blender functions
+		// if (alpha == 0)
+		//     alpha = aSrc;
+		// else
+		//     alpha = aSrc * ((alpha & 0xff) + 1) / 256;
+		// where alpha is the alpha byte of the srcCols
+		srcAlphas = _mm256_srli_epi32(srcCols, 24);
+		difAlphas = _mm256_add_epi32(_mm256_and_si256(alphas, _mm256_set1_epi32(0xff)), _mm256_set1_epi32(1));
+		difAlphas = _mm256_srli_epi32(mul32_as16(srcAlphas, difAlphas), 8);
+		difAlphas = _mm256_slli_epi32(difAlphas, 24);
+		srcAlphas = _mm256_slli_epi32(srcAlphas, 24);
+		mask = _mm256_cmpeq_epi32(alphas, _mm256_setzero_si256());
+		srcAlphas = _mm256_and_si256(srcAlphas, mask);
+		difAlphas = _mm256_andnot_si256(mask, difAlphas);
+		srcCols = _mm256_and_si256(srcCols, _mm256_set1_epi32(0x00ffffff));
+		srcCols = _mm256_or_si256(srcCols, _mm256_or_si256(srcAlphas, difAlphas));
+	};
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender: // see BITMAP member function blendSourceAlpha
+		alphas = _mm256_srli_epi32(srcCols, 24);
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kArgbToArgbBlender: // see BITMAP member function blendArgbToArgb
+		setupArgbAlphas();
+		// only blend if alpha isn't 0, otherwise use destCols
+		mask = _mm256_cmpgt_epi32(_mm256_srli_epi32(srcCols, 24), _mm256_setzero_si256());
+		ch1 = _mm256_and_si256(argbBlendSIMD(srcCols, destCols), mask);
+		ch2 = _mm256_andnot_si256(mask, destCols);
+		return _mm256_or_si256(ch1, ch2);
+	case kArgbToRgbBlender: // see BITMAP member function blendArgbToRgb
+		setupArgbAlphas();
+		return rgbBlendSIMD(srcCols, destCols, _mm256_srli_epi32(srcCols, 24), false);
+	case kRgbToArgbBlender: // see BITMAP member function blendRgbToArgb
+		// if alpha is NOT 0 or 255
+		ch2 = _mm256_and_si256(srcCols, _mm256_set1_epi32(0x00ffffff));
+		ch2 = _mm256_or_si256(ch2, _mm256_slli_epi32(alphas, 24));
+		ch2 = argbBlendSIMD(ch2, destCols);
+		// if alpha is 0 or 255
+		ch1 = _mm256_or_si256(srcCols, _mm256_set1_epi32(0xff000000));
+		// mask and or them together
+		mask = _mm256_or_si256(_mm256_cmpeq_epi32(alphas, _mm256_setzero_si256()), _mm256_cmpeq_epi32(alphas, _mm256_set1_epi32(0xff)));
+		ch1 = _mm256_and_si256(ch1, mask);
+		ch2 = _mm256_andnot_si256(mask, ch2);
+		return _mm256_or_si256(ch1, ch2);
+	case kRgbToRgbBlender: // see BITMAP member function blendRgbToRgb
+		return rgbBlendSIMD(srcCols, destCols, alphas, false);
+	case kAlphaPreservedBlenderMode: // see BITMAP member function blendPreserveAlpha
+		return rgbBlendSIMD(srcCols, destCols, alphas, true);
+	case kOpaqueBlenderMode: // see BITMAP member function blendOpaque
+		return _mm256_or_si256(srcCols, _mm256_set1_epi32(0xff000000));
+	case kAdditiveBlenderMode: // see BITMAP member function blendAdditiveAlpha
+		srcAlphas = _mm256_add_epi32(_mm256_srli_epi32(srcCols, 24), _mm256_srli_epi32(destCols, 24));
+		srcAlphas = findmin32_as16(srcAlphas, _mm256_set1_epi32(0xff));
+		srcCols = _mm256_and_si256(srcCols, _mm256_set1_epi32(0x00ffffff));
+		return _mm256_or_si256(srcCols, _mm256_slli_epi32(srcAlphas, 24));
+	case kTintBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, false);
+	case kTintLightBlenderMode: // see BITMAP member function blendTintSprite
+		return blendTintSpriteSIMD(srcCols, destCols, alphas, true);
+	}
+	return _mm256_setzero_si256();
+}
+
+inline __m256i blendPixelSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
+	__m256i mask, ch1, ch2;
+	switch (_G(_blender_mode)) {
+	case kSourceAlphaBlender:
+	case kOpaqueBlenderMode:
+	case kAdditiveBlenderMode:
+		return srcCols;
+	case kArgbToArgbBlender:
+	case kArgbToRgbBlender:
+		ch1 = _mm256_and_si256(_mm256_set1_epi16(0xff), _mm256_cmpeq_epi16(alphas, _mm256_setzero_si256()));
+		ch2 = _mm256_and_si256(alphas, _mm256_cmpgt_epi16(alphas, _mm256_setzero_si256()));
+		alphas = _mm256_or_si256(ch1, ch2);
+		// fall through
+	case kRgbToRgbBlender:
+	case kAlphaPreservedBlenderMode:
+		return rgbBlendSIMD2Bpp(srcCols, destCols, alphas);
+	case kRgbToArgbBlender:
+		mask = _mm256_or_si256(_mm256_cmpeq_epi16(alphas, _mm256_set1_epi16(0)), _mm256_cmpeq_epi16(alphas, _mm256_set1_epi16(255)));
+		ch1 = _mm256_and_si256(srcCols, mask);
+		ch2 = _mm256_andnot_si256(mask, rgbBlendSIMD2Bpp(srcCols, destCols, alphas));
+		return _mm256_or_si256(ch1, ch2);
+	case kTintBlenderMode:
+	case kTintLightBlenderMode:
+		__m256i srcColsLo = simd2BppTo4Bpp(_mm256_and_si256(srcCols, _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1)));
+		__m256i srcColsHi = simd2BppTo4Bpp(_mm256_srli_si256(srcCols, 16));
+		__m256i destColsLo = simd2BppTo4Bpp(_mm256_and_si256(destCols, _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1)));
+		__m256i destColsHi = simd2BppTo4Bpp(_mm256_srli_si256(destCols, 16));
+		__m256i alphasLo = _mm256_unpacklo_epi16(_mm256_and_si256(alphas, _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1)), _mm256_setzero_si256());
+		__m256i alphasHi = _mm256_unpacklo_epi16(_mm256_srli_si256(alphas, 16), _mm256_setzero_si256());
+		__m256i lo = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsLo, destColsLo, alphasLo, _G(_blender_mode) == kTintLightBlenderMode));
+		__m256i hi = simd4BppTo2Bpp(blendTintSpriteSIMD(srcColsHi, destColsHi, alphasHi, _G(_blender_mode) == kTintLightBlenderMode));
+		return _mm256_or_si256(lo, _mm256_slli_si256(hi, 16));
+	}
+	return _mm256_setzero_si256();
+}
+
+template<int DestBytesPerPixel, int SrcBytesPerPixel>
+inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i maskedAlphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
+	__m256i srcCols, destCol;
+
+	if (DestBytesPerPixel == 4)
+		destCol = _mm256_loadu_si256((const __m256i *)destPtr);
+	else
+		destCol = simd2BppTo4Bpp(_mm256_and_si256(_mm256_loadu_si256((const __m256i *)destPtr), _mm256_set_epi64x(0, 0, -1, -1)));
+	if (SrcBytesPerPixel == 4)
+		srcCols = _mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp));
+	else
+		srcCols = simd2BppTo4Bpp(_mm256_and_si256(_mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp)), _mm256_set_epi64x(0, 0, -1, -1)));
+
+	// we do this here because we need to check if we should skip the pixel before we blend it
+	__m256i mask1 = skipTrans ? _mm256_cmpeq_epi32(_mm256_and_si256(srcCols, maskedAlphas), transColors) : _mm256_setzero_si256();
+	mask1 = _mm256_or_si256(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD(srcCols, destCol, alphas);
+		}
+	}
+	__m256i destCols2 = _mm256_and_si256(destCol, mask1);
+	__m256i srcCols2 = _mm256_andnot_si256(mask1, srcCols);
+	__m256i final = _mm256_or_si256(destCols2, srcCols2);
+	if (horizFlip) {
+        final = _mm256_shuffle_epi32(final, _MM_SHUFFLE(0, 1, 2, 3));
+        final = _mm256_permute2x128_si256(final, final, 0x01);
+	}
+	if (DestBytesPerPixel == 4) {
+		_mm256_storeu_si256((__m256i *)destPtr, final);
+	} else {
+        _mm_storeu_si128((__m128i *)destPtr, _mm256_extracti128_si256(simd4BppTo2Bpp(final), 0));
+	}
+}
+
+inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
+	__m256i destCol = _mm256_loadu_si256((const __m256i *)destPtr);
+	__m256i srcCols = _mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp));
+	__m256i mask1 = skipTrans ? _mm256_cmpeq_epi16(srcCols, transColors) : _mm256_setzero_si256();
+	mask1 = _mm256_or_si256(mask1, skipMask);
+	if (srcAlpha != -1) {
+		// take into account for useTint
+		if (useTint) {
+			srcCols = blendPixelSIMD2Bpp(tint, srcCols, alphas);
+		} else {
+			srcCols = blendPixelSIMD2Bpp(srcCols, destCol, alphas);
+		}
+	}
+	__m256i destCols2 = _mm256_and_si256(destCol, mask1);
+	__m256i srcCols2 = _mm256_andnot_si256(mask1, srcCols);
+	__m256i final = _mm256_or_si256(destCols2, srcCols2);
+	if (horizFlip) {
+		final = _mm256_shufflelo_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm256_shufflehi_epi16(final, _MM_SHUFFLE(0, 1, 2, 3));
+		final = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(final), _mm256_castsi256_pd(final), _MM_SHUFFLE2(0, 1)));
+	}
+	_mm256_storeu_si256((__m256i *)destPtr, final);
+}
+
+class DrawInnerImpl {
+public:
+
+// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
+template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
+static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+    __m256i tint = _mm256_slli_epi32(_mm256_set1_epi32(args.srcAlpha), 24);
+	tint = _mm256_or_si256(tint, _mm256_slli_epi32(_mm256_set1_epi32(args.tintRed), 16));
+	tint = _mm256_or_si256(tint, _mm256_slli_epi32(_mm256_set1_epi32(args.tintGreen), 8));
+	tint = _mm256_or_si256(tint, _mm256_set1_epi32(args.tintBlue));
+	__m256i maskedAlphas = _mm256_set1_epi32(args.alphaMask);
+	__m256i transColors = _mm256_set1_epi32(args.transColor);
+    __m256i alphas = _mm256_set1_epi32(args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m256i addIndexes = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+	if (args.horizFlip) addIndexes = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m256i scaleAdds = _mm256_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4,
+        (uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 8 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		__m256i xCtrWidthSIMD = _mm256_set1_epi32(xCtrWidth); // This is the width of the row
+
+		if (!Scale) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
+				byte *destPtr = &destP[destX * DestBytesPerPixel];
+				// Skip pixels that are beyond the row
+				__m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr; // Have we moved yet
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			byte srcBuffer[4*8] = {0};
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
+				__m256i indexes = _mm256_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				if (SrcBytesPerPixel == 4)
+					indexes = _mm256_slli_epi32(_mm256_srli_epi32(_mm256_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 2);
+				else
+					indexes = _mm256_slli_epi32(_mm256_srli_epi32(_mm256_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 0), SrcBytesPerPixel);
+				memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 1), SrcBytesPerPixel);
+				memcpy(&srcBuffer[2*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 2), SrcBytesPerPixel);
+				memcpy(&srcBuffer[3*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 3), SrcBytesPerPixel);
+				memcpy(&srcBuffer[4*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 4), SrcBytesPerPixel);
+				memcpy(&srcBuffer[5*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 5), SrcBytesPerPixel);
+				memcpy(&srcBuffer[6*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 6), SrcBytesPerPixel);
+				memcpy(&srcBuffer[7*(size_t)SrcBytesPerPixel], srcP + _mm256_extract_epi32(indexes, 7), SrcBytesPerPixel);
+				scaleXCtr += args.scaleX*8;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * (intptr_t)DestBytesPerPixel];
+				__m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, (const byte *)srcBuffer, tint, alphas, maskedAlphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// We have a picture that is a multiple of 4, so no extra pixels to draw
+	if (xCtrWidth % 8 == 0) return;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
+			byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_setzero_si256());
+		}
+		// Because we move in 8 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 7 pixels.
+		if (args.horizFlip) srcP += SrcBytesPerPixel * 7;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 8 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 8;
+		xCtrBpp = xCtr * SrcBytesPerPixel;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 8 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * SrcBytesPerPixel);
+		}
+		byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
+		uint32 srcCol = args.dstBitmap.getColor(srcColPtr, SrcBytesPerPixel);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && ((srcCol & args.alphaMask) == args.transColor))
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		if (DestBytesPerPixel == 4)
+			*(uint32 *)destVal = srcCol;
+		else
+			*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	byte rSrc, gSrc, bSrc, aSrc;
+	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
+	__m256i tint = _mm256_set1_epi16(args.src.format.ARGBToColor(args.srcAlpha, args.tintRed, args.tintGreen, args.tintBlue));
+	__m256i transColors = _mm256_set1_epi16(args.transColor);
+	__m256i alphas = _mm256_set1_epi16(args.srcAlpha);
+
+	// This is so that we can calculate what pixels to crop off in a vectorized way
+	__m256i addIndexes = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	if (args.horizFlip) addIndexes = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+	__m256i scaleAdds = _mm256_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4, (uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m256i scaleAdds2 = _mm256_set_epi32((uint32)args.scaleX*15, (uint32)args.scaleX*14, (uint32)args.scaleX*13, (uint32)args.scaleX*12, (uint32)args.scaleX*11, (uint32)args.scaleX*10, (uint32)args.scaleX*9, (uint32)args.scaleX*8);
+
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrBppStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		xCtrBppStart = xCtrStart * 2;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 16 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		__m256i xCtrWidthSIMD = _mm256_set1_epi16(xCtrWidth); // This is the width of the row
+		if (!Scale) {
+			// If we are not scaling the image
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 32) {
+				byte *destPtr = &destP[destX * 2];
+				// Skip pixels that are beyond the row
+				__m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// Goto next row in source and destination image
+			destP += args.destArea.pitch;
+			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+		} else {
+			// Here we are scaling the image
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			// Since the source yctr might not update every row of the destination, we have
+			// to see if we are on a new row...
+			if (srcYCtr != newSrcYCtr) {
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+
+			// Now also since we might skip a pixel or 2 or duplicate one to reach the desired
+			// scaling size, we create a small dummy buffer that we copy the pixels into and then
+			// call the drawPixelsSIMD function
+			uint16 srcBuffer[16];
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 32) {
+				if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
+				__m256i indexes = _mm256_set1_epi32(scaleXCtr), indexes2 = _mm256_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes = _mm256_slli_epi32(_mm256_srli_epi32(_mm256_add_epi32(indexes, scaleAdds), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				indexes2 = _mm256_slli_epi32(_mm256_srli_epi32(_mm256_add_epi32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS), 1);
+				// Simply memcpy them in. memcpy has no real performance overhead here
+				srcBuffer[0] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 0));
+				srcBuffer[1] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 1));
+				srcBuffer[2] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 2));
+				srcBuffer[3] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 3));
+				srcBuffer[4] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 4));
+				srcBuffer[5] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 5));
+				srcBuffer[6] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 6));
+				srcBuffer[7] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes, 7));
+				srcBuffer[8] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 0));
+				srcBuffer[9] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 1));
+				srcBuffer[10] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 2));
+				srcBuffer[11] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 3));
+				srcBuffer[12] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 4));
+				srcBuffer[13] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 5));
+				srcBuffer[14] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 6));
+				srcBuffer[15] = *(const uint16 *)(srcP + _mm256_extract_epi32(indexes2, 7));
+				scaleXCtr += args.scaleX*16;
+
+				// Now this is pretty much the same as before with non-scaled code, except that we use
+				// our dummy source buffer instead of the actuall source bitmap
+				byte *destPtr = &destP[destX * 2];
+				__m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
+				drawPixelSIMD2Bpp(destPtr, (const byte *)srcBuffer, tint, alphas, transColors, 1, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+			}
+			// We calculate every row here except the last (because then we need to
+			// check for if we fall off the edge of the row)
+			// The only exception here is scaling drawing this is because:
+			// 1) if statements are costly, and the less we do the faster this loop is
+			// 2) with this, the only branch in the normal drawing loop is the width check
+			// 3) the scaling code will actually draw the until the last 4 pixels of the image
+			//    and do the extra if checks because the scaling code is already much slower
+			//    than the normal drawing loop, and the less duplicate code helps here.
+			if (yCtr + 1 != yCtrHeight) destP += args.destArea.pitch;
+		}
+	}
+
+	// We have a picture that is a multiple of 16, so no extra pixels to draw
+	if (xCtrWidth % 16 == 0) return;
+	// Get the last x values of the last row
+	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+	// Drawing the last few not scaled pixels here.
+	// Same as the loop above but now we check if we are going to overflow,
+	// and thus we don't need to mask out pixels that go over the row.
+	if (!Scale) {
+		for (; xCtr + 16 < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 32) {
+			byte *destPtr = &destP[destX * 2];
+			drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_setzero_si256());
+		}
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (args.horizFlip) srcP += 2 * 15;
+	} else {
+		// So if we are scaling, set up the xCtr to what it was before (AKA the last 16 or so pixels of the image)
+		xCtr = xCtrWidth - xCtrWidth % 16;
+		xCtrBpp = xCtr * 2;
+		destX = args.xStart+xCtr;
+	}
+
+	// For the last 16 pixels, we just do them in serial, nothing special
+	for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
+		const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
+		if (Scale) {
+			srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / BITMAP::SCALE_THRESHOLD * 2);
+		}
+		byte *destVal = (byte *)&destP[destX * 2];
+		uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
+		
+		// Check if this is a transparent color we should skip
+		if (args.skipTrans && srcCol == args.transColor)
+			continue;
+
+		args.src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+		if (args.srcAlpha != -1) {
+			if (args.useTint) {
+				rDest = rSrc;
+				gDest = gSrc;
+				bDest = bSrc;
+				aDest = aSrc;
+				rSrc = args.tintRed;
+				gSrc = args.tintGreen;
+				bSrc = args.tintBlue;
+				aSrc = args.srcAlpha;
+			}/* else {
+				format.colorToARGB((uint32)(*(uint16 *)destVal), aDest, rDest, gDest, bDest);
+			}*/
+			args.dstBitmap.blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
+			srcCol = args.dstBitmap.format.ARGBToColor(aDest, rDest, gDest, bDest);
+		} else {
+			srcCol = args.dstBitmap.format.ARGBToColor(aSrc, rSrc, gSrc, bSrc);
+		}
+		*(uint16 *)destVal = srcCol;
+	}
+}
+
+template<bool Scale>
+static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
+	const int xDir = args.horizFlip ? -1 : 1;
+	__m256i transColors = _mm256_set1_epi16(args.transColor | (args.transColor << 8));
+
+	// This is so that we can calculate in parralell the pixel indexes for scaled drawing
+	__m256i scaleAdds1 = _mm256_set_epi32((uint32)args.scaleX*7, (uint32)args.scaleX*6, (uint32)args.scaleX*5, (uint32)args.scaleX*4, (uint32)args.scaleX*3, (uint32)args.scaleX*2, (uint32)args.scaleX, 0);
+	__m256i scaleAdds2 = _mm256_set_epi32((uint32)args.scaleX*15, (uint32)args.scaleX*14, (uint32)args.scaleX*13, (uint32)args.scaleX*12, (uint32)args.scaleX*11, (uint32)args.scaleX*10, (uint32)args.scaleX*9, (uint32)args.scaleX*8);
+	__m256i scaleAdds3 = _mm256_set_epi32((uint32)args.scaleX*23, (uint32)args.scaleX*22, (uint32)args.scaleX*21, (uint32)args.scaleX*20, (uint32)args.scaleX*19, (uint32)args.scaleX*18, (uint32)args.scaleX*17, (uint32)args.scaleX*16);
+	__m256i scaleAdds4 = _mm256_set_epi32((uint32)args.scaleX*31, (uint32)args.scaleX*30, (uint32)args.scaleX*29, (uint32)args.scaleX*28, (uint32)args.scaleX*27, (uint32)args.scaleX*26, (uint32)args.scaleX*25, (uint32)args.scaleX*24);
+	
+	// Clip the bounds ahead of time (so we don't waste time checking if we are in bounds when
+	// we are in the inner loop)
+	int xCtrStart = 0, xCtrWidth = args.dstRect.width();
+	if (args.xStart + xCtrWidth > args.destArea.w) {
+		xCtrWidth = args.destArea.w - args.xStart;
+	}
+	if (args.xStart < 0) {
+		xCtrStart = -args.xStart;
+		args.xStart = 0;
+	}
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
+	if (Scale) yCtrHeight = args.dstRect.height();
+	if (args.yStart < 0) {
+		yCtr = -args.yStart;
+		destY = 0;
+		if (Scale) {
+			scaleYCtr = yCtr * args.scaleY;
+			srcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+		}
+	}
+	if (args.yStart + yCtrHeight > args.destArea.h) {
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	
+	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
+	const byte *srcP = (const byte *)args.src.getBasePtr(
+	                       args.horizFlip ? args.srcArea.right - 32 : args.srcArea.left,
+	                       args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
+	for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
+		if (Scale) {
+			// So here we update the srcYCtr differently due to this being for
+			// scaling
+			int newSrcYCtr = scaleYCtr / BITMAP::SCALE_THRESHOLD;
+			if (srcYCtr != newSrcYCtr) {
+				// Since the source yctr might not update every row of the destination, we have
+				// to see if we are on a new row...
+				int diffSrcYCtr = newSrcYCtr - srcYCtr;
+				srcP += args.src.pitch * diffSrcYCtr;
+				srcYCtr = newSrcYCtr;
+			}
+		}
+		int xCtr = xCtrStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX;
+		for (; xCtr + 32 < xCtrWidth; destX += 32, xCtr += 32) {
+			byte *destPtr = &destP[destX];
+
+			// Here we dont use the drawPixelSIMD function because 1bpp bitmaps in allegro
+			// can't have any blending applied to them
+			__m256i destCols = _mm256_loadu_si256((const __m256i *)destPtr);
+			__m256i srcCols = _mm256_loadu_si256((const __m256i *)(srcP + xDir * xCtr));
+			if (Scale) {
+				// If we are scaling, we have to set each pixel individually
+				__m256i indexes1 = _mm256_set1_epi32(scaleXCtr), indexes2 = _mm256_set1_epi32(scaleXCtr);
+				__m256i indexes3 = _mm256_set1_epi32(scaleXCtr), indexes4 = _mm256_set1_epi32(scaleXCtr);
+				// Calculate in parallel the indexes of the pixels
+				indexes1 = _mm256_srli_epi32(_mm256_add_epi32(indexes1, scaleAdds1), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes2 = _mm256_srli_epi32(_mm256_add_epi32(indexes2, scaleAdds2), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes3 = _mm256_srli_epi32(_mm256_add_epi32(indexes3, scaleAdds3), BITMAP::SCALE_THRESHOLD_BITS);
+				indexes4 = _mm256_srli_epi32(_mm256_add_epi32(indexes4, scaleAdds4), BITMAP::SCALE_THRESHOLD_BITS);
+				srcCols = _mm256_set_epi8(
+					srcP[_mm256_extract_epi32(indexes4, 7)],
+					srcP[_mm256_extract_epi32(indexes4, 6)],
+					srcP[_mm256_extract_epi32(indexes4, 5)],
+					srcP[_mm256_extract_epi32(indexes4, 4)],
+					srcP[_mm256_extract_epi32(indexes4, 3)],
+					srcP[_mm256_extract_epi32(indexes4, 2)],
+					srcP[_mm256_extract_epi32(indexes4, 1)],
+					srcP[_mm256_extract_epi32(indexes4, 0)],
+					srcP[_mm256_extract_epi32(indexes3, 7)],
+					srcP[_mm256_extract_epi32(indexes3, 6)],
+					srcP[_mm256_extract_epi32(indexes3, 5)],
+					srcP[_mm256_extract_epi32(indexes3, 4)],
+					srcP[_mm256_extract_epi32(indexes3, 3)],
+					srcP[_mm256_extract_epi32(indexes3, 2)],
+					srcP[_mm256_extract_epi32(indexes3, 1)],
+					srcP[_mm256_extract_epi32(indexes3, 0)],
+					srcP[_mm256_extract_epi32(indexes2, 7)],
+					srcP[_mm256_extract_epi32(indexes2, 6)],
+					srcP[_mm256_extract_epi32(indexes2, 5)],
+					srcP[_mm256_extract_epi32(indexes2, 4)],
+					srcP[_mm256_extract_epi32(indexes2, 3)],
+					srcP[_mm256_extract_epi32(indexes2, 2)],
+					srcP[_mm256_extract_epi32(indexes2, 1)],
+					srcP[_mm256_extract_epi32(indexes2, 0)],
+					srcP[_mm256_extract_epi32(indexes1, 7)],
+					srcP[_mm256_extract_epi32(indexes1, 6)],
+					srcP[_mm256_extract_epi32(indexes1, 5)],
+					srcP[_mm256_extract_epi32(indexes1, 4)],
+					srcP[_mm256_extract_epi32(indexes1, 3)],
+					srcP[_mm256_extract_epi32(indexes1, 2)],
+					srcP[_mm256_extract_epi32(indexes1, 1)],
+					srcP[_mm256_extract_epi32(indexes1, 0)]);
+				scaleXCtr += args.scaleX*16;
+			}
+
+			// Mask out transparent pixels
+			__m256i mask1 = args.skipTrans ? _mm256_cmpeq_epi8(srcCols, transColors) : _mm256_setzero_si256();
+			__m256i final = _mm256_or_si256(_mm256_andnot_si256(mask1, srcCols), _mm256_and_si256(destCols, mask1));
+			if (args.horizFlip) {
+				__m256i final_swap16 = _mm256_srli_epi16(final, 8);
+				final_swap16 = _mm256_or_si256(final_swap16, _mm256_slli_epi16(_mm256_and_si256(final, _mm256_set1_epi16(0xff)), 8));
+				final_swap16 = _mm256_shufflelo_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final_swap16 = _mm256_shufflehi_epi16(final_swap16, _MM_SHUFFLE(0, 1, 2, 3));
+				final = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(final_swap16), _mm256_castsi256_pd(final_swap16), _MM_SHUFFLE2(0, 1)));
+			}
+			_mm256_storeu_si256((__m256i *)destPtr, final);
+		}
+		// Get the last x values
+
+		// Because we move in 16 pixel units, and horizFlip moves in 1, we have to move
+		// 1 pixel past the last pixel we did not blit, meaning going forward 15 pixels.
+		if (args.horizFlip) srcP += 15;
+		for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
+			const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
+			if (Scale) {
+				srcCol = (const byte *)(srcP + scaleXCtr / BITMAP::SCALE_THRESHOLD);
+			}
+			// Check if this is a transparent color we should skip
+			if (args.skipTrans && *srcCol == args.transColor)
+				continue;
+
+			byte *destVal = (byte *)&destP[destX];
+			*destVal = *srcCol;
+		}
+		if (args.horizFlip) srcP -= 15; // Undo what we did up there
+		destP += args.destArea.pitch; // Goto next row
+		// Only advance the src row by 1 every time like this if we don't scale
+		if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
+	}
+}
+
+}; // end of class DrawInnerImpl
+
+template<bool Scale>
+void BITMAP::drawAVX2(DrawInnerArgs &args) {
+	if (args.sameFormat) {
+		switch (format.bytesPerPixel) {
+		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		}
+	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) { 
+		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
+		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+	}
+}
+
+template void BITMAP::drawAVX2<false>(DrawInnerArgs &);
+template void BITMAP::drawAVX2<true>(DrawInnerArgs &);
+
+} // namespace AGS3
diff --git a/engines/ags/lib/allegro/surface_generic.cpp b/engines/ags/lib/allegro/surface_generic.cpp
index 82626eb9859..e032dcd3109 100644
--- a/engines/ags/lib/allegro/surface_generic.cpp
+++ b/engines/ags/lib/allegro/surface_generic.cpp
@@ -136,6 +136,31 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
 					gSrc = args.tintGreen;
 					bSrc = args.tintBlue;
 					aSrc = args.srcAlpha;
+				} else {
+					uint32 destCol = getColor(destVal, DestBytesPerPixel);
+					if (DestBytesPerPixel == 1) {
+						const RGB &rgb = args.palette[destCol];
+						aDest = 0xff;
+						rDest = rgb.r;
+						gDest = rgb.g;
+						bDest = rgb.b;
+					} else {
+						if (DestBytesPerPixel == 4) {
+							aDest = destCol >> 24;
+							rDest = (destCol >> 16) & 0xff;
+							gDest = (destCol >> 8) & 0xff;
+							bDest = destCol & 0xff;
+						} else { // DestBytesPerPixel == 2
+							aDest = 0xff;
+							rDest = (destCol >> 11) & 0x1f;
+							rDest = (rDest << 3) | (rDest >> 2);
+							gDest = (destCol >> 5) & 0x3f;
+							gDest = (gDest << 2) | (gDest >> 4);
+							bDest = destCol & 0x1f;
+							bDest = (bDest << 3) | (bDest >> 2);
+						}
+						//src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
+					}
 				}
 				blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, args.srcAlpha, args.useTint, destVal);
 			}
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 51f93df8eb8..0fd9bee60f4 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -624,7 +624,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 }
 
 template<bool Scale>
-void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
+static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
@@ -791,7 +791,7 @@ void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 }
 
 template<bool Scale>
-void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
+static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
 
@@ -936,5 +936,3 @@ template void BITMAP::drawSSE2<false>(DrawInnerArgs &);
 template void BITMAP::drawSSE2<true>(DrawInnerArgs &);
 
 } // namespace AGS3
-
-#endif
diff --git a/engines/ags/module.mk b/engines/ags/module.mk
index 33a7cd2d27a..ebdb2ef4a5c 100644
--- a/engines/ags/module.mk
+++ b/engines/ags/module.mk
@@ -387,6 +387,11 @@ MODULE_OBJS += \
 	lib/allegro/surface_sse2.o
 $(MODULE)/lib/allegro/surface_sse2.o: CXXFLAGS += -msse2
 endif
+ifeq ($(SCUMMVM_AVX2),1)
+MODULE_OBJS += \
+	lib/allegro/surface_avx2.o
+$(MODULE)/lib/allegro/surface_avx2.o: CXXFLAGS += -mavx2 -mavx -msse2
+endif
 
 # This module can be built as a plugin
 ifeq ($(ENABLE_AGS), DYNAMIC_PLUGIN)
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index 1f3c2ac25f5..5c5bb4e9fe9 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -24,8 +24,6 @@
 #include "common/debug.h"
 #include "ags/shared/core/platform.h"
 #include "ags/shared/gfx/gfx_def.h"
-//#include "ags/shared/debugging/assert.h"
-// File not present??
 #include "common/scummsys.h"
 #include "ags/lib/allegro/color.h"
 #include "ags/shared/gfx/bitmap.h"
@@ -36,28 +34,16 @@
 #include "graphics/managed_surface.h"
 #include "graphics/pixelformat.h"
 
-#ifdef __aarch64__
-#define OPT_NEON
-#include "ags/lib/allegro/surface_simd_neon.h"
-#elif defined(__x86_64__) || defined(__i686__)
-#define OPT_SSE
-#include "ags/lib/allegro/surface_simd_sse.h"
-#endif
-
 namespace AGS3 {
 
 namespace GfxDef = AGS::Shared::GfxDef;
 using namespace AGS::Shared;
 
-// Comment this out if you don't want the console to be clogged with info durning tests
-#define VERBOSE_TEST_GFX
-
-void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
-	_G(_bitmap_simd_optimizations) = opt;
-#ifdef VERBOSE_TEST_GFX
-	if (opt) debug("SIMD optimizations: true\n");
+void Test_GfxSpeed(bool enableSimd, size_t blenderModeStart, size_t blenderModeEnd) {
+	uint oldSimdFlags = _G(simd_flags);
+	if (!enableSimd) _G(simd_flags) = AGS3::Globals::SIMD_NONE;
+	if (enableSimd) debug("SIMD optimizations: true\n");
 	else debug("SIMD optmizations: false\n");
-#endif
 	Bitmap *benchgfx32 = BitmapHelper::CreateBitmap(100, 100, 32);
 	Bitmap *benchgfx16 = BitmapHelper::CreateBitmapCopy(benchgfx32, 16);
 	Bitmap *benchgfx8 = BitmapHelper::CreateBitmap(100, 100, 8);
@@ -66,40 +52,48 @@ void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	Bitmap *dest8 = BitmapHelper::CreateBitmap(100, 100, 8);
 	int benchRuns[] = {1000, 10000, 100000};
 	int blenderModes[] = {kRgbToRgbBlender, kSourceAlphaBlender, kArgbToArgbBlender, kOpaqueBlenderMode, kTintLightBlenderMode};
-	const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
+	//const char *modeNames[] = {"RGB to RGB", "Source Alpha", "ARGB to ARGB", "Opaque", "Tint with Light"};
 	Bitmap *destinations[] = {dest32, dest16, dest8};
 	Bitmap *graphics[] = {benchgfx32, benchgfx16, benchgfx8};
-	int bpps[] = {32, 16, 8};
+	uint64 time = 0, numIters = 0, timeNotStretched = 0, numItersNotStretched = 0, timeCommon = 0, numItersCommon = 0;
+	//int bpps[] = {32, 16, 8};
 	if (blenderModeEnd >= sizeof(blenderModes) / sizeof(blenderModes[0])) blenderModeEnd = (sizeof(blenderModes) / sizeof(blenderModes[0])) - 1;
 	for (int dest = 0; dest < 3; dest++) {
 		for (int gfx = 0; gfx < 3; gfx++) {
 			if (dest == 2 && gfx != 2) continue;
-			for (int mode = blenderModeStart; mode <= blenderModeEnd; mode++) {
+			for (size_t mode = blenderModeStart; mode <= blenderModeEnd; mode++) {
 				for (int runs = 0; (size_t)runs < sizeof(benchRuns)/sizeof(int); runs++) {
 					uint32 start, end;
 					_G(_blender_mode) = (AGS3::BlenderMode)blenderModes[mode];
-#ifdef VERBOSE_TEST_GFX
-					if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
-#endif
+					//if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: false, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
 					start = std::chrono::high_resolution_clock::now();
 					for (int i = 0; i < benchRuns[runs]; i++)
 						destinations[dest]->Blit(graphics[gfx], 0, 0, kBitmap_Transparency);
 					end = std::chrono::high_resolution_clock::now();
-#ifdef VERBOSE_TEST_GFX
-					if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
-					if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
-#endif
+					timeNotStretched += end - start;
+					numItersNotStretched += benchRuns[runs];
+					if (mode == kArgbToArgbBlender || mode == kRgbToRgbBlender || mode == kRgbToArgbBlender || mode == kArgbToRgbBlender) {
+						timeCommon += end - start;
+						numItersCommon += benchRuns[runs];
+					}
+					time += end - start;
+					//if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
+					//if (runs == 2) debug("Dest: %d bpp, Gfx: %d bpp, Blender: %s, Stretched: true, Iters: %d\n", bpps[dest], bpps[gfx], modeNames[mode], benchRuns[runs]);
 					start = std::chrono::high_resolution_clock::now();
 					for (int i = 0; i < benchRuns[runs]; i++)
 						destinations[dest]->StretchBlt(graphics[gfx], Rect(0, 0, 99, 99), kBitmap_Transparency);
 					end = std::chrono::high_resolution_clock::now();
-#ifdef VERBOSE_TEST_GFX
-					if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
-#endif
+					time += end - start;
+					numIters += benchRuns[runs] * 2;
+					//if (runs == 2) debug("exec time (mills): %u\n\n", end - start);
 				}
 			}
 		}
 	}
+
+	debug("Over all blender modes, pixel formats, and stretching sizes (%f) avg millis per call.", (double)time / (double)numIters);
+	debug("Over all blender modes, pixel formats, but only unstretched (%f) avg millis per call.", (double)timeNotStretched / (double)numItersNotStretched);
+	debug("Over most common blender modes, all pixel formats, but only unstretched (%f) avg millis per call.", (double)timeCommon / (double)numItersCommon);
 	
 	delete benchgfx32;
 	delete benchgfx16;
@@ -107,12 +101,12 @@ void Test_GfxSpeed(bool opt, int blenderModeStart, int blenderModeEnd) {
 	delete dest32;
 	delete dest16;
 	delete dest8;
-}
 
+	if (!enableSimd) _G(simd_flags) = oldSimdFlags;
+}
 
 
-void Test_DrawingLoops() {
-		
+void printInfo(uint8 srcA, uint8 srcR, uint8 srcG, uint8 srcB, uint8 destA, uint8 destR, uint8 destG, uint8 destB, uint32 alpha, uint32 controlCol, uint32 simdCol) {
 }
 void Test_BlenderModes() {
 	constexpr int depth = 2;
@@ -120,6 +114,10 @@ void Test_BlenderModes() {
 	BITMAP dummy(&owner);
 	Graphics::ManagedSurface owner16(16, 16, Graphics::PixelFormat(2, 5, 6, 5, 0, 11, 5, 0, 0));
 	BITMAP dummy16(&owner16);
+	Graphics::ManagedSurface ownerDest(16, 16, Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24));
+	BITMAP dummyDest(&ownerDest);
+	Graphics::ManagedSurface ownerDest16(16, 16, Graphics::PixelFormat(2, 5, 6, 5, 0, 11, 5, 0, 0));
+	BITMAP dummyDest16(&ownerDest16);
 	for (int blenderMode = (int)kSourceAlphaBlender; blenderMode <= (int)kTintLightBlenderMode; blenderMode++) {
 		_G(_blender_mode) = (BlenderMode)blenderMode;
 		for (int srcR = 0; srcR < 255; srcR += (1 << (8 - depth))) {
@@ -132,72 +130,57 @@ void Test_BlenderModes() {
 									for (int destA = 0; destA < 255; destA += (1 << (8 - depth))) {
 										for (uint32 alpha = 0; alpha < 255; alpha += (1 << (8 - depth))) {
 											// First run the non-vectorized version of the code
-											uint32 controlCol, simdCol, pixelDummy;
-											uint16 control2bppCol, simd2bppCol, pixelDummy2bpp;
-											uint8 a = destA, r = destR, g = destG, b = destB;
-											pixelDummy = (a << 24) | (r << 16) | (g << 8) | b;
-											dummy.blendPixel(srcA, srcR, srcG, srcB, a, r, g, b, alpha, false, (byte *)&pixelDummy);
-											controlCol = b | (g << 8) | (r << 16) | (a << 24);
+											uint32 controlCol = 0, simdCol = 0;
+											uint16 control2bppCol = 0, simd2bppCol = 0;
+											uint8 a, r, g, b, a16, r16, g16, b16;
+											a = r = g = b = a16 = r16 = g16 = b16 = 0;
+											
+											auto printInfo = [&]() {
+												debug("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
+												switch ((BlenderMode)blenderMode) {
+													case kSourceAlphaBlender: debug("blenderMode: kSourceAlphaBlender"); break;
+													case kArgbToArgbBlender: debug("blenderMode: kArgbToArgbBlender"); break;
+													case kArgbToRgbBlender: debug("blenderMode: kArgbToRgbBlender"); break;
+													case kRgbToArgbBlender: debug("blenderMode: kRgbToArgbBlender"); break;
+													case kRgbToRgbBlender: debug("blenderMode: kRgbToRgbBlender"); break;
+													case kAlphaPreservedBlenderMode: debug("blenderMode: kAlphaPreservedBlenderMode"); break;
+													case kOpaqueBlenderMode: debug("blenderMode: kOpaqueBlenderMode"); break;
+													case kAdditiveBlenderMode: debug("blenderMode: kAdditiveBlenderMode"); break;
+													case kTintBlenderMode: debug("blenderMode: kTintBlenderMode"); break;
+													case kTintLightBlenderMode: debug("blenderMode: kTintLightBlenderMode"); break;
+												}
+												debug("controlCol %x argb: %d, %d, %d, %d", controlCol, a, r, g, b);
+												debug("simdCol %x argb: %d, %d, %d, %d", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
+												debug("control2bppCol %x rgb: %d, %d, %d", control2bppCol, r16, g16, b16);
+												debug("simd2bppCol %x rgb: %d, %d, %d", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
+											};
+
+											uint oldSimdFlags = _G(simd_flags);
+											_G(simd_flags) = AGS3::Globals::SIMD_NONE;
+											*(uint32 *)dummy.getBasePtr(0, 0) = dummy.format.ARGBToColor(srcA, srcR, srcG, srcB);
+											*(uint32 *)dummyDest.getBasePtr(0, 0) = dummyDest.format.ARGBToColor(destA, destR, destG, destB);
+											dummyDest.draw(&dummy, Common::Rect(16, 16), 0, 0, false, false, false, alpha);
+											controlCol = dummyDest.getpixel(0, 0);
+											dummyDest.format.colorToARGB(dummyDest.getpixel(0, 0), a, r, g, b);
+
+											*(uint16 *)dummy16.getBasePtr(0, 0) = dummy16.format.ARGBToColor(srcA, srcR, srcG, srcB);
+											*(uint16 *)dummyDest16.getBasePtr(0, 0) = dummyDest16.format.ARGBToColor(destA, destR, destG, destB);
+											dummyDest16.draw(&dummy16, Common::Rect(16, 16), 0, 0, false, false, false, alpha);
+											control2bppCol = dummyDest16.getpixel(0, 0);
+											dummyDest16.format.colorToARGB(dummyDest16.getpixel(0, 0), a16, r16, g16, b16);
+											a16 >>= 3; r16 >>= 3; g16 >>= 2; b16 >>= 3;
+											_G(simd_flags) = oldSimdFlags;
+
+											*(uint32 *)dummy.getBasePtr(0, 0) = dummy.format.ARGBToColor(srcA, srcR, srcG, srcB);
+											*(uint32 *)dummyDest.getBasePtr(0, 0) = dummyDest.format.ARGBToColor(destA, destR, destG, destB);
+											dummyDest.draw(&dummy, Common::Rect(16, 16), 0, 0, false, false, false, alpha);
+											simdCol = dummyDest.getpixel(0, 0);
+
+											*(uint16 *)dummy16.getBasePtr(0, 0) = dummy16.format.ARGBToColor(srcA, srcR, srcG, srcB);
+											*(uint16 *)dummyDest16.getBasePtr(0, 0) = dummyDest16.format.ARGBToColor(destA, destR, destG, destB);
+											dummyDest16.draw(&dummy16, Common::Rect(16, 16), 0, 0, false, false, false, alpha);
+											simd2bppCol = dummyDest16.getpixel(0, 0);
 
-											uint8 a16 = 0xff, r16 = destR >> 3, g16 = destG >> 2, b16 = destB >> 3;
-											r16 = (r16 << 3) | (r16 >> 2);
-											g16 = (g16 << 2) | (g16 >> 4);
-											b16 = (b16 << 3) | (b16 >> 2);
-											uint8 srcR16 = srcR >> 3, srcG16 = srcG >> 2, srcB16 = srcB >> 3;
-											srcR16 = (srcR16 << 3) | (srcR16 >> 2);
-											srcG16 = (srcG16 << 2) | (srcG16 >> 4);
-											srcB16 = (srcB16 << 3) | (srcB16 >> 2);
-											pixelDummy2bpp = (destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11);
-											dummy16.blendPixel(0xff, srcR16, srcG16, srcB16, a16, r16, g16, b16, alpha, false, (byte *)&pixelDummy2bpp);
-											r16 >>= 3; g16 >>= 2; b16 >>= 3;
-											control2bppCol = b16 | (g16 << 5) | (r16 << 11);
-											{
-#ifdef OPT_NEON
-												uint32x4_t src = vdupq_n_u32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
-												uint32x4_t dest = vdupq_n_u32(destB | (destG << 8) | (destR << 16) | (destA << 24));
-												uint32x4_t alphas = vdupq_n_u32(alpha);
-												simdCol = vgetq_lane_u32(blendPixelSIMD(src, dest, alphas), 0);
-#else
-												__m128i src = _mm_set1_epi32(srcB | (srcG << 8) | (srcR << 16) | (srcA << 24));
-												__m128i dest = _mm_set1_epi32(destB | (destG << 8) | (destR << 16) | (destA << 24));
-												__m128i alphas = _mm_set1_epi32((int)alpha);
-												simdCol = _mm_cvtsi128_si32(blendPixelSIMD(src, dest, alphas));
-#endif
-											}
-											{
-#ifdef OPT_NEON
-												uint16x8_t src = vdupq_n_u16((srcB >> 3) | ((srcG >> 2) << 5) | ((srcR >> 3) << 11));
-												uint16x8_t dest = vdupq_n_u16((destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11));
-												uint16x8_t alphas = vdupq_n_u16((uint16)alpha);
-												simd2bppCol = vgetq_lane_u16(blendPixelSIMD2Bpp(src, dest, alphas), 0);
-#else
-												__m128i src = _mm_set1_epi16((srcB >> 3) | ((srcG >> 2) << 5) | ((srcR >> 3) << 11));
-												__m128i dest = _mm_set1_epi16((destB >> 3) | ((destG >> 2) << 5) | ((destR >> 3) << 11));
-												__m128i alphas = _mm_set1_epi16((uint16)alpha);
-												simd2bppCol = (uint16)(_mm_cvtsi128_si32(blendPixelSIMD2Bpp(src, dest, alphas)) & 0xffff);
-#endif
-											}
-#ifdef VERBOSE_TEST_GFX
-											debug("src argb: %d, %d, %d, %d dest argb: %d, %d, %d, %d a: %d", srcA, srcR, srcG, srcB, destA, destR, destG, destB, alpha);
-#endif
-											switch ((BlenderMode)blenderMode) {
-												case kSourceAlphaBlender: debug("blenderMode: kSourceAlphaBlender"); break;
-												case kArgbToArgbBlender: debug("blenderMode: kArgbToArgbBlender"); break;
-												case kArgbToRgbBlender: debug("blenderMode: kArgbToRgbBlender"); break;
-												case kRgbToArgbBlender: debug("blenderMode: kRgbToArgbBlender"); break;
-												case kRgbToRgbBlender: debug("blenderMode: kRgbToRgbBlender"); break;
-												case kAlphaPreservedBlenderMode: debug("blenderMode: kAlphaPreservedBlenderMode"); break;
-												case kOpaqueBlenderMode: debug("blenderMode: kOpaqueBlenderMode"); break;
-												case kAdditiveBlenderMode: debug("blenderMode: kAdditiveBlenderMode"); break;
-												case kTintBlenderMode: debug("blenderMode: kTintBlenderMode"); break;
-												case kTintLightBlenderMode: debug("blenderMode: kTintLightBlenderMode"); break;
-											}
-#ifdef VERBOSE_TEST_GFX
-											debug("controlCol %x argb: %d, %d, %d, %d", controlCol, a, r, g, b);
-											debug("simdCol %x argb: %d, %d, %d, %d", simdCol, (simdCol >> 24), ((simdCol >> 16) & 0xff), ((simdCol >> 8) & 0xff), (simdCol & 0xff));
-											debug("control2bppCol %x rgb: %d, %d, %d", control2bppCol, r16, g16, b16);
-											debug("simd2bppCol %x rgb: %d, %d, %d", simd2bppCol, (simd2bppCol >> 11), ((simd2bppCol >> 5) & 0x3f), (simd2bppCol & 0x1f));
-#endif
 											int tolerance, tolerance16;
 											switch ((BlenderMode)blenderMode) {
 												// These need to be IDENTICAL for lamplight city to work
@@ -224,14 +207,38 @@ void Test_BlenderModes() {
 												tolerance = 2;
 												tolerance16 = 1;
 												break;
+
+												default:
+												tolerance = 0;
+											}
+											if (std::abs((int)a - (int)(simdCol >> 24)) > tolerance) {
+												printInfo();
+												assert(false && "a is over the tolerance");
+											}
+											if (std::abs((int)r - (int)((simdCol >> 16) & 0xff)) > tolerance) {
+												printInfo();
+												assert(false && "r is over the tolerance");
+											}
+											if (std::abs((int)g - (int)((simdCol >> 8) & 0xff)) > tolerance) {
+												printInfo();
+												assert(false && "g is over the tolerance");
+											}
+											if (std::abs((int)b - (int)(simdCol & 0xff)) > tolerance) {
+												printInfo();
+												assert(false && "b is over the tolerance");
+											}
+											if (std::abs((int)b16 - (int)(simd2bppCol & 0x1f)) > tolerance16) {
+												printInfo();
+												assert(false && "b16 is over the tolerance");
+											}
+											if (std::abs((int)g16 - (int)((simd2bppCol >> 5) & 0x3f)) > tolerance16) {
+												printInfo();
+												assert(false && "g16 is over the tolerance");
+											}
+											if (std::abs((int)r16 - (int)(simd2bppCol >> 11)) > tolerance16) {
+												printInfo();
+												assert(false && "r16 is over the tolerance");
 											}
-											assert(std::abs((int)a - (int)(simdCol >> 24)) <= tolerance);
-											assert(std::abs((int)r - (int)((simdCol >> 16) & 0xff)) <= tolerance);
-											assert(std::abs((int)g - (int)((simdCol >> 8) & 0xff)) <= tolerance);
-											assert(std::abs((int)b - (int)(simdCol & 0xff)) <= tolerance);
-											assert(std::abs((int)b16 - (int)(simd2bppCol & 0x1f)) <= tolerance16);
-											assert(std::abs((int)g16 - (int)((simd2bppCol >> 5) & 0x3f)) <= tolerance16);
-											assert(std::abs((int)r16 - (int)(simd2bppCol >> 11)) <= tolerance16);
 										}
 									}
 								}
@@ -259,16 +266,14 @@ void Test_GfxTransparency() {
 	}
 }
 
+#define SLOW_TESTS
 void Test_Gfx() {
 	Test_GfxTransparency();
-#if defined(OPT_NEON) || defined(OPT_SSE)
-	//Test_DrawingLoops();
-	//Test_BlenderModes();
+#if (defined(SCUMMVM_AVX2) || defined(SCUMMVM_SSE2) || defined(SCUMMVM_NEON)) && defined(SLOW_TESTS)
+	Test_BlenderModes();
 	// This could take a LONG time
-	bool has_simd = _G(_bitmap_simd_optimizations);
-	if (has_simd) Test_GfxSpeed(true, 0, kTintLightBlenderMode);
-	Test_GfxSpeed(false, 0, kTintLightBlenderMode);
-	_G(_bitmap_simd_optimizations) = has_simd;
+	Test_GfxSpeed(true, kSourceAlphaBlender, kTintLightBlenderMode);
+	Test_GfxSpeed(false, kSourceAlphaBlender, kTintLightBlenderMode);
 #endif
 }
 


Commit: 742f1f296dd052db30f2eabf2c3eb5e50e3ef308
    https://github.com/scummvm/scummvm/commit/742f1f296dd052db30f2eabf2c3eb5e50e3ef308
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-28T21:16:42+02:00

Commit Message:
AGS: Fixed code formatting

Changed paths:
    engines/ags/engine/main/engine.cpp
    engines/ags/lib/allegro/surface_neon.cpp
    engines/ags/lib/allegro/surface_sse2.cpp
    engines/ags/lib/std/functional.h
    engines/ags/tests/test_gfx.cpp


diff --git a/engines/ags/engine/main/engine.cpp b/engines/ags/engine/main/engine.cpp
index f9255b0ec1e..b1ce90e7122 100644
--- a/engines/ags/engine/main/engine.cpp
+++ b/engines/ags/engine/main/engine.cpp
@@ -23,7 +23,6 @@
 // Engine initialization
 //
 
-#include "ags/lib/allegro/color.h"
 #include "ags/shared/core/platform.h"
 #include "ags/lib/allegro.h" // allegro_install and _exit
 #include "ags/engine/ac/asset_helper.h"
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 9eb5b4b54fd..15856ec0c5a 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -1,9 +1,9 @@
 #include <arm_neon.h>
-#include "ags/lib/allegro/gfx.h"
-#include "ags/lib/allegro/color.h"
-#include "ags/lib/allegro/flood.h"
 #include "ags/ags.h"
 #include "ags/globals.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/lib/allegro/gfx.h"
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 0fd9bee60f4..185206bb401 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -1,9 +1,9 @@
 #include <immintrin.h>
-#include "ags/lib/allegro/gfx.h"
-#include "ags/lib/allegro/color.h"
-#include "ags/lib/allegro/flood.h"
 #include "ags/ags.h"
 #include "ags/globals.h"
+#include "ags/lib/allegro/color.h"
+#include "ags/lib/allegro/flood.h"
+#include "ags/lib/allegro/gfx.h"
 #include "common/textconsole.h"
 #include "graphics/screen.h"
 
@@ -77,8 +77,7 @@ inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alpha
 	return _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[2], 11));
 }
 
-inline __m128i mul32_as32(__m128i a, __m128i b)
-{
+inline __m128i mul32_as32(__m128i a, __m128i b) {
 	__m128i tmp1 = _mm_mul_epu32(a,b);
 	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), _mm_srli_si128(b,4));
 	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
@@ -458,13 +457,13 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	const int xDir = args.horizFlip ? -1 : 1;
 	byte rSrc, gSrc, bSrc, aSrc;
 	byte rDest = 0, gDest = 0, bDest = 0, aDest = 0;
-    __m128i tint = _mm_sll_epi32(_mm_set1_epi32(args.srcAlpha), _mm_set1_epi32(24));
+	__m128i tint = _mm_sll_epi32(_mm_set1_epi32(args.srcAlpha), _mm_set1_epi32(24));
 	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintRed), _mm_set1_epi32(16)));
 	tint = _mm_or_si128(tint, _mm_sll_epi32(_mm_set1_epi32(args.tintGreen), _mm_set1_epi32(8)));
 	tint = _mm_or_si128(tint, _mm_set1_epi32(args.tintBlue));
 	__m128i maskedAlphas = _mm_set1_epi32(args.alphaMask);
 	__m128i transColors = _mm_set1_epi32(args.transColor);
-    __m128i alphas = _mm_set1_epi32(args.srcAlpha);
+	__m128i alphas = _mm_set1_epi32(args.srcAlpha);
 
 	// This is so that we can calculate what pixels to crop off in a vectorized way
 	__m128i addIndexes = _mm_set_epi32(3, 2, 1, 0);
diff --git a/engines/ags/lib/std/functional.h b/engines/ags/lib/std/functional.h
index 1d1e3b16e61..ece633814bc 100644
--- a/engines/ags/lib/std/functional.h
+++ b/engines/ags/lib/std/functional.h
@@ -49,7 +49,7 @@ struct function {
 		return *_fn;
 	}
 
-	operator bool() {
+	operator bool() const {
 		return _fn != nullptr;
 	}
 };
diff --git a/engines/ags/tests/test_gfx.cpp b/engines/ags/tests/test_gfx.cpp
index 5c5bb4e9fe9..58fbf009edb 100644
--- a/engines/ags/tests/test_gfx.cpp
+++ b/engines/ags/tests/test_gfx.cpp
@@ -183,32 +183,32 @@ void Test_BlenderModes() {
 
 											int tolerance, tolerance16;
 											switch ((BlenderMode)blenderMode) {
-												// These need to be IDENTICAL for lamplight city to work
-												// It would be nice to get tolerance16 down to 0 though...
-												case kRgbToRgbBlender:
-												case kArgbToRgbBlender:
-												case kSourceAlphaBlender:
-												case kAlphaPreservedBlenderMode:
+											// These need to be IDENTICAL for lamplight city to work
+											// It would be nice to get tolerance16 down to 0 though...
+											case kRgbToRgbBlender:
+											case kArgbToRgbBlender:
+											case kSourceAlphaBlender:
+											case kAlphaPreservedBlenderMode:
 												tolerance = 0;
 												tolerance16 = 1;
 												break;
 
-												// These can be 1 or 2 off, as long as they look the same
-												case kArgbToArgbBlender:
-												case kRgbToArgbBlender:
-												case kOpaqueBlenderMode:
-												case kAdditiveBlenderMode:
+											// These can be 1 or 2 off, as long as they look the same
+											case kArgbToArgbBlender:
+											case kRgbToArgbBlender:
+											case kOpaqueBlenderMode:
+											case kAdditiveBlenderMode:
 												tolerance = 1;
 												tolerance16 = 1;
 												break;
 
-												case kTintBlenderMode:
-												case kTintLightBlenderMode:
+											case kTintBlenderMode:
+											case kTintLightBlenderMode:
 												tolerance = 2;
 												tolerance16 = 1;
 												break;
 
-												default:
+											default:
 												tolerance = 0;
 											}
 											if (std::abs((int)a - (int)(simdCol >> 24)) > tolerance) {