[Scummvm-git-logs] scummvm master -> 41815c98168008552eeaa85deb8b8af2870285a3

Sat Aug 12 22:22:37 UTC 2023

This automated email contains information about 61 new commits which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
710d9cb954 JANITORIAL: Moved blitting to blit-alpha.cpp
d03727885a GRAPHICS: Created blendBlitFrom in ManagedSurface
f15462079d GRAPHICS: ManagedSurface::blendBlitFrom fix
b34f6f48fd SWORD25: Moved to ManagedSurface
056a0029c6 SWORD25: Tweaked RenderedImage to use pPartRect
d36e603864 GRAPHICS: ManagedSurface::blendBlitFrom started on
d039a8570e TEST: Add test for ManagedSurface::blendBlitFrom
50a28d1554 OSYSTEM: Added SIMD feature flags
f062addfe2 BACKENDS: BaseBackend detect x86 SIMD extensions
bdc6d72e0b ANDROID: Added ARM NEON detection in backend init
c3554dd7f0 BACKENDS: BaseBackend detects NEON on aarch64
ffb845f241 WII: Backend detects Altivec extensions
8cc1ab7efa PS3: Backend now detects altivec extenions
139eb6ad61 MAC: Basic Altivec detection implemented
e7cd583e9f GRAPHICS: Refactor blendBlitUnfiltered
402c67064d GRAPHICS: BlendBlit detects cpu extensions
eebadf4495 GRAPHICS: Optimize alpha blend NEON and Generic
9ad04f8000 GRAPHICS: Optimize BLEND_NORMAL on NEON
c2c7ca0275 GRAPHICS: BlendBlit NEON blending modes coded
6bdeeb506f OSYSTEM: Added SSE4.1 feature flag
2947e87e59 BACKENDS: BaseBackend now detects SSE4.1
c2e4fc9b6a GRAPHICS: SSE2 for BlendBlit and fix NEON bug
4f9739685a TEST: blendBlit test will now also test SSE2
843d835641 TEST: Change blendFrom test
e4c984cd3a BUILD: Configure flags for SIMD extensions
64a6548041 BACKENDS: Fixed BaseBackend AVX2 detection
01e2183161 BUILD: Make ./configure POSIX compliant
90cb8cbe5f GRAPHICS: Fix SSE2 and NEON bug in BlendBlit
39f7202473 GRAPHICS: Add AVX2 support for BlendBlit
e5bc2d696d TEST: Test for AVX2 BlendBlit
a18332ab36 JANITORIAL: Touch up BlendBlit comments
ad0c823f2f GRAPHICS: TransparentSurface scales in place
480a77f310 BUILD: Configure automatically detects SIMD
59fa0a9208 BACKENDS: Cpu feature flags now use bit shifts
f5dfa6b8d1 WII: Removed extra ")" in backend hasFeature
787837ca41 ALL: Add Cpu prefix to SIMD extension features
9b312eb16a BUILD: Removed unnessesary SIMD engine features
975808bac4 GRAPHICS: Moved blit files into graphics/blit
e61fce02c8 GRAPHICS: Fix blend blit indentation
df2367c3c9 GRAPHICS: Change blendBlitFrom's format function
41a942c5ce ALL: Renamed TS_ARGB to MS_ARGB
a64c7ea3b7 GRAPHICS: New blendBlitFrom overload
e95cfb4877 GRAPHICS: Fixed indentation in blit files
e374c4d9cf GRAPHICS: Fixed ManagedSurface::blendBlitFrom
df073eeed7 TEST: Added copyright header to blendBlitFrom test
55550f85ac BACKENDS: Move SIMD detection to more stable SDL2
bf7b6c1cf6 BUILD: Fixed typo in Arm NEON feature
91b9c112b5 TEST: Put bitmap saving under ifdef in blending.h
72c01fcdbe TEST: Remove unnessesary functions in blending.h
66c75ee760 GRAPHICS: Moved SIMD code to new translation unit
0961399727 ALL: blendBlitFrom prototype now matches TS::blit
c45918ea91 BUILD: SIMD flags only enabled on blit-blend.o
f802ad16d0 BUILD: Simplify SIMD options
0597770654 GRAPHICS: Refactor BlendBlit
41f82fbab8 BUILD: Change wording for AVX2 option
4fbde03866 ALL: blendBlitFrom exaclty matched TS::blit
b16ae30ebd GRIFFON: Move to useing ManagedSurface
5112d79690 GRAPHICS: Fixed ManagedSurface bug
9d354fc636 GRAPHICS: Fixing blendBlitTo bugs
1b752d6a62 BUILD: BlendBlit SIMD only compiles when needed
41815c9816 GRAPHICS: Fix BlendBlit additive blending mode


Commit: 710d9cb95465588751e0106bf50e82e1b4198a26
    https://github.com/scummvm/scummvm/commit/710d9cb95465588751e0106bf50e82e1b4198a26
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
JANITORIAL: Moved blitting to blit-alpha.cpp

Moved lower level blitting functions used by Graphics::TransparentSurface
into blit-alpha.cpp and moved their corresponding declarations into blit.h.

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/transparent_surface.cpp

diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 01cff6d65a0..dcd44dfc528 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -24,6 +24,29 @@
 
 namespace Graphics {
 
+static const int kBModShift = 8;
+static const int kGModShift = 16;
+static const int kRModShift = 24;
+static const int kAModShift = 0;
+
+static const uint32 kBModMask = 0x0000ff00;
+static const uint32 kGModMask = 0x00ff0000;
+static const uint32 kRModMask = 0xff000000;
+static const uint32 kAModMask = 0x000000ff;
+static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
+
+#ifdef SCUMM_LITTLE_ENDIAN
+static const int kAIndex = 0;
+static const int kBIndex = 1;
+static const int kGIndex = 2;
+static const int kRIndex = 3;
+#else
+static const int kAIndex = 3;
+static const int kBIndex = 2;
+static const int kGIndex = 1;
+static const int kRIndex = 0;
+#endif
+
 namespace {
 
 template<typename Size, bool overwriteAlpha>
@@ -167,4 +190,441 @@ bool setAlpha(byte *dst, const byte *src,
 	return true;
 }
 
+/**
+ * Optimized version of doBlit to be used with multiply blended blitting
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitMultiplyBlendLogic(byte *ino, byte *outo,
+									 uint32 width, uint32 height,
+									 uint32 outPitch, int32 inStep,
+									 int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				if (cb != 255) {
+					out[kBIndex] = MIN<uint>(out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kBIndex] = MIN<uint>(out[kBIndex] * (in[kBIndex] * ina >> 8) >> 8, 255u);
+				}
+
+				if (cg != 255) {
+					out[kGIndex] = MIN<uint>(out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kGIndex] = MIN<uint>(out[kGIndex] * (in[kGIndex] * ina >> 8) >> 8, 255u);
+				}
+
+				if (cr != 255) {
+					out[kRIndex] = MIN<uint>(out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kRIndex] = MIN<uint>(out[kRIndex] * (in[kRIndex] * ina >> 8) >> 8, 255u);
+				}
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += outPitch;
+		ino += inoStep;
+	}
+
+}
+
+// Only blits to and from 32bpp images
+void multiplyBlendBlit(byte *dst, byte *src,
+					   const uint dstPitch, const uint srcPitch,
+					   const int posX, const int posY,
+					   const uint width, const uint height,
+					   const uint32 colorMod, const uint flipping) {
+	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
+
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitMultiplyBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitMultiplyBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	} else {
+		if (alphamod) {
+			doBlitMultiplyBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitMultiplyBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with alpha blended blitting
+ * @param ino a pointer to the input surface
+ * @param outo a pointer to the output surface
+ * @param width width of the input surface
+ * @param height height of the input surface
+ * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
+ * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
+ * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
+ * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitAlphaBlendLogic(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				uint outb = (out[kBIndex] * (255 - ina) >> 8);
+				uint outg = (out[kGIndex] * (255 - ina) >> 8);
+				uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+				out[kAIndex] = 255;
+				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+// Only blits to and from 32bpp images
+void alphaBlendBlit(byte *dst, byte *src,
+					const uint dstPitch, const uint srcPitch,
+					const int posX, const int posY,
+					const uint width, const uint height,
+					const uint32 colorMod, const uint flipping) {
+	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
+
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitAlphaBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitAlphaBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	} else {
+		if (alphamod) {
+			doBlitAlphaBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitAlphaBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with subtractive blended blitting
+ */
+template<bool rgbmod>
+static void doBlitSubtractiveBlendLogic(byte *ino, byte *outo,
+										uint32 width, uint32 height,
+										uint32 pitch, int32 inStep,
+										int32 inoStep, uint32 color) {
+	byte *in;
+	byte *out;
+
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			out[kAIndex] = 255;
+			if (cb != 255) {
+				out[kBIndex] = MAX(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kBIndex] = MAX(out[kBIndex] - (in[kBIndex] * (out[kBIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			if (cg != 255) {
+				out[kGIndex] = MAX(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kGIndex] = MAX(out[kGIndex] - (in[kGIndex] * (out[kGIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			if (cr != 255) {
+				out[kRIndex] = MAX(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kRIndex] = MAX(out[kRIndex] - (in[kRIndex] * (out[kRIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+// Only blits to and from 32bpp images
+void subtractiveBlendBlit(byte *dst, byte *src,
+						  const uint dstPitch, const uint srcPitch,
+						  const int posX, const int posY,
+						  const uint width, const uint height,
+						  const uint32 colorMod, const uint flipping) {
+	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+
+	if (rgbmod) {
+		doBlitSubtractiveBlendLogic<true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+	} else {
+		doBlitSubtractiveBlendLogic<false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with additive blended blitting
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitAdditiveBlendLogic(byte *ino, byte *outo,
+									 uint32 width, uint32 height, uint32 pitch,
+									 int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				if (cb != 255) {
+					out[kBIndex] = MIN<uint>(out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16), 255u);
+				} else {
+					out[kBIndex] = MIN<uint>(out[kBIndex] + (in[kBIndex] * ina >> 8), 255u);
+				}
+
+				if (cg != 255) {
+					out[kGIndex] = MIN<uint>(out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16), 255u);
+				} else {
+					out[kGIndex] = MIN<uint>(out[kGIndex] + (in[kGIndex] * ina >> 8), 255u);
+				}
+
+				if (cr != 255) {
+					out[kRIndex] = MIN<uint>(out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16), 255u);
+				} else {
+					out[kRIndex] = MIN<uint>(out[kRIndex] + (in[kRIndex] * ina >> 8), 255u);
+				}
+			}
+
+			in += inStep;
+			out += 4;
+		}
+
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+// Only blits to and from 32bpp images
+void additiveBlendBlit(byte *dst, byte *src,
+					   const uint dstPitch, const uint srcPitch,
+					   const int posX, const int posY,
+					   const uint width, const uint height,
+					   const uint32 colorMod, const uint flipping) {
+	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
+
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitAdditiveBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitAdditiveBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	} else {
+		if (alphamod) {
+			doBlitAdditiveBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		} else {
+			doBlitAdditiveBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		}
+	}
+}
+
+void opaqueBlendBlit(byte *dst, byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const uint32 colorMod, const uint flipping) {
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+	
+	byte *in;
+	byte *out;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		memcpy(out, in, width * 4);
+		for (uint32 j = 0; j < width; j++) {
+			out[kAIndex] = 0xFF;
+			out += 4;
+		}
+		outo += dstPitch;
+		ino += inoStep;
+	}
+}
+
+void binaryBlendBlit(byte *dst, byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const uint32 colorMod, const uint flipping) {
+	int xp = 0, yp = 0;
+
+	int inStep = 4;
+	int inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+	}
+
+	byte *ino = src + yp * srcPitch + xp * 4;
+	byte *outo = dst + posY * dstPitch + posX * 4;
+
+	byte *in;
+	byte *out;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+			uint32 pix = *(uint32 *)in;
+			int a = in[kAIndex];
+
+			if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+				*(uint32 *)out = pix;
+				out[kAIndex] = 0xFF;
+			}
+			out += 4;
+			in += inStep;
+		}
+		outo += dstPitch;
+		ino += inoStep;
+	}
+}
+
 } // End of namespace Graphics
diff --git a/graphics/blit.h b/graphics/blit.h
index 395cefbeac7..79c3d4de39d 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -23,6 +23,7 @@
 #define GRAPHICS_BLIT_H
 
 #include "graphics/pixelformat.h"
+#include "graphics/transform_struct.h"
 
 namespace Common {
 struct Point;
@@ -190,6 +191,58 @@ bool setAlpha(byte *dst, const byte *src,
               const Graphics::PixelFormat &format,
               const bool skipTransparent, const uint8 alpha);
 
+/**
+ * Returns the pixel format all operations of TransparentSurface support.
+ *
+ * Unlike Surface TransparentSurface only works with a fixed pixel format.
+ * This format can be queried using this static function.
+ *
+ * @return Supported pixel format.
+ */
+inline PixelFormat getSupportedBlenderPixelFormat() {
+	return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+}
+
+void opaqueBlendBlit(byte *dst, byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
+void binaryBlendBlit(byte *dst, byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
+// Only blits to and from 32bpp images
+void multiplyBlendBlit(byte *dst, byte *src,
+					   const uint dstPitch, const uint srcPitch,
+					   const int posX, const int posY,
+					   const uint width, const uint height,
+					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
+// Only blits to and from 32bpp images
+void subtractiveBlendBlit(byte *dst, byte *src,
+						  const uint dstPitch, const uint srcPitch,
+						  const int posX, const int posY,
+						  const uint width, const uint height,
+						  const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
+// Only blits to and from 32bpp images
+void additiveBlendBlit(byte *dst, byte *src,
+					   const uint dstPitch, const uint srcPitch,
+					   const int posX, const int posY,
+					   const uint width, const uint height,
+					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
+// Only blits to and from 32bpp images
+void alphaBlendBlit(byte *dst, byte *src,
+					const uint dstPitch, const uint srcPitch,
+					const int posX, const int posY,
+					const uint width, const uint height,
+					const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+
 /** @} */
 } // End of namespace Graphics
 
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index 66a1ed61652..6aea7ec54d9 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -34,29 +34,7 @@
 
 namespace Graphics {
 
-static const int kBModShift = 8;//img->format.bShift;
-static const int kGModShift = 16;//img->format.gShift;
-static const int kRModShift = 24;//img->format.rShift;
-static const int kAModShift = 0;//img->format.aShift;
-
-static const uint32 kBModMask = 0x0000ff00;
-static const uint32 kGModMask = 0x00ff0000;
-static const uint32 kRModMask = 0xff000000;
-static const uint32 kAModMask = 0x000000ff;
-static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
-
-#ifdef SCUMM_LITTLE_ENDIAN
-static const int kAIndex = 0;
-static const int kBIndex = 1;
-static const int kGIndex = 2;
-static const int kRIndex = 3;
-
-#else
-static const int kAIndex = 3;
-static const int kBIndex = 2;
-static const int kGIndex = 1;
-static const int kRIndex = 0;
-#endif
+static const int kAModShift = 0;
 
 TransparentSurface::TransparentSurface() : Surface(), _alphaMode(ALPHA_FULL) {}
 
@@ -75,318 +53,6 @@ TransparentSurface::TransparentSurface(const Surface &surf, bool copyData) : Sur
 	}
 }
 
-/**
- * Optimized version of doBlit to be used w/opaque blitting (no alpha).
- */
-static void doBlitOpaqueFast(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep) {
-
-	byte *in;
-	byte *out;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		memcpy(out, in, width * 4);
-		for (uint32 j = 0; j < width; j++) {
-			out[kAIndex] = 0xFF;
-			out += 4;
-		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-/**
- * Optimized version of doBlit to be used w/binary blitting (blit or no-blit, no blending).
- */
-static void doBlitBinaryFast(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep) {
-
-	byte *in;
-	byte *out;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
-			uint32 pix = *(uint32 *)in;
-			int a = in[kAIndex];
-
-			if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-				*(uint32 *)out = pix;
-				out[kAIndex] = 0xFF;
-			}
-			out += 4;
-			in += inStep;
-		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-/**
- * Optimized version of doBlit to be used with alpha blended blitting
- * @param ino a pointer to the input surface
- * @param outo a pointer to the output surface
- * @param width width of the input surface
- * @param height height of the input surface
- * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
- * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
- * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
- * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
- */
-template<bool rgbmod, bool alphamod>
-static void doBlitAlphaBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	byte *in;
-	byte *out;
-
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				uint outb = (out[kBIndex] * (255 - ina) >> 8);
-				uint outg = (out[kGIndex] * (255 - ina) >> 8);
-				uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-				out[kAIndex] = 255;
-				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
-			}
-
-			in += inStep;
-			out += 4;
-		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-static void doBlitAlphaBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((color & kAModMask)   != kAModMask);
-
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitAlphaBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitAlphaBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	} else {
-		if (alphamod) {
-			doBlitAlphaBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitAlphaBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	}
-}
-
-/**
- * Optimized version of doBlit to be used with additive blended blitting
- */
-template<bool rgbmod, bool alphamod>
-static void doBlitAdditiveBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	byte *in;
-	byte *out;
-
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				if (cb != 255) {
-					out[kBIndex] = MIN<uint>(out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16), 255u);
-				} else {
-					out[kBIndex] = MIN<uint>(out[kBIndex] + (in[kBIndex] * ina >> 8), 255u);
-				}
-
-				if (cg != 255) {
-					out[kGIndex] = MIN<uint>(out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16), 255u);
-				} else {
-					out[kGIndex] = MIN<uint>(out[kGIndex] + (in[kGIndex] * ina >> 8), 255u);
-				}
-
-				if (cr != 255) {
-					out[kRIndex] = MIN<uint>(out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16), 255u);
-				} else {
-					out[kRIndex] = MIN<uint>(out[kRIndex] + (in[kRIndex] * ina >> 8), 255u);
-				}
-			}
-
-			in += inStep;
-			out += 4;
-		}
-
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-static void doBlitAdditiveBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((color & kAModMask)   != kAModMask);
-
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitAdditiveBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitAdditiveBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	} else {
-		if (alphamod) {
-			doBlitAdditiveBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitAdditiveBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	}
-}
-
-/**
- * Optimized version of doBlit to be used with subtractive blended blitting
- */
-template<bool rgbmod>
-static void doBlitSubtractiveBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	byte *in;
-	byte *out;
-
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
-
-			out[kAIndex] = 255;
-			if (cb != 255) {
-				out[kBIndex] = MAX(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kBIndex] = MAX(out[kBIndex] - (in[kBIndex] * (out[kBIndex]) * in[kAIndex] >> 16), 0);
-			}
-
-			if (cg != 255) {
-				out[kGIndex] = MAX(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kGIndex] = MAX(out[kGIndex] - (in[kGIndex] * (out[kGIndex]) * in[kAIndex] >> 16), 0);
-			}
-
-			if (cr != 255) {
-				out[kRIndex] = MAX(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kRIndex] = MAX(out[kRIndex] - (in[kRIndex] * (out[kRIndex]) * in[kAIndex] >> 16), 0);
-			}
-
-			in += inStep;
-			out += 4;
-		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-static void doBlitSubtractiveBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
-
-	if (rgbmod) {
-		doBlitSubtractiveBlendImpl<true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-	} else {
-		doBlitSubtractiveBlendImpl<false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-	}
-}
-
-/**
- * Optimized version of doBlit to be used with multiply blended blitting
- */
-template<bool rgbmod, bool alphamod>
-static void doBlitMultiplyBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	byte *in;
-	byte *out;
-
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				if (cb != 255) {
-					out[kBIndex] = MIN<uint>(out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kBIndex] = MIN<uint>(out[kBIndex] * (in[kBIndex] * ina >> 8) >> 8, 255u);
-				}
-
-				if (cg != 255) {
-					out[kGIndex] = MIN<uint>(out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kGIndex] = MIN<uint>(out[kGIndex] * (in[kGIndex] * ina >> 8) >> 8, 255u);
-				}
-
-				if (cr != 255) {
-					out[kRIndex] = MIN<uint>(out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kRIndex] = MIN<uint>(out[kRIndex] * (in[kRIndex] * ina >> 8) >> 8, 255u);
-				}
-			}
-
-			in += inStep;
-			out += 4;
-		}
-		outo += pitch;
-		ino += inoStep;
-	}
-
-}
-
-static void doBlitMultiplyBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((color & kAModMask)   != kAModMask);
-
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitMultiplyBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitMultiplyBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	} else {
-		if (alphamod) {
-			doBlitMultiplyBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		} else {
-			doBlitMultiplyBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
-		}
-	}
-}
-
 Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int posY, int flipping, Common::Rect *pPartRect, uint color, int width, int height, TSpriteBlendMode blendMode) {
 
 	Common::Rect retSize;
@@ -487,37 +153,44 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		int xp = 0, yp = 0;
-
-		int inStep = 4;
-		int inoStep = img->pitch;
-		if (flipping & FLIP_H) {
-			inStep = -inStep;
-			xp = img->w - 1;
-		}
-
-		if (flipping & FLIP_V) {
-			inoStep = -inoStep;
-			yp = img->h - 1;
-		}
-
-		byte *ino = (byte *)img->getBasePtr(xp, yp);
-		byte *outo = (byte *)target.getBasePtr(posX, posY);
-
 		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
-			doBlitOpaqueFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+			Graphics::opaqueBlendBlit(
+				(byte *)target.getBasePtr(0, 0),
+				(byte *)img->getBasePtr(0, 0),
+				target.pitch, img->pitch,
+				posX, posY, img->w, img->h, color, flipping);
 		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
-			doBlitBinaryFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+			Graphics::binaryBlendBlit(
+				(byte *)target.getBasePtr(0, 0),
+				(byte *)img->getBasePtr(0, 0),
+				target.pitch, img->pitch,
+				posX, posY, img->w, img->h, color, flipping);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
-				doBlitAdditiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::additiveBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				doBlitSubtractiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::subtractiveBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else if (blendMode == BLEND_MULTIPLY) {
-				doBlitMultiplyBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::multiplyBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else {
 				assert(blendMode == BLEND_NORMAL);
-				doBlitAlphaBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::alphaBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			}
 		}
 
@@ -634,37 +307,44 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		int xp = 0, yp = 0;
-
-		int inStep = 4;
-		int inoStep = img->pitch;
-		if (flipping & FLIP_H) {
-			inStep = -inStep;
-			xp = img->w - 1;
-		}
-
-		if (flipping & FLIP_V) {
-			inoStep = -inoStep;
-			yp = img->h - 1;
-		}
-
-		byte *ino = (byte *)img->getBasePtr(xp, yp);
-		byte *outo = (byte *)target.getBasePtr(posX, posY);
-
 		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
-			doBlitOpaqueFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+			Graphics::opaqueBlendBlit(
+				(byte *)target.getBasePtr(0, 0),
+				(byte *)img->getBasePtr(0, 0),
+				target.pitch, img->pitch,
+				posX, posY, img->w, img->h, color, flipping);
 		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
-			doBlitBinaryFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+			Graphics::binaryBlendBlit(
+				(byte *)target.getBasePtr(0, 0),
+				(byte *)img->getBasePtr(0, 0),
+				target.pitch, img->pitch,
+				posX, posY, img->w, img->h, color, flipping);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
-				doBlitAdditiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::additiveBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				doBlitSubtractiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::subtractiveBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else if (blendMode == BLEND_MULTIPLY) {
-				doBlitMultiplyBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::multiplyBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			} else {
 				assert(blendMode == BLEND_NORMAL);
-				doBlitAlphaBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+				Graphics::alphaBlendBlit(
+					(byte *)target.getBasePtr(0, 0),
+					(byte *)img->getBasePtr(0, 0),
+					target.pitch, img->pitch,
+					posX, posY, img->w, img->h, color, flipping);
 			}
 		}
 


Commit: d03727885a6ef03eb2e52702f0c391a2f8a5b5ba
    https://github.com/scummvm/scummvm/commit/d03727885a6ef03eb2e52702f0c391a2f8a5b5ba
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Created blendBlitFrom in ManagedSurface

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    graphics/transparent_surface.cpp


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index dcd44dfc528..140eb052beb 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -194,12 +194,12 @@ bool setAlpha(byte *dst, const byte *src,
  * Optimized version of doBlit to be used with multiply blended blitting
  */
 template<bool rgbmod, bool alphamod>
-static void doBlitMultiplyBlendLogic(byte *ino, byte *outo,
+static void doBlitMultiplyBlendLogic(const byte *ino, byte *outo,
 									 uint32 width, uint32 height,
 									 uint32 outPitch, int32 inStep,
 									 int32 inoStep, uint32 color) {
 
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
@@ -244,7 +244,7 @@ static void doBlitMultiplyBlendLogic(byte *ino, byte *outo,
 }
 
 // Only blits to and from 32bpp images
-void multiplyBlendBlit(byte *dst, byte *src,
+void multiplyBlendBlit(byte *dst, const byte *src,
 					   const uint dstPitch, const uint srcPitch,
 					   const int posX, const int posY,
 					   const uint width, const uint height,
@@ -266,7 +266,7 @@ void multiplyBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 
 	if (rgbmod) {
@@ -296,9 +296,9 @@ void multiplyBlendBlit(byte *dst, byte *src,
  * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
  */
 template<bool rgbmod, bool alphamod>
-static void doBlitAlphaBlendLogic(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+static void doBlitAlphaBlendLogic(const byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
 
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
@@ -333,7 +333,7 @@ static void doBlitAlphaBlendLogic(byte *ino, byte *outo, uint32 width, uint32 he
 }
 
 // Only blits to and from 32bpp images
-void alphaBlendBlit(byte *dst, byte *src,
+void alphaBlendBlit(byte *dst, const byte *src,
 					const uint dstPitch, const uint srcPitch,
 					const int posX, const int posY,
 					const uint width, const uint height,
@@ -355,7 +355,7 @@ void alphaBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 
 	if (rgbmod) {
@@ -377,11 +377,11 @@ void alphaBlendBlit(byte *dst, byte *src,
  * Optimized version of doBlit to be used with subtractive blended blitting
  */
 template<bool rgbmod>
-static void doBlitSubtractiveBlendLogic(byte *ino, byte *outo,
+static void doBlitSubtractiveBlendLogic(const byte *ino, byte *outo,
 										uint32 width, uint32 height,
 										uint32 pitch, int32 inStep,
 										int32 inoStep, uint32 color) {
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
@@ -421,7 +421,7 @@ static void doBlitSubtractiveBlendLogic(byte *ino, byte *outo,
 }
 
 // Only blits to and from 32bpp images
-void subtractiveBlendBlit(byte *dst, byte *src,
+void subtractiveBlendBlit(byte *dst, const byte *src,
 						  const uint dstPitch, const uint srcPitch,
 						  const int posX, const int posY,
 						  const uint width, const uint height,
@@ -442,7 +442,7 @@ void subtractiveBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 
 	if (rgbmod) {
@@ -456,11 +456,11 @@ void subtractiveBlendBlit(byte *dst, byte *src,
  * Optimized version of doBlit to be used with additive blended blitting
  */
 template<bool rgbmod, bool alphamod>
-static void doBlitAdditiveBlendLogic(byte *ino, byte *outo,
+static void doBlitAdditiveBlendLogic(const byte *ino, byte *outo,
 									 uint32 width, uint32 height, uint32 pitch,
 									 int32 inStep, int32 inoStep, uint32 color) {
 
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
@@ -505,7 +505,7 @@ static void doBlitAdditiveBlendLogic(byte *ino, byte *outo,
 }
 
 // Only blits to and from 32bpp images
-void additiveBlendBlit(byte *dst, byte *src,
+void additiveBlendBlit(byte *dst, const byte *src,
 					   const uint dstPitch, const uint srcPitch,
 					   const int posX, const int posY,
 					   const uint width, const uint height,
@@ -527,7 +527,7 @@ void additiveBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 
 	if (rgbmod) {
@@ -545,7 +545,7 @@ void additiveBlendBlit(byte *dst, byte *src,
 	}
 }
 
-void opaqueBlendBlit(byte *dst, byte *src,
+void opaqueBlendBlit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
@@ -564,10 +564,10 @@ void opaqueBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 	
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	for (uint32 i = 0; i < height; i++) {
@@ -583,7 +583,7 @@ void opaqueBlendBlit(byte *dst, byte *src,
 	}
 }
 
-void binaryBlendBlit(byte *dst, byte *src,
+void binaryBlendBlit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
@@ -602,17 +602,17 @@ void binaryBlendBlit(byte *dst, byte *src,
 		yp = height - 1;
 	}
 
-	byte *ino = src + yp * srcPitch + xp * 4;
+	const byte *ino = src + yp * srcPitch + xp * 4;
 	byte *outo = dst + posY * dstPitch + posX * 4;
 
-	byte *in;
+	const byte *in;
 	byte *out;
 
 	for (uint32 i = 0; i < height; i++) {
 		out = outo;
 		in = ino;
 		for (uint32 j = 0; j < width; j++) {
-			uint32 pix = *(uint32 *)in;
+			uint32 pix = *(const uint32 *)in;
 			int a = in[kAIndex];
 
 			if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
diff --git a/graphics/blit.h b/graphics/blit.h
index 79c3d4de39d..945156cd144 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -191,53 +191,41 @@ bool setAlpha(byte *dst, const byte *src,
               const Graphics::PixelFormat &format,
               const bool skipTransparent, const uint8 alpha);
 
-/**
- * Returns the pixel format all operations of TransparentSurface support.
- *
- * Unlike Surface TransparentSurface only works with a fixed pixel format.
- * This format can be queried using this static function.
- *
- * @return Supported pixel format.
- */
-inline PixelFormat getSupportedBlenderPixelFormat() {
-	return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
-}
-
-void opaqueBlendBlit(byte *dst, byte *src,
+void opaqueBlendBlit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
 					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
 
-void binaryBlendBlit(byte *dst, byte *src,
+void binaryBlendBlit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
 					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
 
 // Only blits to and from 32bpp images
-void multiplyBlendBlit(byte *dst, byte *src,
+void multiplyBlendBlit(byte *dst, const byte *src,
 					   const uint dstPitch, const uint srcPitch,
 					   const int posX, const int posY,
 					   const uint width, const uint height,
 					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
 
 // Only blits to and from 32bpp images
-void subtractiveBlendBlit(byte *dst, byte *src,
+void subtractiveBlendBlit(byte *dst, const byte *src,
 						  const uint dstPitch, const uint srcPitch,
 						  const int posX, const int posY,
 						  const uint width, const uint height,
 						  const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
 
 // Only blits to and from 32bpp images
-void additiveBlendBlit(byte *dst, byte *src,
+void additiveBlendBlit(byte *dst, const byte *src,
 					   const uint dstPitch, const uint srcPitch,
 					   const int posX, const int posY,
 					   const uint width, const uint height,
 					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
 
 // Only blits to and from 32bpp images
-void alphaBlendBlit(byte *dst, byte *src,
+void alphaBlendBlit(byte *dst, const byte *src,
 					const uint dstPitch, const uint srcPitch,
 					const int posX, const int posY,
 					const uint width, const uint height,
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index cc3758436f2..e25de395077 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -20,6 +20,7 @@
  */
 
 #include "graphics/managed_surface.h"
+#include "graphics/blit.h"
 #include "common/algorithm.h"
 #include "common/textconsole.h"
 #include "common/endian.h"
@@ -728,6 +729,104 @@ void ManagedSurface::transBlitFromInner(const Surface &src, const Common::Rect &
 
 #undef HANDLE_BLIT
 
+Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
+										   const Common::Rect &destRect, int flipping,
+										   uint32 colorMod,
+										   TSpriteBlendMode blend, int alphaType) {
+	Common::Rect srcArea = srcRect, dstArea = destRect;
+	if (src.format != getSupportedBlendBlitPixelFormat() ||
+		format != getSupportedBlendBlitPixelFormat() ||
+		(colorMod & BLENDBLIT_RGB(0, 0, 0)) == 0) {
+		return Common::Rect(0, 0, 0, 0);
+	}
+
+	if (flipping & FLIP_H) {
+		srcArea.left = src.w - srcArea.right;
+	}
+
+	if (flipping & FLIP_V) {
+		srcArea.top = src.h - srcArea.bottom;
+	}
+
+	if (dstArea.left < 0) {
+		srcArea.left += -dstArea.left;
+		dstArea.left = 0;
+	}
+
+	if (dstArea.top < 0) {
+		srcArea.top += -dstArea.top;
+		dstArea.top = 0;
+	}
+
+	if (dstArea.right > w) {
+		srcArea.right -= dstArea.right - w;
+		dstArea.right = w;
+	}
+
+	if (dstArea.bottom > h) {
+		srcArea.bottom -= dstArea.bottom - h;
+		dstArea.bottom = h;
+	}
+
+	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
+		if (colorMod == 0xffffffff && blend == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+			Graphics::opaqueBlendBlit(
+				(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+				pitch, src.pitch,
+				dstArea.left, dstArea.top,
+				dstArea.width(), dstArea.height(),
+				colorMod, flipping);
+		} else if (colorMod == 0xffffffff && blend == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+			Graphics::binaryBlendBlit(
+				(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+				pitch, src.pitch,
+				dstArea.left, dstArea.top,
+				dstArea.width(), dstArea.height(),
+				colorMod, flipping);
+		} else {
+			if (blend == BLEND_ADDITIVE) {
+				Graphics::additiveBlendBlit(
+					(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+					pitch, src.pitch,
+					dstArea.left, dstArea.top,
+					dstArea.width(), dstArea.height(),
+					colorMod, flipping);
+			} else if (blend == BLEND_SUBTRACTIVE) {
+				Graphics::subtractiveBlendBlit(
+					(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+					pitch, src.pitch,
+					dstArea.left, dstArea.top,
+					dstArea.width(), dstArea.height(),
+					colorMod, flipping);
+			} else if (blend == BLEND_MULTIPLY) {
+				Graphics::multiplyBlendBlit(
+					(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+					pitch, src.pitch,
+					dstArea.left, dstArea.top,
+					dstArea.width(), dstArea.height(),
+					colorMod, flipping);
+			} else {
+				assert(blend == BLEND_NORMAL);
+				Graphics::alphaBlendBlit(
+					(byte *)getBasePtr(0, 0),
+				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+					pitch, src.pitch,
+					dstArea.left, dstArea.top,
+					dstArea.width(), dstArea.height(),
+					colorMod, flipping);
+			}
+		}
+		return Common::Rect(0, 0, dstArea.width(), dstArea.height());
+	} else {
+		return Common::Rect(0, 0, 0, 0);
+	}
+}
+
 void ManagedSurface::markAllDirty() {
 	addDirtyRect(Common::Rect(0, 0, this->w, this->h));
 }
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 18ebd7876bb..0e2f2de1238 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -24,11 +24,15 @@
 
 #include "graphics/pixelformat.h"
 #include "graphics/surface.h"
-#include "common/rect.h"
+#include "graphics/transform_struct.h"
 #include "common/types.h"
+#include "graphics/transparent_surface.h"
 
 namespace Graphics {
 
+#define BLENDBLIT_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
+#define BLENDBLIT_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
+
 /**
  * @defgroup graphics_managed_surface Managed surface
  * @ingroup graphics
@@ -521,6 +525,55 @@ public:
 		blitFromInner(src._innerSurface, srcRect, Common::Rect(destPos.x, destPos.y, destPos.x + srcRect.width(),
 			destPos.y + srcRect.height()), src._paletteSet ? src._palette : nullptr);
 	}
+	
+	/**
+	 * Returns the pixel format all operations of blendBlitFrom support.
+	 *
+	 * Unlike normal blit functions, blendBlitFrom only works with a fixed pixel
+	 * format. This format can be queried using this static function.
+	 *
+	 * @return Supported pixel format.
+	 */
+	static inline PixelFormat getSupportedBlendBlitPixelFormat() {
+		return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+	}
+
+	/**
+	 * @brief renders the surface to another surface
+	 * @note Most of this is wrong at the time being... Not sure whether or not to keep the old
+	 * arguments or just make the function like the rest here.
+	 * @param target a pointer to the target surface. In most cases this is the framebuffer.
+	 * @param posX the position on the X-axis in the target image in pixels where the image is supposed to be rendered.<br>
+	 * The default value is 0.
+	 * @param posY the position on the Y-axis in the target image in pixels where the image is supposed to be rendered.<br>
+	 * The default value is 0.
+	 * @param flipping how the image should be flipped.<br>
+	 * The default value is Graphics::FLIP_NONE (no flipping)
+	 * @param pPartRect Pointer on Common::Rect which specifies the section to be rendered. If the whole image has to be rendered the Pointer is NULL.<br>
+	 * This referes to the unflipped and unscaled image.<br>
+	 * The default value is NULL.
+	 * @param color an ARGB color value, which determines the parameters for the color modulation und alpha blending.<br>
+	 * The alpha component of the color determines the alpha blending parameter (0 = no covering, 255 = full covering).<br>
+	 * The color components determines the color for color modulation.<br>
+	 * The default value is TS_ARGB(255, 255, 255, 255) (full covering, no color modulation).
+	 * The macros TS_RGB and TS_ARGB can be used for the creation of the color value.
+	 * **Temporarily, these macros can also be replaced with blendBlitMakeARGB/RGB static members of
+	 *  Graphics::ManagedSurface
+	 * @param width the output width of the screen section.
+	 * The images will be scaled if the output width of the screen section differs from the image section.<br>
+	 * The value -1 determines that the image should not be scaled.<br>
+	 * The default value is -1.
+	 * @param height the output height of the screen section.
+	 * The images will be scaled if the output width of the screen section differs from the image section.<br>
+	 * The value -1 determines that the image should not be scaled.<br>
+	 * The default value is -1.
+	 * @return returns the size (not position) of what was drawn to this managed surface.
+	 */
+	Common::Rect blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
+							   const Common::Rect &destRect, int flipping = FLIP_NONE,
+							   uint32 colorMod = BLENDBLIT_ARGB(255, 255, 255, 255),
+							   TSpriteBlendMode blend = BLEND_NORMAL,
+							   int alphaType = ALPHA_FULL);
 
 	/**
 	 * Clear the entire surface.
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index 6aea7ec54d9..514085db19f 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -32,6 +32,8 @@
 #include "graphics/transparent_surface.h"
 #include "graphics/transform_tools.h"
 
+#include "graphics/managed_surface.h"
+
 namespace Graphics {
 
 static const int kAModShift = 0;
@@ -54,6 +56,29 @@ TransparentSurface::TransparentSurface(const Surface &surf, bool copyData) : Sur
 }
 
 Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int posY, int flipping, Common::Rect *pPartRect, uint color, int width, int height, TSpriteBlendMode blendMode) {
+	// TESTING PURPOSES
+	// ManagedSurface s(&target, DisposeAfterUse::NO);
+	// ManagedSurface me(this, DisposeAfterUse::NO);
+	// Common::Rect srcRect(0, 0, me.w, me.h);
+	// if (pPartRect) {
+	// 	srcRect = *pPartRect;
+	// }
+	// if (width == -1) {
+	// 	width = srcRect.width();
+	// }
+	// if (height == -1) {
+	// 	height = srcRect.height();
+	// }
+	// auto rect =  s.blendBlitFrom(me,
+	// 	srcRect,
+	// 	Common::Rect(posX, posY, posX + width, posY + height),
+	// 	flipping,
+	// 	color,
+	// 	blendMode,
+	// 	_alphaMode
+	// );
+	// target.copyFrom(*s.surfacePtr());
+	// return rect;
 
 	Common::Rect retSize;
 	retSize.top = 0;
@@ -92,11 +117,11 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 		srcImage.w = pPartRect->width();
 		srcImage.h = pPartRect->height();
 
-		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
+		debug("Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
 			  pPartRect->left,  pPartRect->top, pPartRect->width(), pPartRect->height(), color, width, height);
 	} else {
 
-		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
+		debug("Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
 			  srcImage.w, srcImage.h, color, width, height);
 	}
 
@@ -158,39 +183,45 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 				(byte *)target.getBasePtr(0, 0),
 				(byte *)img->getBasePtr(0, 0),
 				target.pitch, img->pitch,
-				posX, posY, img->w, img->h, color, flipping);
+				posX, posY, img->w, img->h,
+				color, flipping);
 		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
 			Graphics::binaryBlendBlit(
 				(byte *)target.getBasePtr(0, 0),
 				(byte *)img->getBasePtr(0, 0),
 				target.pitch, img->pitch,
-				posX, posY, img->w, img->h, color, flipping);
+				posX, posY, img->w, img->h,
+				color, flipping);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
 				Graphics::additiveBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
 				Graphics::subtractiveBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else if (blendMode == BLEND_MULTIPLY) {
 				Graphics::multiplyBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else {
 				assert(blendMode == BLEND_NORMAL);
 				Graphics::alphaBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			}
 		}
 
@@ -312,39 +343,45 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 				(byte *)target.getBasePtr(0, 0),
 				(byte *)img->getBasePtr(0, 0),
 				target.pitch, img->pitch,
-				posX, posY, img->w, img->h, color, flipping);
+				posX, posY, img->w, img->h,
+				color, flipping);
 		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
 			Graphics::binaryBlendBlit(
 				(byte *)target.getBasePtr(0, 0),
 				(byte *)img->getBasePtr(0, 0),
 				target.pitch, img->pitch,
-				posX, posY, img->w, img->h, color, flipping);
+				posX, posY, img->w, img->h,
+				color, flipping);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
 				Graphics::additiveBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
 				Graphics::subtractiveBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else if (blendMode == BLEND_MULTIPLY) {
 				Graphics::multiplyBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			} else {
 				assert(blendMode == BLEND_NORMAL);
 				Graphics::alphaBlendBlit(
 					(byte *)target.getBasePtr(0, 0),
 					(byte *)img->getBasePtr(0, 0),
 					target.pitch, img->pitch,
-					posX, posY, img->w, img->h, color, flipping);
+					posX, posY, img->w, img->h,
+				color, flipping);
 			}
 		}
 


Commit: f15462079d66e13ba2489a1e79fa0e5a327cc5f9
    https://github.com/scummvm/scummvm/commit/f15462079d66e13ba2489a1e79fa0e5a327cc5f9
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: ManagedSurface::blendBlitFrom fix

Changed paths:
    graphics/managed_surface.cpp


diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index e25de395077..0e34b413692 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -769,6 +769,9 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 	}
 
 	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
+		if (dstArea.width() != srcArea.width() || dstArea.height() != srcArea.height()) {
+			return Common::Rect(0, 0, dstArea.width(), dstArea.height());
+		}
 		if (colorMod == 0xffffffff && blend == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
 			Graphics::opaqueBlendBlit(
 				(byte *)getBasePtr(0, 0),


Commit: b34f6f48fde927859b06b754863fd72003956503
    https://github.com/scummvm/scummvm/commit/b34f6f48fde927859b06b754863fd72003956503
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
SWORD25: Moved to ManagedSurface

Changed paths:
    engines/sword25/gfx/graphicengine.cpp
    engines/sword25/gfx/graphicengine.h
    engines/sword25/gfx/image/renderedimage.cpp
    engines/sword25/gfx/image/renderedimage.h
    engines/sword25/gfx/renderobjectmanager.cpp


diff --git a/engines/sword25/gfx/graphicengine.cpp b/engines/sword25/gfx/graphicengine.cpp
index 1ecfe3c24d5..fe5d0d489cb 100644
--- a/engines/sword25/gfx/graphicengine.cpp
+++ b/engines/sword25/gfx/graphicengine.cpp
@@ -368,7 +368,7 @@ bool GraphicEngine::saveThumbnailScreenshot(const Common::String &filename) {
 	// until needed when creating savegame files
 	delete _thumbnail;
 
-	_thumbnail = Screenshot::createThumbnail(&_backSurface);
+	_thumbnail = Screenshot::createThumbnail(_backSurface.surfacePtr());
 
 	return true;
 }
diff --git a/engines/sword25/gfx/graphicengine.h b/engines/sword25/gfx/graphicengine.h
index 9e237e326af..76bf0fbddeb 100644
--- a/engines/sword25/gfx/graphicengine.h
+++ b/engines/sword25/gfx/graphicengine.h
@@ -44,7 +44,7 @@
 #include "common/rect.h"
 #include "common/ptr.h"
 #include "common/str.h"
-#include "graphics/surface.h"
+#include "graphics/managed_surface.h"
 #include "sword25/kernel/common.h"
 #include "sword25/kernel/resservice.h"
 #include "sword25/kernel/persistable.h"
@@ -214,8 +214,8 @@ public:
 	 */
 	bool fill(const Common::Rect *fillRectPtr = 0, uint color = BS_RGB(0, 0, 0));
 
-	Graphics::Surface _backSurface;
-	Graphics::Surface *getSurface() { return &_backSurface; }
+	Graphics::ManagedSurface _backSurface;
+	Graphics::ManagedSurface *getSurface() { return &_backSurface; }
 
 	Common::SeekableReadStream *_thumbnail;
 	Common::SeekableReadStream *getThumbnail() { return _thumbnail; }
diff --git a/engines/sword25/gfx/image/renderedimage.cpp b/engines/sword25/gfx/image/renderedimage.cpp
index e41ec699630..6a0a9da2a38 100644
--- a/engines/sword25/gfx/image/renderedimage.cpp
+++ b/engines/sword25/gfx/image/renderedimage.cpp
@@ -126,9 +126,9 @@ RenderedImage::RenderedImage(const Common::String &filename, bool &result) :
 
 	// Uncompress the image
 	if (isPNG)
-		result = ImgLoader::decodePNGImage(pFileData, fileSize, &_surface);
+		result = ImgLoader::decodePNGImage(pFileData, fileSize, _surface.surfacePtr());
 	else
-		result = ImgLoader::decodeThumbnailImage(pFileData, fileSize, &_surface);
+		result = ImgLoader::decodeThumbnailImage(pFileData, fileSize, _surface.surfacePtr());
 
 	if (!result) {
 		error("Could not decode image.");
@@ -233,7 +233,16 @@ bool RenderedImage::blit(int posX, int posY, int flipping, Common::Rect *pPartRe
 	int cg = (color >> BS_GSHIFT) & 0xff;
 	int cb = (color >> BS_BSHIFT) & 0xff;
 
-	_surface.blit(*_backSurface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
+	if (width == -1) width = _surface.w;
+	if (height == -1) height = _surface.h;
+	//_surface.blit(*_backSurface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
+	_backSurface->blendBlitFrom(
+		_surface,
+		pPartRect ? *pPartRect : Common::Rect(0, 0, _surface.w, _surface.h),
+		Common::Rect(posX, posY, posX + width, posY + height),
+		newFlipping,
+		_surface.format.ARGBToColor(ca, cr, cg, cb)
+	);
 
 	return true;
 }
diff --git a/engines/sword25/gfx/image/renderedimage.h b/engines/sword25/gfx/image/renderedimage.h
index ad7cecb9b0e..480b55c2e26 100644
--- a/engines/sword25/gfx/image/renderedimage.h
+++ b/engines/sword25/gfx/image/renderedimage.h
@@ -38,7 +38,7 @@
 #include "sword25/kernel/common.h"
 #include "sword25/gfx/image/image.h"
 #include "sword25/gfx/graphicengine.h"
-#include "graphics/transparent_surface.h"
+#include "graphics/managed_surface.h"
 
 namespace Sword25 {
 
@@ -108,11 +108,11 @@ public:
 	bool isSolid() const override { return !_isTransparent; }
 
 private:
-	Graphics::TransparentSurface _surface;
+	Graphics::ManagedSurface _surface;
 	bool _doCleanup;
 	bool _isTransparent;
 
-	Graphics::Surface *_backSurface;
+	Graphics::ManagedSurface *_backSurface;
 
 	void checkForTransparency();
 };
diff --git a/engines/sword25/gfx/renderobjectmanager.cpp b/engines/sword25/gfx/renderobjectmanager.cpp
index e867f9e7c45..658c0a3aa05 100644
--- a/engines/sword25/gfx/renderobjectmanager.cpp
+++ b/engines/sword25/gfx/renderobjectmanager.cpp
@@ -136,7 +136,7 @@ bool RenderObjectManager::render() {
 
 	if (_rootPtr->render(updateRects, updateRectsMinZ)) {
 		// Copy updated rectangles to the video screen
-		Graphics::Surface *backSurface = Kernel::getInstance()->getGfx()->getSurface();
+		Graphics::ManagedSurface *backSurface = Kernel::getInstance()->getGfx()->getSurface();
 		for (RectangleList::iterator rectIt = updateRects->begin(); rectIt != updateRects->end(); ++rectIt) {
 			const int x = (*rectIt).left;
 			const int y = (*rectIt).top;


Commit: 056a0029c6de3d8022b3de004c4a2943e22062db
    https://github.com/scummvm/scummvm/commit/056a0029c6de3d8022b3de004c4a2943e22062db
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
SWORD25: Tweaked RenderedImage to use pPartRect

Changed paths:
    engines/sword25/gfx/image/renderedimage.cpp


diff --git a/engines/sword25/gfx/image/renderedimage.cpp b/engines/sword25/gfx/image/renderedimage.cpp
index 6a0a9da2a38..43b13a1063b 100644
--- a/engines/sword25/gfx/image/renderedimage.cpp
+++ b/engines/sword25/gfx/image/renderedimage.cpp
@@ -233,8 +233,8 @@ bool RenderedImage::blit(int posX, int posY, int flipping, Common::Rect *pPartRe
 	int cg = (color >> BS_GSHIFT) & 0xff;
 	int cb = (color >> BS_BSHIFT) & 0xff;
 
-	if (width == -1) width = _surface.w;
-	if (height == -1) height = _surface.h;
+	if (width == -1) width = pPartRect ? pPartRect->width() : _surface.w;
+	if (height == -1) height = pPartRect ? pPartRect->height() : _surface.h;
 	//_surface.blit(*_backSurface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
 	_backSurface->blendBlitFrom(
 		_surface,


Commit: d36e603864db788b6f7c7dea5becb909b9275fcc
    https://github.com/scummvm/scummvm/commit/d36e603864db788b6f7c7dea5becb909b9275fcc
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: ManagedSurface::blendBlitFrom started on

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    graphics/transform_struct.h
    graphics/transparent_surface.cpp
    graphics/transparent_surface.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 140eb052beb..017e1149ce3 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -190,27 +190,80 @@ bool setAlpha(byte *dst, const byte *src,
 	return true;
 }
 
+
+struct BlendingSetupArgs {
+	bool rgbmod, alphamod;
+	int xp, yp;
+	int inStep, inoStep;
+	const byte *ino;
+	byte *outo;
+
+	int scaleX, scaleY;
+	uint dstPitch;
+	uint width, height;
+	uint32 color;
+	int flipping;
+
+	BlendingSetupArgs(byte *dst, const byte *src,
+					  const uint dstPitch, const uint srcPitch,
+					  const int posX, const int posY,
+					  const uint width, const uint height,
+					  const int scaleX, const int scaleY,
+					  const uint32 colorMod, const uint flipping) :
+			xp(0), yp(0), dstPitch(dstPitch),
+			width(width), height(height), color(colorMod),
+			scaleX(scaleX), scaleY(scaleY), flipping(flipping) {
+		bool doScale = scaleX != BLEND_BLIT_SCALE_THRESHOLD || scaleY != BLEND_BLIT_SCALE_THRESHOLD;
+		
+		rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+		alphamod = ((colorMod & kAModMask)   != kAModMask);
+		inStep = 4;
+		inoStep = srcPitch;
+		if (flipping & FLIP_H) {
+			inStep = -inStep;
+			xp = width - 1;
+			if (doScale) xp = xp * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
+		}
+
+		if (flipping & FLIP_V) {
+			inoStep = -inoStep;
+			yp = height - 1;
+			if (doScale) yp = yp * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
+		}
+
+		ino = src + yp * srcPitch + xp * 4;
+		outo = dst + posY * dstPitch + posX * 4;
+	}
+};
+
 /**
  * Optimized version of doBlit to be used with multiply blended blitting
  */
-template<bool rgbmod, bool alphamod>
-static void doBlitMultiplyBlendLogic(const byte *ino, byte *outo,
-									 uint32 width, uint32 height,
-									 uint32 outPitch, int32 inStep,
-									 int32 inoStep, uint32 color) {
-
+template<bool doscale>
+static void doBlitMultiplyBlendLogic(BlendingSetupArgs &args) {
 	const byte *in;
 	byte *out;
 
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
 
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
+	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
 
@@ -234,82 +287,46 @@ static void doBlitMultiplyBlendLogic(const byte *ino, byte *outo,
 				}
 			}
 
-			in += inStep;
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
 			out += 4;
 		}
-		outo += outPitch;
-		ino += inoStep;
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
 	}
 
 }
 
-// Only blits to and from 32bpp images
-void multiplyBlendBlit(byte *dst, const byte *src,
-					   const uint dstPitch, const uint srcPitch,
-					   const int posX, const int posY,
-					   const uint width, const uint height,
-					   const uint32 colorMod, const uint flipping) {
-	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
-
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
+template<bool doscale>
+static void doBlitAlphaBlendLogic(BlendingSetupArgs &args) {
+	const byte *in;
+	byte *out;
 
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-	}
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
 
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
+	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
 
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitMultiplyBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		} else {
-			doBlitMultiplyBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		}
-	} else {
-		if (alphamod) {
-			doBlitMultiplyBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
 		} else {
-			doBlitMultiplyBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+			in = args.ino;
 		}
-	}
-}
-
-/**
- * Optimized version of doBlit to be used with alpha blended blitting
- * @param ino a pointer to the input surface
- * @param outo a pointer to the output surface
- * @param width width of the input surface
- * @param height height of the input surface
- * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
- * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
- * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
- * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
- */
-template<bool rgbmod, bool alphamod>
-static void doBlitAlphaBlendLogic(const byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
-
-	const byte *in;
-	byte *out;
-
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
+		out = args.outo;
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
 
@@ -324,74 +341,48 @@ static void doBlitAlphaBlendLogic(const byte *ino, byte *outo, uint32 width, uin
 				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
 			}
 
-			in += inStep;
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
 			out += 4;
 		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-// Only blits to and from 32bpp images
-void alphaBlendBlit(byte *dst, const byte *src,
-					const uint dstPitch, const uint srcPitch,
-					const int posX, const int posY,
-					const uint width, const uint height,
-					const uint32 colorMod, const uint flipping) {
-	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
-
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
-
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-	}
-
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
 
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitAlphaBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		} else {
-			doBlitAlphaBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		}
-	} else {
-		if (alphamod) {
-			doBlitAlphaBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		} else {
-			doBlitAlphaBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
 	}
 }
 
 /**
  * Optimized version of doBlit to be used with subtractive blended blitting
  */
-template<bool rgbmod>
-static void doBlitSubtractiveBlendLogic(const byte *ino, byte *outo,
-										uint32 width, uint32 height,
-										uint32 pitch, int32 inStep,
-										int32 inoStep, uint32 color) {
+template<bool doscale>
+static void doBlitSubtractiveBlendLogic(BlendingSetupArgs &args) {
 	const byte *in;
 	byte *out;
 
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
 
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
+	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+			}
 
 			out[kAIndex] = 255;
 			if (cb != 255) {
@@ -412,66 +403,48 @@ static void doBlitSubtractiveBlendLogic(const byte *ino, byte *outo,
 				out[kRIndex] = MAX(out[kRIndex] - (in[kRIndex] * (out[kRIndex]) * in[kAIndex] >> 16), 0);
 			}
 
-			in += inStep;
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
 			out += 4;
 		}
-		outo += pitch;
-		ino += inoStep;
-	}
-}
-
-// Only blits to and from 32bpp images
-void subtractiveBlendBlit(byte *dst, const byte *src,
-						  const uint dstPitch, const uint srcPitch,
-						  const int posX, const int posY,
-						  const uint width, const uint height,
-						  const uint32 colorMod, const uint flipping) {
-	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
-
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-	}
-
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
-
-	if (rgbmod) {
-		doBlitSubtractiveBlendLogic<true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-	} else {
-		doBlitSubtractiveBlendLogic<false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
 	}
 }
 
 /**
  * Optimized version of doBlit to be used with additive blended blitting
  */
-template<bool rgbmod, bool alphamod>
-static void doBlitAdditiveBlendLogic(const byte *ino, byte *outo,
-									 uint32 width, uint32 height, uint32 pitch,
-									 int32 inStep, int32 inoStep, uint32 color) {
-
+template<bool doscale>
+static void doBlitAdditiveBlendLogic(BlendingSetupArgs &args) {
 	const byte *in;
 	byte *out;
 
-	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
-	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
-	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
-	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
 
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
+	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
 
@@ -495,123 +468,91 @@ static void doBlitAdditiveBlendLogic(const byte *ino, byte *outo,
 				}
 			}
 
-			in += inStep;
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
 			out += 4;
 		}
 
-		outo += pitch;
-		ino += inoStep;
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
 	}
 }
 
-// Only blits to and from 32bpp images
-void additiveBlendBlit(byte *dst, const byte *src,
-					   const uint dstPitch, const uint srcPitch,
-					   const int posX, const int posY,
-					   const uint width, const uint height,
-					   const uint32 colorMod, const uint flipping) {
-	bool rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-	bool alphamod = ((colorMod & kAModMask)   != kAModMask);
-
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
-
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-	}
+template<bool doscale>
+void doBlitOpaqueBlendLogic(BlendingSetupArgs &args) {
+	const byte *in;
+	byte *out;
 
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
 
-	if (rgbmod) {
-		if (alphamod) {
-			doBlitAdditiveBlendLogic<true, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		} else {
-			doBlitAdditiveBlendLogic<true, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
-		}
-	} else {
-		if (alphamod) {
-			doBlitAdditiveBlendLogic<false, true>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + (scaleYCtr + 1) / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
 		} else {
-			doBlitAdditiveBlendLogic<false, false>(ino, outo, width, height, dstPitch, inStep, inoStep, colorMod);
+			in = args.ino;
 		}
-	}
-}
+		out = args.outo;
 
-void opaqueBlendBlit(byte *dst, const byte *src,
-					 const uint dstPitch, const uint srcPitch,
-					 const int posX, const int posY,
-					 const uint width, const uint height,
-					 const uint32 colorMod, const uint flipping) {
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
+		if (doscale) {
+			for (uint32 j = 0; j < args.width; j++) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
 
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-	}
-
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
-	
-	const byte *in;
-	byte *out;
+				memcpy(out, in, 4);
+				out[kAIndex] = 0xFF;
 
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		memcpy(out, in, width * 4);
-		for (uint32 j = 0; j < width; j++) {
-			out[kAIndex] = 0xFF;
-			out += 4;
+				scaleXCtr += args.scaleX;
+				out += 4;
+			}
+		} else if (args.flipping & FLIP_H) {
+			for (uint32 j = 0; j < args.width; j++) {
+				memcpy(out, in, 4);
+				out[kAIndex] = 0xFF;
+				out += 4;
+				in += args.inStep;
+			}
+		} else {
+			memcpy(out, in, args.width * 4);
+			for (uint32 j = 0; j < args.width; j++) {
+				out[kAIndex] = 0xFF;
+				out += 4;
+			}
 		}
-		outo += dstPitch;
-		ino += inoStep;
-	}
-}
-
-void binaryBlendBlit(byte *dst, const byte *src,
-					 const uint dstPitch, const uint srcPitch,
-					 const int posX, const int posY,
-					 const uint width, const uint height,
-					 const uint32 colorMod, const uint flipping) {
-	int xp = 0, yp = 0;
-
-	int inStep = 4;
-	int inoStep = srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-	}
 
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
 	}
+}
 
-	const byte *ino = src + yp * srcPitch + xp * 4;
-	byte *outo = dst + posY * dstPitch + posX * 4;
-
+template<bool doscale>
+void doBlitBinaryBlendLogic(BlendingSetupArgs &args) {
 	const byte *in;
 	byte *out;
 
-	for (uint32 i = 0; i < height; i++) {
-		out = outo;
-		in = ino;
-		for (uint32 j = 0; j < width; j++) {
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+			}
 			uint32 pix = *(const uint32 *)in;
 			int a = in[kAIndex];
 
@@ -619,11 +560,65 @@ void binaryBlendBlit(byte *dst, const byte *src,
 				*(uint32 *)out = pix;
 				out[kAIndex] = 0xFF;
 			}
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
 			out += 4;
-			in += inStep;
 		}
-		outo += dstPitch;
-		ino += inoStep;
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+// Only blits to and from 32bpp images
+void blendBlitUnfiltered(byte *dst, const byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const int scaleX, const int scaleY,
+					 const uint32 colorMod, const uint flipping,
+					 const TSpriteBlendMode blendMode,
+					 const AlphaType alphaType) {
+	if (width == 0 || height == 0) return;
+	BlendingSetupArgs args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
+	if (scaleX == BLEND_BLIT_SCALE_THRESHOLD && scaleY == BLEND_BLIT_SCALE_THRESHOLD) {
+		if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+			doBlitOpaqueBlendLogic<false>(args);
+		} else if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+			doBlitBinaryBlendLogic<false>(args);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				doBlitAdditiveBlendLogic<false>(args);
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				doBlitSubtractiveBlendLogic<false>(args);
+			} else if (blendMode == BLEND_MULTIPLY) {
+				doBlitMultiplyBlendLogic<false>(args);
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				doBlitAlphaBlendLogic<false>(args);
+			}
+		}
+	} else {
+		if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+			doBlitOpaqueBlendLogic<true>(args);
+		} else if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+			doBlitBinaryBlendLogic<true>(args);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				doBlitAdditiveBlendLogic<true>(args);
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				doBlitSubtractiveBlendLogic<true>(args);
+			} else if (blendMode == BLEND_MULTIPLY) {
+				doBlitMultiplyBlendLogic<true>(args);
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				doBlitAlphaBlendLogic<true>(args);
+			}
+		}
 	}
 }
 
diff --git a/graphics/blit.h b/graphics/blit.h
index 945156cd144..0309f717244 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -191,45 +191,27 @@ bool setAlpha(byte *dst, const byte *src,
               const Graphics::PixelFormat &format,
               const bool skipTransparent, const uint8 alpha);
 
-void opaqueBlendBlit(byte *dst, const byte *src,
-					 const uint dstPitch, const uint srcPitch,
-					 const int posX, const int posY,
-					 const uint width, const uint height,
-					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+static const int BLEND_BLIT_SCALE_THRESHOLD = 0x100;
 
-void binaryBlendBlit(byte *dst, const byte *src,
+/**
+ * Optimized version of doBlit to be used with alpha blended blitting
+ * @param ino a pointer to the input surface
+ * @param outo a pointer to the output surface
+ * @param width width of the input surface
+ * @param height height of the input surface
+ * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
+ * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
+ * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
+ * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
+ */
+void blendBlitUnfiltered(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
-					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
-
-// Only blits to and from 32bpp images
-void multiplyBlendBlit(byte *dst, const byte *src,
-					   const uint dstPitch, const uint srcPitch,
-					   const int posX, const int posY,
-					   const uint width, const uint height,
-					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
-
-// Only blits to and from 32bpp images
-void subtractiveBlendBlit(byte *dst, const byte *src,
-						  const uint dstPitch, const uint srcPitch,
-						  const int posX, const int posY,
-						  const uint width, const uint height,
-						  const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
-
-// Only blits to and from 32bpp images
-void additiveBlendBlit(byte *dst, const byte *src,
-					   const uint dstPitch, const uint srcPitch,
-					   const int posX, const int posY,
-					   const uint width, const uint height,
-					   const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
-
-// Only blits to and from 32bpp images
-void alphaBlendBlit(byte *dst, const byte *src,
-					const uint dstPitch, const uint srcPitch,
-					const int posX, const int posY,
-					const uint width, const uint height,
-					const uint32 colorMod = 0, const uint flipping = FLIP_NONE);
+					 const int scaleX, const int scaleY,
+					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE,
+					 const TSpriteBlendMode blendMode = BLEND_NORMAL,
+					 const AlphaType alphaType = ALPHA_FULL);
 
 /** @} */
 } // End of namespace Graphics
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 0e34b413692..07d5afeeb6e 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -731,103 +731,65 @@ void ManagedSurface::transBlitFromInner(const Surface &src, const Common::Rect &
 
 Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
 										   const Common::Rect &destRect, int flipping,
-										   uint32 colorMod,
-										   TSpriteBlendMode blend, int alphaType) {
+										   const uint32 colorMod,
+										   const TSpriteBlendMode blend,
+										   const AlphaType alphaType) {
 	Common::Rect srcArea = srcRect, dstArea = destRect;
-	if (src.format != getSupportedBlendBlitPixelFormat() ||
-		format != getSupportedBlendBlitPixelFormat() ||
-		(colorMod & BLENDBLIT_RGB(0, 0, 0)) == 0) {
+	if (format != getSupportedBlendBlitPixelFormat() || src.format != getSupportedBlendBlitPixelFormat()) {
+		warning("ManagedSurface::blendBlitFrom only accepts RGBA32!");
 		return Common::Rect(0, 0, 0, 0);
 	}
 
-	if (flipping & FLIP_H) {
-		srcArea.left = src.w - srcArea.right;
-	}
+	// Alpha is zero
+	if ((colorMod & TS_ARGB(255, 0, 0, 0)) == 0) return Common::Rect(0, 0, 0, 0);
 
-	if (flipping & FLIP_V) {
-		srcArea.top = src.h - srcArea.bottom;
-	}
+	const int scaleX = BLEND_BLIT_SCALE_THRESHOLD * srcArea.width() / dstArea.width();
+	const int scaleY = BLEND_BLIT_SCALE_THRESHOLD * srcArea.height() / dstArea.height();
 
 	if (dstArea.left < 0) {
-		srcArea.left += -dstArea.left;
+		srcArea.left += -dstArea.left * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
 		dstArea.left = 0;
 	}
 
 	if (dstArea.top < 0) {
-		srcArea.top += -dstArea.top;
+		srcArea.top += -dstArea.top * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
 		dstArea.top = 0;
 	}
 
 	if (dstArea.right > w) {
-		srcArea.right -= dstArea.right - w;
+		srcArea.right -= (dstArea.right - w) * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
 		dstArea.right = w;
 	}
 
 	if (dstArea.bottom > h) {
-		srcArea.bottom -= dstArea.bottom - h;
+		srcArea.bottom -= (dstArea.bottom - h) * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
 		dstArea.bottom = h;
 	}
 
+	if (flipping & FLIP_H) {
+		int tmp_w = srcArea.width();
+		srcArea.left = src.w - srcArea.right;
+		srcArea.right = srcArea.left + tmp_w;
+	}
+
+	if (flipping & FLIP_V) {
+		int tmp_h = srcArea.height();
+		srcArea.top = src.h - srcArea.bottom;
+		srcArea.bottom = srcArea.top + tmp_h;
+	}
+
 	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
-		if (dstArea.width() != srcArea.width() || dstArea.height() != srcArea.height()) {
-			return Common::Rect(0, 0, dstArea.width(), dstArea.height());
-		}
-		if (colorMod == 0xffffffff && blend == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
-			Graphics::opaqueBlendBlit(
-				(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-				pitch, src.pitch,
-				dstArea.left, dstArea.top,
-				dstArea.width(), dstArea.height(),
-				colorMod, flipping);
-		} else if (colorMod == 0xffffffff && blend == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
-			Graphics::binaryBlendBlit(
-				(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-				pitch, src.pitch,
-				dstArea.left, dstArea.top,
-				dstArea.width(), dstArea.height(),
-				colorMod, flipping);
-		} else {
-			if (blend == BLEND_ADDITIVE) {
-				Graphics::additiveBlendBlit(
-					(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-					pitch, src.pitch,
-					dstArea.left, dstArea.top,
-					dstArea.width(), dstArea.height(),
-					colorMod, flipping);
-			} else if (blend == BLEND_SUBTRACTIVE) {
-				Graphics::subtractiveBlendBlit(
-					(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-					pitch, src.pitch,
-					dstArea.left, dstArea.top,
-					dstArea.width(), dstArea.height(),
-					colorMod, flipping);
-			} else if (blend == BLEND_MULTIPLY) {
-				Graphics::multiplyBlendBlit(
-					(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-					pitch, src.pitch,
-					dstArea.left, dstArea.top,
-					dstArea.width(), dstArea.height(),
-					colorMod, flipping);
-			} else {
-				assert(blend == BLEND_NORMAL);
-				Graphics::alphaBlendBlit(
-					(byte *)getBasePtr(0, 0),
-				(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-					pitch, src.pitch,
-					dstArea.left, dstArea.top,
-					dstArea.width(), dstArea.height(),
-					colorMod, flipping);
-			}
-		}
-		return Common::Rect(0, 0, dstArea.width(), dstArea.height());
-	} else {
-		return Common::Rect(0, 0, 0, 0);
+		blendBlitUnfiltered(
+			(byte *)getBasePtr(0, 0),
+			(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
+			pitch, src.pitch,
+			dstArea.left, dstArea.top,
+			dstArea.width(), dstArea.height(),
+			scaleX, scaleY,
+			colorMod, flipping,
+			blend, alphaType);
 	}
+	return Common::Rect(0, 0, dstArea.width(), dstArea.height());
 }
 
 void ManagedSurface::markAllDirty() {
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 0e2f2de1238..3f2f81beac7 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -30,9 +30,6 @@
 
 namespace Graphics {
 
-#define BLENDBLIT_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
-#define BLENDBLIT_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
-
 /**
  * @defgroup graphics_managed_surface Managed surface
  * @ingroup graphics
@@ -571,9 +568,9 @@ public:
 	 */
 	Common::Rect blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
 							   const Common::Rect &destRect, int flipping = FLIP_NONE,
-							   uint32 colorMod = BLENDBLIT_ARGB(255, 255, 255, 255),
-							   TSpriteBlendMode blend = BLEND_NORMAL,
-							   int alphaType = ALPHA_FULL);
+							   const uint32 colorMod = TS_ARGB(255, 255, 255, 255),
+							   const TSpriteBlendMode blend = BLEND_NORMAL,
+							   const AlphaType alphaType = ALPHA_FULL);
 
 	/**
 	 * Clear the entire surface.
diff --git a/graphics/transform_struct.h b/graphics/transform_struct.h
index 35e5d9cde10..6fd310eba34 100644
--- a/graphics/transform_struct.h
+++ b/graphics/transform_struct.h
@@ -24,6 +24,9 @@
 
 #include "common/rect.h"
 
+#define TS_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
+#define TS_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
+
 namespace Graphics {
 
 enum TSpriteBlendMode {
@@ -35,6 +38,12 @@ enum TSpriteBlendMode {
 	NUM_BLEND_MODES
 };
 
+enum AlphaType {
+	ALPHA_OPAQUE = 0,
+	ALPHA_BINARY = 1,
+	ALPHA_FULL = 2
+};
+
 /**
  @brief The possible flipping parameters for the blit method.
  */
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index 514085db19f..589a3a0c797 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -19,8 +19,6 @@
  *
  */
 
-
-
 #include "common/algorithm.h"
 #include "common/endian.h"
 #include "common/util.h"
@@ -56,30 +54,6 @@ TransparentSurface::TransparentSurface(const Surface &surf, bool copyData) : Sur
 }
 
 Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int posY, int flipping, Common::Rect *pPartRect, uint color, int width, int height, TSpriteBlendMode blendMode) {
-	// TESTING PURPOSES
-	// ManagedSurface s(&target, DisposeAfterUse::NO);
-	// ManagedSurface me(this, DisposeAfterUse::NO);
-	// Common::Rect srcRect(0, 0, me.w, me.h);
-	// if (pPartRect) {
-	// 	srcRect = *pPartRect;
-	// }
-	// if (width == -1) {
-	// 	width = srcRect.width();
-	// }
-	// if (height == -1) {
-	// 	height = srcRect.height();
-	// }
-	// auto rect =  s.blendBlitFrom(me,
-	// 	srcRect,
-	// 	Common::Rect(posX, posY, posX + width, posY + height),
-	// 	flipping,
-	// 	color,
-	// 	blendMode,
-	// 	_alphaMode
-	// );
-	// target.copyFrom(*s.surfacePtr());
-	// return rect;
-
 	Common::Rect retSize;
 	retSize.top = 0;
 	retSize.left = 0;
@@ -116,13 +90,6 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 		srcImage.pixels = getBasePtr(xOffset, yOffset);
 		srcImage.w = pPartRect->width();
 		srcImage.h = pPartRect->height();
-
-		debug("Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
-			  pPartRect->left,  pPartRect->top, pPartRect->width(), pPartRect->height(), color, width, height);
-	} else {
-
-		debug("Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
-			  srcImage.w, srcImage.h, color, width, height);
 	}
 
 	if (width == -1) {
@@ -178,53 +145,13 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
-			Graphics::opaqueBlendBlit(
-				(byte *)target.getBasePtr(0, 0),
-				(byte *)img->getBasePtr(0, 0),
-				target.pitch, img->pitch,
-				posX, posY, img->w, img->h,
-				color, flipping);
-		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
-			Graphics::binaryBlendBlit(
-				(byte *)target.getBasePtr(0, 0),
-				(byte *)img->getBasePtr(0, 0),
-				target.pitch, img->pitch,
-				posX, posY, img->w, img->h,
-				color, flipping);
-		} else {
-			if (blendMode == BLEND_ADDITIVE) {
-				Graphics::additiveBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				Graphics::subtractiveBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else if (blendMode == BLEND_MULTIPLY) {
-				Graphics::multiplyBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else {
-				assert(blendMode == BLEND_NORMAL);
-				Graphics::alphaBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			}
-		}
-
+		Graphics::blendBlitUnfiltered(
+			(byte *)target.getBasePtr(0, 0),
+			(byte *)img->getBasePtr(0, 0),
+			target.pitch, img->pitch,
+			posX, posY, img->w, img->h, BLEND_BLIT_SCALE_THRESHOLD, BLEND_BLIT_SCALE_THRESHOLD,
+			color, flipping,
+			blendMode, _alphaMode);
 	}
 
 	retSize.setWidth(img->w);
@@ -276,13 +203,6 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 		srcImage.pixels = getBasePtr(xOffset, yOffset);
 		srcImage.w = pPartRect->width();
 		srcImage.h = pPartRect->height();
-
-		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
-			pPartRect->left, pPartRect->top, pPartRect->width(), pPartRect->height(), color, width, height);
-	} else {
-
-		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
-			srcImage.w, srcImage.h, color, width, height);
 	}
 
 	if (width == -1) {
@@ -338,53 +258,13 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
-			Graphics::opaqueBlendBlit(
-				(byte *)target.getBasePtr(0, 0),
-				(byte *)img->getBasePtr(0, 0),
-				target.pitch, img->pitch,
-				posX, posY, img->w, img->h,
-				color, flipping);
-		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
-			Graphics::binaryBlendBlit(
-				(byte *)target.getBasePtr(0, 0),
-				(byte *)img->getBasePtr(0, 0),
-				target.pitch, img->pitch,
-				posX, posY, img->w, img->h,
-				color, flipping);
-		} else {
-			if (blendMode == BLEND_ADDITIVE) {
-				Graphics::additiveBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				Graphics::subtractiveBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else if (blendMode == BLEND_MULTIPLY) {
-				Graphics::multiplyBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			} else {
-				assert(blendMode == BLEND_NORMAL);
-				Graphics::alphaBlendBlit(
-					(byte *)target.getBasePtr(0, 0),
-					(byte *)img->getBasePtr(0, 0),
-					target.pitch, img->pitch,
-					posX, posY, img->w, img->h,
-				color, flipping);
-			}
-		}
-
+		Graphics::blendBlitUnfiltered(
+			(byte *)target.getBasePtr(0, 0),
+			(byte *)img->getBasePtr(0, 0),
+			target.pitch, img->pitch,
+			posX, posY, img->w, img->h, BLEND_BLIT_SCALE_THRESHOLD, BLEND_BLIT_SCALE_THRESHOLD,
+			color, flipping,
+			blendMode, _alphaMode);
 	}
 
 	retSize.setWidth(img->w);
diff --git a/graphics/transparent_surface.h b/graphics/transparent_surface.h
index 9a4f7d644b6..f23920f8b38 100644
--- a/graphics/transparent_surface.h
+++ b/graphics/transparent_surface.h
@@ -34,9 +34,6 @@
  *
  */
 
-#define TS_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
-#define TS_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
-
 namespace Graphics {
 
 /**
@@ -48,13 +45,6 @@ namespace Graphics {
  * @{
  */
 
-// Enums
-enum AlphaType {
-	ALPHA_OPAQUE = 0,
-	ALPHA_BINARY = 1,
-	ALPHA_FULL = 2
-};
-
 /**
  * A transparent graphics surface, which implements alpha blitting.
  */


Commit: d039a8570ef6f990dc479416bb5b89fe26221c2c
    https://github.com/scummvm/scummvm/commit/d039a8570ef6f990dc479416bb5b89fe26221c2c
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Add test for ManagedSurface::blendBlitFrom

Changed paths:
  A test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
new file mode 100644
index 00000000000..bffaa2b25f1
--- /dev/null
+++ b/test/image/blending.h
@@ -0,0 +1,1023 @@
+#include <cxxtest/TestSuite.h>
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "common/fs.h"
+#include "common/stream.h"
+
+#include "graphics/surface.h"
+#include "graphics/managed_surface.h"
+#include "graphics/transparent_surface.h"
+
+#include "common/algorithm.h"
+#include "common/endian.h"
+#include "common/util.h"
+#include "common/rect.h"
+#include "common/math.h"
+#include "common/textconsole.h"
+#include "graphics/blit.h"
+#include "graphics/primitives.h"
+#include "graphics/transparent_surface.h"
+#include "graphics/transform_tools.h"
+
+namespace OldTransparentSurface {
+
+using namespace Graphics;
+
+struct OldTransparentSurface : public Graphics::Surface {
+	OldTransparentSurface();
+	OldTransparentSurface(const Graphics::Surface &surf, bool copyData = false);
+
+	static PixelFormat getSupportedPixelFormat() {
+		return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+	}
+
+	Common::Rect blit(Graphics::Surface &target, int posX = 0, int posY = 0,
+	                  int flipping = FLIP_NONE,
+	                  Common::Rect *pPartRect = nullptr,
+	                  uint color = TS_ARGB(255, 255, 255, 255),
+	                  int width = -1, int height = -1,
+	                  TSpriteBlendMode blend = BLEND_NORMAL);
+	Common::Rect blitClip(Graphics::Surface &target, Common::Rect clippingArea,
+						int posX = 0, int posY = 0,
+						int flipping = FLIP_NONE,
+						Common::Rect *pPartRect = nullptr,
+						uint color = TS_ARGB(255, 255, 255, 255),
+						int width = -1, int height = -1,
+						TSpriteBlendMode blend = BLEND_NORMAL);
+	OldTransparentSurface *scale(int16 newWidth, int16 newHeight, bool filtering = false) const;
+
+	OldTransparentSurface *rotoscale(const TransformStruct &transform, bool filtering = false) const;
+
+	OldTransparentSurface *convertTo(const PixelFormat &dstFormat, const byte *palette = 0) const;
+
+	float getRatio() {
+		if (!w)
+			return 0;
+
+		return h / (float)w;
+	}
+
+	AlphaType getAlphaMode() const;
+	void setAlphaMode(AlphaType);
+private:
+	AlphaType _alphaMode;
+};
+
+static const int kBModShift = 8;//img->format.bShift;
+static const int kGModShift = 16;//img->format.gShift;
+static const int kRModShift = 24;//img->format.rShift;
+static const int kAModShift = 0;//img->format.aShift;
+
+static const uint32 kBModMask = 0x0000ff00;
+static const uint32 kGModMask = 0x00ff0000;
+static const uint32 kRModMask = 0xff000000;
+static const uint32 kAModMask = 0x000000ff;
+static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
+
+#ifdef SCUMM_LITTLE_ENDIAN
+static const int kAIndex = 0;
+static const int kBIndex = 1;
+static const int kGIndex = 2;
+static const int kRIndex = 3;
+
+#else
+static const int kAIndex = 3;
+static const int kBIndex = 2;
+static const int kGIndex = 1;
+static const int kRIndex = 0;
+#endif
+
+OldTransparentSurface::OldTransparentSurface() : Surface(), _alphaMode(ALPHA_FULL) {}
+
+OldTransparentSurface::OldTransparentSurface(const Surface &surf, bool copyData) : Surface(), _alphaMode(ALPHA_FULL) {
+	if (copyData) {
+		copyFrom(surf);
+	} else {
+		w = surf.w;
+		h = surf.h;
+		pitch = surf.pitch;
+		format = surf.format;
+		// We need to cast the const qualifier away here because 'pixels'
+		// always needs to be writable. 'surf' however is a constant Surface,
+		// thus getPixels will always return const pixel data.
+		pixels = const_cast<void *>(surf.getPixels());
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used w/opaque blitting (no alpha).
+ */
+static void doBlitOpaqueFast(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep) {
+
+	byte *in;
+	byte *out;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+			memcpy(out, in, 4);
+			out[kAIndex] = 0xFF;
+			out += 4;
+			in += inStep;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used w/binary blitting (blit or no-blit, no blending).
+ */
+static void doBlitBinaryFast(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep) {
+
+	byte *in;
+	byte *out;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+			uint32 pix = *(uint32 *)in;
+			int a = in[kAIndex];
+
+			if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+				*(uint32 *)out = pix;
+				out[kAIndex] = 0xFF;
+			}
+			out += 4;
+			in += inStep;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with alpha blended blitting
+ * @param ino a pointer to the input surface
+ * @param outo a pointer to the output surface
+ * @param width width of the input surface
+ * @param height height of the input surface
+ * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
+ * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
+ * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
+ * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitAlphaBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				uint outb = (out[kBIndex] * (255 - ina) >> 8);
+				uint outg = (out[kGIndex] * (255 - ina) >> 8);
+				uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+				out[kAIndex] = 255;
+				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+static void doBlitAlphaBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((color & kAModMask)   != kAModMask);
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitAlphaBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitAlphaBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	} else {
+		if (alphamod) {
+			doBlitAlphaBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitAlphaBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with additive blended blitting
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitAdditiveBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				if (cb != 255) {
+					out[kBIndex] = MIN<uint>(out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16), 255u);
+				} else {
+					out[kBIndex] = MIN<uint>(out[kBIndex] + (in[kBIndex] * ina >> 8), 255u);
+				}
+
+				if (cg != 255) {
+					out[kGIndex] = MIN<uint>(out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16), 255u);
+				} else {
+					out[kGIndex] = MIN<uint>(out[kGIndex] + (in[kGIndex] * ina >> 8), 255u);
+				}
+
+				if (cr != 255) {
+					out[kRIndex] = MIN<uint>(out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16), 255u);
+				} else {
+					out[kRIndex] = MIN<uint>(out[kRIndex] + (in[kRIndex] * ina >> 8), 255u);
+				}
+			}
+
+			in += inStep;
+			out += 4;
+		}
+
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+static void doBlitAdditiveBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((color & kAModMask)   != kAModMask);
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitAdditiveBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitAdditiveBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	} else {
+		if (alphamod) {
+			doBlitAdditiveBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitAdditiveBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with subtractive blended blitting
+ */
+template<bool rgbmod>
+static void doBlitSubtractiveBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			out[kAIndex] = 255;
+			if (cb != 255) {
+				out[kBIndex] = MAX(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kBIndex] = MAX(out[kBIndex] - (in[kBIndex] * (out[kBIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			if (cg != 255) {
+				out[kGIndex] = MAX(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kGIndex] = MAX(out[kGIndex] - (in[kGIndex] * (out[kGIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			if (cr != 255) {
+				out[kRIndex] = MAX(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
+			} else {
+				out[kRIndex] = MAX(out[kRIndex] - (in[kRIndex] * (out[kRIndex]) * in[kAIndex] >> 16), 0);
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+}
+
+static void doBlitSubtractiveBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
+
+	if (rgbmod) {
+		doBlitSubtractiveBlendImpl<true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+	} else {
+		doBlitSubtractiveBlendImpl<false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+	}
+}
+
+/**
+ * Optimized version of doBlit to be used with multiply blended blitting
+ */
+template<bool rgbmod, bool alphamod>
+static void doBlitMultiplyBlendImpl(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	byte *in;
+	byte *out;
+
+	byte ca = alphamod ? ((color >> kAModShift) & 0xFF) : 255;
+	byte cr = rgbmod   ? ((color >> kRModShift) & 0xFF) : 255;
+	byte cg = rgbmod   ? ((color >> kGModShift) & 0xFF) : 255;
+	byte cb = rgbmod   ? ((color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < height; i++) {
+		out = outo;
+		in = ino;
+		for (uint32 j = 0; j < width; j++) {
+
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				if (cb != 255) {
+					out[kBIndex] = MIN<uint>(out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kBIndex] = MIN<uint>(out[kBIndex] * (in[kBIndex] * ina >> 8) >> 8, 255u);
+				}
+
+				if (cg != 255) {
+					out[kGIndex] = MIN<uint>(out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kGIndex] = MIN<uint>(out[kGIndex] * (in[kGIndex] * ina >> 8) >> 8, 255u);
+				}
+
+				if (cr != 255) {
+					out[kRIndex] = MIN<uint>(out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8, 255u);
+				} else {
+					out[kRIndex] = MIN<uint>(out[kRIndex] * (in[kRIndex] * ina >> 8) >> 8, 255u);
+				}
+			}
+
+			in += inStep;
+			out += 4;
+		}
+		outo += pitch;
+		ino += inoStep;
+	}
+
+}
+
+static void doBlitMultiplyBlend(byte *ino, byte *outo, uint32 width, uint32 height, uint32 pitch, int32 inStep, int32 inoStep, uint32 color) {
+
+	bool rgbmod   = ((color & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((color & kAModMask)   != kAModMask);
+
+	if (rgbmod) {
+		if (alphamod) {
+			doBlitMultiplyBlendImpl<true, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitMultiplyBlendImpl<true, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	} else {
+		if (alphamod) {
+			doBlitMultiplyBlendImpl<false, true>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		} else {
+			doBlitMultiplyBlendImpl<false, false>(ino, outo, width, height, pitch, inStep, inoStep, color);
+		}
+	}
+}
+
+Common::Rect OldTransparentSurface::blit(Graphics::Surface &target, int posX, int posY, int flipping, Common::Rect *pPartRect, uint color, int width, int height, TSpriteBlendMode blendMode) {
+
+	Common::Rect retSize;
+	retSize.top = 0;
+	retSize.left = 0;
+	retSize.setWidth(0);
+	retSize.setHeight(0);
+	// Check if we need to draw anything at all
+	int ca = (color >> kAModShift) & 0xff;
+
+	if (ca == 0) {
+		return retSize;
+	}
+
+	// Create an encapsulating surface for the data
+	OldTransparentSurface srcImage(*this, false);
+	// TODO: Is the data really in the screen format?
+	if (format.bytesPerPixel != 4) {
+		warning("OldTransparentSurface can only blit 32bpp images, but got %d", format.bytesPerPixel * 8);
+		return retSize;
+	}
+
+	if (pPartRect) {
+
+		int xOffset = pPartRect->left;
+		int yOffset = pPartRect->top;
+
+		if (flipping & FLIP_V) {
+			yOffset = srcImage.h - pPartRect->bottom;
+		}
+
+		if (flipping & FLIP_H) {
+			xOffset = srcImage.w - pPartRect->right;
+		}
+
+		srcImage.pixels = getBasePtr(xOffset, yOffset);
+		srcImage.w = pPartRect->width();
+		srcImage.h = pPartRect->height();
+
+		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
+			  pPartRect->left,  pPartRect->top, pPartRect->width(), pPartRect->height(), color, width, height);
+	} else {
+
+		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
+			  srcImage.w, srcImage.h, color, width, height);
+	}
+
+	if (width == -1) {
+		width = srcImage.w;
+	}
+	if (height == -1) {
+		height = srcImage.h;
+	}
+
+#ifdef SCALING_TESTING
+	// Hardcode scaling to 66% to test scaling
+	width = width * 2 / 3;
+	height = height * 2 / 3;
+#endif
+
+	Graphics::Surface *img = nullptr;
+	Graphics::Surface *imgScaled = nullptr;
+	byte *savedPixels = nullptr;
+	if ((width != srcImage.w) || (height != srcImage.h)) {
+		// Scale the image
+		img = imgScaled = srcImage.scale(width, height);
+		savedPixels = (byte *)img->getPixels();
+	} else {
+		img = &srcImage;
+	}
+
+	// Handle off-screen clipping
+	if (posY < 0) {
+		img->h = MAX(0, (int)img->h - -posY);
+		if (!(flipping & FLIP_V))
+			img->setPixels((byte *)img->getBasePtr(0, -posY));
+		posY = 0;
+	}
+
+	if (posX < 0) {
+		img->w = MAX(0, (int)img->w - -posX);
+		if (!(flipping & FLIP_H))
+			img->setPixels((byte *)img->getBasePtr(-posX, 0));
+		posX = 0;
+	}
+
+	if (img->w > target.w - posX) {
+		if (flipping & FLIP_H)
+			img->setPixels((byte *)img->getBasePtr(img->w - target.w + posX, 0));
+		img->w = CLIP((int)img->w, 0, (int)MAX((int)target.w - posX, 0));
+	}
+
+	if (img->h > target.h - posY) {
+		if (flipping & FLIP_V)
+			img->setPixels((byte *)img->getBasePtr(0, img->h - target.h + posY));
+		img->h = CLIP((int)img->h, 0, (int)MAX((int)target.h - posY, 0));
+	}
+
+	// Flip surface
+	if ((img->w > 0) && (img->h > 0)) {
+		int xp = 0, yp = 0;
+
+		int inStep = 4;
+		int inoStep = img->pitch;
+		if (flipping & FLIP_H) {
+			inStep = -inStep;
+			xp = img->w - 1;
+		}
+
+		if (flipping & FLIP_V) {
+			inoStep = -inoStep;
+			yp = img->h - 1;
+		}
+
+		byte *ino = (byte *)img->getBasePtr(xp, yp);
+		byte *outo = (byte *)target.getBasePtr(posX, posY);
+
+		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
+			doBlitOpaqueFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
+			doBlitBinaryFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				doBlitAdditiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				doBlitSubtractiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else if (blendMode == BLEND_MULTIPLY) {
+				doBlitMultiplyBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				doBlitAlphaBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			}
+		}
+
+	}
+
+	retSize.setWidth(img->w);
+	retSize.setHeight(img->h);
+
+	if (imgScaled) {
+		imgScaled->setPixels(savedPixels);
+		imgScaled->free();
+		delete imgScaled;
+	}
+
+	return retSize;
+}
+
+Common::Rect OldTransparentSurface::blitClip(Graphics::Surface &target, Common::Rect clippingArea, int posX, int posY, int flipping, Common::Rect *pPartRect, uint color, int width, int height, TSpriteBlendMode blendMode) {
+	Common::Rect retSize;
+	retSize.top = 0;
+	retSize.left = 0;
+	retSize.setWidth(0);
+	retSize.setHeight(0);
+	// Check if we need to draw anything at all
+	int ca = (color >> kAModShift) & 0xff;
+
+	if (ca == 0) {
+		return retSize;
+	}
+
+	// Create an encapsulating surface for the data
+	OldTransparentSurface srcImage(*this, false);
+	// TODO: Is the data really in the screen format?
+	if (format.bytesPerPixel != 4) {
+		warning("OldTransparentSurface can only blit 32bpp images, but got %d", format.bytesPerPixel * 8);
+		return retSize;
+	}
+
+	if (pPartRect) {
+
+		int xOffset = pPartRect->left;
+		int yOffset = pPartRect->top;
+
+		if (flipping & FLIP_V) {
+			yOffset = srcImage.h - pPartRect->bottom;
+		}
+
+		if (flipping & FLIP_H) {
+			xOffset = srcImage.w - pPartRect->right;
+		}
+
+		srcImage.pixels = getBasePtr(xOffset, yOffset);
+		srcImage.w = pPartRect->width();
+		srcImage.h = pPartRect->height();
+
+		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping,
+			pPartRect->left, pPartRect->top, pPartRect->width(), pPartRect->height(), color, width, height);
+	} else {
+
+		debug(6, "Blit(%d, %d, %d, [%d, %d, %d, %d], %08x, %d, %d)", posX, posY, flipping, 0, 0,
+			srcImage.w, srcImage.h, color, width, height);
+	}
+
+	if (width == -1) {
+		width = srcImage.w;
+	}
+	if (height == -1) {
+		height = srcImage.h;
+	}
+
+#ifdef SCALING_TESTING
+	// Hardcode scaling to 66% to test scaling
+	width = width * 2 / 3;
+	height = height * 2 / 3;
+#endif
+
+	Graphics::Surface *img = nullptr;
+	Graphics::Surface *imgScaled = nullptr;
+	byte *savedPixels = nullptr;
+	if ((width != srcImage.w) || (height != srcImage.h)) {
+		// Scale the image
+		img = imgScaled = srcImage.scale(width, height);
+		savedPixels = (byte *)img->getPixels();
+	} else {
+		img = &srcImage;
+	}
+
+	// Handle off-screen clipping
+	if (posY < clippingArea.top) {
+		img->h = MAX(0, (int)img->h - (clippingArea.top - posY));
+		if (!(flipping & FLIP_V))
+			img->setPixels((byte *)img->getBasePtr(0, clippingArea.top - posY));
+		posY = clippingArea.top;
+	}
+
+	if (posX < clippingArea.left) {
+		img->w = MAX(0, (int)img->w - (clippingArea.left - posX));
+		if (!(flipping & FLIP_H))
+			img->setPixels((byte *)img->getBasePtr(clippingArea.left - posX, 0));
+		posX = clippingArea.left;
+	}
+
+	if (img->w > clippingArea.right - posX) {
+		if (flipping & FLIP_H)
+			img->setPixels((byte *)img->getBasePtr(img->w - clippingArea.right + posX, 0));
+		img->w = CLIP((int)img->w, 0, (int)MAX((int)clippingArea.right - posX, 0));
+	}
+
+	if (img->h > clippingArea.bottom - posY) {
+		if (flipping & FLIP_V)
+			img->setPixels((byte *)img->getBasePtr(0, img->h - clippingArea.bottom + posY));
+		img->h = CLIP((int)img->h, 0, (int)MAX((int)clippingArea.bottom - posY, 0));
+	}
+
+	// Flip surface
+	if ((img->w > 0) && (img->h > 0)) {
+		int xp = 0, yp = 0;
+
+		int inStep = 4;
+		int inoStep = img->pitch;
+		if (flipping & FLIP_H) {
+			inStep = -inStep;
+			xp = img->w - 1;
+		}
+
+		if (flipping & FLIP_V) {
+			inoStep = -inoStep;
+			yp = img->h - 1;
+		}
+
+		byte *ino = (byte *)img->getBasePtr(xp, yp);
+		byte *outo = (byte *)target.getBasePtr(posX, posY);
+
+		if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_OPAQUE) {
+			doBlitOpaqueFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+		} else if (color == 0xFFFFFFFF && blendMode == BLEND_NORMAL && _alphaMode == ALPHA_BINARY) {
+			doBlitBinaryFast(ino, outo, img->w, img->h, target.pitch, inStep, inoStep);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				doBlitAdditiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				doBlitSubtractiveBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else if (blendMode == BLEND_MULTIPLY) {
+				doBlitMultiplyBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				doBlitAlphaBlend(ino, outo, img->w, img->h, target.pitch, inStep, inoStep, color);
+			}
+		}
+
+	}
+
+	retSize.setWidth(img->w);
+	retSize.setHeight(img->h);
+
+	if (imgScaled) {
+		imgScaled->setPixels(savedPixels);
+		imgScaled->free();
+		delete imgScaled;
+	}
+
+	return retSize;
+}
+
+AlphaType OldTransparentSurface::getAlphaMode() const {
+	return _alphaMode;
+}
+
+void OldTransparentSurface::setAlphaMode(AlphaType mode) {
+	_alphaMode = mode;
+}
+
+OldTransparentSurface *OldTransparentSurface::scale(int16 newWidth, int16 newHeight, bool filtering) const {
+
+	OldTransparentSurface *target = new OldTransparentSurface();
+
+	target->create(newWidth, newHeight, format);
+
+	if (filtering) {
+		scaleBlitBilinear((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format);
+	} else {
+		scaleBlit((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format);
+	}
+
+	return target;
+}
+
+OldTransparentSurface *OldTransparentSurface::rotoscale(const TransformStruct &transform, bool filtering) const {
+
+	Common::Point newHotspot;
+	Common::Rect rect = TransformTools::newRect(Common::Rect((int16)w, (int16)h), transform, &newHotspot);
+
+	OldTransparentSurface *target = new OldTransparentSurface();
+
+	target->create((uint16)rect.right - rect.left, (uint16)rect.bottom - rect.top, this->format);
+
+	if (filtering) {
+		rotoscaleBlitBilinear((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format, transform, newHotspot);
+	} else {
+		rotoscaleBlit((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format, transform, newHotspot);
+	}
+
+	return target;
+}
+
+OldTransparentSurface *OldTransparentSurface::convertTo(const PixelFormat &dstFormat, const byte *palette) const {
+	assert(pixels);
+
+	OldTransparentSurface *surface = new OldTransparentSurface();
+
+	// If the target format is the same, just copy
+	if (format == dstFormat) {
+		surface->copyFrom(*this);
+		return surface;
+	}
+
+	if (format.bytesPerPixel == 0 || format.bytesPerPixel > 4)
+		error("Surface::convertTo(): Can only convert from 1Bpp, 2Bpp, 3Bpp, and 4Bpp");
+
+	if (dstFormat.bytesPerPixel != 2 && dstFormat.bytesPerPixel != 4)
+		error("Surface::convertTo(): Can only convert to 2Bpp and 4Bpp");
+
+	surface->create(w, h, dstFormat);
+
+	if (format.bytesPerPixel == 1) {
+		// Converting from paletted to high color
+		assert(palette);
+
+		for (int y = 0; y < h; y++) {
+			const byte *srcRow = (const byte *)getBasePtr(0, y);
+			byte *dstRow = (byte *)surface->getBasePtr(0, y);
+
+			for (int x = 0; x < w; x++) {
+				byte index = *srcRow++;
+				byte r = palette[index * 3];
+				byte g = palette[index * 3 + 1];
+				byte b = palette[index * 3 + 2];
+
+				uint32 color = dstFormat.RGBToColor(r, g, b);
+
+				if (dstFormat.bytesPerPixel == 2)
+					*((uint16 *)dstRow) = color;
+				else
+					*((uint32 *)dstRow) = color;
+
+				dstRow += dstFormat.bytesPerPixel;
+			}
+		}
+	} else {
+		// Converting from high color to high color
+		for (int y = 0; y < h; y++) {
+			const byte *srcRow = (const byte *)getBasePtr(0, y);
+			byte *dstRow = (byte *)surface->getBasePtr(0, y);
+
+			for (int x = 0; x < w; x++) {
+				uint32 srcColor;
+				if (format.bytesPerPixel == 2)
+					srcColor = READ_UINT16(srcRow);
+				else if (format.bytesPerPixel == 3)
+					srcColor = READ_UINT24(srcRow);
+				else
+					srcColor = READ_UINT32(srcRow);
+
+				srcRow += format.bytesPerPixel;
+
+				// Convert that color to the new format
+				byte r, g, b, a;
+				format.colorToARGB(srcColor, a, r, g, b);
+				uint32 color = dstFormat.ARGBToColor(a, r, g, b);
+
+				if (dstFormat.bytesPerPixel == 2)
+					*((uint16 *)dstRow) = color;
+				else
+					*((uint32 *)dstRow) = color;
+
+				dstRow += dstFormat.bytesPerPixel;
+			}
+		}
+	}
+
+	return surface;
+}
+
+} // namespace OldTransparentSurface
+
+static int save_bitmap(const char *path, const Graphics::Surface *surf) {
+    Common::FSNode fileNode(path);
+    Common::SeekableWriteStream *out = fileNode.createWriteStream();
+#ifdef SCUMM_LITTLE_ENDIAN
+	const Graphics::PixelFormat requiredFormat_3byte(3, 8, 8, 8, 0, 16, 8, 0, 0);
+#else
+	const Graphics::PixelFormat requiredFormat_3byte(3, 8, 8, 8, 0, 0, 8, 16, 0);
+#endif
+	Graphics::ManagedSurface surface(surf->w, surf->h, requiredFormat_3byte);
+
+	// Copy from the source surface without alpha transparency
+	Graphics::ManagedSurface temp = surf;
+	temp.format.aLoss = 8;
+	surface.rawBlitFrom(temp, Common::Rect(0, 0, surf->w, surf->h),
+		Common::Point(0, 0));
+
+	// Write out the bitmap
+	int dstPitch = surface.w * 3;
+	int extraDataLength = (dstPitch % 4) ? 4 - (dstPitch % 4) : 0;
+	int padding = 0;
+
+	out->writeByte('B');
+	out->writeByte('M');
+	out->writeUint32LE(surface.h * dstPitch + 54);
+	out->writeUint32LE(0);
+	out->writeUint32LE(54);
+	out->writeUint32LE(40);
+	out->writeUint32LE(surface.w);
+	out->writeUint32LE(surface.h);
+	out->writeUint16LE(1);
+	out->writeUint16LE(24);
+	out->writeUint32LE(0);
+	out->writeUint32LE(0);
+	out->writeUint32LE(0);
+	out->writeUint32LE(0);
+	out->writeUint32LE(0);
+	out->writeUint32LE(0);
+
+	for (uint y = surface.h; y-- > 0;) {
+		out->write((const void *)surface.getBasePtr(0, y), dstPitch);
+		out->write(&padding, extraDataLength);
+	}
+
+	return true;
+}
+
+static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface *b) {
+    if (a->w != b->w || a->h != b->h) return false;
+
+    for (int y = 0; y < a->h; y++) {
+        for (int x = 0; x < a->w; x++) {
+            if (a->getPixel(x, y) != b->getPixel(x, y)) return false;
+        }
+    }
+
+    return true;
+}
+
+class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
+public:
+    void test_blend_blit_unfiltered() {
+        Common::Rect dsts[] = {
+            Common::Rect(4, 4, 4+16, 4+16), // Case 0 (source clipping)
+            Common::Rect(24, 20, 24+16, 20+16), // Case 1 (outside of destination)
+            Common::Rect(0, 0, 32, 32), // Case 2 (stretching bigger)
+            Common::Rect(3, 3, 8, 8), // Case 3 (stretching smaller)
+            Common::Rect(8, 4, 8+32, 4+32), // Case 4 (stretching outside of destination)
+            Common::Rect(-4, -4, -4+16, -4+16), // Case 5 (outside of destination 2)
+        }, srcs[] = {
+            Common::Rect(0, 0, 16, 16), // Case 0 (source clipping)
+            Common::Rect(0, 0, 16, 16), // Case 1 (outside of destination)
+            Common::Rect(0, 0, 16, 16), // Case 2 (stretching)
+            Common::Rect(0, 0, 16, 16), // Case 3 (stretching smaller)
+            Common::Rect(0, 0, 16, 16), // Case 4 (stretching outside of destination)
+            Common::Rect(0, 0, 16, 16), // Case 5 (outside of destination 2)
+        };
+
+	    Graphics::Surface baseSurface, destSurface;
+	    baseSurface.create(16, 16, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
+	    destSurface.create(32, 32, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
+	    for (int y = 0; y < baseSurface.h; y++) {
+	    	for (int x = 0; x < baseSurface.w; x++) {
+                int i = x / 4 + y / 4;
+	    		baseSurface.setPixel(x, y, baseSurface.format.ARGBToColor((i & 16) * 255, (i & 1) * 255, (i & 2) * 255, (i & 4) * 255));
+	    	}
+	    }
+
+	    OldTransparentSurface::OldTransparentSurface oldSurf(baseSurface, true);
+	    OldTransparentSurface::OldTransparentSurface oldSurfDest(destSurface, true);
+	    Graphics::TransparentSurface newSurf(baseSurface, true);
+	    Graphics::TransparentSurface newSurfDest(destSurface, true);
+	    Graphics::ManagedSurface managedSurf(&baseSurface, DisposeAfterUse::NO);
+	    Graphics::ManagedSurface managedSurfDest(&destSurface, DisposeAfterUse::NO);
+        const char *blendModes[] = {
+            "BLEND_NORMAL",
+            "BLEND_ADDITIVE",
+            "BLEND_SUBTRACTIVE",
+            "BLEND_MULTIPLY",
+        }, *alphaTypes[] = {
+            "ALPHA_OPAQUE",
+            "ALPHA_BINARY",
+            "ALPHA_FULL",
+        }, *flipNames[] = {
+            "FLIP_NONE",
+            "FLIP_H",
+            "FLIP_V",
+            "FLIP_HV",
+        }, *rectNames[] = {
+            "0 -> (source clipping)",
+            "1 -> (outside of destination)",
+            "2 -> (stretching bigger)",
+            "3 -> (stretching smaller)",
+            "4 -> (stretching outside of destination)",
+            "5 -> (outside of destination)",
+        };
+
+        for (int blendMode = 0; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
+        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+        for (int ba = 255; ba >= 0; ba = (ba == 255 ? 128 : (ba == 128 ? 0 : -1))) {
+        for (int br = 255; br >= 0; br = (br == 255 ? 128 : (br == 128 ? 0 : -1))) {
+        for (int bg = 255; bg >= 0; bg = (bg == 255 ? 128 : (bg == 128 ? 0 : -1))) {
+        for (int bb = 255; bb >= 0; bb = (bb == 255 ? 128 : (bb == 128 ? 0 : -1))) {
+        for (int a = 255; a >= 0; a = (a == 255 ? 128 : (a == 128 ? 0 : -1))) {
+        for (int r = 255; r >= 0; r = (r == 255 ? 128 : (r == 128 ? 0 : -1))) {
+        for (int g = 255; g >= 0; g = (g == 255 ? 128 : (g == 128 ? 0 : -1))) {
+        for (int b = 255; b >= 0; b = (b == 255 ? 128 : (b == 128 ? 0 : -1))) {
+        for (int flipping = 0; flipping <= 3; flipping++) {
+        for (int rect = 0; rect < sizeof(srcs)/sizeof(srcs[0]); rect++) {
+            oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
+            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
+            newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
+            managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, BLENDBLIT_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+
+
+
+            if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
+                warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+                save_bitmap("sourceSurf.bmp", &newSurf);
+                save_bitmap("oldSurfDest.bmp", &oldSurfDest);
+                save_bitmap("newSurfDest.bmp", &newSurfDest);
+                save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+                TS_FAIL("oldSurfDest and newSurfDest are not equal!");
+                return;
+            }
+            if (!areSurfacesEqual(&oldSurfDest, managedSurfDest.surfacePtr())) {
+                warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+                save_bitmap("sourceSurf.bmp", &newSurf);
+                save_bitmap("oldSurfDest.bmp", &oldSurfDest);
+                save_bitmap("newSurfDest.bmp", &newSurfDest);
+                save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+                TS_FAIL("oldSurfDest and managedSurfDest are not equal!");
+                return;
+            }
+            if (!areSurfacesEqual(&newSurfDest, managedSurfDest.surfacePtr())) {
+                warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+                save_bitmap("sourceSurf.bmp", &newSurf);
+                save_bitmap("oldSurfDest.bmp", &oldSurfDest);
+                save_bitmap("newSurfDest.bmp", &newSurfDest);
+                save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+                TS_FAIL("newSurfDest and managedSurfDest are not equal!");
+                return;
+            }
+        } // rect
+        } // flipping
+        } // b
+        } // g
+        } // r
+        } // a
+        } // bb
+        } // bg
+        } // br
+        } // ba
+        } // alpha
+        } // blend
+
+	    baseSurface.free();
+    }
+};


Commit: 50a28d1554f7288589634806972154f947a5b1b1
    https://github.com/scummvm/scummvm/commit/50a28d1554f7288589634806972154f947a5b1b1
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
OSYSTEM: Added SIMD feature flags

Changed paths:
    common/system.h


diff --git a/common/system.h b/common/system.h
index 958b04715e1..17e0bae227a 100644
--- a/common/system.h
+++ b/common/system.h
@@ -578,7 +578,29 @@ public:
 		/**
 		* For platforms that should not have a Quit button.
 		*/
-		kFeatureNoQuit
+		kFeatureNoQuit,
+
+		/**
+		* Arm-v8 requires NEON extensions, but before that, NEON was just
+		* optional, so this signifies that the processor can use NEON.
+		*/
+		kFeatureNEON,
+
+		/**
+		* For x86/x86_64 platforms that have SSE2 support
+		*/
+		kFeatureSSE2,
+
+		/**
+		* For x86_64 platforms that have AVX2 support
+		*/
+		kFeatureAVX2,
+
+		/**
+		* For PowerPC platforms that have the altivec standard as of 1999.
+		* Covers a wide range of platforms, Apple Macs, XBox 360, PS3, and more
+		*/
+		kFeatureAltivec,
 	};
 
 	/**


Commit: f062addfe29668af7e8fba7dc6a706a4cb209ff7
    https://github.com/scummvm/scummvm/commit/f062addfe29668af7e8fba7dc6a706a4cb209ff7
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: BaseBackend detect x86 SIMD extensions

Rebase conflict
Conflicts:
  backends/base-backend.h

Changed paths:
    backends/base-backend.cpp
    backends/base-backend.h
    backends/modular-backend.cpp


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index dcd7337bca8..45ea3f2eb1f 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -64,10 +64,43 @@ void BaseBackend::initBackend() {
 	if (!_audiocdManager)
 		_audiocdManager = new DefaultAudioCDManager();
 #endif
-
+#if defined(__x86_64__) || defined(__i686__)
+	uint32 ext_edx1 = 0, ext_ebx7 = 0;
+#  ifdef __GNUC__
+	asm ("mov $1, %%eax\n\t"
+		 "cpuid\n\t"
+		 "mov %%edx, %0\n\t"
+		 "mov $7, %%eax\n\t"
+		 "cpuid\n\t"
+		 "mov %%ebx, %1\n\t"
+		 : "=rm" (ext_edx1), "=rm" (ext_ebx7)
+		 :
+		 : "eax", "ebx", "ecx", "edx");
+#  elif _MSC_VER
+	__asm
+	{
+		mov eax,1
+		cpuid
+		mov ext_edx1,edx
+		mov ebx,7
+		cpuid
+		mov ext_ebx7,ebx
+	}
+#  endif // __GNUC__ and _MSC_VER
+	_x86features = (ext_edx1 & (1 << 26)) ? kX86FeatureSSE2 : kX86NoFeatures;
+	_x86features |= (ext_ebx7 & (1 << 5)) ? kX86FeatureAVX2 : kX86NoFeatures;
+#else
+	_x86features = kX86NotX86;
+#endif // __x86_64__ and __i686__
 	OSystem::initBackend();
 }
 
+bool BaseBackend::hasFeature(Feature f) {
+	if (f == kFeatureSSE2) return (_x86features & kX86FeatureSSE2) == kX86FeatureSSE2;
+	if (f == kFeatureAVX2) return (_x86features & kX86FeatureAVX2) == kX86FeatureAVX2;
+	return false;
+}
+
 void BaseBackend::fillScreen(uint32 col) {
 	Graphics::Surface *screen = lockScreen();
 	if (screen)
diff --git a/backends/base-backend.h b/backends/base-backend.h
index 36cab351686..826f3116949 100644
--- a/backends/base-backend.h
+++ b/backends/base-backend.h
@@ -31,7 +31,15 @@
  */
 class BaseBackend : public OSystem {
 public:
+	enum x86FeatureFlags {
+		kX86NoFeatures    = 0x00,
+		kX86NotX86        = 0x00,
+		kX86FeatureSSE2   = 0x01,
+		kX86FeatureAVX2   = 0x02,
+	};
+
 	void initBackend() override;
+	bool hasFeature(Feature f) override;
 
 	using OSystem::setScaler;
 	bool setScaler(const char *name, int factor) override final;
@@ -39,6 +47,9 @@ public:
 	void displayActivityIconOnOSD(const Graphics::Surface *icon) override {}
 	void fillScreen(uint32 col) override;
 	void fillScreen(const Common::Rect &r, uint32 col) override;
+
+private:
+	uint32 _x86features;
 };
 
 class EventsBaseBackend : virtual public BaseBackend, Common::EventSource {
diff --git a/backends/modular-backend.cpp b/backends/modular-backend.cpp
index e5b85c1485b..8c814301d1d 100644
--- a/backends/modular-backend.cpp
+++ b/backends/modular-backend.cpp
@@ -41,6 +41,7 @@ ModularGraphicsBackend::~ModularGraphicsBackend() {
 }
 
 bool ModularGraphicsBackend::hasFeature(Feature f) {
+	if (BaseBackend::hasFeature(f)) return true;
 	return _graphicsManager->hasFeature(f);
 }
 


Commit: bdc6d72e0b9635cfda93db501abf38f4423ac6dd
    https://github.com/scummvm/scummvm/commit/bdc6d72e0b9635cfda93db501abf38f4423ac6dd
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
ANDROID: Added ARM NEON detection in backend init

Changed paths:
    backends/platform/android/android.cpp
    backends/platform/android/android.h


diff --git a/backends/platform/android/android.cpp b/backends/platform/android/android.cpp
index 4fd7c1a031e..cbb0e30aaf1 100644
--- a/backends/platform/android/android.cpp
+++ b/backends/platform/android/android.cpp
@@ -49,6 +49,7 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/system_properties.h>
+#include <cpu-features.h>
 #include <time.h>
 #include <unistd.h>
 #include <dlfcn.h>
@@ -444,6 +445,13 @@ void OSystem_Android::initBackend() {
 		}
 	}
 
+	// Quickly figure out if arm NEON is supported
+	if (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM) {
+		_neonSupport = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
+	} else {
+		_neonSupport = android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM64;
+	}
+
 	// Warning: ConfMan.registerDefault() can be used for a Session of ScummVM
 	//          but:
 	//              1. The values will NOT persist to storage
@@ -637,6 +645,7 @@ bool OSystem_Android::hasFeature(Feature f) {
 	if (f == kFeatureOpenGLForGame) return true;
 	/* GLES2 always supports shaders */
 	if (f == kFeatureShadersForGame) return true;
+	if (f == kFeatureNEON) return _neonSupport;
 	return ModularGraphicsBackend::hasFeature(f);
 }
 
diff --git a/backends/platform/android/android.h b/backends/platform/android/android.h
index 34a976078e4..6fe584d7211 100644
--- a/backends/platform/android/android.h
+++ b/backends/platform/android/android.h
@@ -177,6 +177,8 @@ private:
 	mutable void *_gles2DL;
 #endif
 
+	bool _neonSupport; // bool for whether or not arm NEON is supported
+
 	static void *timerThreadFunc(void *arg);
 	static void *audioThreadFunc(void *arg);
 	Common::String getSystemProperty(const char *name) const;


Commit: c3554dd7f0faaad914a0cde49323eb9cf2b8e847
    https://github.com/scummvm/scummvm/commit/c3554dd7f0faaad914a0cde49323eb9cf2b8e847
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: BaseBackend detects NEON on aarch64

Changed paths:
    backends/base-backend.cpp
    backends/base-backend.h


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index 45ea3f2eb1f..684243cfb78 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -64,6 +64,7 @@ void BaseBackend::initBackend() {
 	if (!_audiocdManager)
 		_audiocdManager = new DefaultAudioCDManager();
 #endif
+	_cpuFeatures = kCpuNoFeatures;
 #if defined(__x86_64__) || defined(__i686__)
 	uint32 ext_edx1 = 0, ext_ebx7 = 0;
 #  ifdef __GNUC__
@@ -87,17 +88,19 @@ void BaseBackend::initBackend() {
 		mov ext_ebx7,ebx
 	}
 #  endif // __GNUC__ and _MSC_VER
-	_x86features = (ext_edx1 & (1 << 26)) ? kX86FeatureSSE2 : kX86NoFeatures;
-	_x86features |= (ext_ebx7 & (1 << 5)) ? kX86FeatureAVX2 : kX86NoFeatures;
-#else
-	_x86features = kX86NotX86;
+	_cpuFeatures |= (ext_edx1 & (1 << 26)) ? kCpuFeatureSSE2 : kCpuNoFeatures;
+	_cpuFeatures |= (ext_ebx7 & (1 << 5)) ? kCpuFeatureAVX2 : kCpuNoFeatures;
 #endif // __x86_64__ and __i686__
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+	_cpuFeatures |= kCpuFeatureNEON;
+#endif
 	OSystem::initBackend();
 }
 
 bool BaseBackend::hasFeature(Feature f) {
-	if (f == kFeatureSSE2) return (_x86features & kX86FeatureSSE2) == kX86FeatureSSE2;
-	if (f == kFeatureAVX2) return (_x86features & kX86FeatureAVX2) == kX86FeatureAVX2;
+	if (f == kFeatureSSE2) return _cpuFeatures & kCpuFeatureSSE2;
+	if (f == kFeatureAVX2) return _cpuFeatures & kCpuFeatureAVX2;
+	if (f == kFeatureNEON) return _cpuFeatures & kCpuFeatureNEON;
 	return false;
 }
 
diff --git a/backends/base-backend.h b/backends/base-backend.h
index 826f3116949..70d03ba6fea 100644
--- a/backends/base-backend.h
+++ b/backends/base-backend.h
@@ -31,11 +31,15 @@
  */
 class BaseBackend : public OSystem {
 public:
-	enum x86FeatureFlags {
-		kX86NoFeatures    = 0x00,
-		kX86NotX86        = 0x00,
-		kX86FeatureSSE2   = 0x01,
-		kX86FeatureAVX2   = 0x02,
+	enum CpuFeatureFlags {
+		kCpuNoFeatures     = 0x00, // Completely detected by BaseBackend
+		kCpuFeatureSSE2    = 0x01, // Completely detected by BaseBackend
+		kCpuFeatureAVX2    = 0x02, // Completely detected by BaseBackend
+		// Detected either by BaseBackend (if platform ONLY supports ARMv8+) or
+		// platform specific Backends if ARM is optional or not on all versions
+		// of the platform.
+		kCpuFeatureNEON    = 0x04,
+		kCpuFeatureAlitvec = 0x08, // Platform specific
 	};
 
 	void initBackend() override;
@@ -49,7 +53,7 @@ public:
 	void fillScreen(const Common::Rect &r, uint32 col) override;
 
 private:
-	uint32 _x86features;
+	uint32 _cpuFeatures;
 };
 
 class EventsBaseBackend : virtual public BaseBackend, Common::EventSource {


Commit: ffb845f241e9694889ee699fb8ac9bfedbd0da93
    https://github.com/scummvm/scummvm/commit/ffb845f241e9694889ee699fb8ac9bfedbd0da93
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
WII: Backend detects Altivec extensions

Changed paths:
    backends/platform/wii/osystem.cpp


diff --git a/backends/platform/wii/osystem.cpp b/backends/platform/wii/osystem.cpp
index 270c7d97841..07905b6b4ae 100644
--- a/backends/platform/wii/osystem.cpp
+++ b/backends/platform/wii/osystem.cpp
@@ -176,7 +176,8 @@ bool OSystem_Wii::hasFeature(Feature f) {
 	return (f == kFeatureFullscreenMode) ||
 			(f == kFeatureAspectRatioCorrection) ||
 			(f == kFeatureCursorPalette) ||
-			(f == kFeatureOverlaySupportsAlpha);
+			(f == kFeatureOverlaySupportsAlpha) ||
+			(f == kFeatureAltivec);
 }
 
 void OSystem_Wii::setFeatureState(Feature f, bool enable) {


Commit: 8cc1ab7efa9d1b1d1e9981ade0c8b8372d63422a
    https://github.com/scummvm/scummvm/commit/8cc1ab7efa9d1b1d1e9981ade0c8b8372d63422a
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
PS3: Backend now detects altivec extenions

Changed paths:
    backends/platform/sdl/ps3/ps3.cpp


diff --git a/backends/platform/sdl/ps3/ps3.cpp b/backends/platform/sdl/ps3/ps3.cpp
index 0f7653e3511..e1fdd8b4ca5 100644
--- a/backends/platform/sdl/ps3/ps3.cpp
+++ b/backends/platform/sdl/ps3/ps3.cpp
@@ -133,5 +133,7 @@ bool OSystem_PS3::hasFeature(Feature f) {
 		return false;
 	}
 
+	if (f == kFeatureAltivec) return true;
+
 	return OSystem_SDL::hasFeature(f);
 }


Commit: 139eb6ad61bb69e7e8055230e05a28c09440296c
    https://github.com/scummvm/scummvm/commit/139eb6ad61bb69e7e8055230e05a28c09440296c
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
MAC: Basic Altivec detection implemented

Changed paths:
    backends/platform/sdl/macosx/macosx.cpp


diff --git a/backends/platform/sdl/macosx/macosx.cpp b/backends/platform/sdl/macosx/macosx.cpp
index 165efc2db20..c21e8bcdba6 100644
--- a/backends/platform/sdl/macosx/macosx.cpp
+++ b/backends/platform/sdl/macosx/macosx.cpp
@@ -148,6 +148,11 @@ bool OSystem_MacOSX::hasFeature(Feature f) {
 		return true;
 #endif
 
+#if defined(__VEC__) || defined(__VEC) // PowerPC Altivec extensions
+	if (f == kFeatureAltivec)
+		return true;
+#endif
+
 	return OSystem_POSIX::hasFeature(f);
 }
 


Commit: e7cd583e9fc59f5ee2745d67b8799bb123f82be7
    https://github.com/scummvm/scummvm/commit/e7cd583e9fc59f5ee2745d67b8799bb123f82be7
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Refactor blendBlitUnfiltered

ALL: Fixed last commit not compiling :(

GRAPHICS: Refactoring BlendBlit for SIMD

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    graphics/transparent_surface.cpp
    graphics/transparent_surface.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 017e1149ce3..9d96c65a5b6 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -24,29 +24,6 @@
 
 namespace Graphics {
 
-static const int kBModShift = 8;
-static const int kGModShift = 16;
-static const int kRModShift = 24;
-static const int kAModShift = 0;
-
-static const uint32 kBModMask = 0x0000ff00;
-static const uint32 kGModMask = 0x00ff0000;
-static const uint32 kRModMask = 0xff000000;
-static const uint32 kAModMask = 0x000000ff;
-static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
-
-#ifdef SCUMM_LITTLE_ENDIAN
-static const int kAIndex = 0;
-static const int kBIndex = 1;
-static const int kGIndex = 2;
-static const int kRIndex = 3;
-#else
-static const int kAIndex = 3;
-static const int kBIndex = 2;
-static const int kGIndex = 1;
-static const int kRIndex = 0;
-#endif
-
 namespace {
 
 template<typename Size, bool overwriteAlpha>
@@ -191,56 +168,42 @@ bool setAlpha(byte *dst, const byte *src,
 }
 
 
-struct BlendingSetupArgs {
-	bool rgbmod, alphamod;
-	int xp, yp;
-	int inStep, inoStep;
-	const byte *ino;
-	byte *outo;
-
-	int scaleX, scaleY;
-	uint dstPitch;
-	uint width, height;
-	uint32 color;
-	int flipping;
-
-	BlendingSetupArgs(byte *dst, const byte *src,
-					  const uint dstPitch, const uint srcPitch,
-					  const int posX, const int posY,
-					  const uint width, const uint height,
-					  const int scaleX, const int scaleY,
-					  const uint32 colorMod, const uint flipping) :
-			xp(0), yp(0), dstPitch(dstPitch),
-			width(width), height(height), color(colorMod),
-			scaleX(scaleX), scaleY(scaleY), flipping(flipping) {
-		bool doScale = scaleX != BLEND_BLIT_SCALE_THRESHOLD || scaleY != BLEND_BLIT_SCALE_THRESHOLD;
-		
-		rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-		alphamod = ((colorMod & kAModMask)   != kAModMask);
-		inStep = 4;
-		inoStep = srcPitch;
-		if (flipping & FLIP_H) {
-			inStep = -inStep;
-			xp = width - 1;
-			if (doScale) xp = xp * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
-		}
-
-		if (flipping & FLIP_V) {
-			inoStep = -inoStep;
-			yp = height - 1;
-			if (doScale) yp = yp * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
-		}
+BlendBlit::Args::Args(byte *dst, const byte *src,
+	const uint dstPitch, const uint srcPitch,
+	const int posX, const int posY,
+	const uint width, const uint height,
+	const int scaleX, const int scaleY,
+	const uint32 colorMod, const uint flipping) :
+		xp(0), yp(0), dstPitch(dstPitch),
+		width(width), height(height), color(colorMod),
+		scaleX(scaleX), scaleY(scaleY), flipping(flipping) {
+	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
+	
+	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+	alphamod = ((colorMod & kAModMask)   != kAModMask);
+	inStep = 4;
+	inoStep = srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+		if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
+	}
 
-		ino = src + yp * srcPitch + xp * 4;
-		outo = dst + posY * dstPitch + posX * 4;
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+		if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
 	}
-};
+
+	ino = src + yp * srcPitch + xp * 4;
+	outo = dst + posY * dstPitch + posX * 4;
+}
 
 /**
  * Optimized version of doBlit to be used with multiply blended blitting
  */
 template<bool doscale>
-static void doBlitMultiplyBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -254,7 +217,7 @@ static void doBlitMultiplyBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -262,7 +225,7 @@ static void doBlitMultiplyBlendLogic(BlendingSetupArgs &args) {
 		out = args.outo;
 		for (uint32 j = 0; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
@@ -303,7 +266,7 @@ static void doBlitMultiplyBlendLogic(BlendingSetupArgs &args) {
 }
 
 template<bool doscale>
-static void doBlitAlphaBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -317,7 +280,7 @@ static void doBlitAlphaBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -325,7 +288,7 @@ static void doBlitAlphaBlendLogic(BlendingSetupArgs &args) {
 		out = args.outo;
 		for (uint32 j = 0; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
@@ -360,7 +323,7 @@ static void doBlitAlphaBlendLogic(BlendingSetupArgs &args) {
  * Optimized version of doBlit to be used with subtractive blended blitting
  */
 template<bool doscale>
-static void doBlitSubtractiveBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -373,7 +336,7 @@ static void doBlitSubtractiveBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -381,7 +344,7 @@ static void doBlitSubtractiveBlendLogic(BlendingSetupArgs &args) {
 		out = args.outo;
 		for (uint32 j = 0; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
 
 			out[kAIndex] = 255;
@@ -421,7 +384,7 @@ static void doBlitSubtractiveBlendLogic(BlendingSetupArgs &args) {
  * Optimized version of doBlit to be used with additive blended blitting
  */
 template<bool doscale>
-static void doBlitAdditiveBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -435,7 +398,7 @@ static void doBlitAdditiveBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -443,7 +406,7 @@ static void doBlitAdditiveBlendLogic(BlendingSetupArgs &args) {
 		out = args.outo;
 		for (uint32 j = 0; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
 
 			uint32 ina = in[kAIndex] * ca >> 8;
@@ -484,7 +447,7 @@ static void doBlitAdditiveBlendLogic(BlendingSetupArgs &args) {
 }
 
 template<bool doscale>
-void doBlitOpaqueBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -493,7 +456,7 @@ void doBlitOpaqueBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + (scaleYCtr + 1) / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -502,7 +465,7 @@ void doBlitOpaqueBlendLogic(BlendingSetupArgs &args) {
 
 		if (doscale) {
 			for (uint32 j = 0; j < args.width; j++) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 
 				memcpy(out, in, 4);
 				out[kAIndex] = 0xFF;
@@ -534,7 +497,7 @@ void doBlitOpaqueBlendLogic(BlendingSetupArgs &args) {
 }
 
 template<bool doscale>
-void doBlitBinaryBlendLogic(BlendingSetupArgs &args) {
+void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
@@ -543,7 +506,7 @@ void doBlitBinaryBlendLogic(BlendingSetupArgs &args) {
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
@@ -551,7 +514,7 @@ void doBlitBinaryBlendLogic(BlendingSetupArgs &args) {
 		out = args.outo;
 		for (uint32 j = 0; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / BLEND_BLIT_SCALE_THRESHOLD * args.inStep;
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
 			uint32 pix = *(const uint32 *)in;
 			int a = in[kAIndex];
@@ -574,8 +537,33 @@ void doBlitBinaryBlendLogic(BlendingSetupArgs &args) {
 	}
 }
 
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogic(Args &args) {
+	doBlitBinaryBlendLogicGeneric<doscale>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogic(Args &args) {
+	doBlitOpaqueBlendLogicGeneric<doscale>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitMultiplyBlendLogic(Args &args) {
+	doBlitMultiplyBlendLogicGeneric<doscale>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitSubtractiveBlendLogic(Args &args) {
+	doBlitSubtractiveBlendLogicGeneric<doscale>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitAdditiveBlendLogic(Args &args) {
+	doBlitAdditiveBlendLogicGeneric<doscale>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitAlphaBlendLogic(Args &args) {
+	doBlitAlphaBlendLogicGeneric<doscale>(args);
+}
+
 // Only blits to and from 32bpp images
-void blendBlitUnfiltered(byte *dst, const byte *src,
+void BlendBlit::blit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
@@ -584,8 +572,8 @@ void blendBlitUnfiltered(byte *dst, const byte *src,
 					 const TSpriteBlendMode blendMode,
 					 const AlphaType alphaType) {
 	if (width == 0 || height == 0) return;
-	BlendingSetupArgs args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
-	if (scaleX == BLEND_BLIT_SCALE_THRESHOLD && scaleY == BLEND_BLIT_SCALE_THRESHOLD) {
+	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
+	if (scaleX == SCALE_THRESHOLD && scaleY == SCALE_THRESHOLD) {
 		if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
 			doBlitOpaqueBlendLogic<false>(args);
 		} else if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
diff --git a/graphics/blit.h b/graphics/blit.h
index 0309f717244..91377e955bd 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -191,20 +191,101 @@ bool setAlpha(byte *dst, const byte *src,
               const Graphics::PixelFormat &format,
               const bool skipTransparent, const uint8 alpha);
 
-static const int BLEND_BLIT_SCALE_THRESHOLD = 0x100;
+// This is a class so that we can declare certain things as private
+class BlendBlit {
+private:
+	static const int kBModShift = 8;
+	static const int kGModShift = 16;
+	static const int kRModShift = 24;
+	static const int kAModShift = 0;
+	
+	static const uint32 kBModMask = 0x0000ff00;
+	static const uint32 kGModMask = 0x00ff0000;
+	static const uint32 kRModMask = 0xff000000;
+	static const uint32 kAModMask = 0x000000ff;
+	static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
+	
+#ifdef SCUMM_LITTLE_ENDIAN
+	static const int kAIndex = 0;
+	static const int kBIndex = 1;
+	static const int kGIndex = 2;
+	static const int kRIndex = 3;
+#else
+	static const int kAIndex = 3;
+	static const int kBIndex = 2;
+	static const int kGIndex = 1;
+	static const int kRIndex = 0;
+#endif
 
-/**
- * Optimized version of doBlit to be used with alpha blended blitting
- * @param ino a pointer to the input surface
- * @param outo a pointer to the output surface
- * @param width width of the input surface
- * @param height height of the input surface
- * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
- * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
- * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
- * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
- */
-void blendBlitUnfiltered(byte *dst, const byte *src,
+	struct Args {
+		bool rgbmod, alphamod;
+		int xp, yp;
+		int inStep, inoStep;
+		const byte *ino;
+		byte *outo;
+	
+		int scaleX, scaleY;
+		uint dstPitch;
+		uint width, height;
+		uint32 color;
+		int flipping;
+	
+		Args(byte *dst, const byte *src,
+			 const uint dstPitch, const uint srcPitch,
+			 const int posX, const int posY,
+			 const uint width, const uint height,
+			 const int scaleX, const int scaleY,
+			 const uint32 colorMod, const uint flipping);
+	};
+
+#define LOGIC_FUNCS_EXT(ext) \
+	template<bool doscale> \
+	static void doBlitBinaryBlendLogic##ext(Args &args); \
+	template<bool doscale> \
+	static void doBlitOpaqueBlendLogic##ext(Args &args); \
+	template<bool doscale> \
+	static void doBlitMultiplyBlendLogic##ext(Args &args); \
+	template<bool doscale> \
+	static void doBlitSubtractiveBlendLogic##ext(Args &args); \
+	template<bool doscale> \
+	static void doBlitAdditiveBlendLogic##ext(Args &args); \
+	template<bool doscale> \
+	static void doBlitAlphaBlendLogic##ext(Args &args);
+LOGIC_FUNCS_EXT()
+LOGIC_FUNCS_EXT(Generic)
+#undef LOGIC_FUNCS_EXT
+
+public:
+	static const int SCALE_THRESHOLD = 0x100;
+
+	static inline int getScaleFactor(int srcSize, int dstSize) {
+		return SCALE_THRESHOLD * srcSize / dstSize;
+	}
+
+	/**
+	 * Returns the pixel format all operations of TransparentSurface support.
+	 *
+	 * Use TS_ARGB and TS_RGB to quickly make a color in this format.
+	 * TS_ARGB/RGB are found in graphics/transform_struct.h
+	 *
+	 * @return Supported pixel format.
+	 */
+	static inline PixelFormat getSupportedPixelFormat() {
+		return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+	}
+
+	/**
+	 * Optimized version of doBlit to be used with alpha blended blitting
+	 * @param ino a pointer to the input surface
+	 * @param outo a pointer to the output surface
+	 * @param width width of the input surface
+	 * @param height height of the input surface
+	 * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
+	 * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
+	 * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
+	 * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
+	 */
+	static void blit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
@@ -213,6 +294,9 @@ void blendBlitUnfiltered(byte *dst, const byte *src,
 					 const TSpriteBlendMode blendMode = BLEND_NORMAL,
 					 const AlphaType alphaType = ALPHA_FULL);
 
+	friend struct TransparentSurface;
+}; // End of class BlendBlit
+
 /** @} */
 } // End of namespace Graphics
 
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 07d5afeeb6e..f8ff0db7ec3 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -743,26 +743,26 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 	// Alpha is zero
 	if ((colorMod & TS_ARGB(255, 0, 0, 0)) == 0) return Common::Rect(0, 0, 0, 0);
 
-	const int scaleX = BLEND_BLIT_SCALE_THRESHOLD * srcArea.width() / dstArea.width();
-	const int scaleY = BLEND_BLIT_SCALE_THRESHOLD * srcArea.height() / dstArea.height();
+	const int scaleX = BlendBlit::getScaleFactor(srcArea.width(), dstArea.width());
+	const int scaleY = BlendBlit::getScaleFactor(srcArea.height(), dstArea.height());
 
 	if (dstArea.left < 0) {
-		srcArea.left += -dstArea.left * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
+		srcArea.left += -dstArea.left * scaleX / BlendBlit::SCALE_THRESHOLD;
 		dstArea.left = 0;
 	}
 
 	if (dstArea.top < 0) {
-		srcArea.top += -dstArea.top * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
+		srcArea.top += -dstArea.top * scaleY / BlendBlit::SCALE_THRESHOLD;
 		dstArea.top = 0;
 	}
 
 	if (dstArea.right > w) {
-		srcArea.right -= (dstArea.right - w) * scaleX / BLEND_BLIT_SCALE_THRESHOLD;
+		srcArea.right -= (dstArea.right - w) * scaleX / BlendBlit::SCALE_THRESHOLD;
 		dstArea.right = w;
 	}
 
 	if (dstArea.bottom > h) {
-		srcArea.bottom -= (dstArea.bottom - h) * scaleY / BLEND_BLIT_SCALE_THRESHOLD;
+		srcArea.bottom -= (dstArea.bottom - h) * scaleY / BlendBlit::SCALE_THRESHOLD;
 		dstArea.bottom = h;
 	}
 
@@ -779,7 +779,7 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 	}
 
 	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
-		blendBlitUnfiltered(
+		BlendBlit::blit(
 			(byte *)getBasePtr(0, 0),
 			(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
 			pitch, src.pitch,
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 3f2f81beac7..8706019ad7f 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -27,6 +27,7 @@
 #include "graphics/transform_struct.h"
 #include "common/types.h"
 #include "graphics/transparent_surface.h"
+#include "graphics/blit.h"
 
 namespace Graphics {
 
@@ -532,7 +533,7 @@ public:
 	 * @return Supported pixel format.
 	 */
 	static inline PixelFormat getSupportedBlendBlitPixelFormat() {
-		return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+		return BlendBlit::getSupportedPixelFormat();
 	}
 
 	/**
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index 589a3a0c797..a935afe68e5 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -34,8 +34,6 @@
 
 namespace Graphics {
 
-static const int kAModShift = 0;
-
 TransparentSurface::TransparentSurface() : Surface(), _alphaMode(ALPHA_FULL) {}
 
 TransparentSurface::TransparentSurface(const Surface &surf, bool copyData) : Surface(), _alphaMode(ALPHA_FULL) {
@@ -60,7 +58,7 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 	retSize.setWidth(0);
 	retSize.setHeight(0);
 	// Check if we need to draw anything at all
-	int ca = (color >> kAModShift) & 0xff;
+	int ca = (color >> BlendBlit::kAModShift) & 0xff;
 
 	if (ca == 0) {
 		return retSize;
@@ -145,11 +143,11 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		Graphics::blendBlitUnfiltered(
+		BlendBlit::blit(
 			(byte *)target.getBasePtr(0, 0),
 			(byte *)img->getBasePtr(0, 0),
 			target.pitch, img->pitch,
-			posX, posY, img->w, img->h, BLEND_BLIT_SCALE_THRESHOLD, BLEND_BLIT_SCALE_THRESHOLD,
+			posX, posY, img->w, img->h, BlendBlit::SCALE_THRESHOLD, BlendBlit::SCALE_THRESHOLD,
 			color, flipping,
 			blendMode, _alphaMode);
 	}
@@ -173,7 +171,7 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 	retSize.setWidth(0);
 	retSize.setHeight(0);
 	// Check if we need to draw anything at all
-	int ca = (color >> kAModShift) & 0xff;
+	int ca = (color >> BlendBlit::kAModShift) & 0xff;
 
 	if (ca == 0) {
 		return retSize;
@@ -258,11 +256,11 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 
 	// Flip surface
 	if ((img->w > 0) && (img->h > 0)) {
-		Graphics::blendBlitUnfiltered(
+		BlendBlit::blit(
 			(byte *)target.getBasePtr(0, 0),
 			(byte *)img->getBasePtr(0, 0),
 			target.pitch, img->pitch,
-			posX, posY, img->w, img->h, BLEND_BLIT_SCALE_THRESHOLD, BLEND_BLIT_SCALE_THRESHOLD,
+			posX, posY, img->w, img->h, BlendBlit::SCALE_THRESHOLD, BlendBlit::SCALE_THRESHOLD,
 			color, flipping,
 			blendMode, _alphaMode);
 	}
diff --git a/graphics/transparent_surface.h b/graphics/transparent_surface.h
index f23920f8b38..31d5c5ef91d 100644
--- a/graphics/transparent_surface.h
+++ b/graphics/transparent_surface.h
@@ -24,6 +24,7 @@
 
 #include "graphics/surface.h"
 #include "graphics/transform_struct.h"
+#include "graphics/blit.h"
 
 /*
  * This code is based on Broken Sword 2.5 engine
@@ -61,7 +62,7 @@ struct TransparentSurface : public Graphics::Surface {
 	 * @return Supported pixel format.
 	 */
 	static PixelFormat getSupportedPixelFormat() {
-		return PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0);
+		return BlendBlit::getSupportedPixelFormat();
 	}
 
 	/**


Commit: 402c67064d910bb4fbafa320f30196af44f0fddc
    https://github.com/scummvm/scummvm/commit/402c67064d910bb4fbafa320f30196af44f0fddc
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: BlendBlit detects cpu extensions

Changed paths:
  A graphics/blit-neon.cpp
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/module.mk


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 9d96c65a5b6..afb363e7cfb 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -21,6 +21,7 @@
 
 #include "graphics/blit.h"
 #include "graphics/pixelformat.h"
+#include "common/system.h"
 
 namespace Graphics {
 
@@ -537,30 +538,8 @@ void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
 	}
 }
 
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogic(Args &args) {
-	doBlitBinaryBlendLogicGeneric<doscale>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogic(Args &args) {
-	doBlitOpaqueBlendLogicGeneric<doscale>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitMultiplyBlendLogic(Args &args) {
-	doBlitMultiplyBlendLogicGeneric<doscale>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitSubtractiveBlendLogic(Args &args) {
-	doBlitSubtractiveBlendLogicGeneric<doscale>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitAdditiveBlendLogic(Args &args) {
-	doBlitAdditiveBlendLogicGeneric<doscale>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitAlphaBlendLogic(Args &args) {
-	doBlitAlphaBlendLogicGeneric<doscale>(args);
-}
+// Initialize this to nullptr at the start
+BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
 
 // Only blits to and from 32bpp images
 void BlendBlit::blit(byte *dst, const byte *src,
@@ -572,42 +551,61 @@ void BlendBlit::blit(byte *dst, const byte *src,
 					 const TSpriteBlendMode blendMode,
 					 const AlphaType alphaType) {
 	if (width == 0 || height == 0) return;
-	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
-	if (scaleX == SCALE_THRESHOLD && scaleY == SCALE_THRESHOLD) {
-		if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
-			doBlitOpaqueBlendLogic<false>(args);
-		} else if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
-			doBlitBinaryBlendLogic<false>(args);
-		} else {
-			if (blendMode == BLEND_ADDITIVE) {
-				doBlitAdditiveBlendLogic<false>(args);
-			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				doBlitSubtractiveBlendLogic<false>(args);
-			} else if (blendMode == BLEND_MULTIPLY) {
-				doBlitMultiplyBlendLogic<false>(args);
-			} else {
-				assert(blendMode == BLEND_NORMAL);
-				doBlitAlphaBlendLogic<false>(args);
-			}
-		}
-	} else {
-		if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
-			doBlitOpaqueBlendLogic<true>(args);
-		} else if (colorMod == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
-			doBlitBinaryBlendLogic<true>(args);
-		} else {
-			if (blendMode == BLEND_ADDITIVE) {
-				doBlitAdditiveBlendLogic<true>(args);
-			} else if (blendMode == BLEND_SUBTRACTIVE) {
-				doBlitSubtractiveBlendLogic<true>(args);
-			} else if (blendMode == BLEND_MULTIPLY) {
-				doBlitMultiplyBlendLogic<true>(args);
-			} else {
-				assert(blendMode == BLEND_NORMAL);
-				doBlitAlphaBlendLogic<true>(args);
-			}
-		}
+	if (!blitFunc) {
+		// Get the correct blit function
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+		if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
+		else blitFunc = blitGeneric;
+#else
+		blitFunc = blitGeneric;
+#endif
 	}
+	
+	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
+	blitFunc(args, blendMode, alphaType);
 }
 
+#define BLIT_FUNC(ext) \
+	void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
+		if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
+			if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+				doBlitOpaqueBlendLogic##ext<false>(args); \
+			} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+				doBlitBinaryBlendLogic##ext<false>(args); \
+			} else { \
+				if (blendMode == BLEND_ADDITIVE) { \
+					doBlitAdditiveBlendLogic##ext<false>(args); \
+				} else if (blendMode == BLEND_SUBTRACTIVE) { \
+					doBlitSubtractiveBlendLogic##ext<false>(args); \
+				} else if (blendMode == BLEND_MULTIPLY) { \
+					doBlitMultiplyBlendLogic##ext<false>(args); \
+				} else { \
+					assert(blendMode == BLEND_NORMAL); \
+					doBlitAlphaBlendLogic##ext<false>(args); \
+				} \
+			} \
+		} else { \
+			if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+				doBlitOpaqueBlendLogic##ext<true>(args); \
+			} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+				doBlitBinaryBlendLogic##ext<true>(args); \
+			} else { \
+				if (blendMode == BLEND_ADDITIVE) { \
+					doBlitAdditiveBlendLogic##ext<true>(args); \
+				} else if (blendMode == BLEND_SUBTRACTIVE) { \
+					doBlitSubtractiveBlendLogic##ext<true>(args); \
+				} else if (blendMode == BLEND_MULTIPLY) { \
+					doBlitMultiplyBlendLogic##ext<true>(args); \
+				} else { \
+					assert(blendMode == BLEND_NORMAL); \
+					doBlitAlphaBlendLogic##ext<true>(args); \
+				} \
+			} \
+		} \
+	}
+BLIT_FUNC(Generic)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+BLIT_FUNC(NEON)
+#endif
+
 } // End of namespace Graphics
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
new file mode 100644
index 00000000000..09b7e27d9e0
--- /dev/null
+++ b/graphics/blit-neon.cpp
@@ -0,0 +1,65 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+
+namespace Graphics {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
+    (void)args;
+}
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
+    (void)args;
+}
+template<bool doscale>
+void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
+    (void)args;
+}
+template<bool doscale>
+void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
+    (void)args;
+}
+template<bool doscale>
+void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
+    (void)args;
+}
+template<bool doscale>
+void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
+    (void)args;
+}
+
+#define INSTANTIATE_BLIT_TEMPLATES(ext, b) \
+    template void BlendBlit::doBlitBinaryBlendLogic##ext<b>(Args &); \
+    template void BlendBlit::doBlitOpaqueBlendLogic##ext<b>(Args &); \
+    template void BlendBlit::doBlitMultiplyBlendLogic##ext<b>(Args &); \
+    template void BlendBlit::doBlitSubtractiveBlendLogic##ext<b>(Args &); \
+    template void BlendBlit::doBlitAdditiveBlendLogic##ext<b>(Args &); \
+    template void BlendBlit::doBlitAlphaBlendLogic##ext<b>(Args &);
+INSTANTIATE_BLIT_TEMPLATES(NEON, true)
+INSTANTIATE_BLIT_TEMPLATES(NEON, false)
+#undef INSTANTIATE_BLIT_TEMPLATES
+#endif // __ARM_NEON__
+
+}
diff --git a/graphics/blit.h b/graphics/blit.h
index 91377e955bd..e25a23225cc 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -229,7 +229,7 @@ private:
 		uint width, height;
 		uint32 color;
 		int flipping;
-	
+		
 		Args(byte *dst, const byte *src,
 			 const uint dstPitch, const uint srcPitch,
 			 const int posX, const int posY,
@@ -238,6 +238,9 @@ private:
 			 const uint32 colorMod, const uint flipping);
 	};
 
+// Define logic functions for different architecture extensions.
+// These extensions would just be a template parameter if it weren't for the
+// fact that partial template specialization doesn't exist.
 #define LOGIC_FUNCS_EXT(ext) \
 	template<bool doscale> \
 	static void doBlitBinaryBlendLogic##ext(Args &args); \
@@ -250,11 +253,17 @@ private:
 	template<bool doscale> \
 	static void doBlitAdditiveBlendLogic##ext(Args &args); \
 	template<bool doscale> \
-	static void doBlitAlphaBlendLogic##ext(Args &args);
-LOGIC_FUNCS_EXT()
+	static void doBlitAlphaBlendLogic##ext(Args &args); \
+	static void blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 LOGIC_FUNCS_EXT(Generic)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+LOGIC_FUNCS_EXT(NEON)
+#endif
 #undef LOGIC_FUNCS_EXT
 
+	typedef void(*BlitFunc)(Args &, const TSpriteBlendMode &, const AlphaType &);
+	static BlitFunc blitFunc;
+
 public:
 	static const int SCALE_THRESHOLD = 0x100;
 
@@ -286,13 +295,13 @@ public:
 	 * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
 	 */
 	static void blit(byte *dst, const byte *src,
-					 const uint dstPitch, const uint srcPitch,
-					 const int posX, const int posY,
-					 const uint width, const uint height,
-					 const int scaleX, const int scaleY,
-					 const uint32 colorMod = 0, const uint flipping = FLIP_NONE,
-					 const TSpriteBlendMode blendMode = BLEND_NORMAL,
-					 const AlphaType alphaType = ALPHA_FULL);
+			  const uint dstPitch, const uint srcPitch,
+			  const int posX, const int posY,
+			  const uint width, const uint height,
+			  const int scaleX, const int scaleY,
+			  const uint32 colorMod, const uint flipping,
+			  const TSpriteBlendMode blendMode,
+			  const AlphaType alphaType);
 
 	friend struct TransparentSurface;
 }; // End of class BlendBlit
diff --git a/graphics/module.mk b/graphics/module.mk
index 0d12dc92c88..427b99a1667 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -5,6 +5,7 @@ MODULE_OBJS := \
 	blit.o \
 	blit-alpha.o \
 	blit-scale.o \
+	blit-neon.o \
 	cursorman.o \
 	font.o \
 	fontman.o \


Commit: eebadf44952150810d08b4e554d92efa39904658
    https://github.com/scummvm/scummvm/commit/eebadf44952150810d08b4e554d92efa39904658
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Optimize alpha blend NEON and Generic

I optimized the NEON and Generic paths for ManagedSurface::blendBlitFrom
and the new TransparentSurface::blit. Now (on arm), the new blit function
matches the speed of the old blit function even with the added
inderections that the runtime extension detection code adds in.

Other than that, I made a benchmark for this code and you can make it
using this command:
CFLAGS="-DTEST_BLEND_SPEED" make test

I reverted wii to not use altivec anymore since it doesn't.

I also removed graphics/blit-neon.cpp from graphics/module.mk because
simply including the .cpp file in graphics/blit-alpha.cpp was a better
option because then I didn't need to instantiate every version of the
templates that I needed.

Changed paths:
    backends/platform/wii/osystem.cpp
    graphics/blit-alpha.cpp
    graphics/blit-neon.cpp
    graphics/blit.h
    graphics/module.mk
    test/image/blending.h
    test/null_osystem.cpp


diff --git a/backends/platform/wii/osystem.cpp b/backends/platform/wii/osystem.cpp
index 07905b6b4ae..42a9b6b06d4 100644
--- a/backends/platform/wii/osystem.cpp
+++ b/backends/platform/wii/osystem.cpp
@@ -176,8 +176,7 @@ bool OSystem_Wii::hasFeature(Feature f) {
 	return (f == kFeatureFullscreenMode) ||
 			(f == kFeatureAspectRatioCorrection) ||
 			(f == kFeatureCursorPalette) ||
-			(f == kFeatureOverlaySupportsAlpha) ||
-			(f == kFeatureAltivec);
+			(f == kFeatureOverlaySupportsAlpha));
 }
 
 void OSystem_Wii::setFeatureState(Feature f, bool enable) {
diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index afb363e7cfb..67f5639ae57 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -23,6 +23,8 @@
 #include "graphics/pixelformat.h"
 #include "common/system.h"
 
+#include "graphics/blit-neon.cpp"
+
 namespace Graphics {
 
 namespace {
@@ -203,7 +205,7 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
 /**
  * Optimized version of doBlit to be used with multiply blended blitting
  */
-template<bool doscale>
+template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
@@ -211,10 +213,10 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -266,7 +268,7 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 
 }
 
-template<bool doscale>
+template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
@@ -274,10 +276,10 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -294,15 +296,37 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 
 			uint32 ina = in[kAIndex] * ca >> 8;
 
-			if (ina != 0) {
-				uint outb = (out[kBIndex] * (255 - ina) >> 8);
-				uint outg = (out[kGIndex] * (255 - ina) >> 8);
-				uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-				out[kAIndex] = 255;
-				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+			if (rgbmod) {
+				if (ina != 0) {
+					const uint outb = (out[kBIndex] * (255 - ina) >> 8);
+					const uint outg = (out[kGIndex] * (255 - ina) >> 8);
+					const uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+					out[kAIndex] = 255;
+					out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+					out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+					out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+				}
+			} else {
+				if (ina != 0) {
+					// Runs faster on newer hardware (doesn't do single byte manip)
+					const uint32 in32 = *(const uint32 *)in;
+					const uint32 out32 = *(const uint32 *)out;
+					const uint32 rb = (in32 & (kRModMask | kBModMask)) >> 8;
+					const uint32 g = in32 & kGModMask;
+					const uint32 dstrb = (out32 & (kRModMask | kBModMask)) >> 8;
+					const uint32 dstg = out32 & kGModMask;
+					*(uint32 *)out = kAModMask |
+						((dstrb * (255 - ina) + rb * ina) & (kRModMask | kBModMask)) |
+						((dstg * (255 - ina) + g * ina) >> 8);
+
+					// I think this code will run faster on older hardware
+					// TODO maybe?: Put #ifdef to use on older hardware
+					//out[kAIndex] = 255;
+					//out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
+					//out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
+					//out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
+				}
 			}
 
 			if (doscale)
@@ -323,7 +347,7 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 /**
  * Optimized version of doBlit to be used with subtractive blended blitting
  */
-template<bool doscale>
+template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
@@ -331,9 +355,9 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -384,7 +408,7 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 /**
  * Optimized version of doBlit to be used with additive blended blitting
  */
-template<bool doscale>
+template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
@@ -392,10 +416,10 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	byte cr = args.rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	byte cg = args.rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	byte cb = args.rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -552,12 +576,12 @@ void BlendBlit::blit(byte *dst, const byte *src,
 					 const AlphaType alphaType) {
 	if (width == 0 || height == 0) return;
 	if (!blitFunc) {
-		// Get the correct blit function
+	// Get the correct blit function
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
-		if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
-		else blitFunc = blitGeneric;
+	if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
+	else blitFunc = blitGeneric;
 #else
-		blitFunc = blitGeneric;
+	blitFunc = blitGeneric;
 #endif
 	}
 	
@@ -565,8 +589,11 @@ void BlendBlit::blit(byte *dst, const byte *src,
 	blitFunc(args, blendMode, alphaType);
 }
 
+// Let me know if there is a way to do function pointer to templated functions
 #define BLIT_FUNC(ext) \
 	void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
+		bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
+		bool alphamod = ((args.color & kAModMask)   != kAModMask); \
 		if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
 			if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
 				doBlitOpaqueBlendLogic##ext<false>(args); \
@@ -574,14 +601,54 @@ void BlendBlit::blit(byte *dst, const byte *src,
 				doBlitBinaryBlendLogic##ext<false>(args); \
 			} else { \
 				if (blendMode == BLEND_ADDITIVE) { \
-					doBlitAdditiveBlendLogic##ext<false>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
+						} else { \
+							doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
+						} else { \
+							doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
+						} \
+					} \
 				} else if (blendMode == BLEND_SUBTRACTIVE) { \
-					doBlitSubtractiveBlendLogic##ext<false>(args); \
+					if (rgbmod) { \
+						doBlitSubtractiveBlendLogic##ext<false, true>(args); \
+					} else { \
+						doBlitSubtractiveBlendLogic##ext<false, false>(args); \
+					} \
 				} else if (blendMode == BLEND_MULTIPLY) { \
-					doBlitMultiplyBlendLogic##ext<false>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
+						} else { \
+							doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
+						} else { \
+							doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
+						} \
+					} \
 				} else { \
 					assert(blendMode == BLEND_NORMAL); \
-					doBlitAlphaBlendLogic##ext<false>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitAlphaBlendLogic##ext<false, true, true>(args); \
+						} else { \
+							doBlitAlphaBlendLogic##ext<false, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitAlphaBlendLogic##ext<false, false, true>(args); \
+						} else { \
+							doBlitAlphaBlendLogic##ext<false, false, false>(args); \
+						} \
+					} \
 				} \
 			} \
 		} else { \
@@ -591,14 +658,54 @@ void BlendBlit::blit(byte *dst, const byte *src,
 				doBlitBinaryBlendLogic##ext<true>(args); \
 			} else { \
 				if (blendMode == BLEND_ADDITIVE) { \
-					doBlitAdditiveBlendLogic##ext<true>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
+						} else { \
+							doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
+						} else { \
+							doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
+						} \
+					} \
 				} else if (blendMode == BLEND_SUBTRACTIVE) { \
-					doBlitSubtractiveBlendLogic##ext<true>(args); \
+					if (rgbmod) { \
+						doBlitSubtractiveBlendLogic##ext<true, true>(args); \
+					} else { \
+						doBlitSubtractiveBlendLogic##ext<true, false>(args); \
+					} \
 				} else if (blendMode == BLEND_MULTIPLY) { \
-					doBlitMultiplyBlendLogic##ext<true>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
+						} else { \
+							doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
+						} else { \
+							doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
+						} \
+					} \
 				} else { \
 					assert(blendMode == BLEND_NORMAL); \
-					doBlitAlphaBlendLogic##ext<true>(args); \
+					if (rgbmod) { \
+						if (alphamod) { \
+							doBlitAlphaBlendLogic##ext<true, true, true>(args); \
+						} else { \
+							doBlitAlphaBlendLogic##ext<true, true, false>(args); \
+						} \
+					} else { \
+						if (alphamod) { \
+							doBlitAlphaBlendLogic##ext<true, false, true>(args); \
+						} else { \
+							doBlitAlphaBlendLogic##ext<true, false, false>(args); \
+						} \
+					} \
 				} \
 			} \
 		} \
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 09b7e27d9e0..d60821489dc 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -19,12 +19,14 @@
  *
  */
 
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+
 #include "graphics/blit.h"
 #include "graphics/pixelformat.h"
 
 namespace Graphics {
 
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template<bool doscale>
 void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
     (void)args;
@@ -33,33 +35,143 @@ template<bool doscale>
 void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
     (void)args;
 }
-template<bool doscale>
+template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
     (void)args;
 }
-template<bool doscale>
+template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
     (void)args;
 }
-template<bool doscale>
+template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
     (void)args;
 }
-template<bool doscale>
-void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
-    (void)args;
+
+template<bool rgbmod, bool alphamod>
+static inline uint32x4_t drawPixelAlphaBlend(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    if (flip) {
+        src = vrev64q_u32(src);
+	    src = vcombine_u32(vget_high_u32(src), vget_low_u32(src));
+    }
+    uint32x4_t ina;
+    if (alphamod)
+        ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+    else
+        ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+    if (rgbmod) {
+        uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+        uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+        uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+        uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+        uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+        uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+
+        dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
+        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
+        srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
+        src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
+        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
+    } else {
+        uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+        uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+        uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+        uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+
+        dstRB = vshrq_n_u32(vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        srcRB = vaddq_u32(dstRB, vshrq_n_u32(vmulq_u32(srcRB, ina), 8));
+        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
+        src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcRB, 8), vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+    }
+
+    dst = vandq_u32(alphaMask, dst);
+    src = vandq_u32(vmvnq_u32(alphaMask), src);
+    return vorrq_u32(dst, src);
 }
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
+	const byte *in;
+	byte *out;
 
-#define INSTANTIATE_BLIT_TEMPLATES(ext, b) \
-    template void BlendBlit::doBlitBinaryBlendLogic##ext<b>(Args &); \
-    template void BlendBlit::doBlitOpaqueBlendLogic##ext<b>(Args &); \
-    template void BlendBlit::doBlitMultiplyBlendLogic##ext<b>(Args &); \
-    template void BlendBlit::doBlitSubtractiveBlendLogic##ext<b>(Args &); \
-    template void BlendBlit::doBlitAdditiveBlendLogic##ext<b>(Args &); \
-    template void BlendBlit::doBlitAlphaBlendLogic##ext<b>(Args &);
-INSTANTIATE_BLIT_TEMPLATES(NEON, true)
-INSTANTIATE_BLIT_TEMPLATES(NEON, false)
-#undef INSTANTIATE_BLIT_TEMPLATES
-#endif // __ARM_NEON__
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
+
+	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
 
+        uint32 j;
+		for (j = 0; j + 4 < args.width; j += 4) {
+            uint32x4_t dstPixels = vld1q_u32((const uint32 *)out);
+            uint32x4_t srcPixels;
+            if (!doscale) {
+                srcPixels = vld1q_u32((const uint32 *)in);
+            } else {
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
+                scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+                scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+                scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+                scaleXCtr += args.scaleX;
+            }
+            uint32x4_t res = drawPixelAlphaBlend<rgbmod, alphamod>(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+            vst1q_u32((uint32 *)out, res);
+			if (!doscale)
+				in += args.inStep * 4;
+			out += 4 * 4;
+		}
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+			}
+			uint32 ina = in[kAIndex] * ca >> 8;
+
+			if (ina != 0) {
+				uint outb = (out[kBIndex] * (255 - ina) >> 8);
+				uint outg = (out[kGIndex] * (255 - ina) >> 8);
+				uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+				out[kAIndex] = 255;
+				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+			}
+
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+        
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
 }
+
+} // end of namespace Graphics
+
+#endif // __ARM_NEON__
diff --git a/graphics/blit.h b/graphics/blit.h
index e25a23225cc..e2c5356083b 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -29,6 +29,8 @@ namespace Common {
 struct Point;
 }
 
+class BlendBlitUnfilteredTestSuite;
+
 namespace Graphics {
 
 /**
@@ -194,29 +196,6 @@ bool setAlpha(byte *dst, const byte *src,
 // This is a class so that we can declare certain things as private
 class BlendBlit {
 private:
-	static const int kBModShift = 8;
-	static const int kGModShift = 16;
-	static const int kRModShift = 24;
-	static const int kAModShift = 0;
-	
-	static const uint32 kBModMask = 0x0000ff00;
-	static const uint32 kGModMask = 0x00ff0000;
-	static const uint32 kRModMask = 0xff000000;
-	static const uint32 kAModMask = 0x000000ff;
-	static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
-	
-#ifdef SCUMM_LITTLE_ENDIAN
-	static const int kAIndex = 0;
-	static const int kBIndex = 1;
-	static const int kGIndex = 2;
-	static const int kRIndex = 3;
-#else
-	static const int kAIndex = 3;
-	static const int kBIndex = 2;
-	static const int kGIndex = 1;
-	static const int kRIndex = 0;
-#endif
-
 	struct Args {
 		bool rgbmod, alphamod;
 		int xp, yp;
@@ -246,33 +225,56 @@ private:
 	static void doBlitBinaryBlendLogic##ext(Args &args); \
 	template<bool doscale> \
 	static void doBlitOpaqueBlendLogic##ext(Args &args); \
-	template<bool doscale> \
+	template<bool doscale, bool rgbmod, bool alphamod> \
 	static void doBlitMultiplyBlendLogic##ext(Args &args); \
-	template<bool doscale> \
+	template<bool doscale, bool rgbmod> \
 	static void doBlitSubtractiveBlendLogic##ext(Args &args); \
-	template<bool doscale> \
+	template<bool doscale, bool rgbmod, bool alphamod> \
 	static void doBlitAdditiveBlendLogic##ext(Args &args); \
-	template<bool doscale> \
+	template<bool doscale, bool rgbmod, bool alphamod> \
 	static void doBlitAlphaBlendLogic##ext(Args &args); \
 	static void blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
-LOGIC_FUNCS_EXT(Generic)
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 LOGIC_FUNCS_EXT(NEON)
 #endif
+LOGIC_FUNCS_EXT(Generic)
 #undef LOGIC_FUNCS_EXT
 
 	typedef void(*BlitFunc)(Args &, const TSpriteBlendMode &, const AlphaType &);
 	static BlitFunc blitFunc;
+	friend class ::BlendBlitUnfilteredTestSuite;
 
 public:
 	static const int SCALE_THRESHOLD = 0x100;
+	static const int kBModShift = 8;
+	static const int kGModShift = 16;
+	static const int kRModShift = 24;
+	static const int kAModShift = 0;
+	
+	static const uint32 kBModMask = 0x0000ff00;
+	static const uint32 kGModMask = 0x00ff0000;
+	static const uint32 kRModMask = 0xff000000;
+	static const uint32 kAModMask = 0x000000ff;
+	static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
+	
+#ifdef SCUMM_LITTLE_ENDIAN
+	static const int kAIndex = 0;
+	static const int kBIndex = 1;
+	static const int kGIndex = 2;
+	static const int kRIndex = 3;
+#else
+	static const int kAIndex = 3;
+	static const int kBIndex = 2;
+	static const int kGIndex = 1;
+	static const int kRIndex = 0;
+#endif
 
 	static inline int getScaleFactor(int srcSize, int dstSize) {
 		return SCALE_THRESHOLD * srcSize / dstSize;
 	}
 
 	/**
-	 * Returns the pixel format all operations of TransparentSurface support.
+	 * Returns the pixel format all operations of BlendBlit::blit support.
 	 *
 	 * Use TS_ARGB and TS_RGB to quickly make a color in this format.
 	 * TS_ARGB/RGB are found in graphics/transform_struct.h
@@ -303,7 +305,6 @@ public:
 			  const TSpriteBlendMode blendMode,
 			  const AlphaType alphaType);
 
-	friend struct TransparentSurface;
 }; // End of class BlendBlit
 
 /** @} */
diff --git a/graphics/module.mk b/graphics/module.mk
index 427b99a1667..0d12dc92c88 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -5,7 +5,6 @@ MODULE_OBJS := \
 	blit.o \
 	blit-alpha.o \
 	blit-scale.o \
-	blit-neon.o \
 	cursorman.o \
 	font.o \
 	fontman.o \
diff --git a/test/image/blending.h b/test/image/blending.h
index bffaa2b25f1..8191c6bf8c5 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -6,6 +6,7 @@
 
 #include "common/fs.h"
 #include "common/stream.h"
+#include "common/system.h"
 
 #include "graphics/surface.h"
 #include "graphics/managed_surface.h"
@@ -896,7 +897,103 @@ static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface
 
 class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
 public:
+	void test_blend_speed() {
+#ifdef TEST_BLEND_SPEED
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
+#else
+		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
+#endif
+	
+	    Graphics::Surface baseSurface, destSurface;
+	    baseSurface.create(128, 128, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
+	    destSurface.create(256, 256, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
+	    for (int y = 0; y < baseSurface.h; y++) {
+	    	for (int x = 0; x < baseSurface.w; x++) {
+                int i = x / 4 + y / 4;
+	    		baseSurface.setPixel(x, y, baseSurface.format.ARGBToColor((i & 16) * 255, (i & 1) * 255, (i & 2) * 255, (i & 4) * 255));
+	    	}
+	    }
+
+	    OldTransparentSurface::OldTransparentSurface oldSurf(baseSurface, true);
+	    OldTransparentSurface::OldTransparentSurface oldSurfDest(destSurface, true);
+	    Graphics::ManagedSurface managedSurf(&baseSurface, DisposeAfterUse::NO);
+	    Graphics::ManagedSurface managedSurfDest(&destSurface, DisposeAfterUse::NO);
+
+		int numIters = 0, numItersScaled = 0;
+		double oldTime = 0.0, newTime = 0.0, genericTime = 0.0;
+		double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
+		const int iters = 2500;
+		
+        for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::BLEND_NORMAL + 1; blendMode++) {
+        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+        for (int flipping = 0; flipping <= 3; flipping++) {
+            oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
+            managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
+            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+			uint32 oldStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), -1, -1, (Graphics::TSpriteBlendMode)blendMode);
+			}
+			oldTime += g_system->getMillis() - oldStart;
+			uint32 newStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+			}
+			newTime += g_system->getMillis() - newStart;
+            managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
+			Graphics::BlendBlit::BlitFunc oldFunc = Graphics::BlendBlit::blitFunc;
+			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
+			uint32 genericStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+			}
+			Graphics::BlendBlit::blitFunc = oldFunc;
+			genericTime += g_system->getMillis() - genericStart;
+			numIters ++;
+
+			// scaled
+            oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
+            managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
+            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+			oldStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), oldSurfDest.w, oldSurfDest.h, (Graphics::TSpriteBlendMode)blendMode);
+			}
+			oldTimeScaled += g_system->getMillis() - oldStart;
+			newStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+			}
+			newTimeScaled += g_system->getMillis() - newStart;
+            managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
+			oldFunc = Graphics::BlendBlit::blitFunc;
+			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
+			genericStart = g_system->getMillis();
+			for (int i = 0; i < iters; i++) {
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+			}
+			Graphics::BlendBlit::blitFunc = oldFunc;
+			genericTimeScaled += g_system->getMillis() - genericStart;
+			numItersScaled++;
+        } // flipping
+        } // alpha
+        } // blend
+
+		debug("Old TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTime / numIters);
+		debug("New ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTime / numIters);
+		debug("New ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTime / numIters);
+		debug("Old SCALING TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTimeScaled / numItersScaled);
+		debug("New SCALING ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTimeScaled / numItersScaled);
+		debug("New SCALING ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
+		debug("Note this speed test puts the old code in the best senario against the new code.");
+
+	    baseSurface.free();
+#endif
+	}
+
     void test_blend_blit_unfiltered() {
+#ifdef TEST_BLEND_SPEED
         Common::Rect dsts[] = {
             Common::Rect(4, 4, 4+16, 4+16), // Case 0 (source clipping)
             Common::Rect(24, 20, 24+16, 20+16), // Case 1 (outside of destination)
@@ -971,7 +1068,7 @@ public:
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
             newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, BLENDBLIT_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 
 
 
@@ -1019,5 +1116,6 @@ public:
         } // blend
 
 	    baseSurface.free();
+#endif
     }
 };
diff --git a/test/null_osystem.cpp b/test/null_osystem.cpp
index 6f1d0540109..54078d1446e 100644
--- a/test/null_osystem.cpp
+++ b/test/null_osystem.cpp
@@ -18,6 +18,10 @@ void BaseBackend::initBackend() {
 	OSystem::initBackend();
 }
 
+bool BaseBackend::hasFeature(OSystem::Feature f) {
+	return false;
+}
+
 void BaseBackend::fillScreen(uint32 col) {
 }
 


Commit: 9ad04f800017503f58b2978fe5382343dbcce447
    https://github.com/scummvm/scummvm/commit/9ad04f800017503f58b2978fe5382343dbcce447
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Optimize BLEND_NORMAL on NEON

Changed paths:
    graphics/blit-neon.cpp
    test/image/blending.h


diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index d60821489dc..1d2bb497e44 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -84,12 +84,12 @@ static inline uint32x4_t drawPixelAlphaBlend(uint32x4_t src, uint32x4_t dst, con
         uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
         uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
 
-        dstRB = vshrq_n_u32(vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+        dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
         dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-        srcRB = vaddq_u32(dstRB, vshrq_n_u32(vmulq_u32(srcRB, ina), 8));
+        srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
         srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
         src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
-        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcRB, 8), vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+        src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
     }
 
     dst = vandq_u32(alphaMask, dst);
diff --git a/test/image/blending.h b/test/image/blending.h
index 8191c6bf8c5..8b76c8b77f3 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -926,19 +926,20 @@ public:
 		const int iters = 2500;
 		
         for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::BLEND_NORMAL + 1; blendMode++) {
-        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+        for (int alphaType = Graphics::ALPHA_FULL; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
         for (int flipping = 0; flipping <= 3; flipping++) {
+		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
 			uint32 oldStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), -1, -1, (Graphics::TSpriteBlendMode)blendMode);
+            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode);
 			}
 			oldTime += g_system->getMillis() - oldStart;
 			uint32 newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTime += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -946,7 +947,7 @@ public:
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			uint32 genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTime += g_system->getMillis() - genericStart;
@@ -958,12 +959,12 @@ public:
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
 			oldStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), oldSurfDest.w, oldSurfDest.h, (Graphics::TSpriteBlendMode)blendMode);
+            	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, color, oldSurfDest.w, oldSurfDest.h, (Graphics::TSpriteBlendMode)blendMode);
 			}
 			oldTimeScaled += g_system->getMillis() - oldStart;
 			newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTimeScaled += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -971,11 +972,12 @@ public:
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTimeScaled += g_system->getMillis() - genericStart;
 			numItersScaled++;
+		} // color
         } // flipping
         } // alpha
         } // blend
@@ -986,7 +988,6 @@ public:
 		debug("Old SCALING TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTimeScaled / numItersScaled);
 		debug("New SCALING ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTimeScaled / numItersScaled);
 		debug("New SCALING ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
-		debug("Note this speed test puts the old code in the best senario against the new code.");
 
 	    baseSurface.free();
 #endif


Commit: c2c7ca027501c58930b3bd053b96e1ccaec9ffdb
    https://github.com/scummvm/scummvm/commit/c2c7ca027501c58930b3bd053b96e1ccaec9ffdb
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: BlendBlit NEON blending modes coded

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit-neon.cpp
    graphics/blit.h
    graphics/managed_surface.cpp
    test/image/blending.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 67f5639ae57..680f409baeb 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -174,12 +174,12 @@ bool setAlpha(byte *dst, const byte *src,
 BlendBlit::Args::Args(byte *dst, const byte *src,
 	const uint dstPitch, const uint srcPitch,
 	const int posX, const int posY,
-	const uint width, const uint height,
-	const int scaleX, const int scaleY,
-	const uint32 colorMod, const uint flipping) :
+	const uint _width, const uint _height,
+	const int _scaleX, const int _scaleY,
+	const uint32 colorMod, const uint _flipping) :
 		xp(0), yp(0), dstPitch(dstPitch),
-		width(width), height(height), color(colorMod),
-		scaleX(scaleX), scaleY(scaleY), flipping(flipping) {
+		width(_width), height(_height), color(colorMod),
+		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping) {
 	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
 	
 	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
@@ -213,10 +213,13 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
+	const byte rawcr = (args.color >> kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> kBModShift) & 0xFF;
 	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -234,23 +237,9 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 			uint32 ina = in[kAIndex] * ca >> 8;
 
 			if (ina != 0) {
-				if (cb != 255) {
-					out[kBIndex] = MIN<uint>(out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kBIndex] = MIN<uint>(out[kBIndex] * (in[kBIndex] * ina >> 8) >> 8, 255u);
-				}
-
-				if (cg != 255) {
-					out[kGIndex] = MIN<uint>(out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kGIndex] = MIN<uint>(out[kGIndex] * (in[kGIndex] * ina >> 8) >> 8, 255u);
-				}
-
-				if (cr != 255) {
-					out[kRIndex] = MIN<uint>(out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8, 255u);
-				} else {
-					out[kRIndex] = MIN<uint>(out[kRIndex] * (in[kRIndex] * ina >> 8) >> 8, 255u);
-				}
+				out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
+				out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
+				out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
 			}
 
 			if (doscale)
@@ -318,7 +307,7 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 					const uint32 dstg = out32 & kGModMask;
 					*(uint32 *)out = kAModMask |
 						((dstrb * (255 - ina) + rb * ina) & (kRModMask | kBModMask)) |
-						((dstg * (255 - ina) + g * ina) >> 8);
+						(((dstg * (255 - ina) + g * ina) >> 8) & kGModMask);
 
 					// I think this code will run faster on older hardware
 					// TODO maybe?: Put #ifdef to use on older hardware
@@ -355,9 +344,12 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const byte rawcr = (args.color >> kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> kBModShift) & 0xFF;
+	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -373,23 +365,9 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 			}
 
 			out[kAIndex] = 255;
-			if (cb != 255) {
-				out[kBIndex] = MAX(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kBIndex] = MAX(out[kBIndex] - (in[kBIndex] * (out[kBIndex]) * in[kAIndex] >> 16), 0);
-			}
-
-			if (cg != 255) {
-				out[kGIndex] = MAX(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kGIndex] = MAX(out[kGIndex] - (in[kGIndex] * (out[kGIndex]) * in[kAIndex] >> 16), 0);
-			}
-
-			if (cr != 255) {
-				out[kRIndex] = MAX(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
-			} else {
-				out[kRIndex] = MAX(out[kRIndex] - (in[kRIndex] * (out[kRIndex]) * in[kAIndex] >> 16), 0);
-			}
+			out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
+			out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
+			out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
 
 			if (doscale)
 				scaleXCtr += args.scaleX;
@@ -416,10 +394,13 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
+	const byte rawcr = (args.color >> kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> kBModShift) & 0xFF;
 	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
@@ -437,23 +418,9 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 			uint32 ina = in[kAIndex] * ca >> 8;
 
 			if (ina != 0) {
-				if (cb != 255) {
-					out[kBIndex] = MIN<uint>(out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16), 255u);
-				} else {
-					out[kBIndex] = MIN<uint>(out[kBIndex] + (in[kBIndex] * ina >> 8), 255u);
-				}
-
-				if (cg != 255) {
-					out[kGIndex] = MIN<uint>(out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16), 255u);
-				} else {
-					out[kGIndex] = MIN<uint>(out[kGIndex] + (in[kGIndex] * ina >> 8), 255u);
-				}
-
-				if (cr != 255) {
-					out[kRIndex] = MIN<uint>(out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16), 255u);
-				} else {
-					out[kRIndex] = MIN<uint>(out[kRIndex] + (in[kRIndex] * ina >> 8), 255u);
-				}
+				out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
+				out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
+				out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
 			}
 
 			if (doscale)
@@ -498,17 +465,10 @@ void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
 				scaleXCtr += args.scaleX;
 				out += 4;
 			}
-		} else if (args.flipping & FLIP_H) {
-			for (uint32 j = 0; j < args.width; j++) {
-				memcpy(out, in, 4);
-				out[kAIndex] = 0xFF;
-				out += 4;
-				in += args.inStep;
-			}
 		} else {
-			memcpy(out, in, args.width * 4);
 			for (uint32 j = 0; j < args.width; j++) {
-				out[kAIndex] = 0xFF;
+				*(uint32 *)out = *(const uint32 *)in | kAModMask;
+				in += args.inStep;
 				out += 4;
 			}
 		}
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 1d2bb497e44..6f87f88fb3f 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -27,143 +27,311 @@
 
 namespace Graphics {
 
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
-    (void)args;
-}
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
-    (void)args;
-}
 template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
-    (void)args;
-}
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
-    (void)args;
-}
+struct AlphaBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+	    uint32x4_t ina;
+	    if (alphamod)
+	        ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+	    else
+	        ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+	    uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+	
+	    if (rgbmod) {
+	        uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+	        uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+	        uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+	        uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+	        uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+	        uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+	
+	        dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+	        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+	        dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+	        srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
+	        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
+	        srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
+	        src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+	        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
+	        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
+	    } else {
+	        uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+	        uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+	        uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+	        uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+	
+	        dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
+	        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+	        srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
+	        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
+	        src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+	        src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+	    }
+	
+	    dst = vandq_u32(alphaMask, dst);
+	    src = vandq_u32(vmvnq_u32(alphaMask), src);
+	    return vorrq_u32(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+			out[BlendBlit::kAIndex] = 255;
+			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+		}
+	}
+};
+
 template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
-    (void)args;
-}
+struct MultiplyBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	uint32x4_t ina;
+    	if (alphamod)
+    	    ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+    	else
+    	    ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+    	if (rgbmod) {
+    	    uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+    	    uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	    uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+    	    uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+    	    srcb = vandq_u32(vshrq_n_u32(vmulq_u32(dstb, vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), ina), 16)), 8), vmovq_n_u32(BlendBlit::kBModMask));
+    	    srcg = vandq_u32(vshlq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
+    	    srcr = vandq_u32(vshlq_n_u32(vmulq_u32(dstr, vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
+
+    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcb)));
+    	} else {
+    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    srcg = vandq_u32(vshrq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), 8), vmovq_n_u32(BlendBlit::kGModMask));
+    	    srcrb = vandq_u32(vmulq_u32(dstrb, vshrq_n_u32(vmulq_u32(srcrb, ina), 8)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+    	}
+
+    	dst = vandq_u32(alphaMask, dst);
+    	src = vandq_u32(vmvnq_u32(alphaMask), src);
+    	return vorrq_u32(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+		}
+	}
+};
 
-template<bool rgbmod, bool alphamod>
-static inline uint32x4_t drawPixelAlphaBlend(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    if (flip) {
-        src = vrev64q_u32(src);
-	    src = vcombine_u32(vget_high_u32(src), vget_low_u32(src));
-    }
-    uint32x4_t ina;
-    if (alphamod)
-        ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
-    else
-        ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
-
-    if (rgbmod) {
-        uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
-        uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
-        uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
-        uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
-        uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
-        uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
-
-        dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-        dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-        srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
-        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
-        srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
-        src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
-        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
-        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
-    } else {
-        uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
-        uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
-        uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
-        uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
-
-        dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
-        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-        srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
-        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
-        src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
-        src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
-    }
-
-    dst = vandq_u32(alphaMask, dst);
-    src = vandq_u32(vmvnq_u32(alphaMask), src);
-    return vorrq_u32(dst, src);
-}
 template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
+struct OpaqueBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct BinaryBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+            uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
+            dst = vandq_u32(dst, alphaMask);
+            src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
+            return vorrq_u32(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 pix = *(const uint32 *)in;
+		int a = in[BlendBlit::kAIndex];
+
+		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+			*(uint32 *)out = pix;
+			out[BlendBlit::kAIndex] = 0xFF;
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct AdditiveBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	uint32x4_t ina;
+    	if (alphamod)
+    	    ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+    	else
+    	    ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+    	if (rgbmod) {
+    	    uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+    	    uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	    uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+    	    uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
+			srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
+			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
+
+    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcb)));
+    	} else if (alphamod) {
+    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
+			srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+    	} else {
+    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
+			srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+		}
+
+    	dst = vandq_u32(alphaMask, dst);
+    	src = vandq_u32(vmvnq_u32(alphaMask), src);
+    	return vorrq_u32(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct SubtractiveBlend {
+	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    	uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	uint32x4_t dstb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
+		srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
+		srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
+
+    	return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+			out[BlendBlit::kAIndex] = 255;
+			out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+			out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+			out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+	}
+};
+
+class BlendBlitImpl {
+
+public:
+template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+static inline void blitInnerLoop(BlendBlit::Args &args) {
 	const byte *in;
 	byte *out;
 
+	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+
 	int scaleXCtr, scaleYCtr = 0;
 	const byte *inBase;
 
-	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+    if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
 			scaleXCtr = 0;
 		} else {
 			in = args.ino;
 		}
 		out = args.outo;
-
-        uint32 j;
-		for (j = 0; j + 4 < args.width; j += 4) {
-            uint32x4_t dstPixels = vld1q_u32((const uint32 *)out);
+        uint32 j = 0;
+		for (; j + 4 <= args.width; j += 4) {
+            uint32x4_t dstPixels;
+            if (loaddst) dstPixels = vld1q_u32((const uint32 *)out);
             uint32x4_t srcPixels;
             if (!doscale) {
                 srcPixels = vld1q_u32((const uint32 *)in);
             } else {
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 1);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 2);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 3);
                 scaleXCtr += args.scaleX;
             }
-            uint32x4_t res = drawPixelAlphaBlend<rgbmod, alphamod>(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
-            vst1q_u32((uint32 *)out, res);
-			if (!doscale)
-				in += args.inStep * 4;
+            if (!doscale && (args.flipping & FLIP_H)) {
+                srcPixels = vrev64q_u32(srcPixels);
+	            srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
+            }
+			{
+				const uint32x4_t res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+            	vst1q_u32((uint32 *)out, res);
+			}
+			if (!doscale) in += args.inStep * 4;
 			out += 4 * 4;
 		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
 		for (; j < args.width; j++) {
 			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				uint outb = (out[kBIndex] * (255 - ina) >> 8);
-				uint outg = (out[kGIndex] * (255 - ina) >> 8);
-				uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-				out[kAIndex] = 255;
-				out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-				out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-				out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
 			}
 
+			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            
 			if (doscale)
 				scaleXCtr += args.scaleX;
 			else
 				in += args.inStep;
 			out += 4;
 		}
-        
 		if (doscale)
 			scaleYCtr += args.scaleY;
 		else
@@ -172,6 +340,33 @@ void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
 	}
 }
 
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale, bool rgbmod>
+void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
+	BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+
 } // end of namespace Graphics
 
 #endif // __ARM_NEON__
diff --git a/graphics/blit.h b/graphics/blit.h
index e2c5356083b..2ac68c8fca6 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -243,6 +243,7 @@ LOGIC_FUNCS_EXT(Generic)
 	typedef void(*BlitFunc)(Args &, const TSpriteBlendMode &, const AlphaType &);
 	static BlitFunc blitFunc;
 	friend class ::BlendBlitUnfilteredTestSuite;
+	friend class BlendBlitImpl;
 
 public:
 	static const int SCALE_THRESHOLD = 0x100;
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index f8ff0db7ec3..d534935c3e4 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -789,7 +789,9 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 			colorMod, flipping,
 			blend, alphaType);
 	}
-	return Common::Rect(0, 0, dstArea.width(), dstArea.height());
+
+	if (dstArea.isEmpty()) return Common::Rect(0, 0, 0, 0);
+	else return Common::Rect(0, 0, dstArea.width(), dstArea.height());
 }
 
 void ManagedSurface::markAllDirty() {
diff --git a/test/image/blending.h b/test/image/blending.h
index 8b76c8b77f3..999f55c1606 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -885,26 +885,17 @@ static int save_bitmap(const char *path, const Graphics::Surface *surf) {
 
 static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface *b) {
     if (a->w != b->w || a->h != b->h) return false;
-
-    for (int y = 0; y < a->h; y++) {
-        for (int x = 0; x < a->w; x++) {
-            if (a->getPixel(x, y) != b->getPixel(x, y)) return false;
-        }
-    }
-
-    return true;
+	return memcmp(a->getPixels(), b->getPixels(), a->h * a->pitch) == 0;
 }
 
 class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
 public:
 	void test_blend_speed() {
-#ifdef TEST_BLEND_SPEED
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
 #else
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 #endif
-	
 	    Graphics::Surface baseSurface, destSurface;
 	    baseSurface.create(128, 128, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
 	    destSurface.create(256, 256, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
@@ -990,11 +981,9 @@ public:
 		debug("New SCALING ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
 
 	    baseSurface.free();
-#endif
 	}
 
     void test_blend_blit_unfiltered() {
-#ifdef TEST_BLEND_SPEED
         Common::Rect dsts[] = {
             Common::Rect(4, 4, 4+16, 4+16), // Case 0 (source clipping)
             Common::Rect(24, 20, 24+16, 20+16), // Case 1 (outside of destination)
@@ -1071,8 +1060,6 @@ public:
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
             managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 
-
-
             if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
@@ -1117,6 +1104,5 @@ public:
         } // blend
 
 	    baseSurface.free();
-#endif
     }
 };


Commit: 6bdeeb506f3f3f05b21eb4a498c565735593417f
    https://github.com/scummvm/scummvm/commit/6bdeeb506f3f3f05b21eb4a498c565735593417f
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
OSYSTEM: Added SSE4.1 feature flag

Changed paths:
    common/system.h


diff --git a/common/system.h b/common/system.h
index 17e0bae227a..8dc35176971 100644
--- a/common/system.h
+++ b/common/system.h
@@ -591,6 +591,11 @@ public:
 		*/
 		kFeatureSSE2,
 
+		/**
+		* For x86/x86_64 platforms that have SSE4.1 support
+		*/
+		kFeatureSSE41,
+
 		/**
 		* For x86_64 platforms that have AVX2 support
 		*/


Commit: 2947e87e59d84775de3d8c278ded8185f24cb32c
    https://github.com/scummvm/scummvm/commit/2947e87e59d84775de3d8c278ded8185f24cb32c
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: BaseBackend now detects SSE4.1

Changed paths:
    backends/base-backend.cpp
    backends/base-backend.h


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index 684243cfb78..5a3837d5510 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -65,16 +65,17 @@ void BaseBackend::initBackend() {
 		_audiocdManager = new DefaultAudioCDManager();
 #endif
 	_cpuFeatures = kCpuNoFeatures;
-#if defined(__x86_64__) || defined(__i686__)
-	uint32 ext_edx1 = 0, ext_ebx7 = 0;
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+	uint32 ext_edx1 = 0, ext_ebx7 = 0, ext_ecx1 = 0;
 #  ifdef __GNUC__
 	asm ("mov $1, %%eax\n\t"
 		 "cpuid\n\t"
 		 "mov %%edx, %0\n\t"
+		 "mov %%ecx, %2\n\t"
 		 "mov $7, %%eax\n\t"
 		 "cpuid\n\t"
 		 "mov %%ebx, %1\n\t"
-		 : "=rm" (ext_edx1), "=rm" (ext_ebx7)
+		 : "=rm" (ext_edx1), "=rm" (ext_ebx7), "=rm" (ext_ecx1)
 		 :
 		 : "eax", "ebx", "ecx", "edx");
 #  elif _MSC_VER
@@ -83,6 +84,7 @@ void BaseBackend::initBackend() {
 		mov eax,1
 		cpuid
 		mov ext_edx1,edx
+		mov ext_ecx1,ecx
 		mov ebx,7
 		cpuid
 		mov ext_ebx7,ebx
@@ -90,6 +92,7 @@ void BaseBackend::initBackend() {
 #  endif // __GNUC__ and _MSC_VER
 	_cpuFeatures |= (ext_edx1 & (1 << 26)) ? kCpuFeatureSSE2 : kCpuNoFeatures;
 	_cpuFeatures |= (ext_ebx7 & (1 << 5)) ? kCpuFeatureAVX2 : kCpuNoFeatures;
+	_cpuFeatures |= (ext_ecx1 & (1 << 19)) ? kCpuFeatureSSE41 : kCpuNoFeatures;
 #endif // __x86_64__ and __i686__
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 	_cpuFeatures |= kCpuFeatureNEON;
diff --git a/backends/base-backend.h b/backends/base-backend.h
index 70d03ba6fea..0c25dbafc6e 100644
--- a/backends/base-backend.h
+++ b/backends/base-backend.h
@@ -40,6 +40,7 @@ public:
 		// of the platform.
 		kCpuFeatureNEON    = 0x04,
 		kCpuFeatureAlitvec = 0x08, // Platform specific
+		kCpuFeatureSSE41   = 0x10, // Completely detected by BaseBackend
 	};
 
 	void initBackend() override;


Commit: c2e4fc9b6ac31f658b09f234b0301a2d908d64a7
    https://github.com/scummvm/scummvm/commit/c2e4fc9b6ac31f658b09f234b0301a2d908d64a7
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: SSE2 for BlendBlit and fix NEON bug

Changed paths:
  A graphics/blit-sse2.cpp
    graphics/blit-alpha.cpp
    graphics/blit-neon.cpp
    graphics/blit.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 680f409baeb..62490ee5ac9 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -24,6 +24,7 @@
 #include "common/system.h"
 
 #include "graphics/blit-neon.cpp"
+#include "graphics/blit-sse2.cpp"
 
 namespace Graphics {
 
@@ -172,12 +173,12 @@ bool setAlpha(byte *dst, const byte *src,
 
 
 BlendBlit::Args::Args(byte *dst, const byte *src,
-	const uint dstPitch, const uint srcPitch,
+	const uint _dstPitch, const uint _srcPitch,
 	const int posX, const int posY,
 	const uint _width, const uint _height,
 	const int _scaleX, const int _scaleY,
 	const uint32 colorMod, const uint _flipping) :
-		xp(0), yp(0), dstPitch(dstPitch),
+		xp(0), yp(0), dstPitch(_dstPitch),
 		width(_width), height(_height), color(colorMod),
 		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping) {
 	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
@@ -185,7 +186,7 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
 	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
 	alphamod = ((colorMod & kAModMask)   != kAModMask);
 	inStep = 4;
-	inoStep = srcPitch;
+	inoStep = _srcPitch;
 	if (flipping & FLIP_H) {
 		inStep = -inStep;
 		xp = width - 1;
@@ -198,8 +199,8 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
 		if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
 	}
 
-	ino = src + yp * srcPitch + xp * 4;
-	outo = dst + posY * dstPitch + posX * 4;
+	ino = src + yp * _srcPitch + xp * 4;
+	outo = dst + posY * _dstPitch + posX * 4;
 }
 
 /**
@@ -540,6 +541,9 @@ void BlendBlit::blit(byte *dst, const byte *src,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 	if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
 	else blitFunc = blitGeneric;
+#elif defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+	if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
+	else blitFunc = blitGeneric;
 #else
 	blitFunc = blitGeneric;
 #endif
@@ -674,5 +678,8 @@ BLIT_FUNC(Generic)
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 BLIT_FUNC(NEON)
 #endif
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+BLIT_FUNC(SSE2)
+#endif
 
 } // End of namespace Graphics
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 6f87f88fb3f..018b15fe5d9 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -112,7 +112,7 @@ struct MultiplyBlend {
     	    srcr = vandq_u32(vshlq_n_u32(vmulq_u32(dstr, vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
 
     	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcb)));
+    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
     	} else {
     	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
     	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
@@ -154,10 +154,10 @@ struct OpaqueBlend {
 template<bool doscale, bool rgbmod, bool alphamod>
 struct BinaryBlend {
 	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-            uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
-            dst = vandq_u32(dst, alphaMask);
-            src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
-            return vorrq_u32(dst, src);
+        uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
+        dst = vandq_u32(dst, alphaMask);
+        src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
+        return vorrq_u32(dst, src);
 	}
 
 	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
@@ -254,10 +254,10 @@ struct SubtractiveBlend {
 	}
 
 	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-			out[BlendBlit::kAIndex] = 255;
-			out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-			out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-			out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kAIndex] = 255;
+		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
 	}
 };
 
diff --git a/graphics/blit-sse2.cpp b/graphics/blit-sse2.cpp
new file mode 100644
index 00000000000..799583644fb
--- /dev/null
+++ b/graphics/blit-sse2.cpp
@@ -0,0 +1,376 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#include <immintrin.h>
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+
+namespace Graphics {
+
+static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) {
+	__m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0));
+	__m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0));
+	return _mm_unpacklo_epi32(even, odd);
+}
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct AlphaBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+	    __m128i ina;
+	    if (alphamod)
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+	    else
+			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+	    __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+	
+	    if (rgbmod) {
+	    	__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+	    	__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+	    	__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+	    	__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+	    	__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+	    	__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+			dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+			dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
+			srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(cb)));
+			src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
+			src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
+	    } else {
+			__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+			dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+			dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+			srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift);
+			srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+	    }
+
+		dst = _mm_and_si128(alphaMask, dst);
+		src = _mm_andnot_si128(alphaMask, src);
+	    return _mm_or_si128(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+			out[BlendBlit::kAIndex] = 255;
+			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct MultiplyBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m128i ina;
+	    if (alphamod)
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+	    else
+			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+	    __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+
+    	if (rgbmod) {
+			__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstb, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcb, _mm_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcg, _mm_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstr, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcr, _mm_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+    	} else {
+			__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask));
+    	    srcrb = _mm_and_si128(sse2_mul32(dstrb, _mm_srli_epi32(sse2_mul32(srcrb, ina), 8)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+    	}
+
+    	dst = _mm_and_si128(alphaMask, dst);
+    	src = _mm_andnot_si128(alphaMask, src);
+    	return _mm_or_si128(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct OpaqueBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct BinaryBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		__m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
+		dst = _mm_and_si128(dst, alphaMask);
+		src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
+		return _mm_or_si128(src, dst);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 pix = *(const uint32 *)in;
+		int a = in[BlendBlit::kAIndex];
+
+		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+			*(uint32 *)out = pix;
+			out[BlendBlit::kAIndex] = 0xFF;
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct AdditiveBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m128i ina;
+    	if (alphamod)
+    	    ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+    	else
+    	    ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
+
+    	if (rgbmod) {
+    	    __m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask));
+    	    __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	    __m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask));
+    	    __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
+
+    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcb)));
+    	} else if (alphamod) {
+    	    __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+    	    __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+    	    __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+    	} else {
+    	    __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+    	    __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+    	    __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+		}
+
+    	dst = _mm_and_si128(alphaMask, dst);
+    	src = _mm_andnot_si128(alphaMask, src);
+    	return _mm_or_si128(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct SubtractiveBlend {
+	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    	__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
+
+    	return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		out[BlendBlit::kAIndex] = 255;
+		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+	}
+};
+
+class BlendBlitImpl {
+
+public:
+template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const byte *in;
+	byte *out;
+
+	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
+
+    if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+
+		uint32 j = 0;
+		for (; j + 4 <= args.width; j += 4) {
+    		__m128i dstPixels, srcPixels;
+			if (loaddst) dstPixels = _mm_loadu_si128((const __m128i *)out);
+    		if (!doscale) {
+    		    srcPixels = _mm_loadu_si128((const __m128i *)in);
+    		} else {
+				srcPixels = _mm_setr_epi32(
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+				);
+				scaleXCtr += args.scaleX * 4;
+    		}
+    		if (!doscale && (args.flipping & FLIP_H)) {
+				srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+    		}
+			{
+				const __m128i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+				_mm_storeu_si128((__m128i *)out, res);
+			}
+			if (!doscale) in += (ptrdiff_t)args.inStep * 4;
+			out += 4ULL * 4;
+		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAlphaBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale, bool rgbmod>
+void BlendBlit::doBlitSubtractiveBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAdditiveBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitMultiplyBlendLogicSSE2(Args &args) {
+	BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+}
+
+} // End of namespace Graphics
+
+#endif // __x86_64__
diff --git a/graphics/blit.h b/graphics/blit.h
index 2ac68c8fca6..46d0879155a 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -237,6 +237,9 @@ private:
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 LOGIC_FUNCS_EXT(NEON)
 #endif
+#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+LOGIC_FUNCS_EXT(SSE2)
+#endif
 LOGIC_FUNCS_EXT(Generic)
 #undef LOGIC_FUNCS_EXT
 


Commit: 4f9739685a3b8e728a0530efc331d5e1de8e4a17
    https://github.com/scummvm/scummvm/commit/4f9739685a3b8e728a0530efc331d5e1de8e4a17
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: blendBlit test will now also test SSE2

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index 999f55c1606..5edff8d87e3 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -893,11 +893,13 @@ public:
 	void test_blend_speed() {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
+#elif defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitSSE2;
 #else
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 #endif
 	    Graphics::Surface baseSurface, destSurface;
-	    baseSurface.create(128, 128, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
+	    baseSurface.create(103, 103, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
 	    destSurface.create(256, 256, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
 	    for (int y = 0; y < baseSurface.h; y++) {
 	    	for (int x = 0; x < baseSurface.w; x++) {
@@ -915,9 +917,9 @@ public:
 		double oldTime = 0.0, newTime = 0.0, genericTime = 0.0;
 		double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
 		const int iters = 2500;
-		
-        for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::BLEND_NORMAL + 1; blendMode++) {
-        for (int alphaType = Graphics::ALPHA_FULL; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+
+        for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
+        for (int alphaType = Graphics::ALPHA_OPAQUE; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
         for (int flipping = 0; flipping <= 3; flipping++) {
 		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -1050,7 +1052,7 @@ public:
         for (int g = 255; g >= 0; g = (g == 255 ? 128 : (g == 128 ? 0 : -1))) {
         for (int b = 255; b >= 0; b = (b == 255 ? 128 : (b == 128 ? 0 : -1))) {
         for (int flipping = 0; flipping <= 3; flipping++) {
-        for (int rect = 0; rect < sizeof(srcs)/sizeof(srcs[0]); rect++) {
+        for (int rect = 0; rect < (int)(sizeof(srcs)/sizeof(srcs[0])); rect++) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
             oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);


Commit: 843d835641deae375950d054b0cf7991cd9ae963
    https://github.com/scummvm/scummvm/commit/843d835641deae375950d054b0cf7991cd9ae963
Author: Wyatt Radkiewicz (wyattwradkiewicz at gmail.com)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Change blendFrom test

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index 5edff8d87e3..e8dc9a5fbbd 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -918,8 +918,8 @@ public:
 		double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
 		const int iters = 2500;
 
-        for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
-        for (int alphaType = Graphics::ALPHA_OPAQUE; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+        for (int blendMode = 0; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
+        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
         for (int flipping = 0; flipping <= 3; flipping++) {
 		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));


Commit: e4c984cd3aadb71ef519d58def2a07d738add8c3
    https://github.com/scummvm/scummvm/commit/e4c984cd3aadb71ef519d58def2a07d738add8c3
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Configure flags for SIMD extensions

--enable-ext-neon
--enable-ext-sse2
--enable-ext-avx2

Changed paths:
    configure
    graphics/blit-alpha.cpp
    graphics/blit-neon.cpp
    graphics/blit-sse2.cpp
    graphics/blit.h
    test/image/blending.h


diff --git a/configure b/configure
index d5559372305..133a9881420 100755
--- a/configure
+++ b/configure
@@ -293,6 +293,9 @@ add_feature zlib "zlib" "_zlib"
 add_feature lua "lua" "_lua"
 add_feature fribidi "FriBidi" "_fribidi"
 add_feature test_cxx11 "Test C++11" "_test_cxx11"
+add_feature ext_sse2 "Add x86/64 SSE2 support" "_ext_sse2"
+add_feature ext_avx2 "Add x86/64 AVX2 support" "_ext_avx2"
+add_feature ext_neon "Add Arm NEON support" "_ext_neon"
 
 # Directories for installing ScummVM.
 # This list is closely based on what GNU autoconf does,
@@ -979,6 +982,9 @@ Optional Features:
   --disable-windows-console do not show console output on Windows
   --enable-windows-unicode  use Windows Unicode APIs (default)
   --disable-windows-unicode use Windows ANSI APIs
+	--enable-ext-sse2				  allow code to use sse2 extensions on x86/64
+	--enable-ext-avx2				  allow code to use avx2 extensions on x86/64
+	--enable-ext-neon				  allow code to use neon extensions on Arm
 
 Optional Documentation Options:
   --with-manual-version=VERSION version to use when generating the manual (optional)
@@ -1292,6 +1298,12 @@ for ac_option in $@; do
 	--disable-eventrecorder)     _eventrec=no            ;;
 	--enable-text-console)       _text_console=yes       ;;
 	--disable-text-console)      _text_console=no        ;;
+	--enable-ext-sse2)           _ext_sse2=yes       		 ;;
+	--disable-ext-sse2)          _ext_sse2=no        		 ;;
+	--enable-ext-avx2)           _ext_avx2=yes       		 ;;
+	--disable-ext-avx2)          _ext_avx2=no        		 ;;
+	--enable-ext-neon)           _ext_neon=yes       		 ;;
+	--disable-ext-neon)          _ext_neon=no        		 ;;
 	--with-fluidsynth-prefix=*)
 		arg=`echo $ac_option | cut -d '=' -f 2`
 		FLUIDSYNTH_CFLAGS="-I$arg/include"
@@ -6857,6 +6869,30 @@ if test "$_enable_ubsan" = yes ; then
 fi
 echo "$_enable_ubsan"
 
+#
+# Whether to add compiler options and preprocessor defines for SIMD extensions
+#
+if [[ "$_host_cpu" =~ ^86$ ]] ; then
+	define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
+	if test "$_ext_sse2" = yes ; then
+		append_var CXXFLAGS "-msse2"
+	fi
+	echo_n "Enabling x86/64 SSE2... "
+	echo "$_ext_sse2"
+	define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
+	if test "$_ext_avx2" = yes ; then
+		append_var CXXFLAGS "-mavx2 -mavx"
+	fi
+	echo_n "Enabling x86/64 AVX2... "
+	echo "$_ext_avx2"
+fi
+
+if [[ "$_host_cpu" =~ ^arm$ ]] || [[ "$_host_cpu" =~ ^aarch64$ ]] ; then
+	define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
+	echo_n "Enabling arm NEON... "
+	echo "$_ext_neon"
+fi
+
 echo_n "Backend... "
 echo_n "$_backend"
 
diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 62490ee5ac9..1b4f70ab765 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -538,14 +538,12 @@ void BlendBlit::blit(byte *dst, const byte *src,
 	if (width == 0 || height == 0) return;
 	if (!blitFunc) {
 	// Get the correct blit function
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+	blitFunc = blitGeneric;
+#ifdef SCUMMVM_NEON
 	if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
-	else blitFunc = blitGeneric;
-#elif defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#endif
+#ifdef SCUMMVM_SSE2
 	if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
-	else blitFunc = blitGeneric;
-#else
-	blitFunc = blitGeneric;
 #endif
 	}
 	
@@ -675,10 +673,10 @@ void BlendBlit::blit(byte *dst, const byte *src,
 		} \
 	}
 BLIT_FUNC(Generic)
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#ifdef SCUMMVM_NEON
 BLIT_FUNC(NEON)
 #endif
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#ifdef SCUMMVM_SSE2
 BLIT_FUNC(SSE2)
 #endif
 
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 018b15fe5d9..16ee17fd2b4 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -19,7 +19,7 @@
  *
  */
 
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#ifdef SCUMMVM_NEON
 #include <arm_neon.h>
 
 #include "graphics/blit.h"
@@ -369,4 +369,4 @@ void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
 
 } // end of namespace Graphics
 
-#endif // __ARM_NEON__
+#endif // SCUMMVM_NEON
diff --git a/graphics/blit-sse2.cpp b/graphics/blit-sse2.cpp
index 799583644fb..315d479f154 100644
--- a/graphics/blit-sse2.cpp
+++ b/graphics/blit-sse2.cpp
@@ -19,7 +19,7 @@
  *
  */
 
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#ifdef SCUMMVM_SSE2
 #include <immintrin.h>
 
 #include "graphics/blit.h"
@@ -373,4 +373,4 @@ void BlendBlit::doBlitMultiplyBlendLogicSSE2(Args &args) {
 
 } // End of namespace Graphics
 
-#endif // __x86_64__
+#endif // SSE2
diff --git a/graphics/blit.h b/graphics/blit.h
index 46d0879155a..1887bbaeb4c 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -234,10 +234,10 @@ private:
 	template<bool doscale, bool rgbmod, bool alphamod> \
 	static void doBlitAlphaBlendLogic##ext(Args &args); \
 	static void blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#ifdef SCUMMVM_NEON
 LOGIC_FUNCS_EXT(NEON)
 #endif
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#ifdef SCUMMVM_SSE2
 LOGIC_FUNCS_EXT(SSE2)
 #endif
 LOGIC_FUNCS_EXT(Generic)
diff --git a/test/image/blending.h b/test/image/blending.h
index e8dc9a5fbbd..f40cd5282e9 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -891,9 +891,9 @@ static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface
 class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
 public:
 	void test_blend_speed() {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#ifdef SCUMMVM_NEON
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
-#elif defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
+#elif SCUMMVM_SSE2
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitSSE2;
 #else
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;


Commit: 64a6548041c6c510525d18a49baa88204edc1faf
    https://github.com/scummvm/scummvm/commit/64a6548041c6c510525d18a49baa88204edc1faf
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: Fixed BaseBackend AVX2 detection

Changed paths:
    backends/base-backend.cpp


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index 5a3837d5510..7533cb9b13e 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -73,6 +73,7 @@ void BaseBackend::initBackend() {
 		 "mov %%edx, %0\n\t"
 		 "mov %%ecx, %2\n\t"
 		 "mov $7, %%eax\n\t"
+		 "mov $0, %%ecx\n\t"
 		 "cpuid\n\t"
 		 "mov %%ebx, %1\n\t"
 		 : "=rm" (ext_edx1), "=rm" (ext_ebx7), "=rm" (ext_ecx1)
@@ -85,7 +86,8 @@ void BaseBackend::initBackend() {
 		cpuid
 		mov ext_edx1,edx
 		mov ext_ecx1,ecx
-		mov ebx,7
+		mov eax,7
+		mov ecx,0
 		cpuid
 		mov ext_ebx7,ebx
 	}


Commit: 01e218316135fafaf09c605c780e6bc2f5e32326
    https://github.com/scummvm/scummvm/commit/01e218316135fafaf09c605c780e6bc2f5e32326
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Make ./configure POSIX compliant

Changed paths:
    configure


diff --git a/configure b/configure
index 133a9881420..ed5a1554868 100755
--- a/configure
+++ b/configure
@@ -225,6 +225,9 @@ _builtin_resources=yes
 _windows_console=yes
 _windows_unicode=yes
 _cygwin_build=no
+_ext_sse2=no
+_ext_avx2=no
+_ext_neon=no
 # Default commands
 _ranlib=ranlib
 _strip=strip
@@ -6872,22 +6875,23 @@ echo "$_enable_ubsan"
 #
 # Whether to add compiler options and preprocessor defines for SIMD extensions
 #
-if [[ "$_host_cpu" =~ ^86$ ]] ; then
+if ( echo "$_host_cpu" | grep '86' >> /dev/null ) ; then
 	define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
 	if test "$_ext_sse2" = yes ; then
-		append_var CXXFLAGS "-msse2"
+		append_var CXXFLAGS "-msse2 -msse"
 	fi
 	echo_n "Enabling x86/64 SSE2... "
 	echo "$_ext_sse2"
 	define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
 	if test "$_ext_avx2" = yes ; then
-		append_var CXXFLAGS "-mavx2 -mavx"
+		append_var CXXFLAGS "-mavx2 -mavx -msse2 -msse"
+		define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
 	fi
-	echo_n "Enabling x86/64 AVX2... "
+	echo_n "Enabling x86/64 AVX2 and SSE2... "
 	echo "$_ext_avx2"
 fi
 
-if [[ "$_host_cpu" =~ ^arm$ ]] || [[ "$_host_cpu" =~ ^aarch64$ ]] ; then
+if ( echo "$_host_cpu" | grep 'arm' >> /dev/null ) || ( echo "$_host_cpu" | grep 'aarch64' >> /dev/null ) ; then
 	define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
 	echo_n "Enabling arm NEON... "
 	echo "$_ext_neon"


Commit: 90cb8cbe5f679197f5b471e0756922e4c4a73150
    https://github.com/scummvm/scummvm/commit/90cb8cbe5f679197f5b471e0756922e4c4a73150
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fix SSE2 and NEON bug in BlendBlit

Changed paths:
    graphics/blit-neon.cpp
    graphics/blit-sse2.cpp


diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 16ee17fd2b4..534133afd6d 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -194,7 +194,7 @@ struct AdditiveBlend {
 			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
 
     	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcb)));
+    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
     	} else if (alphamod) {
     	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
     	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
diff --git a/graphics/blit-sse2.cpp b/graphics/blit-sse2.cpp
index 315d479f154..8c0576febfc 100644
--- a/graphics/blit-sse2.cpp
+++ b/graphics/blit-sse2.cpp
@@ -200,7 +200,7 @@ struct AdditiveBlend {
 			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
 
     	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcb)));
+    	    src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
     	} else if (alphamod) {
     	    __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
     	    __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
@@ -270,6 +270,11 @@ struct SubtractiveBlend {
 class BlendBlitImpl {
 
 public:
+#ifdef SCUMMVM_AVX2
+template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+static inline void blitInnerLoopAVX2(BlendBlit::Args &args);
+#endif
+
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
 	const byte *in;


Commit: 39f7202473e4c3a3621f4457f0d14015a4645d1b
    https://github.com/scummvm/scummvm/commit/39f7202473e4c3a3621f4457f0d14015a4645d1b
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Add AVX2 support for BlendBlit

Changed paths:
  A graphics/blit-avx2.cpp
    graphics/blit-alpha.cpp
    graphics/blit.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 1b4f70ab765..424dc4c3026 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -25,6 +25,7 @@
 
 #include "graphics/blit-neon.cpp"
 #include "graphics/blit-sse2.cpp"
+#include "graphics/blit-avx2.cpp"
 
 namespace Graphics {
 
@@ -544,6 +545,9 @@ void BlendBlit::blit(byte *dst, const byte *src,
 #endif
 #ifdef SCUMMVM_SSE2
 	if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
+#endif
+#ifdef SCUMMVM_AVX2
+	if (g_system->hasFeature(OSystem::kFeatureAVX2)) blitFunc = blitAVX2;
 #endif
 	}
 	
@@ -679,5 +683,8 @@ BLIT_FUNC(NEON)
 #ifdef SCUMMVM_SSE2
 BLIT_FUNC(SSE2)
 #endif
+#ifdef SCUMMVM_AVX2
+BLIT_FUNC(AVX2)
+#endif
 
 } // End of namespace Graphics
diff --git a/graphics/blit-avx2.cpp b/graphics/blit-avx2.cpp
new file mode 100644
index 00000000000..13bf6561a5b
--- /dev/null
+++ b/graphics/blit-avx2.cpp
@@ -0,0 +1,370 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifdef SCUMMVM_AVX2
+#include <immintrin.h>
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+
+namespace Graphics {
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct AlphaBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+	    __m256i ina;
+	    if (alphamod)
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+	    else
+			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+	    __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+	
+	    if (rgbmod) {
+	    	__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+	    	__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+	    	__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+	    	__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+	    	__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+	    	__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+			dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+			dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
+			srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(cb)));
+			src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
+			src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
+	    } else {
+			__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+			dstRB = _mm256_srli_epi32(_mm256_mullo_epi32(dstRB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+			dstG = _mm256_srli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+			srcRB = _mm256_slli_epi32(_mm256_add_epi32(dstRB, _mm256_srli_epi32(_mm256_mullo_epi32(srcRB, ina), 8)), BlendBlit::kBModShift);
+			srcG = _mm256_slli_epi32(_mm256_add_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(_mm256_and_si256(srcRB, _mm256_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+	    }
+
+		dst = _mm256_and_si256(alphaMask, dst);
+		src = _mm256_andnot_si256(alphaMask, src);
+	    return _mm256_or_si256(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+			out[BlendBlit::kAIndex] = 255;
+			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct MultiplyBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m256i ina;
+	    if (alphamod)
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+	    else
+			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+	    __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+
+    	if (rgbmod) {
+			__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcb, _mm256_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcg, _mm256_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcr, _mm256_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
+    	} else {
+			__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+    	    srcrb = _mm256_and_si256(_mm256_mullo_epi32(dstrb, _mm256_srli_epi32(_mm256_mullo_epi32(srcrb, ina), 8)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+    	}
+
+    	dst = _mm256_and_si256(alphaMask, dst);
+    	src = _mm256_andnot_si256(alphaMask, src);
+    	return _mm256_or_si256(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct OpaqueBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct BinaryBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		__m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
+		dst = _mm256_and_si256(dst, alphaMask);
+		src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
+		return _mm256_or_si256(src, dst);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 pix = *(const uint32 *)in;
+		int a = in[BlendBlit::kAIndex];
+
+		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+			*(uint32 *)out = pix;
+			out[BlendBlit::kAIndex] = 0xFF;
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct AdditiveBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m256i ina;
+    	if (alphamod)
+    	    ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+    	else
+    	    ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
+
+    	if (rgbmod) {
+    	    __m256i srcb = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask));
+    	    __m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    __m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	    __m256i dstb = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask));
+    	    __m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	    __m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
+    	} else if (alphamod) {
+    	    __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+    	    __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+    	    __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+    	} else {
+    	    __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+    	    __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	    __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+    	    __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+		}
+
+    	dst = _mm256_and_si256(alphaMask, dst);
+    	src = _mm256_andnot_si256(alphaMask, src);
+    	return _mm256_or_si256(dst, src);
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+		if (ina != 0) {
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+		}
+	}
+};
+
+template<bool doscale, bool rgbmod, bool alphamod>
+struct SubtractiveBlendAVX2 {
+	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    	__m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    	__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+    	__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+    	__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+    	__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+    	return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
+	}
+
+	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+		out[BlendBlit::kAIndex] = 255;
+		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+	}
+};
+
+template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
+	const byte *in;
+	byte *out;
+
+	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+
+	int scaleXCtr, scaleYCtr = 0;
+	const byte *inBase;
+
+    if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = 0;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+
+		uint32 j = 0;
+		for (; j + 8 <= args.width; j += 8) {
+    		__m256i dstPixels, srcPixels;
+			if (loaddst) dstPixels = _mm256_loadu_si256((const __m256i *)out);
+    		if (!doscale) {
+    		    srcPixels = _mm256_loadu_si256((const __m256i *)in);
+    		} else {
+				srcPixels = _mm256_setr_epi32(
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 4) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 5) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 6) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 7) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+				);
+				scaleXCtr += args.scaleX * 8;
+    		}
+    		if (!doscale && (args.flipping & FLIP_H)) {
+				srcPixels = _mm256_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+				srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
+    		}
+			{
+				const __m256i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+				_mm256_storeu_si256((__m256i *)out, res);
+			}
+			if (!doscale) in += (ptrdiff_t)args.inStep * 8;
+			out += 4ULL * 8;
+		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 7;
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAlphaBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<AlphaBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale, bool rgbmod>
+void BlendBlit::doBlitSubtractiveBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<SubtractiveBlendAVX2, doscale, rgbmod, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAdditiveBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<AdditiveBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<OpaqueBlendAVX2, doscale, false, false, false, true>(args);
+}
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<BinaryBlendAVX2, doscale, false, false, false, true>(args);
+}
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitMultiplyBlendLogicAVX2(Args &args) {
+	BlendBlitImpl::blitInnerLoopAVX2<MultiplyBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+}
+
+} // End of namespace Graphics
+
+#endif // SCUMMVM_AVX2
diff --git a/graphics/blit.h b/graphics/blit.h
index 1887bbaeb4c..f27d3473125 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -240,6 +240,9 @@ LOGIC_FUNCS_EXT(NEON)
 #ifdef SCUMMVM_SSE2
 LOGIC_FUNCS_EXT(SSE2)
 #endif
+#ifdef SCUMMVM_AVX2
+LOGIC_FUNCS_EXT(AVX2)
+#endif
 LOGIC_FUNCS_EXT(Generic)
 #undef LOGIC_FUNCS_EXT
 


Commit: e5bc2d696d3f6055d87e438fa39a1640e2b775a5
    https://github.com/scummvm/scummvm/commit/e5bc2d696d3f6055d87e438fa39a1640e2b775a5
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Test for AVX2 BlendBlit

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index f40cd5282e9..4b85ae9fe58 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -891,12 +891,15 @@ static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface
 class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
 public:
 	void test_blend_speed() {
+		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 #ifdef SCUMMVM_NEON
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
-#elif SCUMMVM_SSE2
+#endif
+#ifdef SCUMMVM_SSE2
 		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitSSE2;
-#else
-		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
+#endif
+#ifdef SCUMMVM_AVX2
+		Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitAVX2;
 #endif
 	    Graphics::Surface baseSurface, destSurface;
 	    baseSurface.create(103, 103, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
@@ -918,8 +921,8 @@ public:
 		double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
 		const int iters = 2500;
 
-        for (int blendMode = 0; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
-        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
+        for (int blendMode = 0; blendMode < 1; blendMode++) {
+        for (int alphaType = 0; alphaType <= 1; alphaType++) {
         for (int flipping = 0; flipping <= 3; flipping++) {
 		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));


Commit: a18332ab36ab83b2595f20621747605f73beea84
    https://github.com/scummvm/scummvm/commit/a18332ab36ab83b2595f20621747605f73beea84
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
JANITORIAL: Touch up BlendBlit comments

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit.h
    graphics/managed_surface.h
    test/image/blending.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index 424dc4c3026..e20a3a651da 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -287,8 +287,8 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 
 			uint32 ina = in[kAIndex] * ca >> 8;
 
-			if (rgbmod) {
-				if (ina != 0) {
+			if (ina != 0) {
+				if (rgbmod) {
 					const uint outb = (out[kBIndex] * (255 - ina) >> 8);
 					const uint outg = (out[kGIndex] * (255 - ina) >> 8);
 					const uint outr = (out[kRIndex] * (255 - ina) >> 8);
@@ -297,26 +297,12 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 					out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
 					out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
 					out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
-				}
-			} else {
-				if (ina != 0) {
-					// Runs faster on newer hardware (doesn't do single byte manip)
-					const uint32 in32 = *(const uint32 *)in;
-					const uint32 out32 = *(const uint32 *)out;
-					const uint32 rb = (in32 & (kRModMask | kBModMask)) >> 8;
-					const uint32 g = in32 & kGModMask;
-					const uint32 dstrb = (out32 & (kRModMask | kBModMask)) >> 8;
-					const uint32 dstg = out32 & kGModMask;
-					*(uint32 *)out = kAModMask |
-						((dstrb * (255 - ina) + rb * ina) & (kRModMask | kBModMask)) |
-						(((dstg * (255 - ina) + g * ina) >> 8) & kGModMask);
-
-					// I think this code will run faster on older hardware
-					// TODO maybe?: Put #ifdef to use on older hardware
-					//out[kAIndex] = 255;
-					//out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
-					//out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
-					//out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
+				} else {
+					out[kAIndex] = 255;
+					out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
+					out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
+					out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
+					
 				}
 			}
 
@@ -460,10 +446,7 @@ void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
 		if (doscale) {
 			for (uint32 j = 0; j < args.width; j++) {
 				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-
-				memcpy(out, in, 4);
-				out[kAIndex] = 0xFF;
-
+				*(uint32 *)out = *(const uint32 *)in | kAModMask;
 				scaleXCtr += args.scaleX;
 				out += 4;
 			}
@@ -503,13 +486,13 @@ void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
 			if (doscale) {
 				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
 			}
-			uint32 pix = *(const uint32 *)in;
-			int a = in[kAIndex];
 
-			if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-				*(uint32 *)out = pix;
-				out[kAIndex] = 0xFF;
-			}
+			uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
+			uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
+    		pixout &= ~mask;
+    		pix = (pix | kAModMask) & mask;
+    		*(uint32 *)out = pixout | pix;
+			
 			if (doscale)
 				scaleXCtr += args.scaleX;
 			else
@@ -528,6 +511,9 @@ void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
 BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
 
 // Only blits to and from 32bpp images
+// So this function is just here to jump to whatever function is in
+// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
+// the cpu has certain SIMD feature enabled or not.
 void BlendBlit::blit(byte *dst, const byte *src,
 					 const uint dstPitch, const uint srcPitch,
 					 const int posX, const int posY,
@@ -537,17 +523,19 @@ void BlendBlit::blit(byte *dst, const byte *src,
 					 const TSpriteBlendMode blendMode,
 					 const AlphaType alphaType) {
 	if (width == 0 || height == 0) return;
+
+	// If no function has been selected yet, detect and select
 	if (!blitFunc) {
-	// Get the correct blit function
-	blitFunc = blitGeneric;
+		// Get the correct blit function
+		blitFunc = blitGeneric;
 #ifdef SCUMMVM_NEON
-	if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
+		if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
 #endif
 #ifdef SCUMMVM_SSE2
-	if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
+		if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
 #endif
 #ifdef SCUMMVM_AVX2
-	if (g_system->hasFeature(OSystem::kFeatureAVX2)) blitFunc = blitAVX2;
+		if (g_system->hasFeature(OSystem::kFeatureAVX2)) blitFunc = blitAVX2;
 #endif
 	}
 	
@@ -555,7 +543,8 @@ void BlendBlit::blit(byte *dst, const byte *src,
 	blitFunc(args, blendMode, alphaType);
 }
 
-// Let me know if there is a way to do function pointer to templated functions
+// This is just a macro to expand it because its a pretty simple function where
+// readabiliy doesn't matter too much and macros tend to work faster better than functors
 #define BLIT_FUNC(ext) \
 	void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
 		bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
diff --git a/graphics/blit.h b/graphics/blit.h
index f27d3473125..811eda3dd55 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -294,14 +294,21 @@ public:
 
 	/**
 	 * Optimized version of doBlit to be used with alpha blended blitting
-	 * @param ino a pointer to the input surface
-	 * @param outo a pointer to the output surface
+	 * NOTE: Can only be used with BlendBlit::getSupportedPixelFormat format
+	 * @param dst a pointer to the destination buffer (can be offseted by pixels)
+	 * @param src a pointer to the source buffer (can be offseted by pixels)
+	 * @param dstPitch destination pitch
+	 * @param srcPitch source pitch
+	 * @param posX where src will be blitted to (onto dest)
+	 * @param posY where src will be blitted to (onto dest)
 	 * @param width width of the input surface
 	 * @param height height of the input surface
-	 * @param pitch pitch of the output surface - that is, width in bytes of every row, usually bpp * width of the TARGET surface (the area we are blitting to might be smaller, do the math)
-	 * @inStep size in bytes to skip to address each pixel, usually bpp of the source surface
-	 * @inoStep width in bytes of every row on the *input* surface / kind of like pitch
-	 * @color colormod in 0xAARRGGBB format - 0xFFFFFFFF for no colormod
+	 * @param scaleX scale factor to use when blitting (src / dst) (0.5 for 2x scale) use BlendBlit::SCALE_THRESHOLD
+	 * @param scaleY scale factor to use when blitting (src / dst) (0.5 for 2x scale) use BlendBlit::SCALE_THRESHOLD
+	 * @param colorMod the color to multiply by. (0xffffffff does no multiplication and has 0 overhead usually)
+	 * @param flipping flipping flags used with Graphics::FLIP_FLAGS
+	 * @param blendMode the blending mode to be used
+	 * @param alphaType the alpha mixing mode to be used
 	 */
 	static void blit(byte *dst, const byte *src,
 			  const uint dstPitch, const uint srcPitch,
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 8706019ad7f..bca2a7b262a 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -537,35 +537,15 @@ public:
 	}
 
 	/**
-	 * @brief renders the surface to another surface
-	 * @note Most of this is wrong at the time being... Not sure whether or not to keep the old
-	 * arguments or just make the function like the rest here.
-	 * @param target a pointer to the target surface. In most cases this is the framebuffer.
-	 * @param posX the position on the X-axis in the target image in pixels where the image is supposed to be rendered.<br>
-	 * The default value is 0.
-	 * @param posY the position on the Y-axis in the target image in pixels where the image is supposed to be rendered.<br>
-	 * The default value is 0.
-	 * @param flipping how the image should be flipped.<br>
-	 * The default value is Graphics::FLIP_NONE (no flipping)
-	 * @param pPartRect Pointer on Common::Rect which specifies the section to be rendered. If the whole image has to be rendered the Pointer is NULL.<br>
-	 * This referes to the unflipped and unscaled image.<br>
-	 * The default value is NULL.
-	 * @param color an ARGB color value, which determines the parameters for the color modulation und alpha blending.<br>
-	 * The alpha component of the color determines the alpha blending parameter (0 = no covering, 255 = full covering).<br>
-	 * The color components determines the color for color modulation.<br>
-	 * The default value is TS_ARGB(255, 255, 255, 255) (full covering, no color modulation).
-	 * The macros TS_RGB and TS_ARGB can be used for the creation of the color value.
-	 * **Temporarily, these macros can also be replaced with blendBlitMakeARGB/RGB static members of
-	 *  Graphics::ManagedSurface
-	 * @param width the output width of the screen section.
-	 * The images will be scaled if the output width of the screen section differs from the image section.<br>
-	 * The value -1 determines that the image should not be scaled.<br>
-	 * The default value is -1.
-	 * @param height the output height of the screen section.
-	 * The images will be scaled if the output width of the screen section differs from the image section.<br>
-	 * The value -1 determines that the image should not be scaled.<br>
-	 * The default value is -1.
-	 * @return returns the size (not position) of what was drawn to this managed surface.
+	 * @brief renders src onto this managed surface
+	 * @param src source surface
+	 * @param srcRect source clipping rectangle (used for sprite sheets for example)
+	 * @param destRect the destination of source onto this managed surface
+	 * @param flipping flipping flags (use Graphics::FLIP_FLAGS)
+	 * @param colorMod what color to multiply by (0xffffffff does nothing)
+	 * @param blend the blending mode to use.
+	 * @param alphaType what alpha mode to use. FULL is default
+	 * @return returns the size of the rendered rectangle
 	 */
 	Common::Rect blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
 							   const Common::Rect &destRect, int flipping = FLIP_NONE,
diff --git a/test/image/blending.h b/test/image/blending.h
index 4b85ae9fe58..56584505e98 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -921,8 +921,8 @@ public:
 		double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
 		const int iters = 2500;
 
-        for (int blendMode = 0; blendMode < 1; blendMode++) {
-        for (int alphaType = 0; alphaType <= 1; alphaType++) {
+        for (int blendMode = 0; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
+        for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
         for (int flipping = 0; flipping <= 3; flipping++) {
 		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -964,7 +964,6 @@ public:
 			}
 			newTimeScaled += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
-			oldFunc = Graphics::BlendBlit::blitFunc;
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {


Commit: ad0c823f2f0fddb29776bc33cfa4fefaef321842
    https://github.com/scummvm/scummvm/commit/ad0c823f2f0fddb29776bc33cfa4fefaef321842
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: TransparentSurface scales in place

TransparentSurface now scales in place instead of making a copy. This
is much faster than before.
Also BlendBlit::blit now takes a scale offset parameter to help with
vary large images being cropped, otherwise people can leave it to 0.

Changed paths:
    graphics/blit-alpha.cpp
    graphics/blit-avx2.cpp
    graphics/blit-neon.cpp
    graphics/blit-sse2.cpp
    graphics/blit.h
    graphics/managed_surface.cpp
    graphics/transparent_surface.cpp
    test/image/blending.h


diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index e20a3a651da..a7c8cc7bd38 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -178,10 +178,12 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
 	const int posX, const int posY,
 	const uint _width, const uint _height,
 	const int _scaleX, const int _scaleY,
+	const int scaleXsrcOff, const int scaleYsrcOff,
 	const uint32 colorMod, const uint _flipping) :
 		xp(0), yp(0), dstPitch(_dstPitch),
 		width(_width), height(_height), color(colorMod),
-		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping) {
+		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
+		scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
 	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
 	
 	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
@@ -212,7 +214,7 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	const byte rawcr = (args.color >> kRModShift) & 0xFF;
@@ -226,7 +228,7 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -264,7 +266,7 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
@@ -275,7 +277,7 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -329,7 +331,7 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	const byte rawcr = (args.color >> kRModShift) & 0xFF;
@@ -342,7 +344,7 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -379,7 +381,7 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	const byte rawcr = (args.color >> kRModShift) & 0xFF;
@@ -393,7 +395,7 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -431,13 +433,13 @@ void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -471,13 +473,13 @@ void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
 	const byte *in;
 	byte *out;
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
@@ -519,6 +521,7 @@ void BlendBlit::blit(byte *dst, const byte *src,
 					 const int posX, const int posY,
 					 const uint width, const uint height,
 					 const int scaleX, const int scaleY,
+					 const int scaleXsrcOff, const int scaleYsrcOff,
 					 const uint32 colorMod, const uint flipping,
 					 const TSpriteBlendMode blendMode,
 					 const AlphaType alphaType) {
@@ -539,7 +542,7 @@ void BlendBlit::blit(byte *dst, const byte *src,
 #endif
 	}
 	
-	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, colorMod, flipping);
+	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
 	blitFunc(args, blendMode, alphaType);
 }
 
diff --git a/graphics/blit-avx2.cpp b/graphics/blit-avx2.cpp
index 13bf6561a5b..d06a0ab310e 100644
--- a/graphics/blit-avx2.cpp
+++ b/graphics/blit-avx2.cpp
@@ -274,7 +274,7 @@ void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
 	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
 	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
     if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
@@ -282,7 +282,7 @@ void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
diff --git a/graphics/blit-neon.cpp b/graphics/blit-neon.cpp
index 534133afd6d..17712dc979f 100644
--- a/graphics/blit-neon.cpp
+++ b/graphics/blit-neon.cpp
@@ -277,7 +277,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
 	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
     if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
@@ -285,7 +285,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
diff --git a/graphics/blit-sse2.cpp b/graphics/blit-sse2.cpp
index 8c0576febfc..1925490bdaa 100644
--- a/graphics/blit-sse2.cpp
+++ b/graphics/blit-sse2.cpp
@@ -288,7 +288,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
 	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
 
-	int scaleXCtr, scaleYCtr = 0;
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
     if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
@@ -296,7 +296,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
 			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = 0;
+			scaleXCtr = args.scaleXoff;
 		} else {
 			in = args.ino;
 		}
diff --git a/graphics/blit.h b/graphics/blit.h
index 811eda3dd55..8336c30ba34 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -203,7 +203,7 @@ private:
 		const byte *ino;
 		byte *outo;
 	
-		int scaleX, scaleY;
+		int scaleX, scaleY, scaleXoff, scaleYoff;
 		uint dstPitch;
 		uint width, height;
 		uint32 color;
@@ -214,6 +214,7 @@ private:
 			 const int posX, const int posY,
 			 const uint width, const uint height,
 			 const int scaleX, const int scaleY,
+			 const int scaleXsrcOff, const int scaleYsrcOff,
 			 const uint32 colorMod, const uint flipping);
 	};
 
@@ -305,6 +306,10 @@ public:
 	 * @param height height of the input surface
 	 * @param scaleX scale factor to use when blitting (src / dst) (0.5 for 2x scale) use BlendBlit::SCALE_THRESHOLD
 	 * @param scaleY scale factor to use when blitting (src / dst) (0.5 for 2x scale) use BlendBlit::SCALE_THRESHOLD
+	 * @param scaleXsrcOff since you can only offset the *src pointer to effectivly
+	 *     get a different part of the source image rendered, it can only go in
+	 *     1 pixel chunks, so this fixes that by added a little offset
+	 * @param scaleYsrcOff same as the X one
 	 * @param colorMod the color to multiply by. (0xffffffff does no multiplication and has 0 overhead usually)
 	 * @param flipping flipping flags used with Graphics::FLIP_FLAGS
 	 * @param blendMode the blending mode to be used
@@ -315,6 +320,7 @@ public:
 			  const int posX, const int posY,
 			  const uint width, const uint height,
 			  const int scaleX, const int scaleY,
+			  const int scaleXsrcOff, const int scaleYsrcOff,
 			  const uint32 colorMod, const uint flipping,
 			  const TSpriteBlendMode blendMode,
 			  const AlphaType alphaType);
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index d534935c3e4..ae6e7ebff84 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -745,13 +745,16 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 
 	const int scaleX = BlendBlit::getScaleFactor(srcArea.width(), dstArea.width());
 	const int scaleY = BlendBlit::getScaleFactor(srcArea.height(), dstArea.height());
+	int scaleXoff = 0, scaleYoff = 0;
 
 	if (dstArea.left < 0) {
+		scaleXoff = (-dstArea.left * scaleX) % BlendBlit::SCALE_THRESHOLD;
 		srcArea.left += -dstArea.left * scaleX / BlendBlit::SCALE_THRESHOLD;
 		dstArea.left = 0;
 	}
 
 	if (dstArea.top < 0) {
+		scaleYoff = (-dstArea.top * scaleY) % BlendBlit::SCALE_THRESHOLD;
 		srcArea.top += -dstArea.top * scaleY / BlendBlit::SCALE_THRESHOLD;
 		dstArea.top = 0;
 	}
@@ -770,12 +773,14 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 		int tmp_w = srcArea.width();
 		srcArea.left = src.w - srcArea.right;
 		srcArea.right = srcArea.left + tmp_w;
+		scaleXoff = (BlendBlit::SCALE_THRESHOLD - (scaleXoff + dstArea.width() * scaleX)) % BlendBlit::SCALE_THRESHOLD;
 	}
 
 	if (flipping & FLIP_V) {
 		int tmp_h = srcArea.height();
 		srcArea.top = src.h - srcArea.bottom;
 		srcArea.bottom = srcArea.top + tmp_h;
+		scaleYoff = (BlendBlit::SCALE_THRESHOLD - (scaleYoff + dstArea.height() * scaleY)) % BlendBlit::SCALE_THRESHOLD;
 	}
 
 	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
@@ -786,6 +791,7 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 			dstArea.left, dstArea.top,
 			dstArea.width(), dstArea.height(),
 			scaleX, scaleY,
+			scaleXoff, scaleYoff,
 			colorMod, flipping,
 			blend, alphaType);
 	}
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index a935afe68e5..45f8442a6e4 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -72,22 +72,13 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 		return retSize;
 	}
 
-	if (pPartRect) {
-
-		int xOffset = pPartRect->left;
-		int yOffset = pPartRect->top;
-
-		if (flipping & FLIP_V) {
-			yOffset = srcImage.h - pPartRect->bottom;
-		}
+	int xOffset = 0, yOffset = 0, srcW = srcImage.w, srcH = srcImage.h;
 
-		if (flipping & FLIP_H) {
-			xOffset = srcImage.w - pPartRect->right;
-		}
-
-		srcImage.pixels = getBasePtr(xOffset, yOffset);
-		srcImage.w = pPartRect->width();
-		srcImage.h = pPartRect->height();
+	if (pPartRect) {
+		xOffset = pPartRect->left;
+		yOffset = pPartRect->top;
+		srcW = pPartRect->width();
+		srcH = pPartRect->height();
 	}
 
 	if (width == -1) {
@@ -97,69 +88,65 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 		height = srcImage.h;
 	}
 
+	int scaleX = BlendBlit::getScaleFactor(srcW, width), scaleXoff = 0;
+	int scaleY = BlendBlit::getScaleFactor(srcH, height), scaleYoff = 0;
+
 #ifdef SCALING_TESTING
 	// Hardcode scaling to 66% to test scaling
 	width = width * 2 / 3;
 	height = height * 2 / 3;
 #endif
 
-	Graphics::Surface *img = nullptr;
-	Graphics::Surface *imgScaled = nullptr;
-	byte *savedPixels = nullptr;
-	if ((width != srcImage.w) || (height != srcImage.h)) {
-		// Scale the image
-		img = imgScaled = srcImage.scale(width, height);
-		savedPixels = (byte *)img->getPixels();
-	} else {
-		img = &srcImage;
-	}
-
 	// Handle off-screen clipping
 	if (posY < 0) {
-		img->h = MAX(0, (int)img->h - -posY);
-		if (!(flipping & FLIP_V))
-			img->setPixels((byte *)img->getBasePtr(0, -posY));
+		height = MAX(0, (int)height + posY);
+		scaleYoff += (-posY * scaleY) % BlendBlit::SCALE_THRESHOLD;
+		yOffset += -posY * scaleY / BlendBlit::SCALE_THRESHOLD;
+		srcH = MAX(0, srcH + posY * scaleY / BlendBlit::SCALE_THRESHOLD);
 		posY = 0;
 	}
 
 	if (posX < 0) {
-		img->w = MAX(0, (int)img->w - -posX);
-		if (!(flipping & FLIP_H))
-			img->setPixels((byte *)img->getBasePtr(-posX, 0));
+		width = MAX(0, (int)width + posX);
+		scaleXoff += (-posX * scaleX) % BlendBlit::SCALE_THRESHOLD;
+		xOffset += -posX * scaleX / BlendBlit::SCALE_THRESHOLD;
+		srcW = MAX(0, srcW + posX * scaleX / BlendBlit::SCALE_THRESHOLD);
 		posX = 0;
 	}
 
-	if (img->w > target.w - posX) {
-		if (flipping & FLIP_H)
-			img->setPixels((byte *)img->getBasePtr(img->w - target.w + posX, 0));
-		img->w = CLIP((int)img->w, 0, (int)MAX((int)target.w - posX, 0));
+	if (width + posX > target.w) {
+		srcW -= ((width + posX) - target.w) * scaleX / BlendBlit::SCALE_THRESHOLD;
+		width = target.w - posX;
+	}
+
+	if (height + posY > target.h) {
+		srcH -= ((height + posY) - target.h) * scaleY / BlendBlit::SCALE_THRESHOLD;
+		height = target.h - posY;
+	}
+	if (flipping & FLIP_H) {
+		scaleXoff = (BlendBlit::SCALE_THRESHOLD - (scaleXoff + width * scaleX)) % BlendBlit::SCALE_THRESHOLD;
+		xOffset = this->w - (xOffset + srcW);
 	}
 
-	if (img->h > target.h - posY) {
-		if (flipping & FLIP_V)
-			img->setPixels((byte *)img->getBasePtr(0, img->h - target.h + posY));
-		img->h = CLIP((int)img->h, 0, (int)MAX((int)target.h - posY, 0));
+	if (flipping & FLIP_V) {
+		scaleYoff = (BlendBlit::SCALE_THRESHOLD - (scaleYoff + height * scaleY)) % BlendBlit::SCALE_THRESHOLD;
+		yOffset = this->h - (yOffset + srcH);
 	}
 
 	// Flip surface
-	if ((img->w > 0) && (img->h > 0)) {
+	if ((width > 0) && (height > 0)) {
 		BlendBlit::blit(
 			(byte *)target.getBasePtr(0, 0),
-			(byte *)img->getBasePtr(0, 0),
-			target.pitch, img->pitch,
-			posX, posY, img->w, img->h, BlendBlit::SCALE_THRESHOLD, BlendBlit::SCALE_THRESHOLD,
+			(byte *)srcImage.getBasePtr(xOffset, yOffset),
+			target.pitch, srcImage.pitch,
+			posX, posY, width, height, scaleX, scaleY,
+			scaleXoff, scaleYoff,
 			color, flipping,
 			blendMode, _alphaMode);
 	}
 
-	retSize.setWidth(img->w);
-	retSize.setHeight(img->h);
-
-	if (imgScaled) {
-		imgScaled->setPixels(savedPixels);
-		imgScaled->free();
-		delete imgScaled;
-	}
+	retSize.setWidth((int16)width);
+	retSize.setHeight((int16)height);
 
 	return retSize;
 }
@@ -185,22 +172,13 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 		return retSize;
 	}
 
-	if (pPartRect) {
-
-		int xOffset = pPartRect->left;
-		int yOffset = pPartRect->top;
-
-		if (flipping & FLIP_V) {
-			yOffset = srcImage.h - pPartRect->bottom;
-		}
+	int xOffset = 0, yOffset = 0, srcW = srcImage.w, srcH = srcImage.h;
 
-		if (flipping & FLIP_H) {
-			xOffset = srcImage.w - pPartRect->right;
-		}
-
-		srcImage.pixels = getBasePtr(xOffset, yOffset);
-		srcImage.w = pPartRect->width();
-		srcImage.h = pPartRect->height();
+	if (pPartRect) {
+		xOffset = pPartRect->left;
+		yOffset = pPartRect->top;
+		srcW = pPartRect->width();
+		srcH = pPartRect->height();
 	}
 
 	if (width == -1) {
@@ -210,69 +188,67 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 		height = srcImage.h;
 	}
 
+	int scaleX = BlendBlit::getScaleFactor(srcW, width), scaleXoff = 0;;
+	int scaleY = BlendBlit::getScaleFactor(srcH, height), scaleYoff = 0;;
+
 #ifdef SCALING_TESTING
 	// Hardcode scaling to 66% to test scaling
 	width = width * 2 / 3;
 	height = height * 2 / 3;
 #endif
 
-	Graphics::Surface *img = nullptr;
-	Graphics::Surface *imgScaled = nullptr;
-	byte *savedPixels = nullptr;
-	if ((width != srcImage.w) || (height != srcImage.h)) {
-		// Scale the image
-		img = imgScaled = srcImage.scale(width, height);
-		savedPixels = (byte *)img->getPixels();
-	} else {
-		img = &srcImage;
-	}
-
 	// Handle off-screen clipping
 	if (posY < clippingArea.top) {
-		img->h = MAX(0, (int)img->h - (clippingArea.top - posY));
-		if (!(flipping & FLIP_V))
-			img->setPixels((byte *)img->getBasePtr(0, clippingArea.top - posY));
+		posY -= clippingArea.top;
+		scaleYoff += (-posY * scaleY) % BlendBlit::SCALE_THRESHOLD;
+		yOffset += -posY * scaleY / BlendBlit::SCALE_THRESHOLD;
+		height = MAX(0, (int)height + posY);
+		srcH = MAX(0, srcH + posY * scaleY / BlendBlit::SCALE_THRESHOLD);
 		posY = clippingArea.top;
 	}
 
 	if (posX < clippingArea.left) {
-		img->w = MAX(0, (int)img->w - (clippingArea.left - posX));
-		if (!(flipping & FLIP_H))
-			img->setPixels((byte *)img->getBasePtr(clippingArea.left - posX, 0));
+		posX -= clippingArea.left;
+		scaleXoff += (-posX * scaleX) % BlendBlit::SCALE_THRESHOLD;
+		xOffset += -posX * scaleX / BlendBlit::SCALE_THRESHOLD;
+		width = MAX(0, (int)width + posX);
+		srcW = MAX(0, srcW + posX * scaleX / BlendBlit::SCALE_THRESHOLD);
 		posX = clippingArea.left;
 	}
 
-	if (img->w > clippingArea.right - posX) {
-		if (flipping & FLIP_H)
-			img->setPixels((byte *)img->getBasePtr(img->w - clippingArea.right + posX, 0));
-		img->w = CLIP((int)img->w, 0, (int)MAX((int)clippingArea.right - posX, 0));
+	if (width + posX > clippingArea.right) {
+		srcW -= ((width + posX) - clippingArea.right) * scaleX / BlendBlit::SCALE_THRESHOLD;
+		width = clippingArea.right - posX;
+	}
+
+	if (height + posY > clippingArea.bottom) {
+		srcH -= ((height + posY) - clippingArea.bottom) * scaleY / BlendBlit::SCALE_THRESHOLD;
+		height = clippingArea.bottom - posY;
+	}
+	if (flipping & FLIP_H) {
+		scaleXoff = (BlendBlit::SCALE_THRESHOLD - (scaleXoff + width * scaleX)) % BlendBlit::SCALE_THRESHOLD;
+		xOffset = this->w - (xOffset + srcW);
 	}
 
-	if (img->h > clippingArea.bottom - posY) {
-		if (flipping & FLIP_V)
-			img->setPixels((byte *)img->getBasePtr(0, img->h - clippingArea.bottom + posY));
-		img->h = CLIP((int)img->h, 0, (int)MAX((int)clippingArea.bottom - posY, 0));
+	if (flipping & FLIP_V) {
+		scaleYoff = (BlendBlit::SCALE_THRESHOLD - (scaleYoff + height * scaleY)) % BlendBlit::SCALE_THRESHOLD;
+		yOffset = this->h - (yOffset + srcH);
 	}
 
 	// Flip surface
-	if ((img->w > 0) && (img->h > 0)) {
+	if ((width > 0) && (height > 0)) {
 		BlendBlit::blit(
 			(byte *)target.getBasePtr(0, 0),
-			(byte *)img->getBasePtr(0, 0),
-			target.pitch, img->pitch,
-			posX, posY, img->w, img->h, BlendBlit::SCALE_THRESHOLD, BlendBlit::SCALE_THRESHOLD,
+			(byte *)srcImage.getBasePtr(xOffset, yOffset),
+			target.pitch, srcImage.pitch,
+			posX, posY, width, height, scaleX, scaleY,
+			scaleXoff, scaleYoff,
 			color, flipping,
 			blendMode, _alphaMode);
 	}
 
-	retSize.setWidth(img->w);
-	retSize.setHeight(img->h);
-
-	if (imgScaled) {
-		imgScaled->setPixels(savedPixels);
-		imgScaled->free();
-		delete imgScaled;
-	}
+	retSize.setWidth(width);
+	retSize.setHeight(height);
 
 	return retSize;
 }
diff --git a/test/image/blending.h b/test/image/blending.h
index 56584505e98..8ef2ca75904 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -992,9 +992,10 @@ public:
             Common::Rect(4, 4, 4+16, 4+16), // Case 0 (source clipping)
             Common::Rect(24, 20, 24+16, 20+16), // Case 1 (outside of destination)
             Common::Rect(0, 0, 32, 32), // Case 2 (stretching bigger)
-            Common::Rect(3, 3, 8, 8), // Case 3 (stretching smaller)
+            Common::Rect(3, 3, 3+8, 3+8), // Case 3 (stretching smaller)
             Common::Rect(8, 4, 8+32, 4+32), // Case 4 (stretching outside of destination)
             Common::Rect(-4, -4, -4+16, -4+16), // Case 5 (outside of destination 2)
+            Common::Rect(-16, -16, 32+16, 32+16), // Case 6 (completely bigger)
         }, srcs[] = {
             Common::Rect(0, 0, 16, 16), // Case 0 (source clipping)
             Common::Rect(0, 0, 16, 16), // Case 1 (outside of destination)
@@ -1002,6 +1003,7 @@ public:
             Common::Rect(0, 0, 16, 16), // Case 3 (stretching smaller)
             Common::Rect(0, 0, 16, 16), // Case 4 (stretching outside of destination)
             Common::Rect(0, 0, 16, 16), // Case 5 (outside of destination 2)
+            Common::Rect(0, 0, 16, 16), // Case 6 (completely bigger)
         };
 
 	    Graphics::Surface baseSurface, destSurface;
@@ -1041,6 +1043,7 @@ public:
             "3 -> (stretching smaller)",
             "4 -> (stretching outside of destination)",
             "5 -> (outside of destination)",
+			"6 -> (completely bigger)"
         };
 
         for (int blendMode = 0; blendMode < Graphics::NUM_BLEND_MODES; blendMode++) {
@@ -1057,12 +1060,21 @@ public:
         for (int rect = 0; rect < (int)(sizeof(srcs)/sizeof(srcs[0])); rect++) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            Common::Rect ret1 = oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            Common::Rect ret2 = newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+
+			if (ret1 != ret2 || ret2 != ret3 || ret1 != ret3) {
+                warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+				warning("old: Rect(%d, %d, %d, %d)", ret1.left, ret1.top, ret1.width(), ret1.height());
+				warning("new: Rect(%d, %d, %d, %d)", ret2.left, ret2.top, ret2.width(), ret2.height());
+				warning("managed: Rect(%d, %d, %d, %d)", ret3.left, ret3.top, ret3.width(), ret3.height());
+                TS_FAIL("Return sizes are not equal!");
+			}
 
             if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
@@ -1094,6 +1106,32 @@ public:
                 TS_FAIL("newSurfDest and managedSurfDest are not equal!");
                 return;
             }
+
+			
+            oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
+            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            ret1 = oldSurf.blitClip(oldSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
+            newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            ret2 = newSurf.blitClip(newSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
+                warning("BLIT_CLIP blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+                save_bitmap("sourceSurfBlipClip.bmp", &newSurf);
+                save_bitmap("oldSurfDestBlitClip.bmp", &oldSurfDest);
+                save_bitmap("newSurfDestBlitClip.bmp", &newSurfDest);
+                save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+                TS_FAIL("oldSurfDest and newSurfDest are not equal with blipClip!");
+                return;
+            }
+			if (ret1 != ret2) {
+                warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
+                    blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+				warning("old: Rect(%d, %d, %d, %d)", ret1.left, ret1.top, ret1.width(), ret1.height());
+				warning("new: Rect(%d, %d, %d, %d)", ret2.left, ret2.top, ret2.width(), ret2.height());
+				warning("managed: Rect(%d, %d, %d, %d)", ret3.left, ret3.top, ret3.width(), ret3.height());
+                TS_FAIL("Return sizes are not equal for blitClip!");
+			}
         } // rect
         } // flipping
         } // b


Commit: 480a77f3103c1cbc3d65045d9b993b6bd8483f65
    https://github.com/scummvm/scummvm/commit/480a77f3103c1cbc3d65045d9b993b6bd8483f65
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Configure automatically detects SIMD

Changed paths:
    configure


diff --git a/configure b/configure
index ed5a1554868..04b87e7cc52 100755
--- a/configure
+++ b/configure
@@ -225,9 +225,9 @@ _builtin_resources=yes
 _windows_console=yes
 _windows_unicode=yes
 _cygwin_build=no
-_ext_sse2=no
-_ext_avx2=no
-_ext_neon=no
+_ext_sse2=auto
+_ext_avx2=auto
+_ext_neon=auto
 # Default commands
 _ranlib=ranlib
 _strip=strip
@@ -6875,27 +6875,72 @@ echo "$_enable_ubsan"
 #
 # Whether to add compiler options and preprocessor defines for SIMD extensions
 #
-if ( echo "$_host_cpu" | grep '86' >> /dev/null ) ; then
-	define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
-	if test "$_ext_sse2" = yes ; then
-		append_var CXXFLAGS "-msse2 -msse"
-	fi
-	echo_n "Enabling x86/64 SSE2... "
-	echo "$_ext_sse2"
-	define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
-	if test "$_ext_avx2" = yes ; then
-		append_var CXXFLAGS "-mavx2 -mavx -msse2 -msse"
-		define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
-	fi
-	echo_n "Enabling x86/64 AVX2 and SSE2... "
-	echo "$_ext_avx2"
-fi
 
-if ( echo "$_host_cpu" | grep 'arm' >> /dev/null ) || ( echo "$_host_cpu" | grep 'aarch64' >> /dev/null ) ; then
-	define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
-	echo_n "Enabling arm NEON... "
-	echo "$_ext_neon"
-fi
+# Automatically detect if SSE2, NEON, AVX2 can be compiled (not if they can
+# be run at runtime)
+case $_host_cpu in
+	x86_64 | amd64)
+		if test "$_ext_sse2" = auto ; then
+			_ext_sse2=yes
+		fi
+		if test "$_ext_avx2" = auto ; then
+			_ext_avx2=yes
+		fi
+		_ext_neon=no
+		;;
+	i[3-6]86)
+		if test "$_ext_sse2" = auto ; then
+			_ext_sse2=yes
+		fi
+		if test "$_ext_avx2" = auto ; then
+			_ext_avx2=no
+		fi
+		_ext_neon=no
+		;;
+	aarch64)
+		if test "$_ext_neon" = auto ; then
+			_ext_neon=yes
+		fi
+		_ext_sse2=no
+		_ext_avx2=no
+		;;
+	arm*)
+		if test "$_ext_neon" = auto ; then
+			_ext_neon=no
+		fi
+		_ext_sse2=no
+		_ext_avx2=no
+		;;
+	*)
+		_ext_sse2=no
+		_ext_avx2=no
+		_ext_neon=no
+		;;
+esac
+
+define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
+if test "$_ext_sse2" = yes ; then
+	append_var CXXFLAGS "-msse2 -msse"
+fi
+echo_n "Enabling x86/64 SSE2... "
+echo "$_ext_sse2"
+define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
+if test "$_ext_avx2" = yes ; then
+	append_var CXXFLAGS "-mavx2 -mavx -msse2 -msse"
+	define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
+fi
+echo_n "Enabling x86/64 AVX2 and SSE2... "
+echo "$_ext_avx2"
+define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
+# AArch64 might by default come with more fpu extensions, so we wouldn't want
+# to downgrade. Almost all armv7 cpus have neon or less in terms of fpu
+# extensions so setting fpu to neon is almost always an upgrade over defaults.
+# Not to mention it would have to be included anyways
+if ( test "$_ext_avx2" = yes ) && ( test "$_host_cpu" != aarch64 ) ; then
+	append_var CXXFLAGS "-mfpu=neon"
+fi
+echo_n "Enabling arm NEON... "
+echo "$_ext_neon"
 
 echo_n "Backend... "
 echo_n "$_backend"


Commit: 59fa0a920881f4ac21065ab22145f55cebcfb22a
    https://github.com/scummvm/scummvm/commit/59fa0a920881f4ac21065ab22145f55cebcfb22a
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: Cpu feature flags now use bit shifts

Changed paths:
    backends/base-backend.h


diff --git a/backends/base-backend.h b/backends/base-backend.h
index 0c25dbafc6e..7febdb95e9c 100644
--- a/backends/base-backend.h
+++ b/backends/base-backend.h
@@ -32,15 +32,15 @@
 class BaseBackend : public OSystem {
 public:
 	enum CpuFeatureFlags {
-		kCpuNoFeatures     = 0x00, // Completely detected by BaseBackend
-		kCpuFeatureSSE2    = 0x01, // Completely detected by BaseBackend
-		kCpuFeatureAVX2    = 0x02, // Completely detected by BaseBackend
+		kCpuNoFeatures     = 0, // Completely detected by BaseBackend
+		kCpuFeatureSSE2    = (1 << 0), // Completely detected by BaseBackend
+		kCpuFeatureAVX2    = (1 << 1), // Completely detected by BaseBackend
 		// Detected either by BaseBackend (if platform ONLY supports ARMv8+) or
 		// platform specific Backends if ARM is optional or not on all versions
 		// of the platform.
-		kCpuFeatureNEON    = 0x04,
-		kCpuFeatureAlitvec = 0x08, // Platform specific
-		kCpuFeatureSSE41   = 0x10, // Completely detected by BaseBackend
+		kCpuFeatureNEON    = (1 << 2),
+		kCpuFeatureAlitvec = (1 << 3), // Platform specific
+		kCpuFeatureSSE41   = (1 << 4), // Completely detected by BaseBackend
 	};
 
 	void initBackend() override;


Commit: f5dfa6b8d119cff1589b9a3da49f276d2915d5d8
    https://github.com/scummvm/scummvm/commit/f5dfa6b8d119cff1589b9a3da49f276d2915d5d8
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
WII: Removed extra ")" in backend hasFeature

Changed paths:
    backends/platform/wii/osystem.cpp


diff --git a/backends/platform/wii/osystem.cpp b/backends/platform/wii/osystem.cpp
index 42a9b6b06d4..270c7d97841 100644
--- a/backends/platform/wii/osystem.cpp
+++ b/backends/platform/wii/osystem.cpp
@@ -176,7 +176,7 @@ bool OSystem_Wii::hasFeature(Feature f) {
 	return (f == kFeatureFullscreenMode) ||
 			(f == kFeatureAspectRatioCorrection) ||
 			(f == kFeatureCursorPalette) ||
-			(f == kFeatureOverlaySupportsAlpha));
+			(f == kFeatureOverlaySupportsAlpha);
 }
 
 void OSystem_Wii::setFeatureState(Feature f, bool enable) {


Commit: 787837ca4139447bb7cb3af2c60fdb2078d52a5e
    https://github.com/scummvm/scummvm/commit/787837ca4139447bb7cb3af2c60fdb2078d52a5e
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
ALL: Add Cpu prefix to SIMD extension features

Changed paths:
    backends/base-backend.cpp
    backends/platform/android/android.cpp
    common/system.h
    graphics/blit-alpha.cpp


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index 7533cb9b13e..81515ae3372 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -103,9 +103,9 @@ void BaseBackend::initBackend() {
 }
 
 bool BaseBackend::hasFeature(Feature f) {
-	if (f == kFeatureSSE2) return _cpuFeatures & kCpuFeatureSSE2;
-	if (f == kFeatureAVX2) return _cpuFeatures & kCpuFeatureAVX2;
-	if (f == kFeatureNEON) return _cpuFeatures & kCpuFeatureNEON;
+	if (f == kFeatureCpuSSE2) return _cpuFeatures & kCpuFeatureSSE2;
+	if (f == kFeatureCpuAVX2) return _cpuFeatures & kCpuFeatureAVX2;
+	if (f == kFeatureCpuNEON) return _cpuFeatures & kCpuFeatureNEON;
 	return false;
 }
 
diff --git a/backends/platform/android/android.cpp b/backends/platform/android/android.cpp
index cbb0e30aaf1..4bd53188c34 100644
--- a/backends/platform/android/android.cpp
+++ b/backends/platform/android/android.cpp
@@ -645,7 +645,7 @@ bool OSystem_Android::hasFeature(Feature f) {
 	if (f == kFeatureOpenGLForGame) return true;
 	/* GLES2 always supports shaders */
 	if (f == kFeatureShadersForGame) return true;
-	if (f == kFeatureNEON) return _neonSupport;
+	if (f == kFeatureCpuNEON) return _neonSupport;
 	return ModularGraphicsBackend::hasFeature(f);
 }
 
diff --git a/common/system.h b/common/system.h
index 8dc35176971..3a79cc64c1a 100644
--- a/common/system.h
+++ b/common/system.h
@@ -584,28 +584,28 @@ public:
 		* Arm-v8 requires NEON extensions, but before that, NEON was just
 		* optional, so this signifies that the processor can use NEON.
 		*/
-		kFeatureNEON,
+		kFeatureCpuNEON,
 
 		/**
 		* For x86/x86_64 platforms that have SSE2 support
 		*/
-		kFeatureSSE2,
+		kFeatureCpuSSE2,
 
 		/**
 		* For x86/x86_64 platforms that have SSE4.1 support
 		*/
-		kFeatureSSE41,
+		kFeatureCpuSSE41,
 
 		/**
 		* For x86_64 platforms that have AVX2 support
 		*/
-		kFeatureAVX2,
+		kFeatureCpuAVX2,
 
 		/**
 		* For PowerPC platforms that have the altivec standard as of 1999.
 		* Covers a wide range of platforms, Apple Macs, XBox 360, PS3, and more
 		*/
-		kFeatureAltivec,
+		kFeatureCpuAltivec,
 	};
 
 	/**
diff --git a/graphics/blit-alpha.cpp b/graphics/blit-alpha.cpp
index a7c8cc7bd38..ccae2cde100 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit-alpha.cpp
@@ -532,13 +532,13 @@ void BlendBlit::blit(byte *dst, const byte *src,
 		// Get the correct blit function
 		blitFunc = blitGeneric;
 #ifdef SCUMMVM_NEON
-		if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
+		if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
 #endif
 #ifdef SCUMMVM_SSE2
-		if (g_system->hasFeature(OSystem::kFeatureSSE2)) blitFunc = blitSSE2;
+		if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
 #endif
 #ifdef SCUMMVM_AVX2
-		if (g_system->hasFeature(OSystem::kFeatureAVX2)) blitFunc = blitAVX2;
+		if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
 #endif
 	}
 	


Commit: 9b312eb16af515f4aa8ca94843a12249a4a04b2a
    https://github.com/scummvm/scummvm/commit/9b312eb16af515f4aa8ca94843a12249a4a04b2a
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Removed unnessesary SIMD engine features

Changed paths:
    configure


diff --git a/configure b/configure
index 04b87e7cc52..390f0de91b3 100755
--- a/configure
+++ b/configure
@@ -296,9 +296,6 @@ add_feature zlib "zlib" "_zlib"
 add_feature lua "lua" "_lua"
 add_feature fribidi "FriBidi" "_fribidi"
 add_feature test_cxx11 "Test C++11" "_test_cxx11"
-add_feature ext_sse2 "Add x86/64 SSE2 support" "_ext_sse2"
-add_feature ext_avx2 "Add x86/64 AVX2 support" "_ext_avx2"
-add_feature ext_neon "Add Arm NEON support" "_ext_neon"
 
 # Directories for installing ScummVM.
 # This list is closely based on what GNU autoconf does,
@@ -985,9 +982,9 @@ Optional Features:
   --disable-windows-console do not show console output on Windows
   --enable-windows-unicode  use Windows Unicode APIs (default)
   --disable-windows-unicode use Windows ANSI APIs
-	--enable-ext-sse2				  allow code to use sse2 extensions on x86/64
-	--enable-ext-avx2				  allow code to use avx2 extensions on x86/64
-	--enable-ext-neon				  allow code to use neon extensions on Arm
+  --enable-ext-sse2         allow code to use sse2 extensions on x86/64
+  --enable-ext-avx2         allow code to use avx2 extensions on x86/64
+  --enable-ext-neon         allow code to use neon extensions on Arm
 
 Optional Documentation Options:
   --with-manual-version=VERSION version to use when generating the manual (optional)
@@ -1301,12 +1298,12 @@ for ac_option in $@; do
 	--disable-eventrecorder)     _eventrec=no            ;;
 	--enable-text-console)       _text_console=yes       ;;
 	--disable-text-console)      _text_console=no        ;;
-	--enable-ext-sse2)           _ext_sse2=yes       		 ;;
-	--disable-ext-sse2)          _ext_sse2=no        		 ;;
-	--enable-ext-avx2)           _ext_avx2=yes       		 ;;
-	--disable-ext-avx2)          _ext_avx2=no        		 ;;
-	--enable-ext-neon)           _ext_neon=yes       		 ;;
-	--disable-ext-neon)          _ext_neon=no        		 ;;
+	--enable-ext-sse2)           _ext_sse2=yes           ;;
+	--disable-ext-sse2)          _ext_sse2=no            ;;
+	--enable-ext-avx2)           _ext_avx2=yes           ;;
+	--disable-ext-avx2)          _ext_avx2=no            ;;
+	--enable-ext-neon)           _ext_neon=yes           ;;
+	--disable-ext-neon)          _ext_neon=no            ;;
 	--with-fluidsynth-prefix=*)
 		arg=`echo $ac_option | cut -d '=' -f 2`
 		FLUIDSYNTH_CFLAGS="-I$arg/include"


Commit: 975808bac4841240f2d501d60237cbe7b459745d
    https://github.com/scummvm/scummvm/commit/975808bac4841240f2d501d60237cbe7b459745d
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Moved blit files into graphics/blit

Changed paths:
  A graphics/blit/blit-alpha.cpp
  A graphics/blit/blit-atari.cpp
  A graphics/blit/blit-avx2.cpp
  A graphics/blit/blit-neon.cpp
  A graphics/blit/blit-scale.cpp
  A graphics/blit/blit-sse2.cpp
  A graphics/blit/blit.cpp
  R graphics/blit-alpha.cpp
  R graphics/blit-atari.cpp
  R graphics/blit-avx2.cpp
  R graphics/blit-neon.cpp
  R graphics/blit-scale.cpp
  R graphics/blit-sse2.cpp
  R graphics/blit.cpp
    graphics/module.mk


diff --git a/graphics/blit-alpha.cpp b/graphics/blit/blit-alpha.cpp
similarity index 99%
rename from graphics/blit-alpha.cpp
rename to graphics/blit/blit-alpha.cpp
index ccae2cde100..b7e2b5f723b 100644
--- a/graphics/blit-alpha.cpp
+++ b/graphics/blit/blit-alpha.cpp
@@ -23,9 +23,9 @@
 #include "graphics/pixelformat.h"
 #include "common/system.h"
 
-#include "graphics/blit-neon.cpp"
-#include "graphics/blit-sse2.cpp"
-#include "graphics/blit-avx2.cpp"
+#include "graphics/blit/blit-neon.cpp"
+#include "graphics/blit/blit-sse2.cpp"
+#include "graphics/blit/blit-avx2.cpp"
 
 namespace Graphics {
 
diff --git a/graphics/blit-atari.cpp b/graphics/blit/blit-atari.cpp
similarity index 100%
rename from graphics/blit-atari.cpp
rename to graphics/blit/blit-atari.cpp
diff --git a/graphics/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
similarity index 100%
rename from graphics/blit-avx2.cpp
rename to graphics/blit/blit-avx2.cpp
diff --git a/graphics/blit-neon.cpp b/graphics/blit/blit-neon.cpp
similarity index 100%
rename from graphics/blit-neon.cpp
rename to graphics/blit/blit-neon.cpp
diff --git a/graphics/blit-scale.cpp b/graphics/blit/blit-scale.cpp
similarity index 100%
rename from graphics/blit-scale.cpp
rename to graphics/blit/blit-scale.cpp
diff --git a/graphics/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
similarity index 100%
rename from graphics/blit-sse2.cpp
rename to graphics/blit/blit-sse2.cpp
diff --git a/graphics/blit.cpp b/graphics/blit/blit.cpp
similarity index 100%
rename from graphics/blit.cpp
rename to graphics/blit/blit.cpp
diff --git a/graphics/module.mk b/graphics/module.mk
index 0d12dc92c88..e27c44785ed 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -2,9 +2,9 @@ MODULE := graphics
 
 MODULE_OBJS := \
 	big5.o \
-	blit.o \
-	blit-alpha.o \
-	blit-scale.o \
+	blit/blit.o \
+	blit/blit-alpha.o \
+	blit/blit-scale.o \
 	cursorman.o \
 	font.o \
 	fontman.o \


Commit: e61fce02c8f7f8b839b47d6a9af8caf5fb4b44de
    https://github.com/scummvm/scummvm/commit/e61fce02c8f7f8b839b47d6a9af8caf5fb4b44de
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fix blend blit indentation

Changed paths:
    graphics/blit/blit-avx2.cpp
    graphics/blit/blit-sse2.cpp


diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
index d06a0ab310e..b7768457a97 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -277,7 +277,7 @@ void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
 	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
 
-    if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
+	if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
 
 	for (uint32 i = 0; i < args.height; i++) {
 		if (doscale) {
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
index 1925490bdaa..c1b15c14354 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -98,14 +98,14 @@ struct AlphaBlend {
 template<bool doscale, bool rgbmod, bool alphamod>
 struct MultiplyBlend {
 	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m128i ina;
-	    if (alphamod)
+		__m128i ina;
+		if (alphamod)
 			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
-	    else
+		else
 			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-	    __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
 
-    	if (rgbmod) {
+		if (rgbmod) {
 			__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
 			__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);


Commit: df2367c3c922aaf9d33a8ddc6c9850492864525b
    https://github.com/scummvm/scummvm/commit/df2367c3c922aaf9d33a8ddc6c9850492864525b
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Change blendBlitFrom's format function

Changed paths:
    graphics/managed_surface.cpp
    graphics/managed_surface.h


diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index ae6e7ebff84..7ca4e55623f 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -735,7 +735,7 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 										   const TSpriteBlendMode blend,
 										   const AlphaType alphaType) {
 	Common::Rect srcArea = srcRect, dstArea = destRect;
-	if (format != getSupportedBlendBlitPixelFormat() || src.format != getSupportedBlendBlitPixelFormat()) {
+	if (!isBlendBlitPixelFormatSupported(src.format, format)) {
 		warning("ManagedSurface::blendBlitFrom only accepts RGBA32!");
 		return Common::Rect(0, 0, 0, 0);
 	}
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index bca2a7b262a..b45d218348b 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -525,15 +525,12 @@ public:
 	}
 	
 	/**
-	 * Returns the pixel format all operations of blendBlitFrom support.
-	 *
-	 * Unlike normal blit functions, blendBlitFrom only works with a fixed pixel
-	 * format. This format can be queried using this static function.
-	 *
-	 * @return Supported pixel format.
+	 * ManagedSurface::blendBlitFrom is meant to be a highly optimized
+	 * blending/blitting function, so it can only accept certain format combinations.
+	 * @return true if the formats can be used by blendBlitFrom.
 	 */
-	static inline PixelFormat getSupportedBlendBlitPixelFormat() {
-		return BlendBlit::getSupportedPixelFormat();
+	static inline bool isBlendBlitPixelFormatSupported(const PixelFormat &src, const PixelFormat &dst) {
+		return BlendBlit::getSupportedPixelFormat() == src && BlendBlit::getSupportedPixelFormat() == dst;
 	}
 
 	/**


Commit: 41a942c5ce91810552692025700170754e21e6f2
    https://github.com/scummvm/scummvm/commit/41a942c5ce91810552692025700170754e21e6f2
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
ALL: Renamed TS_ARGB to MS_ARGB

Changed paths:
    engines/griffon/cutscenes.cpp
    engines/griffon/dialogs.cpp
    engines/griffon/draw.cpp
    engines/griffon/logic.cpp
    engines/griffon/resources.cpp
    engines/griffon/sound.cpp
    engines/ngi/gfx.cpp
    engines/sludge/backdrop.cpp
    engines/sludge/sprites.cpp
    engines/sludge/thumbnail.cpp
    engines/sludge/transition.cpp
    engines/sword25/gfx/image/vectorimagerenderer.cpp
    engines/wage/guiborders.cpp
    engines/wintermute/base/gfx/osystem/base_surface_osystem.cpp
    graphics/blit.h
    graphics/macgui/macwindowmanager.cpp
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    graphics/nine_patch.cpp
    graphics/transform_struct.h
    graphics/transparent_surface.h
    test/image/blending.h


diff --git a/engines/griffon/cutscenes.cpp b/engines/griffon/cutscenes.cpp
index fe1754d4c44..0ca9c48634b 100644
--- a/engines/griffon/cutscenes.cpp
+++ b/engines/griffon/cutscenes.cpp
@@ -169,7 +169,7 @@ void GriffonEngine::showLogos() {
 		}
 
 		_videoBuffer->fillRect(Common::Rect(0, 0, 320, 240), 0);
-		_logosImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, nullptr, TS_ARGB((int)y, (int)y, (int)y, (int)y));
+		_logosImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB((int)y, (int)y, (int)y, (int)y));
 
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 		g_system->updateScreen();
diff --git a/engines/griffon/dialogs.cpp b/engines/griffon/dialogs.cpp
index c4dc1465b86..45e62744a94 100644
--- a/engines/griffon/dialogs.cpp
+++ b/engines/griffon/dialogs.cpp
@@ -316,14 +316,14 @@ void GriffonEngine::configMenu() {
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, TS_ARGB(128, 255, 255, 255));
+		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		rcDest.left = 256;
 		rcDest.top = 192;
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, TS_ARGB(128, 255, 255, 255));
+		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		configwindow->blit(*_videoBuffer);
 
@@ -661,14 +661,14 @@ void GriffonEngine::saveLoadNew() {
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, TS_ARGB(128, 255, 255, 255));
+		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		rcDest.left = 256;
 		rcDest.top = 192;
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, TS_ARGB(128, 255, 255, 255));
+		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		_saveLoadImg->blit(*_videoBuffer);
 
diff --git a/engines/griffon/draw.cpp b/engines/griffon/draw.cpp
index f3da046f677..d35f2573c5c 100644
--- a/engines/griffon/draw.cpp
+++ b/engines/griffon/draw.cpp
@@ -229,7 +229,7 @@ void GriffonEngine::drawHud() {
 				rcDest.left = ix;
 				rcDest.top = iy;
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 			}
 		}
 	}
@@ -587,7 +587,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							int x = 192 + ((int)(_itemyloc + ff * 5) % 3) * 64;
 							if (x > 255)
 								x = 255;
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(x, 255, 255, 255));
+							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
 
 							for (int f = 1; f <= 8; f++) {
 								rcSrc.left = 16 * (int)(RND() * 2);
@@ -601,7 +601,7 @@ void GriffonEngine::drawNPCs(int mode) {
 								x = 192 + f % 3 * 64;
 								if (x > 255)
 									x = 255;
-								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(x, 255, 255, 255));
+								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
 							}
 
 							rcSrc.left = 0;
@@ -777,7 +777,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = sx + 32 + (int)(RND() * 3) - 1;
 						rcDest.top = sy - (int)(RND() * 6);
 
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 					}
 
 					for (int ii = 0; ii <= 8; ii++) {
@@ -796,7 +796,7 @@ void GriffonEngine::drawNPCs(int mode) {
 
 							int alpha = i2 / 3 * 224;
 
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 							int xloc = rcDest.left;
 							int yloc = rcDest.top;
diff --git a/engines/griffon/logic.cpp b/engines/griffon/logic.cpp
index 265ecf945b8..e6e8833d2cb 100644
--- a/engines/griffon/logic.cpp
+++ b/engines/griffon/logic.cpp
@@ -1307,7 +1307,7 @@ void GriffonEngine::updateSpells() {
 						}
 
 						if (xloc > -16 && xloc < 304 && yloc > -16 && yloc < 224) {
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB((int)alf, 255, 255, 255));
+							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB((int)alf, 255, 255, 255));
 
 							if (_spellInfo[i].damagewho == 0) {
 								for (int e = 1; e <= _lastNpc; e++) {
@@ -1382,7 +1382,7 @@ void GriffonEngine::updateSpells() {
 				rcDest.left = xloc;
 				rcDest.top = yloc;
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(255, 255, 255, 255));
+				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.2 * _fpsr;
 				if (_spellInfo[i].frame < 0)
@@ -1504,7 +1504,7 @@ void GriffonEngine::updateSpells() {
 						rcDest.top = yloc;
 
 						if (xloc > -16 && xloc < 304 && yloc > -16 && yloc < 224) {
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(255, 255, 255, 255));
+							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
 
 							if (scatter) {
 								if (_spellInfo[i].damagewho == 0) {
@@ -1583,7 +1583,7 @@ void GriffonEngine::updateSpells() {
 				if (fra > 24)
 					f = 192 * (1 - (fra - 24) / 8);
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(f, 255, 255, 255));
+				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.3 * _fpsr;
 				if (_spellInfo[i].frame < 0) {
@@ -1680,7 +1680,7 @@ void GriffonEngine::updateSpells() {
 						rcDest.left = xloc;
 						rcDest.top = yloc;
 
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 					}
 				} else {
 
@@ -1712,7 +1712,7 @@ void GriffonEngine::updateSpells() {
 							rcDest.left = xloc;
 							rcDest.top = yloc;
 
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(192, 255, 255, 255));
+							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(192, 255, 255, 255));
 						}
 
 						if (xloc < -1 || yloc < -1 || xloc > 304 || yloc > 224)
@@ -2089,7 +2089,7 @@ void GriffonEngine::updateSpellsUnder() {
 				if (fra > 24)
 					f = 160 * (1 - (fra - 24) / 8);
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(f, 255, 255, 255));
+				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.2 * _fpsr;
 				if (_spellInfo[i].frame < 0)
@@ -2191,7 +2191,7 @@ void GriffonEngine::updateSpellsUnder() {
 							rcDest.top = (int)yloc;
 
 							if (xloc > -1 && xloc < 304 && yloc > -1 && yloc < 224) {
-								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 								int sx = (xloc / 2 + 4);
 								int sy = (yloc / 2 + 8);
@@ -2313,7 +2313,7 @@ void GriffonEngine::updateSpellsUnder() {
 					rcDest.top = yloc;
 
 					if (xloc > -16 && xloc < 320 && yloc > -16 && yloc < 240) {
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 						if (_spellInfo[i].damagewho == 1) {
 							float xdif = (xloc + 8) - (_player.px + 12);
diff --git a/engines/griffon/resources.cpp b/engines/griffon/resources.cpp
index 5f13bc6284a..1e23c7504be 100644
--- a/engines/griffon/resources.cpp
+++ b/engines/griffon/resources.cpp
@@ -321,7 +321,7 @@ void GriffonEngine::loadMap(int mapnum) {
 						}
 					}
 
-					_tiles[curtilel]->blit(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(alpha, 255, 255, 255));
+					_tiles[curtilel]->blit(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 					rcDest.left = x * 8;
 					rcDest.top = y * 8;
diff --git a/engines/griffon/sound.cpp b/engines/griffon/sound.cpp
index da20f1d988b..4c5f4d7b6d2 100644
--- a/engines/griffon/sound.cpp
+++ b/engines/griffon/sound.cpp
@@ -132,7 +132,7 @@ void GriffonEngine::setupAudio() {
 	rcDest.left = 160 - 44;
 	rcDest.top = 116 + 12;
 
-	loadimg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, TS_ARGB(160, 255, 255, 255));
+	loadimg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(160, 255, 255, 255));
 
 	g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 	g_system->updateScreen();
diff --git a/engines/ngi/gfx.cpp b/engines/ngi/gfx.cpp
index f817f211fab..9d1106078a4 100644
--- a/engines/ngi/gfx.cpp
+++ b/engines/ngi/gfx.cpp
@@ -781,7 +781,7 @@ void Bitmap::putDib(int x, int y, const Palette &palette, byte alpha) {
 	if (y1 < 0)
 		y1 = 0;
 
-	int alphac = TS_ARGB(alpha, 0xff, 0xff, 0xff);
+	int alphac = MS_ARGB(alpha, 0xff, 0xff, 0xff);
 
 	_surface->blit(g_nmi->_backgroundSurface, x1, y1, _flipping, &sub, alphac);
 	g_nmi->_system->copyRectToScreen(g_nmi->_backgroundSurface.getBasePtr(x1, y1), g_nmi->_backgroundSurface.pitch, x1, y1, sub.width(), sub.height());
@@ -946,7 +946,7 @@ void Bitmap::colorFill(uint32 *dest, int len, int32 color) {
 
 	g_nmi->_origFormat.colorToRGB(color, r, g, b);
 
-	uint32 c = TS_ARGB(0xff, r, g, b);
+	uint32 c = MS_ARGB(0xff, r, g, b);
 
 	for (int i = 0; i < len; i++)
 		*dest++ = c;
@@ -969,7 +969,7 @@ void Bitmap::paletteFill(uint32 *dest, byte *src, int len, const Palette &palett
 	for (int i = 0; i < len; i++) {
 		g_nmi->_origFormat.colorToRGB(palette.pal[*src++] & 0xffff, r, g, b);
 
-		*dest++ = TS_ARGB(0xff, r, g, b);
+		*dest++ = MS_ARGB(0xff, r, g, b);
 	}
 }
 
@@ -997,7 +997,7 @@ void Bitmap::copierKeyColor(uint32 *dest, byte *src, int len, int keyColor, cons
 		for (int i = 0; i < len; i++) {
 			if (*src != keyColor) {
 				g_nmi->_origFormat.colorToRGB(palette.pal[*src] & 0xffff, r, g, b);
-				*dest = TS_ARGB(0xff, r, g, b);
+				*dest = MS_ARGB(0xff, r, g, b);
 			}
 
 			dest++;
@@ -1009,7 +1009,7 @@ void Bitmap::copierKeyColor(uint32 *dest, byte *src, int len, int keyColor, cons
 		for (int i = 0; i < len; i++) {
 			if (*src16 != 0) {
 				g_nmi->_origFormat.colorToRGB(READ_LE_UINT16(src16), r, g, b);
-				*dest = TS_ARGB(0xff, r, g, b);
+				*dest = MS_ARGB(0xff, r, g, b);
 			}
 
 			dest++;
@@ -1042,14 +1042,14 @@ void Bitmap::copier(uint32 *dest, byte *src, int len, const Palette &palette, bo
 		for (int i = 0; i < len; i++) {
 			g_nmi->_origFormat.colorToRGB(palette.pal[*src++] & 0xffff, r, g, b);
 
-			*dest++ = TS_ARGB(0xff, r, g, b);
+			*dest++ = MS_ARGB(0xff, r, g, b);
 		}
 	} else {
 		int16 *src16 = (int16 *)src;
 
 		for (int i = 0; i < len; i++) {
 			g_nmi->_origFormat.colorToRGB(READ_LE_UINT16(src16++), r, g, b);
-			*dest++ = TS_ARGB(0xff, r, g, b);
+			*dest++ = MS_ARGB(0xff, r, g, b);
 		}
 	}
 }
diff --git a/engines/sludge/backdrop.cpp b/engines/sludge/backdrop.cpp
index a6710d7496f..eb01e8bff03 100644
--- a/engines/sludge/backdrop.cpp
+++ b/engines/sludge/backdrop.cpp
@@ -323,7 +323,7 @@ void GraphicsManager::drawHorizontalLine(uint x1, uint y, uint x2) {
 
 void GraphicsManager::darkScreen() {
 	Graphics::TransparentSurface tmp(_backdropSurface, false);
-	tmp.blit(_backdropSurface, 0, 0, Graphics::FLIP_NONE, nullptr, TS_ARGB(255 >> 1, 0, 0, 0));
+	tmp.blit(_backdropSurface, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB(255 >> 1, 0, 0, 0));
 
 	// reset zBuffer
 	if (_zBuffer->originalNum >= 0) {
@@ -360,7 +360,7 @@ bool GraphicsManager::loadLightMap(int v) {
 		if (_lightMapMode == LIGHTMAPMODE_HOTSPOT) {
 			return fatal("Light map width and height don't match scene width and height. That is required for lightmaps in HOTSPOT mode.");
 		} else if (_lightMapMode == LIGHTMAPMODE_PIXEL) {
-			tmp.blit(_lightMap, 0, 0, Graphics::FLIP_NONE, nullptr, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), (int)_sceneWidth, (int)_sceneHeight);
+			tmp.blit(_lightMap, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), (int)_sceneWidth, (int)_sceneHeight);
 		} else {
 			_lightMap.copyFrom(tmp);
 		}
@@ -461,7 +461,7 @@ bool GraphicsManager::mixHSI(int num, Common::SeekableReadStream *stream, int x,
 		return false;
 
 	Graphics::TransparentSurface tmp(mixSurface, false);
-	tmp.blit(_backdropSurface, x, y, Graphics::FLIP_NONE, nullptr, TS_ARGB(255 >> 1, 255, 255, 255));
+	tmp.blit(_backdropSurface, x, y, Graphics::FLIP_NONE, nullptr, MS_ARGB(255 >> 1, 255, 255, 255));
 	mixSurface.free();
 
 	return true;
diff --git a/engines/sludge/sprites.cpp b/engines/sludge/sprites.cpp
index a345e776d07..4c1a2d6c030 100644
--- a/engines/sludge/sprites.cpp
+++ b/engines/sludge/sprites.cpp
@@ -276,7 +276,7 @@ void GraphicsManager::pasteSpriteToBackDrop(int x1, int y1, Sprite &single, cons
 	y1 -= single.yhot;
 	Graphics::TransparentSurface tmp(single.surface, false);
 	tmp.blit(_backdropSurface, x1, y1, Graphics::FLIP_NONE, nullptr,
-			TS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
+			MS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
 }
 
 // burnSpriteToBackDrop adds text in the colour specified by setBurnColour
@@ -293,7 +293,7 @@ void GraphicsManager::burnSpriteToBackDrop(int x1, int y1, Sprite &single, const
 	y1 -= single.yhot - 1;
 	Graphics::TransparentSurface tmp(single.burnSurface, false);
 	tmp.blit(_backdropSurface, x1, y1, Graphics::FLIP_NONE, nullptr,
-			TS_RGB(_currentBurnR, _currentBurnG, _currentBurnB));
+			MS_RGB(_currentBurnR, _currentBurnG, _currentBurnB));
 }
 
 void GraphicsManager::fontSprite(bool flip, int x, int y, Sprite &single, const SpritePalette &fontPal) {
@@ -302,11 +302,11 @@ void GraphicsManager::fontSprite(bool flip, int x, int y, Sprite &single, const
 
 	// Use Transparent surface to scale and blit
 	Graphics::TransparentSurface tmp(single.surface, false);
-	tmp.blit(_renderSurface, x1, y1, (flip ? Graphics::FLIP_H : Graphics::FLIP_NONE), 0, TS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
+	tmp.blit(_renderSurface, x1, y1, (flip ? Graphics::FLIP_H : Graphics::FLIP_NONE), 0, MS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
 
 	if (single.burnSurface.getPixels() != nullptr) {
 		Graphics::TransparentSurface tmp2(single.burnSurface, false);
-		tmp2.blit(_renderSurface, x1, y1, (flip ? Graphics::FLIP_H : Graphics::FLIP_NONE), 0, TS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
+		tmp2.blit(_renderSurface, x1, y1, (flip ? Graphics::FLIP_H : Graphics::FLIP_NONE), 0, MS_RGB(fontPal.originalRed, fontPal.originalGreen, fontPal.originalBlue));
 
 	}
 }
@@ -329,7 +329,7 @@ void GraphicsManager::blendColor(Graphics::Surface *blitted, uint32 color, Graph
 	Graphics::TransparentSurface tmp;
 	tmp.create(blitted->w, blitted->h, blitted->format);
 	tmp.fillRect(Common::Rect(0, 0, tmp.w, tmp.h), color);
-	tmp.blit(*blitted, 0, 0, Graphics::FLIP_NONE, nullptr, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), (int)blitted->w, (int)blitted->h, mode);
+	tmp.blit(*blitted, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), (int)blitted->w, (int)blitted->h, mode);
 	tmp.free();
 }
 
@@ -365,7 +365,7 @@ Graphics::Surface *GraphicsManager::applyLightmapToSprite(Graphics::Surface *&bl
 			tmp.blit(*blitted, 0, 0,
 					(mirror ? Graphics::FLIP_H : Graphics::FLIP_NONE),
 					(mirror ? &rect_h : &rect_none),
-					TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
+					MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
 					(int)blitted->w, (int)blitted->h, Graphics::BLEND_MULTIPLY);
 		} else {
 			curLight[0] = curLight[1] = curLight[2] = 255;
@@ -383,15 +383,15 @@ Graphics::Surface *GraphicsManager::applyLightmapToSprite(Graphics::Surface *&bl
 		fb = curLight[2]*thisPerson->b * thisPerson->colourmix / 65025 / 255.0F;
 	}
 
-	uint32 primaryColor = TS_ARGB(255,
+	uint32 primaryColor = MS_ARGB(255,
 			(uint8)(fr + curLight[0] * (255 - thisPerson->colourmix) / 255.f),
 			(uint8)(fg + curLight[1] * (255 - thisPerson->colourmix) / 255.f),
 			(uint8)(fb + curLight[2] * (255 - thisPerson->colourmix) / 255.f));
 
-	uint32 secondaryColor = TS_ARGB(0xff, (uint8)(fr * 255), (uint8)(fg * 255), (uint8)(fb * 255));
+	uint32 secondaryColor = MS_ARGB(0xff, (uint8)(fr * 255), (uint8)(fg * 255), (uint8)(fb * 255));
 
 	// apply primary color
-	if (primaryColor != (uint32)TS_ARGB(255, 255, 255, 255)) {
+	if (primaryColor != (uint32)MS_ARGB(255, 255, 255, 255)) {
 		if (!toDetele) {
 			toDetele = blitted = duplicateSurface(blitted);
 			blendColor(blitted, primaryColor, Graphics::BLEND_MULTIPLY);
@@ -451,7 +451,7 @@ bool GraphicsManager::scaleSprite(Sprite &single, const SpritePalette &fontPal,
 	// Use Transparent surface to scale and blit
 	if (!_zBuffer->numPanels) {
 		Graphics::TransparentSurface tmp(*blitted, false);
-		tmp.blit(_renderSurface, x1, y1, (mirror ? Graphics::FLIP_H : Graphics::FLIP_NONE), nullptr, TS_ARGB(255 - thisPerson->transparency, 255, 255, 255), diffX, diffY);
+		tmp.blit(_renderSurface, x1, y1, (mirror ? Graphics::FLIP_H : Graphics::FLIP_NONE), nullptr, MS_ARGB(255 - thisPerson->transparency, 255, 255, 255), diffX, diffY);
 		if (ptr) {
 			ptr->free();
 			delete ptr;
@@ -512,7 +512,7 @@ void GraphicsManager::displaySpriteLayers() {
 		SpriteLayer::iterator it;
 		for (it = _spriteLayers->layer[i].begin(); it != _spriteLayers->layer[i].end(); ++it) {
 			Graphics::TransparentSurface tmp(*(*it)->surface, false);
-			tmp.blit(_renderSurface, (*it)->x, (*it)->y, (*it)->flip, nullptr, TS_ARGB((*it)->transparency, 255, 255, 255), (*it)->width, (*it)->height);
+			tmp.blit(_renderSurface, (*it)->x, (*it)->y, (*it)->flip, nullptr, MS_ARGB((*it)->transparency, 255, 255, 255), (*it)->width, (*it)->height);
 		}
 	}
 	killSpriteLayers();
@@ -567,7 +567,7 @@ void GraphicsManager::fixScaleSprite(int x, int y, Sprite &single, const SpriteP
 	// draw sprite
 	if (!_zBuffer->numPanels) {
 		Graphics::TransparentSurface tmp(single.surface, false);
-		tmp.blit(_renderSurface, x1, y1, (mirror ? Graphics::FLIP_H : Graphics::FLIP_NONE), nullptr, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), diffX, diffY);
+		tmp.blit(_renderSurface, x1, y1, (mirror ? Graphics::FLIP_H : Graphics::FLIP_NONE), nullptr, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), diffX, diffY);
 		if (ptr) {
 			ptr->free();
 			delete ptr;
diff --git a/engines/sludge/thumbnail.cpp b/engines/sludge/thumbnail.cpp
index b1232abb8d5..56234bc2868 100644
--- a/engines/sludge/thumbnail.cpp
+++ b/engines/sludge/thumbnail.cpp
@@ -115,7 +115,7 @@ void GraphicsManager::showThumbnail(const Common::String &filename, int atX, int
 		if (fileHeight + atY > (int)_sceneHeight)
 			fileHeight = _sceneHeight - atY;
 
-		thumbnail.blit(_backdropSurface, atX, atY, Graphics::FLIP_NONE, nullptr, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), fileWidth, fileHeight);
+		thumbnail.blit(_backdropSurface, atX, atY, Graphics::FLIP_NONE, nullptr, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255), fileWidth, fileHeight);
 		thumbnail.free();
 	}
 }
diff --git a/engines/sludge/transition.cpp b/engines/sludge/transition.cpp
index 8b7fbbbe845..00b6f074b1f 100644
--- a/engines/sludge/transition.cpp
+++ b/engines/sludge/transition.cpp
@@ -37,7 +37,7 @@ void GraphicsManager::setBrightnessLevel(int brightnessLevel) {
 unsigned lastFrom, lastTo;
 
 void GraphicsManager::transitionFader() {
-	blendColor(&_renderSurface, TS_ARGB(255 - _brightnessLevel, 0, 0, 0), Graphics::BLEND_NORMAL);
+	blendColor(&_renderSurface, MS_ARGB(255 - _brightnessLevel, 0, 0, 0), Graphics::BLEND_NORMAL);
 }
 
 void GraphicsManager::transitionCrossFader() {
@@ -48,7 +48,7 @@ void GraphicsManager::transitionCrossFader() {
 		return;
 
 	Graphics::TransparentSurface tmp(_snapshotSurface, false);
-	tmp.blit(_renderSurface, 0, 0, Graphics::FLIP_NONE, nullptr, TS_ARGB(255 - _brightnessLevel, 0xff, 0xff, 0xff));
+	tmp.blit(_renderSurface, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB(255 - _brightnessLevel, 0xff, 0xff, 0xff));
 }
 
 void GraphicsManager::transitionSnapshotBox() {
diff --git a/engines/sword25/gfx/image/vectorimagerenderer.cpp b/engines/sword25/gfx/image/vectorimagerenderer.cpp
index e6265f4bc2d..01d0842dcf2 100644
--- a/engines/sword25/gfx/image/vectorimagerenderer.cpp
+++ b/engines/sword25/gfx/image/vectorimagerenderer.cpp
@@ -49,7 +49,7 @@ void art_rgb_fill_run1(byte *buf, byte r, byte g, byte b, int n) {
 		memset(buf, g, n + n + n + n);
 	} else {
 		uint32 *alt = (uint32 *)buf;
-		uint32 color = TS_RGB(r, g, b);
+		uint32 color = MS_RGB(r, g, b);
 
 		for (i = 0; i < n; i++)
 			*alt++ = color;
diff --git a/engines/wage/guiborders.cpp b/engines/wage/guiborders.cpp
index 0cf2ecbf512..d82c20459c3 100644
--- a/engines/wage/guiborders.cpp
+++ b/engines/wage/guiborders.cpp
@@ -251,15 +251,15 @@ void Gui::loadBorder(Graphics::MacWindow *target, const char *border[], uint hei
 		for (uint x = 0; x < width; x++) {
 			switch(border[y][x * 2]) {
 			case ' ':
-				*dst = TS_RGB(0, 0, 0);
+				*dst = MS_RGB(0, 0, 0);
 				break;
 
 			case '#':
-				*dst = TS_RGB(0xff, 0xff, 0xff);
+				*dst = MS_RGB(0xff, 0xff, 0xff);
 				break;
 
 			case '.':
-				*dst = TS_RGB(0xff, 0, 0xff);
+				*dst = MS_RGB(0xff, 0, 0xff);
 				break;
 
 			default:
diff --git a/engines/wintermute/base/gfx/osystem/base_surface_osystem.cpp b/engines/wintermute/base/gfx/osystem/base_surface_osystem.cpp
index f5c7d975a7f..b310683064d 100644
--- a/engines/wintermute/base/gfx/osystem/base_surface_osystem.cpp
+++ b/engines/wintermute/base/gfx/osystem/base_surface_osystem.cpp
@@ -42,7 +42,7 @@
 #include "common/system.h"
 
 #define TS_COLOR(wmeColor) \
-	TS_ARGB(RGBCOLGetA(wmeColor), RGBCOLGetR(wmeColor), RGBCOLGetG(wmeColor), RGBCOLGetB(wmeColor))
+	MS_ARGB(RGBCOLGetA(wmeColor), RGBCOLGetR(wmeColor), RGBCOLGetG(wmeColor), RGBCOLGetB(wmeColor))
 
 namespace Wintermute {
 
diff --git a/graphics/blit.h b/graphics/blit.h
index 8336c30ba34..85bae19d5bb 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -284,8 +284,8 @@ public:
 	/**
 	 * Returns the pixel format all operations of BlendBlit::blit support.
 	 *
-	 * Use TS_ARGB and TS_RGB to quickly make a color in this format.
-	 * TS_ARGB/RGB are found in graphics/transform_struct.h
+	 * Use MS_ARGB and MS_RGB to quickly make a color in this format.
+	 * MS_ARGB/RGB are found in graphics/transform_struct.h
 	 *
 	 * @return Supported pixel format.
 	 */
diff --git a/graphics/macgui/macwindowmanager.cpp b/graphics/macgui/macwindowmanager.cpp
index 66d016cf0f9..5bb41a4e1a5 100644
--- a/graphics/macgui/macwindowmanager.cpp
+++ b/graphics/macgui/macwindowmanager.cpp
@@ -812,7 +812,7 @@ void MacWindowManager::loadDesktop() {
 void MacWindowManager::setDesktopColor(byte r, byte g, byte b) {
 	cleanupDesktopBmp();
 	_desktopBmp = new Graphics::TransparentSurface();
-	uint32 color = TS_RGB(r, g, b);
+	uint32 color = MS_RGB(r, g, b);
 
 	const Graphics::PixelFormat requiredFormat_4byte(4, 8, 8, 8, 8, 0, 8, 16, 24);
 	Graphics::ManagedSurface *source = new Graphics::ManagedSurface();
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 7ca4e55623f..2e7db5641ad 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -741,7 +741,7 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 	}
 
 	// Alpha is zero
-	if ((colorMod & TS_ARGB(255, 0, 0, 0)) == 0) return Common::Rect(0, 0, 0, 0);
+	if ((colorMod & MS_ARGB(255, 0, 0, 0)) == 0) return Common::Rect(0, 0, 0, 0);
 
 	const int scaleX = BlendBlit::getScaleFactor(srcArea.width(), dstArea.width());
 	const int scaleY = BlendBlit::getScaleFactor(srcArea.height(), dstArea.height());
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index b45d218348b..184782dbd15 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -26,9 +26,11 @@
 #include "graphics/surface.h"
 #include "graphics/transform_struct.h"
 #include "common/types.h"
-#include "graphics/transparent_surface.h"
 #include "graphics/blit.h"
 
+#define MS_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
+#define MS_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
+
 namespace Graphics {
 
 /**
@@ -546,7 +548,7 @@ public:
 	 */
 	Common::Rect blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
 							   const Common::Rect &destRect, int flipping = FLIP_NONE,
-							   const uint32 colorMod = TS_ARGB(255, 255, 255, 255),
+							   const uint32 colorMod = MS_ARGB(255, 255, 255, 255),
 							   const TSpriteBlendMode blend = BLEND_NORMAL,
 							   const AlphaType alphaType = ALPHA_FULL);
 
diff --git a/graphics/nine_patch.cpp b/graphics/nine_patch.cpp
index e7f7fd58c8f..7f8a0f2d338 100644
--- a/graphics/nine_patch.cpp
+++ b/graphics/nine_patch.cpp
@@ -335,7 +335,7 @@ void NinePatchBitmap::drawRegions(Graphics::Surface &target, int dx, int dy, int
 						_h._m[j]->offset + _h._m[j]->length, _v._m[i]->offset + _v._m[i]->length);
 
 			_bmp->blit(target, dx + _h._m[j]->dest_offset, dy + _v._m[i]->dest_offset,
-					Graphics::FLIP_NONE, &r, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
+					Graphics::FLIP_NONE, &r, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
 					_h._m[j]->dest_length, _v._m[i]->dest_length);
 		}
 	}
@@ -370,7 +370,7 @@ void NinePatchBitmap::blitClip(Graphics::Surface &target, Common::Rect clip, int
 				_h._m[j]->offset + _h._m[j]->length, _v._m[i]->offset + _v._m[i]->length);
 
 			_bmp->blitClip(target, clip, dx + _h._m[j]->dest_offset, dy + _v._m[i]->dest_offset,
-				Graphics::FLIP_NONE, &r, TS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
+				Graphics::FLIP_NONE, &r, MS_ARGB((uint)255, (uint)255, (uint)255, (uint)255),
 				_h._m[j]->dest_length, _v._m[i]->dest_length);
 		}
 	}
@@ -379,10 +379,10 @@ void NinePatchBitmap::blitClip(Graphics::Surface &target, Common::Rect clip, int
 byte NinePatchBitmap::getColorIndex(uint32 target, byte* palette) {
 	byte *pal = palette;
 	uint i = 0;
-	uint32 color = TS_RGB(pal[0], pal[1], pal[2]);
+	uint32 color = MS_RGB(pal[0], pal[1], pal[2]);
 	while (color != target) {
 		i += 3;
-		color = TS_RGB(pal[i], pal[i + 1], pal[i + 2]);
+		color = MS_RGB(pal[i], pal[i + 1], pal[i + 2]);
 	}
 	return (i / 3);
 }
diff --git a/graphics/transform_struct.h b/graphics/transform_struct.h
index 6fd310eba34..80123be4b50 100644
--- a/graphics/transform_struct.h
+++ b/graphics/transform_struct.h
@@ -24,9 +24,6 @@
 
 #include "common/rect.h"
 
-#define TS_RGB(R,G,B)       (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | 0xff)
-#define TS_ARGB(A,R,G,B)    (uint32)(((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
-
 namespace Graphics {
 
 enum TSpriteBlendMode {
diff --git a/graphics/transparent_surface.h b/graphics/transparent_surface.h
index 31d5c5ef91d..073dd535939 100644
--- a/graphics/transparent_surface.h
+++ b/graphics/transparent_surface.h
@@ -23,6 +23,7 @@
 #define GRAPHICS_TRANSPARENTSURFACE_H
 
 #include "graphics/surface.h"
+#include "graphics/managed_surface.h"
 #include "graphics/transform_struct.h"
 #include "graphics/blit.h"
 
@@ -80,8 +81,8 @@ struct TransparentSurface : public Graphics::Surface {
 	 @param color an ARGB color value, which determines the parameters for the color modulation und alpha blending.<br>
 	 The alpha component of the color determines the alpha blending parameter (0 = no covering, 255 = full covering).<br>
 	 The color components determines the color for color modulation.<br>
-	 The default value is TS_ARGB(255, 255, 255, 255) (full covering, no color modulation).
-	 The macros TS_RGB and TS_ARGB can be used for the creation of the color value.
+	 The default value is MS_ARGB(255, 255, 255, 255) (full covering, no color modulation).
+	 The macros MS_RGB and MS_ARGB can be used for the creation of the color value.
 	 @param width the output width of the screen section.
 	 The images will be scaled if the output width of the screen section differs from the image section.<br>
 	 The value -1 determines that the image should not be scaled.<br>
@@ -95,14 +96,14 @@ struct TransparentSurface : public Graphics::Surface {
 	Common::Rect blit(Graphics::Surface &target, int posX = 0, int posY = 0,
 	                  int flipping = FLIP_NONE,
 	                  Common::Rect *pPartRect = nullptr,
-	                  uint color = TS_ARGB(255, 255, 255, 255),
+	                  uint color = MS_ARGB(255, 255, 255, 255),
 	                  int width = -1, int height = -1,
 	                  TSpriteBlendMode blend = BLEND_NORMAL);
 	Common::Rect blitClip(Graphics::Surface &target, Common::Rect clippingArea,
 						int posX = 0, int posY = 0,
 						int flipping = FLIP_NONE,
 						Common::Rect *pPartRect = nullptr,
-						uint color = TS_ARGB(255, 255, 255, 255),
+						uint color = MS_ARGB(255, 255, 255, 255),
 						int width = -1, int height = -1,
 						TSpriteBlendMode blend = BLEND_NORMAL);
 
diff --git a/test/image/blending.h b/test/image/blending.h
index 8ef2ca75904..23ed4648bd9 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -38,14 +38,14 @@ struct OldTransparentSurface : public Graphics::Surface {
 	Common::Rect blit(Graphics::Surface &target, int posX = 0, int posY = 0,
 	                  int flipping = FLIP_NONE,
 	                  Common::Rect *pPartRect = nullptr,
-	                  uint color = TS_ARGB(255, 255, 255, 255),
+	                  uint color = MS_ARGB(255, 255, 255, 255),
 	                  int width = -1, int height = -1,
 	                  TSpriteBlendMode blend = BLEND_NORMAL);
 	Common::Rect blitClip(Graphics::Surface &target, Common::Rect clippingArea,
 						int posX = 0, int posY = 0,
 						int flipping = FLIP_NONE,
 						Common::Rect *pPartRect = nullptr,
-						uint color = TS_ARGB(255, 255, 255, 255),
+						uint color = MS_ARGB(255, 255, 255, 255),
 						int width = -1, int height = -1,
 						TSpriteBlendMode blend = BLEND_NORMAL);
 	OldTransparentSurface *scale(int16 newWidth, int16 newHeight, bool filtering = false) const;
@@ -1060,12 +1060,12 @@ public:
         for (int rect = 0; rect < (int)(sizeof(srcs)/sizeof(srcs[0])); rect++) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            Common::Rect ret1 = oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            Common::Rect ret1 = oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            Common::Rect ret2 = newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            Common::Rect ret2 = newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, MS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 
 			if (ret1 != ret2 || ret2 != ret3 || ret1 != ret3) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
@@ -1110,10 +1110,10 @@ public:
 			
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
             oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            ret1 = oldSurf.blitClip(oldSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            ret1 = oldSurf.blitClip(oldSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
-            ret2 = newSurf.blitClip(newSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
+            ret2 = newSurf.blitClip(newSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
                 warning("BLIT_CLIP blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);


Commit: a64c7ea3b7f6497f83cd1207970181c15e4edd84
    https://github.com/scummvm/scummvm/commit/a64c7ea3b7f6497f83cd1207970181c15e4edd84
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: New blendBlitFrom overload

Changed paths:
    graphics/managed_surface.cpp
    graphics/managed_surface.h


diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 2e7db5641ad..050e6125be0 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -734,6 +734,13 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 										   const uint32 colorMod,
 										   const TSpriteBlendMode blend,
 										   const AlphaType alphaType) {
+	return blendBlitFrom(src, srcRect, destRect, flipping, colorMod, blend, alphaType);
+}
+Common::Rect ManagedSurface::blendBlitFrom(const Surface &src, const Common::Rect &srcRect,
+										   const Common::Rect &destRect, int flipping,
+										   const uint32 colorMod,
+										   const TSpriteBlendMode blend,
+										   const AlphaType alphaType) {
 	Common::Rect srcArea = srcRect, dstArea = destRect;
 	if (!isBlendBlitPixelFormatSupported(src.format, format)) {
 		warning("ManagedSurface::blendBlitFrom only accepts RGBA32!");
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 184782dbd15..1d1677f8cc6 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -551,6 +551,11 @@ public:
 							   const uint32 colorMod = MS_ARGB(255, 255, 255, 255),
 							   const TSpriteBlendMode blend = BLEND_NORMAL,
 							   const AlphaType alphaType = ALPHA_FULL);
+	Common::Rect blendBlitFrom(const Surface &src, const Common::Rect &srcRect,
+							   const Common::Rect &destRect, int flipping = FLIP_NONE,
+							   const uint32 colorMod = MS_ARGB(255, 255, 255, 255),
+							   const TSpriteBlendMode blend = BLEND_NORMAL,
+							   const AlphaType alphaType = ALPHA_FULL);
 
 	/**
 	 * Clear the entire surface.


Commit: e95cfb4877e3d74236a3dabcfd18cf1eae1ae2f6
    https://github.com/scummvm/scummvm/commit/e95cfb4877e3d74236a3dabcfd18cf1eae1ae2f6
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fixed indentation in blit files

Changed paths:
    graphics/blit/blit-alpha.cpp
    graphics/blit/blit-avx2.cpp
    graphics/blit/blit-neon.cpp
    graphics/blit/blit-sse2.cpp


diff --git a/graphics/blit/blit-alpha.cpp b/graphics/blit/blit-alpha.cpp
index b7e2b5f723b..4b5793c5056 100644
--- a/graphics/blit/blit-alpha.cpp
+++ b/graphics/blit/blit-alpha.cpp
@@ -38,28 +38,28 @@ inline void applyColorKeyLogic(byte *dst, const byte *src, const uint w, const u
                                const uint8 rKey, const uint8 gKey, const uint8 bKey,
                                const uint8 rNew, const uint8 gNew, const uint8 bNew) {
 
-	const uint32 keyPix    = format.ARGBToColor(0,   rKey, gKey, bKey);
-	const uint32 newPix    = format.ARGBToColor(0,   rNew, gNew, bNew);
-	const uint32 rgbMask   = format.ARGBToColor(0,   255,  255,  255);
-	const uint32 alphaMask = format.ARGBToColor(255, 0,    0,    0);
-
-	for (uint y = 0; y < h; ++y) {
-		for (uint x = 0; x < w; ++x) {
-			uint32 pix = *(const Size *)src;
-
-			if ((pix & rgbMask) == keyPix) {
-				*(Size *)dst = newPix;
-			} else if (overwriteAlpha) {
-				*(Size *)dst = pix | alphaMask;
-			}
-
-			src += sizeof(Size);
-			dst += sizeof(Size);
-		}
-
-		src += srcDelta;
-		dst += dstDelta;
-	}
+    const uint32 keyPix    = format.ARGBToColor(0,   rKey, gKey, bKey);
+    const uint32 newPix    = format.ARGBToColor(0,   rNew, gNew, bNew);
+    const uint32 rgbMask   = format.ARGBToColor(0,   255,  255,  255);
+    const uint32 alphaMask = format.ARGBToColor(255, 0,    0,    0);
+
+    for (uint y = 0; y < h; ++y) {
+        for (uint x = 0; x < w; ++x) {
+            uint32 pix = *(const Size *)src;
+
+            if ((pix & rgbMask) == keyPix) {
+                *(Size *)dst = newPix;
+            } else if (overwriteAlpha) {
+                *(Size *)dst = pix | alphaMask;
+            }
+
+            src += sizeof(Size);
+            dst += sizeof(Size);
+        }
+
+        src += srcDelta;
+        dst += dstDelta;
+    }
 }
 
 template<typename Size, bool skipTransparent>
@@ -67,26 +67,26 @@ inline void setAlphaLogic(byte *dst, const byte *src, const uint w, const uint h
                           const uint srcDelta, const uint dstDelta,
                           const Graphics::PixelFormat &format, const uint8 alpha) {
 
-	const uint32 newAlpha  = format.ARGBToColor(alpha, 0,   0,   0);
-	const uint32 rgbMask   = format.ARGBToColor(0,     255, 255, 255);
-	const uint32 alphaMask = format.ARGBToColor(255,   0,   0,   0);
+    const uint32 newAlpha  = format.ARGBToColor(alpha, 0,   0,   0);
+    const uint32 rgbMask   = format.ARGBToColor(0,     255, 255, 255);
+    const uint32 alphaMask = format.ARGBToColor(255,   0,   0,   0);
 
-	for (uint y = 0; y < h; ++y) {
-		for (uint x = 0; x < w; ++x) {
-			uint32 pix = *(const Size *)src;
+    for (uint y = 0; y < h; ++y) {
+        for (uint x = 0; x < w; ++x) {
+            uint32 pix = *(const Size *)src;
 
-			if (!skipTransparent || (pix & alphaMask))
-				*(Size *)dst = (pix & rgbMask) | newAlpha;
-			else
-				*(Size *)dst = pix;
+            if (!skipTransparent || (pix & alphaMask))
+                *(Size *)dst = (pix & rgbMask) | newAlpha;
+            else
+                *(Size *)dst = pix;
 
-			src += sizeof(Size);
-			dst += sizeof(Size);
-		}
+            src += sizeof(Size);
+            dst += sizeof(Size);
+        }
 
-		src += srcDelta;
-		dst += dstDelta;
-	}
+        src += srcDelta;
+        dst += dstDelta;
+    }
 }
 
 } // End of anonymous namespace
@@ -99,37 +99,37 @@ bool applyColorKey(byte *dst, const byte *src,
                    const uint8 rKey, const uint8 gKey, const uint8 bKey,
                    const uint8 rNew, const uint8 gNew, const uint8 bNew) {
 
-	// Faster, but larger, to provide optimized handling for each case.
-	const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
-	const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
-
-	if (format.aBits() == 0) {
-		return false;
-	}
-
-	if (overwriteAlpha) {
-		if (format.bytesPerPixel == 1) {
-			applyColorKeyLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else if (format.bytesPerPixel == 2) {
-			applyColorKeyLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else if (format.bytesPerPixel == 4) {
-			applyColorKeyLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else {
-			return false;
-		}
-	} else {
-		if (format.bytesPerPixel == 1) {
-			applyColorKeyLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else if (format.bytesPerPixel == 2) {
-			applyColorKeyLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else if (format.bytesPerPixel == 4) {
-			applyColorKeyLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
-		} else {
-			return false;
-		}
-	}
-
-	return true;
+    // Faster, but larger, to provide optimized handling for each case.
+    const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
+    const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
+
+    if (format.aBits() == 0) {
+        return false;
+    }
+
+    if (overwriteAlpha) {
+        if (format.bytesPerPixel == 1) {
+            applyColorKeyLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else if (format.bytesPerPixel == 2) {
+            applyColorKeyLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else if (format.bytesPerPixel == 4) {
+            applyColorKeyLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else {
+            return false;
+        }
+    } else {
+        if (format.bytesPerPixel == 1) {
+            applyColorKeyLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else if (format.bytesPerPixel == 2) {
+            applyColorKeyLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else if (format.bytesPerPixel == 4) {
+            applyColorKeyLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+        } else {
+            return false;
+        }
+    }
+
+    return true;
 }
 
 // Function to set the alpha channel for all pixels to the specified value
@@ -139,71 +139,71 @@ bool setAlpha(byte *dst, const byte *src,
               const Graphics::PixelFormat &format,
               const bool skipTransparent, const uint8 alpha) {
 
-	// Faster, but larger, to provide optimized handling for each case.
-	const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
-	const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
-
-	if (format.aBits() == 0) {
-		return false;
-	}
-
-	if (skipTransparent) {
-		if (format.bytesPerPixel == 1) {
-			setAlphaLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else if (format.bytesPerPixel == 2) {
-			setAlphaLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else if (format.bytesPerPixel == 4) {
-			setAlphaLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else {
-			return false;
-		}
-	} else {
-		if (format.bytesPerPixel == 1) {
-			setAlphaLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else if (format.bytesPerPixel == 2) {
-			setAlphaLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else if (format.bytesPerPixel == 4) {
-			setAlphaLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
-		} else {
-			return false;
-		}
-	}
-
-	return true;
+    // Faster, but larger, to provide optimized handling for each case.
+    const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
+    const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
+
+    if (format.aBits() == 0) {
+        return false;
+    }
+
+    if (skipTransparent) {
+        if (format.bytesPerPixel == 1) {
+            setAlphaLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else if (format.bytesPerPixel == 2) {
+            setAlphaLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else if (format.bytesPerPixel == 4) {
+            setAlphaLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else {
+            return false;
+        }
+    } else {
+        if (format.bytesPerPixel == 1) {
+            setAlphaLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else if (format.bytesPerPixel == 2) {
+            setAlphaLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else if (format.bytesPerPixel == 4) {
+            setAlphaLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+        } else {
+            return false;
+        }
+    }
+
+    return true;
 }
 
 
 BlendBlit::Args::Args(byte *dst, const byte *src,
-	const uint _dstPitch, const uint _srcPitch,
-	const int posX, const int posY,
-	const uint _width, const uint _height,
-	const int _scaleX, const int _scaleY,
-	const int scaleXsrcOff, const int scaleYsrcOff,
-	const uint32 colorMod, const uint _flipping) :
-		xp(0), yp(0), dstPitch(_dstPitch),
-		width(_width), height(_height), color(colorMod),
-		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
-		scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
-	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
-	
-	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-	alphamod = ((colorMod & kAModMask)   != kAModMask);
-	inStep = 4;
-	inoStep = _srcPitch;
-	if (flipping & FLIP_H) {
-		inStep = -inStep;
-		xp = width - 1;
-		if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
-	}
-
-	if (flipping & FLIP_V) {
-		inoStep = -inoStep;
-		yp = height - 1;
-		if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
-	}
-
-	ino = src + yp * _srcPitch + xp * 4;
-	outo = dst + posY * _dstPitch + posX * 4;
+    const uint _dstPitch, const uint _srcPitch,
+    const int posX, const int posY,
+    const uint _width, const uint _height,
+    const int _scaleX, const int _scaleY,
+    const int scaleXsrcOff, const int scaleYsrcOff,
+    const uint32 colorMod, const uint _flipping) :
+        xp(0), yp(0), dstPitch(_dstPitch),
+        width(_width), height(_height), color(colorMod),
+        scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
+        scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
+    bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
+    
+    rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+    alphamod = ((colorMod & kAModMask)   != kAModMask);
+    inStep = 4;
+    inoStep = _srcPitch;
+    if (flipping & FLIP_H) {
+        inStep = -inStep;
+        xp = width - 1;
+        if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
+    }
+
+    if (flipping & FLIP_V) {
+        inoStep = -inoStep;
+        yp = height - 1;
+        if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
+    }
+
+    ino = src + yp * _srcPitch + xp * 4;
+    outo = dst + posY * _dstPitch + posX * 4;
 }
 
 /**
@@ -211,116 +211,116 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
  */
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	const byte rawcr = (args.color >> kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
-				out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
-				out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
-			}
-
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
+                out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
+                out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 
 }
 
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-	const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-	const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				if (rgbmod) {
-					const uint outb = (out[kBIndex] * (255 - ina) >> 8);
-					const uint outg = (out[kGIndex] * (255 - ina) >> 8);
-					const uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-					out[kAIndex] = 255;
-					out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-					out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-					out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
-				} else {
-					out[kAIndex] = 255;
-					out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
-					out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
-					out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
-					
-				}
-			}
-
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+    const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+    const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                if (rgbmod) {
+                    const uint outb = (out[kBIndex] * (255 - ina) >> 8);
+                    const uint outg = (out[kGIndex] * (255 - ina) >> 8);
+                    const uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+                    out[kAIndex] = 255;
+                    out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+                    out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+                    out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+                } else {
+                    out[kAIndex] = 255;
+                    out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
+                    out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
+                    out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
+                    
+                }
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 /**
@@ -328,49 +328,49 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
  */
 template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	const byte rawcr = (args.color >> kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> kBModShift) & 0xFF;
-	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-
-			out[kAIndex] = 255;
-			out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
-			out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
-			out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
-
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            out[kAIndex] = 255;
+            out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
+            out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
+            out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 /**
@@ -378,135 +378,135 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
  */
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	const byte rawcr = (args.color >> kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-	const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-	const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-	const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-
-			uint32 ina = in[kAIndex] * ca >> 8;
-
-			if (ina != 0) {
-				out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
-				out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
-				out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
-			}
-
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
+                out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
+                out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 template<bool doscale>
 void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-
-		if (doscale) {
-			for (uint32 j = 0; j < args.width; j++) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-				*(uint32 *)out = *(const uint32 *)in | kAModMask;
-				scaleXCtr += args.scaleX;
-				out += 4;
-			}
-		} else {
-			for (uint32 j = 0; j < args.width; j++) {
-				*(uint32 *)out = *(const uint32 *)in | kAModMask;
-				in += args.inStep;
-				out += 4;
-			}
-		}
-
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+
+        if (doscale) {
+            for (uint32 j = 0; j < args.width; j++) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+                *(uint32 *)out = *(const uint32 *)in | kAModMask;
+                scaleXCtr += args.scaleX;
+                out += 4;
+            }
+        } else {
+            for (uint32 j = 0; j < args.width; j++) {
+                *(uint32 *)out = *(const uint32 *)in | kAModMask;
+                in += args.inStep;
+                out += 4;
+            }
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 template<bool doscale>
 void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-			}
-
-			uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
-			uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
-    		pixout &= ~mask;
-    		pix = (pix | kAModMask) & mask;
-    		*(uint32 *)out = pixout | pix;
-			
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
+            uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
+            pixout &= ~mask;
+            pix = (pix | kAModMask) & mask;
+            *(uint32 *)out = pixout | pix;
+            
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 // Initialize this to nullptr at the start
@@ -517,157 +517,157 @@ BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
 // BlendBlit::blitFunc. This way, we can detect at runtime whether or not
 // the cpu has certain SIMD feature enabled or not.
 void BlendBlit::blit(byte *dst, const byte *src,
-					 const uint dstPitch, const uint srcPitch,
-					 const int posX, const int posY,
-					 const uint width, const uint height,
-					 const int scaleX, const int scaleY,
-					 const int scaleXsrcOff, const int scaleYsrcOff,
-					 const uint32 colorMod, const uint flipping,
-					 const TSpriteBlendMode blendMode,
-					 const AlphaType alphaType) {
-	if (width == 0 || height == 0) return;
-
-	// If no function has been selected yet, detect and select
-	if (!blitFunc) {
-		// Get the correct blit function
-		blitFunc = blitGeneric;
+                     const uint dstPitch, const uint srcPitch,
+                     const int posX, const int posY,
+                     const uint width, const uint height,
+                     const int scaleX, const int scaleY,
+                     const int scaleXsrcOff, const int scaleYsrcOff,
+                     const uint32 colorMod, const uint flipping,
+                     const TSpriteBlendMode blendMode,
+                     const AlphaType alphaType) {
+    if (width == 0 || height == 0) return;
+
+    // If no function has been selected yet, detect and select
+    if (!blitFunc) {
+        // Get the correct blit function
+        blitFunc = blitGeneric;
 #ifdef SCUMMVM_NEON
-		if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
+        if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
 #endif
 #ifdef SCUMMVM_SSE2
-		if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
+        if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
 #endif
 #ifdef SCUMMVM_AVX2
-		if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
+        if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
 #endif
-	}
-	
-	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
-	blitFunc(args, blendMode, alphaType);
+    }
+    
+    Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
+    blitFunc(args, blendMode, alphaType);
 }
 
 // This is just a macro to expand it because its a pretty simple function where
 // readabiliy doesn't matter too much and macros tend to work faster better than functors
 #define BLIT_FUNC(ext) \
-	void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
-		bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
-		bool alphamod = ((args.color & kAModMask)   != kAModMask); \
-		if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
-			if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-				doBlitOpaqueBlendLogic##ext<false>(args); \
-			} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-				doBlitBinaryBlendLogic##ext<false>(args); \
-			} else { \
-				if (blendMode == BLEND_ADDITIVE) { \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
-						} else { \
-							doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
-						} else { \
-							doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
-						} \
-					} \
-				} else if (blendMode == BLEND_SUBTRACTIVE) { \
-					if (rgbmod) { \
-						doBlitSubtractiveBlendLogic##ext<false, true>(args); \
-					} else { \
-						doBlitSubtractiveBlendLogic##ext<false, false>(args); \
-					} \
-				} else if (blendMode == BLEND_MULTIPLY) { \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
-						} else { \
-							doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
-						} else { \
-							doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
-						} \
-					} \
-				} else { \
-					assert(blendMode == BLEND_NORMAL); \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitAlphaBlendLogic##ext<false, true, true>(args); \
-						} else { \
-							doBlitAlphaBlendLogic##ext<false, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitAlphaBlendLogic##ext<false, false, true>(args); \
-						} else { \
-							doBlitAlphaBlendLogic##ext<false, false, false>(args); \
-						} \
-					} \
-				} \
-			} \
-		} else { \
-			if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-				doBlitOpaqueBlendLogic##ext<true>(args); \
-			} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-				doBlitBinaryBlendLogic##ext<true>(args); \
-			} else { \
-				if (blendMode == BLEND_ADDITIVE) { \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
-						} else { \
-							doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
-						} else { \
-							doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
-						} \
-					} \
-				} else if (blendMode == BLEND_SUBTRACTIVE) { \
-					if (rgbmod) { \
-						doBlitSubtractiveBlendLogic##ext<true, true>(args); \
-					} else { \
-						doBlitSubtractiveBlendLogic##ext<true, false>(args); \
-					} \
-				} else if (blendMode == BLEND_MULTIPLY) { \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
-						} else { \
-							doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
-						} else { \
-							doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
-						} \
-					} \
-				} else { \
-					assert(blendMode == BLEND_NORMAL); \
-					if (rgbmod) { \
-						if (alphamod) { \
-							doBlitAlphaBlendLogic##ext<true, true, true>(args); \
-						} else { \
-							doBlitAlphaBlendLogic##ext<true, true, false>(args); \
-						} \
-					} else { \
-						if (alphamod) { \
-							doBlitAlphaBlendLogic##ext<true, false, true>(args); \
-						} else { \
-							doBlitAlphaBlendLogic##ext<true, false, false>(args); \
-						} \
-					} \
-				} \
-			} \
-		} \
-	}
+    void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
+        bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
+        bool alphamod = ((args.color & kAModMask)   != kAModMask); \
+        if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
+            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+                doBlitOpaqueBlendLogic##ext<false>(args); \
+            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+                doBlitBinaryBlendLogic##ext<false>(args); \
+            } else { \
+                if (blendMode == BLEND_ADDITIVE) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } else if (blendMode == BLEND_SUBTRACTIVE) { \
+                    if (rgbmod) { \
+                        doBlitSubtractiveBlendLogic##ext<false, true>(args); \
+                    } else { \
+                        doBlitSubtractiveBlendLogic##ext<false, false>(args); \
+                    } \
+                } else if (blendMode == BLEND_MULTIPLY) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } else { \
+                    assert(blendMode == BLEND_NORMAL); \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } \
+            } \
+        } else { \
+            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+                doBlitOpaqueBlendLogic##ext<true>(args); \
+            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+                doBlitBinaryBlendLogic##ext<true>(args); \
+            } else { \
+                if (blendMode == BLEND_ADDITIVE) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } else if (blendMode == BLEND_SUBTRACTIVE) { \
+                    if (rgbmod) { \
+                        doBlitSubtractiveBlendLogic##ext<true, true>(args); \
+                    } else { \
+                        doBlitSubtractiveBlendLogic##ext<true, false>(args); \
+                    } \
+                } else if (blendMode == BLEND_MULTIPLY) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } else { \
+                    assert(blendMode == BLEND_NORMAL); \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } \
+            } \
+        } \
+    }
 BLIT_FUNC(Generic)
 #ifdef SCUMMVM_NEON
 BLIT_FUNC(NEON)
diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
index b7768457a97..893ac21a3f0 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -29,340 +29,340 @@ namespace Graphics {
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AlphaBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-	    __m256i ina;
-	    if (alphamod)
-			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
-	    else
-			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-	    __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
-	
-	    if (rgbmod) {
-	    	__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-	    	__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-	    	__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-	    	__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-	    	__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-	    	__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
-			dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
-			dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
-			srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(cr)), BlendBlit::kRModShift - 8));
-			srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(cg)), BlendBlit::kGModShift - 8));
-			srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(cb)));
-			src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
-			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
-			src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
-	    } else {
-			__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-
-			dstRB = _mm256_srli_epi32(_mm256_mullo_epi32(dstRB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
-			dstG = _mm256_srli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
-			srcRB = _mm256_slli_epi32(_mm256_add_epi32(dstRB, _mm256_srli_epi32(_mm256_mullo_epi32(srcRB, ina), 8)), BlendBlit::kBModShift);
-			srcG = _mm256_slli_epi32(_mm256_add_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
-			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
-			src = _mm256_or_si256(_mm256_and_si256(srcRB, _mm256_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
-	    }
-
-		dst = _mm256_and_si256(alphaMask, dst);
-		src = _mm256_andnot_si256(alphaMask, src);
-	    return _mm256_or_si256(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
-			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
-			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
-
-			out[BlendBlit::kAIndex] = 255;
-			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
-			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
-			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
-		}
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m256i ina;
+        if (alphamod)
+            ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+        else
+            ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+        __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+    
+        if (rgbmod) {
+            __m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+            dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+            dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
+            srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(cr)), BlendBlit::kRModShift - 8));
+            srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(cg)), BlendBlit::kGModShift - 8));
+            srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(cb)));
+            src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
+            src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
+        } else {
+            __m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+            dstRB = _mm256_srli_epi32(_mm256_mullo_epi32(dstRB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+            dstG = _mm256_srli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+            srcRB = _mm256_slli_epi32(_mm256_add_epi32(dstRB, _mm256_srli_epi32(_mm256_mullo_epi32(srcRB, ina), 8)), BlendBlit::kBModShift);
+            srcG = _mm256_slli_epi32(_mm256_add_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+            src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(_mm256_and_si256(srcRB, _mm256_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+        }
+
+        dst = _mm256_and_si256(alphaMask, dst);
+        src = _mm256_andnot_si256(alphaMask, src);
+        return _mm256_or_si256(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+            uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+            uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+            out[BlendBlit::kAIndex] = 255;
+            out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+            out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+            out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct MultiplyBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m256i ina;
-	    if (alphamod)
-			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
-	    else
-			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-	    __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
-
-    	if (rgbmod) {
-			__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-			__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-			srcb = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcb, _mm256_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcg, _mm256_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcr, _mm256_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
-
-			src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-			src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
-    	} else {
-			__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
-			__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
-			__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), 8), _mm256_set1_epi32(BlendBlit::kGModMask));
-    	    srcrb = _mm256_and_si256(_mm256_mullo_epi32(dstrb, _mm256_srli_epi32(_mm256_mullo_epi32(srcrb, ina), 8)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
-    	}
-
-    	dst = _mm256_and_si256(alphaMask, dst);
-    	src = _mm256_andnot_si256(alphaMask, src);
-    	return _mm256_or_si256(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
-		}
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m256i ina;
+        if (alphamod)
+            ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+        else
+            ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+        __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+
+        if (rgbmod) {
+            __m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcb, _mm256_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
+            srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcg, _mm256_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+            srcr = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcr, _mm256_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
+        } else {
+            __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            srcg = _mm256_and_si256(_mm256_srli_epi32(_mm256_mullo_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm256_and_si256(_mm256_mullo_epi32(dstrb, _mm256_srli_epi32(_mm256_mullo_epi32(srcrb, ina), 8)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+        }
+
+        dst = _mm256_and_si256(alphaMask, dst);
+        src = _mm256_andnot_si256(alphaMask, src);
+        return _mm256_or_si256(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct OpaqueBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+    }
 
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
-	}
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct BinaryBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		__m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
-		dst = _mm256_and_si256(dst, alphaMask);
-		src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
-		return _mm256_or_si256(src, dst);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 pix = *(const uint32 *)in;
-		int a = in[BlendBlit::kAIndex];
-
-		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-			*(uint32 *)out = pix;
-			out[BlendBlit::kAIndex] = 0xFF;
-		}
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
+        dst = _mm256_and_si256(dst, alphaMask);
+        src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
+        return _mm256_or_si256(src, dst);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 pix = *(const uint32 *)in;
+        int a = in[BlendBlit::kAIndex];
+
+        if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+            *(uint32 *)out = pix;
+            out[BlendBlit::kAIndex] = 0xFF;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AdditiveBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m256i ina;
-    	if (alphamod)
-    	    ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
-    	else
-    	    ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
-
-    	if (rgbmod) {
-    	    __m256i srcb = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask));
-    	    __m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    __m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	    __m256i dstb = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask));
-    	    __m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    __m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-			srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
-
-    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
-    	} else if (alphamod) {
-    	    __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
-    	    __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
-    	    __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
-    	} else {
-    	    __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
-    	    __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
-    	    __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
-		}
-
-    	dst = _mm256_and_si256(alphaMask, dst);
-    	src = _mm256_andnot_si256(alphaMask, src);
-    	return _mm256_or_si256(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
-		}
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m256i ina;
+        if (alphamod)
+            ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+        else
+            ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+        __m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
+
+        if (rgbmod) {
+            __m256i srcb = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask));
+            __m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m256i dstb = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask));
+            __m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
+            srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
+            srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
+        } else if (alphamod) {
+            __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+        } else {
+            __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+            __m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+        }
+
+        dst = _mm256_and_si256(alphaMask, dst);
+        src = _mm256_andnot_si256(alphaMask, src);
+        return _mm256_or_si256(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct SubtractiveBlendAVX2 {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
-    	__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-		srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
-		srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
-		srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
-
-    	return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		out[BlendBlit::kAIndex] = 255;
-		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-	}
+    static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+        __m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        __m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        __m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+        __m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        __m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        __m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+        srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
+        srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
+        srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+        return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        out[BlendBlit::kAIndex] = 255;
+        out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+    }
 };
 
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
 void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
-	const byte *in;
-	byte *out;
-
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-
-		uint32 j = 0;
-		for (; j + 8 <= args.width; j += 8) {
-    		__m256i dstPixels, srcPixels;
-			if (loaddst) dstPixels = _mm256_loadu_si256((const __m256i *)out);
-    		if (!doscale) {
-    		    srcPixels = _mm256_loadu_si256((const __m256i *)in);
-    		} else {
-				srcPixels = _mm256_setr_epi32(
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 4) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 5) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 6) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 7) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
-				);
-				scaleXCtr += args.scaleX * 8;
-    		}
-    		if (!doscale && (args.flipping & FLIP_H)) {
-				srcPixels = _mm256_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
-				srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
-    		}
-			{
-				const __m256i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
-				_mm256_storeu_si256((__m256i *)out, res);
-			}
-			if (!doscale) in += (ptrdiff_t)args.inStep * 8;
-			out += 4ULL * 8;
-		}
-		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 7;
-		for (; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
-			}
-
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+    const byte *in;
+    byte *out;
+
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+    const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+    const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+
+        uint32 j = 0;
+        for (; j + 8 <= args.width; j += 8) {
+            __m256i dstPixels, srcPixels;
+            if (loaddst) dstPixels = _mm256_loadu_si256((const __m256i *)out);
+            if (!doscale) {
+                srcPixels = _mm256_loadu_si256((const __m256i *)in);
+            } else {
+                srcPixels = _mm256_setr_epi32(
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 4) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 5) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 6) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 7) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+                );
+                scaleXCtr += args.scaleX * 8;
+            }
+            if (!doscale && (args.flipping & FLIP_H)) {
+                srcPixels = _mm256_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+                srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
+            }
+            {
+                const __m256i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+                _mm256_storeu_si256((__m256i *)out, res);
+            }
+            if (!doscale) in += (ptrdiff_t)args.inStep * 8;
+            out += 4ULL * 8;
+        }
+        if (!doscale && (args.flipping & FLIP_H)) in += 4 * 7;
+        for (; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
             
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAlphaBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<AlphaBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<AlphaBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<SubtractiveBlendAVX2, doscale, rgbmod, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<SubtractiveBlendAVX2, doscale, rgbmod, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<AdditiveBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<AdditiveBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitOpaqueBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<OpaqueBlendAVX2, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<OpaqueBlendAVX2, doscale, false, false, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitBinaryBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<BinaryBlendAVX2, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<BinaryBlendAVX2, doscale, false, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicAVX2(Args &args) {
-	BlendBlitImpl::blitInnerLoopAVX2<MultiplyBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoopAVX2<MultiplyBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
 }
 
 } // End of namespace Graphics
diff --git a/graphics/blit/blit-neon.cpp b/graphics/blit/blit-neon.cpp
index 17712dc979f..6d57e16f857 100644
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-neon.cpp
@@ -29,236 +29,236 @@ namespace Graphics {
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AlphaBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-	    uint32x4_t ina;
-	    if (alphamod)
-	        ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
-	    else
-	        ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-	    uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
-	
-	    if (rgbmod) {
-	        uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
-	        uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
-	        uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
-	        uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
-	        uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
-	        uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
-	
-	        dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-	        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-	        dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-	        srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
-	        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
-	        srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
-	        src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
-	        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
-	        src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
-	    } else {
-	        uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
-	        uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
-	        uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
-	        uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
-	
-	        dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
-	        dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-	        srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
-	        srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
-	        src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
-	        src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
-	    }
-	
-	    dst = vandq_u32(alphaMask, dst);
-	    src = vandq_u32(vmvnq_u32(alphaMask), src);
-	    return vorrq_u32(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
-			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
-			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
-
-			out[BlendBlit::kAIndex] = 255;
-			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
-			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
-			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
-		}
-	}
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32x4_t ina;
+        if (alphamod)
+            ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+        else
+            ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+        uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+    
+        if (rgbmod) {
+            uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+            uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+            uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+            uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+            uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+            uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+    
+            dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+            dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+            dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+            srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
+            srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
+            srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
+            src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
+            src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
+        } else {
+            uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+            uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+            uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+    
+            dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
+            dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+            srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
+            srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
+            src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+        }
+    
+        dst = vandq_u32(alphaMask, dst);
+        src = vandq_u32(vmvnq_u32(alphaMask), src);
+        return vorrq_u32(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+            uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+            uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+            out[BlendBlit::kAIndex] = 255;
+            out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+            out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+            out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct MultiplyBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	uint32x4_t ina;
-    	if (alphamod)
-    	    ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
-    	else
-    	    ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
-
-    	if (rgbmod) {
-    	    uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
-    	    uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	    uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
-    	    uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-    	    srcb = vandq_u32(vshrq_n_u32(vmulq_u32(dstb, vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), ina), 16)), 8), vmovq_n_u32(BlendBlit::kBModMask));
-    	    srcg = vandq_u32(vshlq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
-    	    srcr = vandq_u32(vshlq_n_u32(vmulq_u32(dstr, vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
-
-    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
-    	} else {
-    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    srcg = vandq_u32(vshrq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), 8), vmovq_n_u32(BlendBlit::kGModMask));
-    	    srcrb = vandq_u32(vmulq_u32(dstrb, vshrq_n_u32(vmulq_u32(srcrb, ina), 8)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
-    	}
-
-    	dst = vandq_u32(alphaMask, dst);
-    	src = vandq_u32(vmvnq_u32(alphaMask), src);
-    	return vorrq_u32(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
-		}
-	}
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32x4_t ina;
+        if (alphamod)
+            ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+        else
+            ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+        uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+        if (rgbmod) {
+            uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+            uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+            uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = vandq_u32(vshrq_n_u32(vmulq_u32(dstb, vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), ina), 16)), 8), vmovq_n_u32(BlendBlit::kBModMask));
+            srcg = vandq_u32(vshlq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
+            srcr = vandq_u32(vshlq_n_u32(vmulq_u32(dstr, vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
+
+            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+        } else {
+            uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            srcg = vandq_u32(vshrq_n_u32(vmulq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), 8), vmovq_n_u32(BlendBlit::kGModMask));
+            srcrb = vandq_u32(vmulq_u32(dstrb, vshrq_n_u32(vmulq_u32(srcrb, ina), 8)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+        }
+
+        dst = vandq_u32(alphaMask, dst);
+        src = vandq_u32(vmvnq_u32(alphaMask), src);
+        return vorrq_u32(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct OpaqueBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-	}
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+    }
 
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
-	}
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct BinaryBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
         dst = vandq_u32(dst, alphaMask);
         src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
         return vorrq_u32(dst, src);
-	}
+    }
 
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 pix = *(const uint32 *)in;
-		int a = in[BlendBlit::kAIndex];
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 pix = *(const uint32 *)in;
+        int a = in[BlendBlit::kAIndex];
 
-		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-			*(uint32 *)out = pix;
-			out[BlendBlit::kAIndex] = 0xFF;
-		}
-	}
+        if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+            *(uint32 *)out = pix;
+            out[BlendBlit::kAIndex] = 0xFF;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AdditiveBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	uint32x4_t ina;
-    	if (alphamod)
-    	    ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
-    	else
-    	    ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
-
-    	if (rgbmod) {
-    	    uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
-    	    uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	    uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
-    	    uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-			srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
-			srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
-			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
-
-    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
-    	} else if (alphamod) {
-    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
-			srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
-    	} else {
-    	    uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
-    	    uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
-			srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	    src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
-		}
-
-    	dst = vandq_u32(alphaMask, dst);
-    	src = vandq_u32(vmvnq_u32(alphaMask), src);
-    	return vorrq_u32(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
-		}
-	}
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32x4_t ina;
+        if (alphamod)
+            ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+        else
+            ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+        uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+        if (rgbmod) {
+            uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+            uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+            uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
+            srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
+            srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
+
+            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+        } else if (alphamod) {
+            uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
+            srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+        } else {
+            uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+            uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
+            srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+        }
+
+        dst = vandq_u32(alphaMask, dst);
+        src = vandq_u32(vmvnq_u32(alphaMask), src);
+        return vorrq_u32(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct SubtractiveBlend {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
-    	uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	uint32x4_t dstb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-		srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
-		srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
-		srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
-
-    	return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		out[BlendBlit::kAIndex] = 255;
-		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-	}
+    static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+        uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+        uint32x4_t dstb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+        srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
+        srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
+        srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
+
+        return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        out[BlendBlit::kAIndex] = 255;
+        out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+    }
 };
 
 class BlendBlitImpl {
@@ -266,105 +266,105 @@ class BlendBlitImpl {
 public:
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
-	const byte *in;
-	byte *out;
+    const byte *in;
+    byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+    const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+    const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
 
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
 
     if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
 
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
         uint32 j = 0;
-		for (; j + 4 <= args.width; j += 4) {
+        for (; j + 4 <= args.width; j += 4) {
             uint32x4_t dstPixels;
             if (loaddst) dstPixels = vld1q_u32((const uint32 *)out);
             uint32x4_t srcPixels;
             if (!doscale) {
                 srcPixels = vld1q_u32((const uint32 *)in);
             } else {
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
+                srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 1);
+                srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 1);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 2);
+                srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 2);
                 scaleXCtr += args.scaleX;
-				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 3);
+                srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 3);
                 scaleXCtr += args.scaleX;
             }
             if (!doscale && (args.flipping & FLIP_H)) {
                 srcPixels = vrev64q_u32(srcPixels);
-	            srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
+                srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
+            }
+            {
+                const uint32x4_t res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+                vst1q_u32((uint32 *)out, res);
+            }
+            if (!doscale) in += args.inStep * 4;
+            out += 4 * 4;
+        }
+        if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
+        for (; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
             }
-			{
-				const uint32x4_t res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
-            	vst1q_u32((uint32 *)out, res);
-			}
-			if (!doscale) in += args.inStep * 4;
-			out += 4 * 4;
-		}
-		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
-		for (; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
-			}
-
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
             
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
-	BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 
 } // end of namespace Graphics
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
index c1b15c14354..7aa572ce7a1 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -28,243 +28,243 @@
 namespace Graphics {
 
 static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) {
-	__m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0));
-	__m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0));
-	return _mm_unpacklo_epi32(even, odd);
+    __m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0));
+    __m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0));
+    return _mm_unpacklo_epi32(even, odd);
 }
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AlphaBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-	    __m128i ina;
-	    if (alphamod)
-			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
-	    else
-			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-	    __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
-	
-	    if (rgbmod) {
-	    	__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-	    	__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-	    	__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-	    	__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-	    	__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-	    	__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
-			dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
-			dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
-			srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(cr)), BlendBlit::kRModShift - 8));
-			srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(cg)), BlendBlit::kGModShift - 8));
-			srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(cb)));
-			src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
-			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
-			src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
-	    } else {
-			__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-
-			dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
-			dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
-			srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift);
-			srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
-			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
-			src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
-	    }
-
-		dst = _mm_and_si128(alphaMask, dst);
-		src = _mm_andnot_si128(alphaMask, src);
-	    return _mm_or_si128(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
-			uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
-			uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
-
-			out[BlendBlit::kAIndex] = 255;
-			out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
-			out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
-			out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
-		}
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m128i ina;
+        if (alphamod)
+            ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+        else
+            ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+        __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+    
+        if (rgbmod) {
+            __m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+            dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+            dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
+            srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(cr)), BlendBlit::kRModShift - 8));
+            srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(cg)), BlendBlit::kGModShift - 8));
+            srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(cb)));
+            src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
+            src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
+        } else {
+            __m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+            dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+            dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+            srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift);
+            srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+            src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+        }
+
+        dst = _mm_and_si128(alphaMask, dst);
+        src = _mm_andnot_si128(alphaMask, src);
+        return _mm_or_si128(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+            uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+            uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+            out[BlendBlit::kAIndex] = 255;
+            out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+            out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+            out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct MultiplyBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		__m128i ina;
-		if (alphamod)
-			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
-		else
-			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
-
-		if (rgbmod) {
-			__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-			__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-			__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-			srcb = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstb, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcb, _mm_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcg, _mm_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstr, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcr, _mm_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
-
-			src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-			src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
-    	} else {
-			__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
-			__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-			__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
-			__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask));
-    	    srcrb = _mm_and_si128(sse2_mul32(dstrb, _mm_srli_epi32(sse2_mul32(srcrb, ina), 8)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
-    	}
-
-    	dst = _mm_and_si128(alphaMask, dst);
-    	src = _mm_andnot_si128(alphaMask, src);
-    	return _mm_or_si128(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
-		}
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m128i ina;
+        if (alphamod)
+            ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+        else
+            ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+        __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+
+        if (rgbmod) {
+            __m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstb, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcb, _mm_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
+            srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcg, _mm_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
+            srcr = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstr, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcr, _mm_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
+
+            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+        } else {
+            __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            srcg = _mm_and_si128(_mm_srli_epi32(sse2_mul32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm_and_si128(sse2_mul32(dstrb, _mm_srli_epi32(sse2_mul32(srcrb, ina), 8)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+        }
+
+        dst = _mm_and_si128(alphaMask, dst);
+        src = _mm_andnot_si128(alphaMask, src);
+        return _mm_or_si128(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct OpaqueBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+    }
 
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
-	}
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct BinaryBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		__m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
-		dst = _mm_and_si128(dst, alphaMask);
-		src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
-		return _mm_or_si128(src, dst);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 pix = *(const uint32 *)in;
-		int a = in[BlendBlit::kAIndex];
-
-		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-			*(uint32 *)out = pix;
-			out[BlendBlit::kAIndex] = 0xFF;
-		}
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
+        dst = _mm_and_si128(dst, alphaMask);
+        src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
+        return _mm_or_si128(src, dst);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 pix = *(const uint32 *)in;
+        int a = in[BlendBlit::kAIndex];
+
+        if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+            *(uint32 *)out = pix;
+            out[BlendBlit::kAIndex] = 0xFF;
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct AdditiveBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m128i ina;
-    	if (alphamod)
-    	    ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
-    	else
-    	    ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
-
-    	if (rgbmod) {
-    	    __m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask));
-    	    __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	    __m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask));
-    	    __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	    __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-			srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
-
-    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
-    	} else if (alphamod) {
-    	    __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
-    	    __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
-    	    __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
-    	} else {
-    	    __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
-    	    __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	    __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
-    	    __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
-
-			srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
-
-    	    src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	    src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
-		}
-
-    	dst = _mm_and_si128(alphaMask, dst);
-    	src = _mm_andnot_si128(alphaMask, src);
-    	return _mm_or_si128(dst, src);
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
-
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
-		}
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m128i ina;
+        if (alphamod)
+            ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+        else
+            ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+        __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
+
+        if (rgbmod) {
+            __m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask));
+            __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+            __m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask));
+            __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+            __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+            srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
+            srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
+            srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
+
+            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+        } else if (alphamod) {
+            __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+        } else {
+            __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+            __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+            __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+            srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
+            srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+        }
+
+        dst = _mm_and_si128(alphaMask, dst);
+        src = _mm_andnot_si128(alphaMask, src);
+        return _mm_or_si128(dst, src);
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+        if (ina != 0) {
+            out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+            out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+            out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+        }
+    }
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 struct SubtractiveBlend {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-    	__m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
-    	__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-    	__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
-    	__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
-    	__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
-
-		srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
-		srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
-		srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
-
-    	return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
-	}
-
-	static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
-		out[BlendBlit::kAIndex] = 255;
-		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-	}
+    static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        __m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+        __m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+        __m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+        __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+        __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+        srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
+        srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
+        srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
+
+        return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+    }
+
+    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+        out[BlendBlit::kAIndex] = 255;
+        out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+        out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+    }
 };
 
 class BlendBlitImpl {
@@ -277,103 +277,103 @@ static inline void blitInnerLoopAVX2(BlendBlit::Args &args);
 
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
-	const byte *in;
-	byte *out;
+    const byte *in;
+    byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
+    const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
+    const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
 
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
 
     if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
 
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-
-		uint32 j = 0;
-		for (; j + 4 <= args.width; j += 4) {
-    		__m128i dstPixels, srcPixels;
-			if (loaddst) dstPixels = _mm_loadu_si128((const __m128i *)out);
-    		if (!doscale) {
-    		    srcPixels = _mm_loadu_si128((const __m128i *)in);
-    		} else {
-				srcPixels = _mm_setr_epi32(
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
-					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
-				);
-				scaleXCtr += args.scaleX * 4;
-    		}
-    		if (!doscale && (args.flipping & FLIP_H)) {
-				srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
-    		}
-			{
-				const __m128i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
-				_mm_storeu_si128((__m128i *)out, res);
-			}
-			if (!doscale) in += (ptrdiff_t)args.inStep * 4;
-			out += 4ULL * 4;
-		}
-		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
-		for (; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
-			}
-
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+
+        uint32 j = 0;
+        for (; j + 4 <= args.width; j += 4) {
+            __m128i dstPixels, srcPixels;
+            if (loaddst) dstPixels = _mm_loadu_si128((const __m128i *)out);
+            if (!doscale) {
+                srcPixels = _mm_loadu_si128((const __m128i *)in);
+            } else {
+                srcPixels = _mm_setr_epi32(
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+                    *(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+                );
+                scaleXCtr += args.scaleX * 4;
+            }
+            if (!doscale && (args.flipping & FLIP_H)) {
+                srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+            }
+            {
+                const __m128i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+                _mm_storeu_si128((__m128i *)out, res);
+            }
+            if (!doscale) in += (ptrdiff_t)args.inStep * 4;
+            out += 4ULL * 4;
+        }
+        if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
+        for (; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
             
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
 }
 
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAlphaBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale, bool rgbmod>
 void BlendBlit::doBlitSubtractiveBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitAdditiveBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitOpaqueBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
 }
 template<bool doscale>
 void BlendBlit::doBlitBinaryBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
 }
 template<bool doscale, bool rgbmod, bool alphamod>
 void BlendBlit::doBlitMultiplyBlendLogicSSE2(Args &args) {
-	BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+    BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
 }
 
 } // End of namespace Graphics


Commit: e374c4d9cfd67f2e9a104e55c6feb8d249d6ff47
    https://github.com/scummvm/scummvm/commit/e374c4d9cfd67f2e9a104e55c6feb8d249d6ff47
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fixed ManagedSurface::blendBlitFrom

Changed paths:
    graphics/managed_surface.cpp


diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 050e6125be0..1de03d75a8a 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -734,7 +734,7 @@ Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Comm
 										   const uint32 colorMod,
 										   const TSpriteBlendMode blend,
 										   const AlphaType alphaType) {
-	return blendBlitFrom(src, srcRect, destRect, flipping, colorMod, blend, alphaType);
+	return blendBlitFrom(src.rawSurface(), srcRect, destRect, flipping, colorMod, blend, alphaType);
 }
 Common::Rect ManagedSurface::blendBlitFrom(const Surface &src, const Common::Rect &srcRect,
 										   const Common::Rect &destRect, int flipping,


Commit: df073eeed7e4f7ff613624494ce8e6db87532224
    https://github.com/scummvm/scummvm/commit/df073eeed7e4f7ff613624494ce8e6db87532224
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Added copyright header to blendBlitFrom test

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index 23ed4648bd9..123dcadc0a8 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -1,3 +1,24 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
 #include <cxxtest/TestSuite.h>
 
 #if defined(HAVE_CONFIG_H)


Commit: 55550f85ac2e9d25b54cd73435766f3760ee2145
    https://github.com/scummvm/scummvm/commit/55550f85ac2e9d25b54cd73435766f3760ee2145
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BACKENDS: Move SIMD detection to more stable SDL2

Changed paths:
    backends/base-backend.cpp
    backends/base-backend.h
    backends/platform/sdl/sdl.cpp


diff --git a/backends/base-backend.cpp b/backends/base-backend.cpp
index 81515ae3372..2aef2842489 100644
--- a/backends/base-backend.cpp
+++ b/backends/base-backend.cpp
@@ -63,49 +63,11 @@ void BaseBackend::initBackend() {
 #ifndef DISABLE_DEFAULT_AUDIOCD_MANAGER
 	if (!_audiocdManager)
 		_audiocdManager = new DefaultAudioCDManager();
-#endif
-	_cpuFeatures = kCpuNoFeatures;
-#if defined(__x86_64__) || defined(__i686__) || defined(_M_X86) || defined(_M_X64)
-	uint32 ext_edx1 = 0, ext_ebx7 = 0, ext_ecx1 = 0;
-#  ifdef __GNUC__
-	asm ("mov $1, %%eax\n\t"
-		 "cpuid\n\t"
-		 "mov %%edx, %0\n\t"
-		 "mov %%ecx, %2\n\t"
-		 "mov $7, %%eax\n\t"
-		 "mov $0, %%ecx\n\t"
-		 "cpuid\n\t"
-		 "mov %%ebx, %1\n\t"
-		 : "=rm" (ext_edx1), "=rm" (ext_ebx7), "=rm" (ext_ecx1)
-		 :
-		 : "eax", "ebx", "ecx", "edx");
-#  elif _MSC_VER
-	__asm
-	{
-		mov eax,1
-		cpuid
-		mov ext_edx1,edx
-		mov ext_ecx1,ecx
-		mov eax,7
-		mov ecx,0
-		cpuid
-		mov ext_ebx7,ebx
-	}
-#  endif // __GNUC__ and _MSC_VER
-	_cpuFeatures |= (ext_edx1 & (1 << 26)) ? kCpuFeatureSSE2 : kCpuNoFeatures;
-	_cpuFeatures |= (ext_ebx7 & (1 << 5)) ? kCpuFeatureAVX2 : kCpuNoFeatures;
-	_cpuFeatures |= (ext_ecx1 & (1 << 19)) ? kCpuFeatureSSE41 : kCpuNoFeatures;
-#endif // __x86_64__ and __i686__
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-	_cpuFeatures |= kCpuFeatureNEON;
 #endif
 	OSystem::initBackend();
 }
 
 bool BaseBackend::hasFeature(Feature f) {
-	if (f == kFeatureCpuSSE2) return _cpuFeatures & kCpuFeatureSSE2;
-	if (f == kFeatureCpuAVX2) return _cpuFeatures & kCpuFeatureAVX2;
-	if (f == kFeatureCpuNEON) return _cpuFeatures & kCpuFeatureNEON;
 	return false;
 }
 
diff --git a/backends/base-backend.h b/backends/base-backend.h
index 7febdb95e9c..45bf2af43cd 100644
--- a/backends/base-backend.h
+++ b/backends/base-backend.h
@@ -31,18 +31,6 @@
  */
 class BaseBackend : public OSystem {
 public:
-	enum CpuFeatureFlags {
-		kCpuNoFeatures     = 0, // Completely detected by BaseBackend
-		kCpuFeatureSSE2    = (1 << 0), // Completely detected by BaseBackend
-		kCpuFeatureAVX2    = (1 << 1), // Completely detected by BaseBackend
-		// Detected either by BaseBackend (if platform ONLY supports ARMv8+) or
-		// platform specific Backends if ARM is optional or not on all versions
-		// of the platform.
-		kCpuFeatureNEON    = (1 << 2),
-		kCpuFeatureAlitvec = (1 << 3), // Platform specific
-		kCpuFeatureSSE41   = (1 << 4), // Completely detected by BaseBackend
-	};
-
 	void initBackend() override;
 	bool hasFeature(Feature f) override;
 
@@ -52,9 +40,6 @@ public:
 	void displayActivityIconOnOSD(const Graphics::Surface *icon) override {}
 	void fillScreen(uint32 col) override;
 	void fillScreen(const Common::Rect &r, uint32 col) override;
-
-private:
-	uint32 _cpuFeatures;
 };
 
 class EventsBaseBackend : virtual public BaseBackend, Common::EventSource {
diff --git a/backends/platform/sdl/sdl.cpp b/backends/platform/sdl/sdl.cpp
index 776a98ce7a0..4b06e6d26ba 100644
--- a/backends/platform/sdl/sdl.cpp
+++ b/backends/platform/sdl/sdl.cpp
@@ -175,6 +175,11 @@ void OSystem_SDL::init() {
 bool OSystem_SDL::hasFeature(Feature f) {
 #if SDL_VERSION_ATLEAST(2, 0, 0)
 	if (f == kFeatureClipboardSupport) return true;
+	if (f == kFeatureCpuSSE2) return SDL_HasSSE2();
+	if (f == kFeatureCpuNEON) return SDL_HasNEON();
+	if (f == kFeatureCpuSSE41) return SDL_HasSSE41();
+	if (f == kFeatureCpuAVX2) return SDL_HasAVX2();
+	if (f == kFeatureCpuAltivec) return SDL_HasAltiVec();
 #endif
 #if SDL_VERSION_ATLEAST(2, 0, 14)
 	if (f == kFeatureOpenUrl) return true;


Commit: bf7b6c1cf69165108eec5a322a165e4f926af236
    https://github.com/scummvm/scummvm/commit/bf7b6c1cf69165108eec5a322a165e4f926af236
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Fixed typo in Arm NEON feature

Changed paths:
    configure


diff --git a/configure b/configure
index 390f0de91b3..b680598116b 100755
--- a/configure
+++ b/configure
@@ -6933,7 +6933,7 @@ define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
 # to downgrade. Almost all armv7 cpus have neon or less in terms of fpu
 # extensions so setting fpu to neon is almost always an upgrade over defaults.
 # Not to mention it would have to be included anyways
-if ( test "$_ext_avx2" = yes ) && ( test "$_host_cpu" != aarch64 ) ; then
+if ( test "$_ext_neon" = yes ) && ( test "$_host_cpu" != aarch64 ) ; then
 	append_var CXXFLAGS "-mfpu=neon"
 fi
 echo_n "Enabling arm NEON... "


Commit: 91b9c112b580257d74aa8f3298419e79843ae60d
    https://github.com/scummvm/scummvm/commit/91b9c112b580257d74aa8f3298419e79843ae60d
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Put bitmap saving under ifdef in blending.h

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index 123dcadc0a8..c2d84138064 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -858,6 +858,7 @@ OldTransparentSurface *OldTransparentSurface::convertTo(const PixelFormat &dstFo
 
 } // namespace OldTransparentSurface
 
+#ifdef TEST_IMAGE_BLENDING_SAVE
 static int save_bitmap(const char *path, const Graphics::Surface *surf) {
     Common::FSNode fileNode(path);
     Common::SeekableWriteStream *out = fileNode.createWriteStream();
@@ -903,6 +904,7 @@ static int save_bitmap(const char *path, const Graphics::Surface *surf) {
 
 	return true;
 }
+#endif
 
 static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface *b) {
     if (a->w != b->w || a->h != b->h) return false;
@@ -1100,30 +1102,36 @@ public:
             if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+#ifdef TEST_IMAGE_BLENDING_SAVE
                 save_bitmap("sourceSurf.bmp", &newSurf);
                 save_bitmap("oldSurfDest.bmp", &oldSurfDest);
                 save_bitmap("newSurfDest.bmp", &newSurfDest);
                 save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+#endif
                 TS_FAIL("oldSurfDest and newSurfDest are not equal!");
                 return;
             }
             if (!areSurfacesEqual(&oldSurfDest, managedSurfDest.surfacePtr())) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+#ifdef TEST_IMAGE_BLENDING_SAVE
                 save_bitmap("sourceSurf.bmp", &newSurf);
                 save_bitmap("oldSurfDest.bmp", &oldSurfDest);
                 save_bitmap("newSurfDest.bmp", &newSurfDest);
                 save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+#endif
                 TS_FAIL("oldSurfDest and managedSurfDest are not equal!");
                 return;
             }
             if (!areSurfacesEqual(&newSurfDest, managedSurfDest.surfacePtr())) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+#ifdef TEST_IMAGE_BLENDING_SAVE
                 save_bitmap("sourceSurf.bmp", &newSurf);
                 save_bitmap("oldSurfDest.bmp", &oldSurfDest);
                 save_bitmap("newSurfDest.bmp", &newSurfDest);
                 save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+#endif
                 TS_FAIL("newSurfDest and managedSurfDest are not equal!");
                 return;
             }
@@ -1138,10 +1146,12 @@ public:
             if (!areSurfacesEqual(&oldSurfDest, &newSurfDest)) {
                 warning("BLIT_CLIP blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",
                     blendModes[blendMode], alphaTypes[alphaType], a, r, g, b, flipNames[flipping], rectNames[rect]);
+#ifdef TEST_IMAGE_BLENDING_SAVE
                 save_bitmap("sourceSurfBlipClip.bmp", &newSurf);
                 save_bitmap("oldSurfDestBlitClip.bmp", &oldSurfDest);
                 save_bitmap("newSurfDestBlitClip.bmp", &newSurfDest);
                 save_bitmap("managedSurfDest.bmp", managedSurfDest.surfacePtr());
+#endif
                 TS_FAIL("oldSurfDest and newSurfDest are not equal with blipClip!");
                 return;
             }


Commit: 72c01fcdbe8f51a44e21d3b7b50c4708b7336dbe
    https://github.com/scummvm/scummvm/commit/72c01fcdbe8f51a44e21d3b7b50c4708b7336dbe
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
TEST: Remove unnessesary functions in blending.h

Changed paths:
    test/image/blending.h


diff --git a/test/image/blending.h b/test/image/blending.h
index c2d84138064..93606accbe7 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -70,28 +70,13 @@ struct OldTransparentSurface : public Graphics::Surface {
 						int width = -1, int height = -1,
 						TSpriteBlendMode blend = BLEND_NORMAL);
 	OldTransparentSurface *scale(int16 newWidth, int16 newHeight, bool filtering = false) const;
-
-	OldTransparentSurface *rotoscale(const TransformStruct &transform, bool filtering = false) const;
-
-	OldTransparentSurface *convertTo(const PixelFormat &dstFormat, const byte *palette = 0) const;
-
-	float getRatio() {
-		if (!w)
-			return 0;
-
-		return h / (float)w;
-	}
-
-	AlphaType getAlphaMode() const;
-	void setAlphaMode(AlphaType);
-private:
 	AlphaType _alphaMode;
 };
 
-static const int kBModShift = 8;//img->format.bShift;
-static const int kGModShift = 16;//img->format.gShift;
-static const int kRModShift = 24;//img->format.rShift;
-static const int kAModShift = 0;//img->format.aShift;
+static const int kBModShift = 8;
+static const int kGModShift = 16;
+static const int kRModShift = 24;
+static const int kAModShift = 0;
 
 static const uint32 kBModMask = 0x0000ff00;
 static const uint32 kGModMask = 0x00ff0000;
@@ -737,14 +722,6 @@ Common::Rect OldTransparentSurface::blitClip(Graphics::Surface &target, Common::
 	return retSize;
 }
 
-AlphaType OldTransparentSurface::getAlphaMode() const {
-	return _alphaMode;
-}
-
-void OldTransparentSurface::setAlphaMode(AlphaType mode) {
-	_alphaMode = mode;
-}
-
 OldTransparentSurface *OldTransparentSurface::scale(int16 newWidth, int16 newHeight, bool filtering) const {
 
 	OldTransparentSurface *target = new OldTransparentSurface();
@@ -760,102 +737,6 @@ OldTransparentSurface *OldTransparentSurface::scale(int16 newWidth, int16 newHei
 	return target;
 }
 
-OldTransparentSurface *OldTransparentSurface::rotoscale(const TransformStruct &transform, bool filtering) const {
-
-	Common::Point newHotspot;
-	Common::Rect rect = TransformTools::newRect(Common::Rect((int16)w, (int16)h), transform, &newHotspot);
-
-	OldTransparentSurface *target = new OldTransparentSurface();
-
-	target->create((uint16)rect.right - rect.left, (uint16)rect.bottom - rect.top, this->format);
-
-	if (filtering) {
-		rotoscaleBlitBilinear((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format, transform, newHotspot);
-	} else {
-		rotoscaleBlit((byte *)target->getPixels(), (const byte *)getPixels(), target->pitch, pitch, target->w, target->h, w, h, format, transform, newHotspot);
-	}
-
-	return target;
-}
-
-OldTransparentSurface *OldTransparentSurface::convertTo(const PixelFormat &dstFormat, const byte *palette) const {
-	assert(pixels);
-
-	OldTransparentSurface *surface = new OldTransparentSurface();
-
-	// If the target format is the same, just copy
-	if (format == dstFormat) {
-		surface->copyFrom(*this);
-		return surface;
-	}
-
-	if (format.bytesPerPixel == 0 || format.bytesPerPixel > 4)
-		error("Surface::convertTo(): Can only convert from 1Bpp, 2Bpp, 3Bpp, and 4Bpp");
-
-	if (dstFormat.bytesPerPixel != 2 && dstFormat.bytesPerPixel != 4)
-		error("Surface::convertTo(): Can only convert to 2Bpp and 4Bpp");
-
-	surface->create(w, h, dstFormat);
-
-	if (format.bytesPerPixel == 1) {
-		// Converting from paletted to high color
-		assert(palette);
-
-		for (int y = 0; y < h; y++) {
-			const byte *srcRow = (const byte *)getBasePtr(0, y);
-			byte *dstRow = (byte *)surface->getBasePtr(0, y);
-
-			for (int x = 0; x < w; x++) {
-				byte index = *srcRow++;
-				byte r = palette[index * 3];
-				byte g = palette[index * 3 + 1];
-				byte b = palette[index * 3 + 2];
-
-				uint32 color = dstFormat.RGBToColor(r, g, b);
-
-				if (dstFormat.bytesPerPixel == 2)
-					*((uint16 *)dstRow) = color;
-				else
-					*((uint32 *)dstRow) = color;
-
-				dstRow += dstFormat.bytesPerPixel;
-			}
-		}
-	} else {
-		// Converting from high color to high color
-		for (int y = 0; y < h; y++) {
-			const byte *srcRow = (const byte *)getBasePtr(0, y);
-			byte *dstRow = (byte *)surface->getBasePtr(0, y);
-
-			for (int x = 0; x < w; x++) {
-				uint32 srcColor;
-				if (format.bytesPerPixel == 2)
-					srcColor = READ_UINT16(srcRow);
-				else if (format.bytesPerPixel == 3)
-					srcColor = READ_UINT24(srcRow);
-				else
-					srcColor = READ_UINT32(srcRow);
-
-				srcRow += format.bytesPerPixel;
-
-				// Convert that color to the new format
-				byte r, g, b, a;
-				format.colorToARGB(srcColor, a, r, g, b);
-				uint32 color = dstFormat.ARGBToColor(a, r, g, b);
-
-				if (dstFormat.bytesPerPixel == 2)
-					*((uint16 *)dstRow) = color;
-				else
-					*((uint32 *)dstRow) = color;
-
-				dstRow += dstFormat.bytesPerPixel;
-			}
-		}
-	}
-
-	return surface;
-}
-
 } // namespace OldTransparentSurface
 
 #ifdef TEST_IMAGE_BLENDING_SAVE
@@ -950,7 +831,7 @@ public:
 		for (uint32 color = 0xffffffff; color != 0; color = (color == 0xffffffff ? 0x7f7f7f7f : 0)) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
-            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            oldSurf._alphaMode = (Graphics::AlphaType)alphaType;
 			uint32 oldStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
             	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode);
@@ -975,7 +856,7 @@ public:
 			// scaled
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
-            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            oldSurf._alphaMode = (Graphics::AlphaType)alphaType;
 			oldStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
             	oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, color, oldSurfDest.w, oldSurfDest.h, (Graphics::TSpriteBlendMode)blendMode);
@@ -1082,7 +963,7 @@ public:
         for (int flipping = 0; flipping <= 3; flipping++) {
         for (int rect = 0; rect < (int)(sizeof(srcs)/sizeof(srcs[0])); rect++) {
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            oldSurf._alphaMode = (Graphics::AlphaType)alphaType;
             Common::Rect ret1 = oldSurf.blit(oldSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
@@ -1138,7 +1019,7 @@ public:
 
 			
             oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
+            oldSurf._alphaMode = (Graphics::AlphaType)alphaType;
             ret1 = oldSurf.blitClip(oldSurfDest, Common::Rect(2, 2, oldSurfDest.w - 2, oldSurfDest.h - 2), dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             newSurfDest.fillRect(Common::Rect(0, 0, newSurfDest.w, newSurfDest.h), newSurfDest.format.ARGBToColor(ba, br, bg, bb));
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);


Commit: 66c75ee760a55bb5e30ef8dcee001abc0d7b4b34
    https://github.com/scummvm/scummvm/commit/66c75ee760a55bb5e30ef8dcee001abc0d7b4b34
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Moved SIMD code to new translation unit

Changed paths:
  A graphics/blit/blit-blend-avx2.h
  A graphics/blit/blit-blend-neon.h
  A graphics/blit/blit-blend-normal.h
  A graphics/blit/blit-blend-sse2.h
  A graphics/blit/blit-blend.cpp
  R graphics/blit/blit-avx2.cpp
  R graphics/blit/blit-neon.cpp
  R graphics/blit/blit-sse2.cpp
    graphics/blit/blit-alpha.cpp
    graphics/module.mk


diff --git a/graphics/blit/blit-alpha.cpp b/graphics/blit/blit-alpha.cpp
index 4b5793c5056..bb82b13d68c 100644
--- a/graphics/blit/blit-alpha.cpp
+++ b/graphics/blit/blit-alpha.cpp
@@ -21,11 +21,6 @@
 
 #include "graphics/blit.h"
 #include "graphics/pixelformat.h"
-#include "common/system.h"
-
-#include "graphics/blit/blit-neon.cpp"
-#include "graphics/blit/blit-sse2.cpp"
-#include "graphics/blit/blit-avx2.cpp"
 
 namespace Graphics {
 
@@ -172,511 +167,4 @@ bool setAlpha(byte *dst, const byte *src,
     return true;
 }
 
-
-BlendBlit::Args::Args(byte *dst, const byte *src,
-    const uint _dstPitch, const uint _srcPitch,
-    const int posX, const int posY,
-    const uint _width, const uint _height,
-    const int _scaleX, const int _scaleY,
-    const int scaleXsrcOff, const int scaleYsrcOff,
-    const uint32 colorMod, const uint _flipping) :
-        xp(0), yp(0), dstPitch(_dstPitch),
-        width(_width), height(_height), color(colorMod),
-        scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
-        scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
-    bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
-    
-    rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-    alphamod = ((colorMod & kAModMask)   != kAModMask);
-    inStep = 4;
-    inoStep = _srcPitch;
-    if (flipping & FLIP_H) {
-        inStep = -inStep;
-        xp = width - 1;
-        if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
-    }
-
-    if (flipping & FLIP_V) {
-        inoStep = -inoStep;
-        yp = height - 1;
-        if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
-    }
-
-    ino = src + yp * _srcPitch + xp * 4;
-    outo = dst + posY * _dstPitch + posX * 4;
-}
-
-/**
- * Optimized version of doBlit to be used with multiply blended blitting
- */
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
-                out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
-                out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-
-}
-
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-    const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-    const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                if (rgbmod) {
-                    const uint outb = (out[kBIndex] * (255 - ina) >> 8);
-                    const uint outg = (out[kGIndex] * (255 - ina) >> 8);
-                    const uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-                    out[kAIndex] = 255;
-                    out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-                    out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-                    out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
-                } else {
-                    out[kAIndex] = 255;
-                    out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
-                    out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
-                    out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
-                    
-                }
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-/**
- * Optimized version of doBlit to be used with subtractive blended blitting
- */
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            out[kAIndex] = 255;
-            out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
-            out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
-            out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-/**
- * Optimized version of doBlit to be used with additive blended blitting
- */
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
-                out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
-                out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-
-        if (doscale) {
-            for (uint32 j = 0; j < args.width; j++) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-                *(uint32 *)out = *(const uint32 *)in | kAModMask;
-                scaleXCtr += args.scaleX;
-                out += 4;
-            }
-        } else {
-            for (uint32 j = 0; j < args.width; j++) {
-                *(uint32 *)out = *(const uint32 *)in | kAModMask;
-                in += args.inStep;
-                out += 4;
-            }
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
-            uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
-            pixout &= ~mask;
-            pix = (pix | kAModMask) & mask;
-            *(uint32 *)out = pixout | pix;
-            
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-// Initialize this to nullptr at the start
-BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
-
-// Only blits to and from 32bpp images
-// So this function is just here to jump to whatever function is in
-// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
-// the cpu has certain SIMD feature enabled or not.
-void BlendBlit::blit(byte *dst, const byte *src,
-                     const uint dstPitch, const uint srcPitch,
-                     const int posX, const int posY,
-                     const uint width, const uint height,
-                     const int scaleX, const int scaleY,
-                     const int scaleXsrcOff, const int scaleYsrcOff,
-                     const uint32 colorMod, const uint flipping,
-                     const TSpriteBlendMode blendMode,
-                     const AlphaType alphaType) {
-    if (width == 0 || height == 0) return;
-
-    // If no function has been selected yet, detect and select
-    if (!blitFunc) {
-        // Get the correct blit function
-        blitFunc = blitGeneric;
-#ifdef SCUMMVM_NEON
-        if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
-#endif
-#ifdef SCUMMVM_SSE2
-        if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
-#endif
-#ifdef SCUMMVM_AVX2
-        if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
-#endif
-    }
-    
-    Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
-    blitFunc(args, blendMode, alphaType);
-}
-
-// This is just a macro to expand it because its a pretty simple function where
-// readabiliy doesn't matter too much and macros tend to work faster better than functors
-#define BLIT_FUNC(ext) \
-    void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
-        bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
-        bool alphamod = ((args.color & kAModMask)   != kAModMask); \
-        if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
-            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-                doBlitOpaqueBlendLogic##ext<false>(args); \
-            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-                doBlitBinaryBlendLogic##ext<false>(args); \
-            } else { \
-                if (blendMode == BLEND_ADDITIVE) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } else if (blendMode == BLEND_SUBTRACTIVE) { \
-                    if (rgbmod) { \
-                        doBlitSubtractiveBlendLogic##ext<false, true>(args); \
-                    } else { \
-                        doBlitSubtractiveBlendLogic##ext<false, false>(args); \
-                    } \
-                } else if (blendMode == BLEND_MULTIPLY) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } else { \
-                    assert(blendMode == BLEND_NORMAL); \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } \
-            } \
-        } else { \
-            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-                doBlitOpaqueBlendLogic##ext<true>(args); \
-            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-                doBlitBinaryBlendLogic##ext<true>(args); \
-            } else { \
-                if (blendMode == BLEND_ADDITIVE) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } else if (blendMode == BLEND_SUBTRACTIVE) { \
-                    if (rgbmod) { \
-                        doBlitSubtractiveBlendLogic##ext<true, true>(args); \
-                    } else { \
-                        doBlitSubtractiveBlendLogic##ext<true, false>(args); \
-                    } \
-                } else if (blendMode == BLEND_MULTIPLY) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } else { \
-                    assert(blendMode == BLEND_NORMAL); \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } \
-            } \
-        } \
-    }
-BLIT_FUNC(Generic)
-#ifdef SCUMMVM_NEON
-BLIT_FUNC(NEON)
-#endif
-#ifdef SCUMMVM_SSE2
-BLIT_FUNC(SSE2)
-#endif
-#ifdef SCUMMVM_AVX2
-BLIT_FUNC(AVX2)
-#endif
-
 } // End of namespace Graphics
diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-blend-avx2.h
similarity index 99%
rename from graphics/blit/blit-avx2.cpp
rename to graphics/blit/blit-blend-avx2.h
index 893ac21a3f0..d22d7ef4763 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-blend-avx2.h
@@ -19,6 +19,9 @@
  *
  */
 
+#ifndef GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
+#define GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
+
 #ifdef SCUMMVM_AVX2
 #include <immintrin.h>
 
@@ -368,3 +371,4 @@ void BlendBlit::doBlitMultiplyBlendLogicAVX2(Args &args) {
 } // End of namespace Graphics
 
 #endif // SCUMMVM_AVX2
+#endif // GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
diff --git a/graphics/blit/blit-neon.cpp b/graphics/blit/blit-blend-neon.h
similarity index 99%
rename from graphics/blit/blit-neon.cpp
rename to graphics/blit/blit-blend-neon.h
index 6d57e16f857..66fa2d0484b 100644
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-blend-neon.h
@@ -19,6 +19,9 @@
  *
  */
 
+#ifndef GRAPHICS_BLIT_BLIT_BLEND_NEON_H
+#define GRAPHICS_BLIT_BLIT_BLEND_NEON_H
+
 #ifdef SCUMMVM_NEON
 #include <arm_neon.h>
 
@@ -370,3 +373,4 @@ void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
 } // end of namespace Graphics
 
 #endif // SCUMMVM_NEON
+#endif // GRAPHICS_BLIT_BLIT_BLEND_NEON_H
diff --git a/graphics/blit/blit-blend-normal.h b/graphics/blit/blit-blend-normal.h
new file mode 100644
index 00000000000..a7903c4097f
--- /dev/null
+++ b/graphics/blit/blit-blend-normal.h
@@ -0,0 +1,333 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
+#define GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
+#include "graphics/blit.h"
+
+namespace Graphics {
+
+/**
+ * Optimized version of doBlit to be used with multiply blended blitting
+ */
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
+                out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
+                out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+
+}
+
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
+    const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
+    const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                if (rgbmod) {
+                    const uint outb = (out[kBIndex] * (255 - ina) >> 8);
+                    const uint outg = (out[kGIndex] * (255 - ina) >> 8);
+                    const uint outr = (out[kRIndex] * (255 - ina) >> 8);
+
+                    out[kAIndex] = 255;
+                    out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
+                    out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
+                    out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
+                } else {
+                    out[kAIndex] = 255;
+                    out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
+                    out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
+                    out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
+                    
+                }
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+/**
+ * Optimized version of doBlit to be used with subtractive blended blitting
+ */
+template<bool doscale, bool rgbmod>
+void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            out[kAIndex] = 255;
+            out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
+            out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
+            out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+/**
+ * Optimized version of doBlit to be used with additive blended blitting
+ */
+template<bool doscale, bool rgbmod, bool alphamod>
+void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
+                out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
+                out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+template<bool doscale>
+void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+
+        if (doscale) {
+            for (uint32 j = 0; j < args.width; j++) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+                *(uint32 *)out = *(const uint32 *)in | kAModMask;
+                scaleXCtr += args.scaleX;
+                out += 4;
+            }
+        } else {
+            for (uint32 j = 0; j < args.width; j++) {
+                *(uint32 *)out = *(const uint32 *)in | kAModMask;
+                in += args.inStep;
+                out += 4;
+            }
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+template<bool doscale>
+void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
+            uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
+            pixout &= ~mask;
+            pix = (pix | kAModMask) & mask;
+            *(uint32 *)out = pixout | pix;
+            
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+} // end of namespace Graphics
+
+#endif // GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-blend-sse2.h
similarity index 99%
rename from graphics/blit/blit-sse2.cpp
rename to graphics/blit/blit-blend-sse2.h
index 7aa572ce7a1..29a27832eba 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-blend-sse2.h
@@ -19,6 +19,9 @@
  *
  */
 
+#ifndef GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
+#define GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
+
 #ifdef SCUMMVM_SSE2
 #include <immintrin.h>
 
@@ -379,3 +382,4 @@ void BlendBlit::doBlitMultiplyBlendLogicSSE2(Args &args) {
 } // End of namespace Graphics
 
 #endif // SSE2
+#endif // GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
diff --git a/graphics/blit/blit-blend.cpp b/graphics/blit/blit-blend.cpp
new file mode 100644
index 00000000000..a664f32eb06
--- /dev/null
+++ b/graphics/blit/blit-blend.cpp
@@ -0,0 +1,233 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/system.h"
+#include "graphics/blit/blit-blend-normal.h"
+#include "graphics/blit/blit-blend-neon.h"
+#include "graphics/blit/blit-blend-sse2.h"
+#include "graphics/blit/blit-blend-avx2.h"
+
+namespace Graphics {
+
+BlendBlit::Args::Args(byte *dst, const byte *src,
+    const uint _dstPitch, const uint _srcPitch,
+    const int posX, const int posY,
+    const uint _width, const uint _height,
+    const int _scaleX, const int _scaleY,
+    const int scaleXsrcOff, const int scaleYsrcOff,
+    const uint32 colorMod, const uint _flipping) :
+        xp(0), yp(0), dstPitch(_dstPitch),
+        width(_width), height(_height), color(colorMod),
+        scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
+        scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
+    bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
+    
+    rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+    alphamod = ((colorMod & kAModMask)   != kAModMask);
+    inStep = 4;
+    inoStep = _srcPitch;
+    if (flipping & FLIP_H) {
+        inStep = -inStep;
+        xp = width - 1;
+        if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
+    }
+
+    if (flipping & FLIP_V) {
+        inoStep = -inoStep;
+        yp = height - 1;
+        if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
+    }
+
+    ino = src + yp * _srcPitch + xp * 4;
+    outo = dst + posY * _dstPitch + posX * 4;
+}
+
+// Initialize this to nullptr at the start
+BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
+
+// Only blits to and from 32bpp images
+// So this function is just here to jump to whatever function is in
+// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
+// the cpu has certain SIMD feature enabled or not.
+void BlendBlit::blit(byte *dst, const byte *src,
+                     const uint dstPitch, const uint srcPitch,
+                     const int posX, const int posY,
+                     const uint width, const uint height,
+                     const int scaleX, const int scaleY,
+                     const int scaleXsrcOff, const int scaleYsrcOff,
+                     const uint32 colorMod, const uint flipping,
+                     const TSpriteBlendMode blendMode,
+                     const AlphaType alphaType) {
+    if (width == 0 || height == 0) return;
+
+    // If no function has been selected yet, detect and select
+    if (!blitFunc) {
+        // Get the correct blit function
+        blitFunc = blitGeneric;
+#ifdef SCUMMVM_NEON
+        if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
+#endif
+#ifdef SCUMMVM_SSE2
+        if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
+#endif
+#ifdef SCUMMVM_AVX2
+        if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
+#endif
+    }
+    
+    Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
+    blitFunc(args, blendMode, alphaType);
+}
+
+// This is just a macro to expand it because its a pretty simple function where
+// readabiliy doesn't matter too much and macros tend to work faster better than functors
+#define BLIT_FUNC(ext) \
+    void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
+        bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
+        bool alphamod = ((args.color & kAModMask)   != kAModMask); \
+        if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
+            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+                doBlitOpaqueBlendLogic##ext<false>(args); \
+            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+                doBlitBinaryBlendLogic##ext<false>(args); \
+            } else { \
+                if (blendMode == BLEND_ADDITIVE) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } else if (blendMode == BLEND_SUBTRACTIVE) { \
+                    if (rgbmod) { \
+                        doBlitSubtractiveBlendLogic##ext<false, true>(args); \
+                    } else { \
+                        doBlitSubtractiveBlendLogic##ext<false, false>(args); \
+                    } \
+                } else if (blendMode == BLEND_MULTIPLY) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } else { \
+                    assert(blendMode == BLEND_NORMAL); \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<false, true, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<false, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<false, false, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<false, false, false>(args); \
+                        } \
+                    } \
+                } \
+            } \
+        } else { \
+            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
+                doBlitOpaqueBlendLogic##ext<true>(args); \
+            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
+                doBlitBinaryBlendLogic##ext<true>(args); \
+            } else { \
+                if (blendMode == BLEND_ADDITIVE) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } else if (blendMode == BLEND_SUBTRACTIVE) { \
+                    if (rgbmod) { \
+                        doBlitSubtractiveBlendLogic##ext<true, true>(args); \
+                    } else { \
+                        doBlitSubtractiveBlendLogic##ext<true, false>(args); \
+                    } \
+                } else if (blendMode == BLEND_MULTIPLY) { \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } else { \
+                    assert(blendMode == BLEND_NORMAL); \
+                    if (rgbmod) { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<true, true, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<true, true, false>(args); \
+                        } \
+                    } else { \
+                        if (alphamod) { \
+                            doBlitAlphaBlendLogic##ext<true, false, true>(args); \
+                        } else { \
+                            doBlitAlphaBlendLogic##ext<true, false, false>(args); \
+                        } \
+                    } \
+                } \
+            } \
+        } \
+    }
+BLIT_FUNC(Generic)
+#ifdef SCUMMVM_NEON
+BLIT_FUNC(NEON)
+#endif
+#ifdef SCUMMVM_SSE2
+BLIT_FUNC(SSE2)
+#endif
+#ifdef SCUMMVM_AVX2
+BLIT_FUNC(AVX2)
+#endif
+
+} // end of namespace Graphics
diff --git a/graphics/module.mk b/graphics/module.mk
index e27c44785ed..bca2d3c6092 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -3,6 +3,7 @@ MODULE := graphics
 MODULE_OBJS := \
 	big5.o \
 	blit/blit.o \
+	blit/blit-blend.o \
 	blit/blit-alpha.o \
 	blit/blit-scale.o \
 	cursorman.o \


Commit: 09613997270bfe15510afe25e7a129d35259f170
    https://github.com/scummvm/scummvm/commit/09613997270bfe15510afe25e7a129d35259f170
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
ALL: blendBlitFrom prototype now matches TS::blit

Changed paths:
    engines/sword25/gfx/image/renderedimage.cpp
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    test/image/blending.h


diff --git a/engines/sword25/gfx/image/renderedimage.cpp b/engines/sword25/gfx/image/renderedimage.cpp
index 43b13a1063b..7ea25f33fd1 100644
--- a/engines/sword25/gfx/image/renderedimage.cpp
+++ b/engines/sword25/gfx/image/renderedimage.cpp
@@ -235,14 +235,7 @@ bool RenderedImage::blit(int posX, int posY, int flipping, Common::Rect *pPartRe
 
 	if (width == -1) width = pPartRect ? pPartRect->width() : _surface.w;
 	if (height == -1) height = pPartRect ? pPartRect->height() : _surface.h;
-	//_surface.blit(*_backSurface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
-	_backSurface->blendBlitFrom(
-		_surface,
-		pPartRect ? *pPartRect : Common::Rect(0, 0, _surface.w, _surface.h),
-		Common::Rect(posX, posY, posX + width, posY + height),
-		newFlipping,
-		_surface.format.ARGBToColor(ca, cr, cg, cb)
-	);
+	_backSurface->blendBlitFrom(_surface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
 
 	return true;
 }
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index 1de03d75a8a..ceb6df94874 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -729,19 +729,28 @@ void ManagedSurface::transBlitFromInner(const Surface &src, const Common::Rect &
 
 #undef HANDLE_BLIT
 
-Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
-										   const Common::Rect &destRect, int flipping,
-										   const uint32 colorMod,
+Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src,
+										   const int posX, const int posY,
+										   const int flipping,
+										   const Common::Rect *srcRect,
+										   const uint colorMod,
+										   const int width, const int height,
 										   const TSpriteBlendMode blend,
 										   const AlphaType alphaType) {
-	return blendBlitFrom(src.rawSurface(), srcRect, destRect, flipping, colorMod, blend, alphaType);
-}
-Common::Rect ManagedSurface::blendBlitFrom(const Surface &src, const Common::Rect &srcRect,
-										   const Common::Rect &destRect, int flipping,
-										   const uint32 colorMod,
+	return blendBlitFrom(src.rawSurface(), posX, posY, flipping, srcRect, colorMod, width, height, blend, alphaType);
+}
+Common::Rect ManagedSurface::blendBlitFrom(const Surface &src,
+										   const int posX, const int posY,
+										   const int flipping,
+										   const Common::Rect *srcRect,
+										   const uint colorMod,
+										   const int width, const int height,
 										   const TSpriteBlendMode blend,
 										   const AlphaType alphaType) {
-	Common::Rect srcArea = srcRect, dstArea = destRect;
+
+	Common::Rect dstArea(posX, posY, posX + (width == -1 ? src.w : width), posY + (height == -1 ? src.h : height));
+	Common::Rect srcArea = srcRect ? *srcRect : Common::Rect(0, 0, src.w, src.h);
+	
 	if (!isBlendBlitPixelFormatSupported(src.format, format)) {
 		warning("ManagedSurface::blendBlitFrom only accepts RGBA32!");
 		return Common::Rect(0, 0, 0, 0);
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 1d1677f8cc6..4332dbf3d6f 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -538,22 +538,30 @@ public:
 	/**
 	 * @brief renders src onto this managed surface
 	 * @param src source surface
-	 * @param srcRect source clipping rectangle (used for sprite sheets for example)
-	 * @param destRect the destination of source onto this managed surface
+	 * @param posX, posY are the position of the src onto this surface
 	 * @param flipping flipping flags (use Graphics::FLIP_FLAGS)
+	 * @param srcRect source clipping
+	 * @param width width of destination
+	 * @param height height of destination
 	 * @param colorMod what color to multiply by (0xffffffff does nothing)
 	 * @param blend the blending mode to use.
 	 * @param alphaType what alpha mode to use. FULL is default
 	 * @return returns the size of the rendered rectangle
 	 */
-	Common::Rect blendBlitFrom(const ManagedSurface &src, const Common::Rect &srcRect,
-							   const Common::Rect &destRect, int flipping = FLIP_NONE,
-							   const uint32 colorMod = MS_ARGB(255, 255, 255, 255),
+	Common::Rect blendBlitFrom(const ManagedSurface &src,
+							   const int posX = 0, const int posY = 0,
+							   const int flipping = FLIP_NONE,
+							   const Common::Rect *srcRect = nullptr,
+							   const uint colorMod = MS_ARGB(255, 255, 255, 255),
+							   const int width = -1, const int height = -1,
 							   const TSpriteBlendMode blend = BLEND_NORMAL,
 							   const AlphaType alphaType = ALPHA_FULL);
-	Common::Rect blendBlitFrom(const Surface &src, const Common::Rect &srcRect,
-							   const Common::Rect &destRect, int flipping = FLIP_NONE,
-							   const uint32 colorMod = MS_ARGB(255, 255, 255, 255),
+	Common::Rect blendBlitFrom(const Surface &src,
+							   const int posX = 0, const int posY = 0,
+							   const int flipping = FLIP_NONE,
+							   const Common::Rect *srcRect = nullptr,
+							   const uint colorMod = MS_ARGB(255, 255, 255, 255),
+							   const int width = -1, const int height = -1,
 							   const TSpriteBlendMode blend = BLEND_NORMAL,
 							   const AlphaType alphaType = ALPHA_FULL);
 
diff --git a/test/image/blending.h b/test/image/blending.h
index 93606accbe7..25f3e59865b 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -839,7 +839,7 @@ public:
 			oldTime += g_system->getMillis() - oldStart;
 			uint32 newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTime += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -847,7 +847,7 @@ public:
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			uint32 genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTime += g_system->getMillis() - genericStart;
@@ -864,14 +864,14 @@ public:
 			oldTimeScaled += g_system->getMillis() - oldStart;
 			newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTimeScaled += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-            	managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, color, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTimeScaled += g_system->getMillis() - genericStart;
@@ -969,7 +969,7 @@ public:
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
             Common::Rect ret2 = newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, MS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 
 			if (ret1 != ret2 || ret2 != ret3 || ret1 != ret3) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",


Commit: c45918ea91f5b58ce9abc85b8ac61c863ff6de44
    https://github.com/scummvm/scummvm/commit/c45918ea91f5b58ce9abc85b8ac61c863ff6de44
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: SIMD flags only enabled on blit-blend.o

Changed paths:
    configure
    graphics/module.mk


diff --git a/configure b/configure
index b680598116b..c1a0478da4a 100755
--- a/configure
+++ b/configure
@@ -6916,25 +6916,22 @@ case $_host_cpu in
 esac
 
 define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
-if test "$_ext_sse2" = yes ; then
-	append_var CXXFLAGS "-msse2 -msse"
-fi
 echo_n "Enabling x86/64 SSE2... "
 echo "$_ext_sse2"
 define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
 if test "$_ext_avx2" = yes ; then
-	append_var CXXFLAGS "-mavx2 -mavx -msse2 -msse"
-	define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
+	define_in_config_h_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
 fi
 echo_n "Enabling x86/64 AVX2 and SSE2... "
 echo "$_ext_avx2"
-define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
 # AArch64 might by default come with more fpu extensions, so we wouldn't want
 # to downgrade. Almost all armv7 cpus have neon or less in terms of fpu
 # extensions so setting fpu to neon is almost always an upgrade over defaults.
 # Not to mention it would have to be included anyways
-if ( test "$_ext_neon" = yes ) && ( test "$_host_cpu" != aarch64 ) ; then
-	append_var CXXFLAGS "-mfpu=neon"
+if ( test "$_host_cpu" != aarch64 ) ; then
+	define_in_config_h_if_yes "$_ext_neon" 'SCUMMVM_NEON'
+else
+	define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
 fi
 echo_n "Enabling arm NEON... "
 echo "$_ext_neon"
diff --git a/graphics/module.mk b/graphics/module.mk
index bca2d3c6092..e97744871de 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -140,5 +140,15 @@ endif
 
 endif
 
+ifeq ($(SCUMMVM_NEON),1)
+$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mfpu=neon
+endif
+ifeq ($(SCUMMVM_SSE2),1)
+$(MODULE)/blit/blit-blend.o: CXXFLAGS += -msse2 -msse
+endif
+ifeq ($(SCUMMVM_AVX2),1)
+$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mavx2 -mavx -msse2 -msse
+endif
+
 # Include common rules
 include $(srcdir)/rules.mk


Commit: f802ad16d05356386ed4f2a6a1ba40e44d79c081
    https://github.com/scummvm/scummvm/commit/f802ad16d05356386ed4f2a6a1ba40e44d79c081
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Simplify SIMD options

Changed paths:
    configure
    graphics/module.mk


diff --git a/configure b/configure
index c1a0478da4a..e947db2ad45 100755
--- a/configure
+++ b/configure
@@ -6919,20 +6919,9 @@ define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
 echo_n "Enabling x86/64 SSE2... "
 echo "$_ext_sse2"
 define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
-if test "$_ext_avx2" = yes ; then
-	define_in_config_h_if_yes "$_ext_avx2" 'SCUMMVM_SSE2'
-fi
 echo_n "Enabling x86/64 AVX2 and SSE2... "
 echo "$_ext_avx2"
-# AArch64 might by default come with more fpu extensions, so we wouldn't want
-# to downgrade. Almost all armv7 cpus have neon or less in terms of fpu
-# extensions so setting fpu to neon is almost always an upgrade over defaults.
-# Not to mention it would have to be included anyways
-if ( test "$_host_cpu" != aarch64 ) ; then
-	define_in_config_h_if_yes "$_ext_neon" 'SCUMMVM_NEON'
-else
-	define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
-fi
+define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
 echo_n "Enabling arm NEON... "
 echo "$_ext_neon"
 
diff --git a/graphics/module.mk b/graphics/module.mk
index e97744871de..c31c22e9875 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -144,10 +144,10 @@ ifeq ($(SCUMMVM_NEON),1)
 $(MODULE)/blit/blit-blend.o: CXXFLAGS += -mfpu=neon
 endif
 ifeq ($(SCUMMVM_SSE2),1)
-$(MODULE)/blit/blit-blend.o: CXXFLAGS += -msse2 -msse
+$(MODULE)/blit/blit-blend.o: CXXFLAGS += -msse2
 endif
 ifeq ($(SCUMMVM_AVX2),1)
-$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mavx2 -mavx -msse2 -msse
+$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mavx2
 endif
 
 # Include common rules


Commit: 0597770654faa995b2bf8304cda3f7f851dfe494
    https://github.com/scummvm/scummvm/commit/0597770654faa995b2bf8304cda3f7f851dfe494
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Refactor BlendBlit

Changed paths:
  A graphics/blit/blit-avx2.cpp
  A graphics/blit/blit-neon.cpp
  A graphics/blit/blit-sse2.cpp
  R graphics/blit/blit-blend-avx2.h
  R graphics/blit/blit-blend-neon.h
  R graphics/blit/blit-blend-normal.h
  R graphics/blit/blit-blend-sse2.h
  R graphics/blit/blit-blend.cpp
    graphics/blit.h
    graphics/blit/blit-alpha.cpp
    graphics/module.mk


diff --git a/graphics/blit.h b/graphics/blit.h
index 85bae19d5bb..80f0a98b44d 100644
--- a/graphics/blit.h
+++ b/graphics/blit.h
@@ -218,33 +218,16 @@ private:
 			 const uint32 colorMod, const uint flipping);
 	};
 
-// Define logic functions for different architecture extensions.
-// These extensions would just be a template parameter if it weren't for the
-// fact that partial template specialization doesn't exist.
-#define LOGIC_FUNCS_EXT(ext) \
-	template<bool doscale> \
-	static void doBlitBinaryBlendLogic##ext(Args &args); \
-	template<bool doscale> \
-	static void doBlitOpaqueBlendLogic##ext(Args &args); \
-	template<bool doscale, bool rgbmod, bool alphamod> \
-	static void doBlitMultiplyBlendLogic##ext(Args &args); \
-	template<bool doscale, bool rgbmod> \
-	static void doBlitSubtractiveBlendLogic##ext(Args &args); \
-	template<bool doscale, bool rgbmod, bool alphamod> \
-	static void doBlitAdditiveBlendLogic##ext(Args &args); \
-	template<bool doscale, bool rgbmod, bool alphamod> \
-	static void doBlitAlphaBlendLogic##ext(Args &args); \
-	static void blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 #ifdef SCUMMVM_NEON
-LOGIC_FUNCS_EXT(NEON)
+	static void blitNEON(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 #endif
 #ifdef SCUMMVM_SSE2
-LOGIC_FUNCS_EXT(SSE2)
+	static void blitSSE2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 #endif
 #ifdef SCUMMVM_AVX2
-LOGIC_FUNCS_EXT(AVX2)
+	static void blitAVX2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 #endif
-LOGIC_FUNCS_EXT(Generic)
+	static void blitGeneric(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
 #undef LOGIC_FUNCS_EXT
 
 	typedef void(*BlitFunc)(Args &, const TSpriteBlendMode &, const AlphaType &);
diff --git a/graphics/blit/blit-alpha.cpp b/graphics/blit/blit-alpha.cpp
index bb82b13d68c..75ecde97159 100644
--- a/graphics/blit/blit-alpha.cpp
+++ b/graphics/blit/blit-alpha.cpp
@@ -19,6 +19,7 @@
  *
  */
 
+#include "common/system.h"
 #include "graphics/blit.h"
 #include "graphics/pixelformat.h"
 
@@ -167,4 +168,501 @@ bool setAlpha(byte *dst, const byte *src,
     return true;
 }
 
+BlendBlit::Args::Args(byte *dst, const byte *src,
+    const uint _dstPitch, const uint _srcPitch,
+    const int posX, const int posY,
+    const uint _width, const uint _height,
+    const int _scaleX, const int _scaleY,
+    const int scaleXsrcOff, const int scaleYsrcOff,
+    const uint32 colorMod, const uint _flipping) :
+        xp(0), yp(0), dstPitch(_dstPitch),
+        width(_width), height(_height), color(colorMod),
+        scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
+        scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
+    bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
+    
+    rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+    alphamod = ((colorMod & kAModMask)   != kAModMask);
+    inStep = 4;
+    inoStep = _srcPitch;
+    if (flipping & FLIP_H) {
+        inStep = -inStep;
+        xp = width - 1;
+        if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
+    }
+
+    if (flipping & FLIP_V) {
+        inoStep = -inoStep;
+        yp = height - 1;
+        if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
+    }
+
+    ino = src + yp * _srcPitch + xp * 4;
+    outo = dst + posY * _dstPitch + posX * 4;
+}
+
+// Initialize this to nullptr at the start
+BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
+
+// Only blits to and from 32bpp images
+// So this function is just here to jump to whatever function is in
+// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
+// the cpu has certain SIMD feature enabled or not.
+void BlendBlit::blit(byte *dst, const byte *src,
+                     const uint dstPitch, const uint srcPitch,
+                     const int posX, const int posY,
+                     const uint width, const uint height,
+                     const int scaleX, const int scaleY,
+                     const int scaleXsrcOff, const int scaleYsrcOff,
+                     const uint32 colorMod, const uint flipping,
+                     const TSpriteBlendMode blendMode,
+                     const AlphaType alphaType) {
+    if (width == 0 || height == 0) return;
+
+    // If no function has been selected yet, detect and select
+    if (!blitFunc) {
+        // Get the correct blit function
+        blitFunc = blitGeneric;
+#ifdef SCUMMVM_NEON
+        if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
+#endif
+#ifdef SCUMMVM_SSE2
+        if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
+#endif
+#ifdef SCUMMVM_AVX2
+        if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
+#endif
+    }
+    
+    Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
+    blitFunc(args, blendMode, alphaType);
+}
+
+class BlendBlitImpl {
+public:
+/**
+ * Optimized version of doBlit to be used with multiply blended blitting
+ */
+template<bool doscale, bool rgbmod, bool alphamod>
+static void doBlitMultiplyBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
+                out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
+                out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+
+}
+
+template<bool doscale, bool rgbmod, bool alphamod>
+static void doBlitAlphaBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const byte cr = rgbmod   ? ((args.color >> BlendBlit::kRModShift) & 0xFF) : 255;
+    const byte cg = rgbmod   ? ((args.color >> BlendBlit::kGModShift) & 0xFF) : 255;
+    const byte cb = rgbmod   ? ((args.color >> BlendBlit::kBModShift) & 0xFF) : 255;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                if (rgbmod) {
+                    const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+                    const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+                    const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+                    out[BlendBlit::kAIndex] = 255;
+                    out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
+                    out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
+                    out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+                } else {
+                    out[BlendBlit::kAIndex] = 255;
+                    out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + in[BlendBlit::kBIndex] * ina) >> 8;
+                    out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * (255 - ina) + in[BlendBlit::kGIndex] * ina) >> 8;
+                    out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * (255 - ina) + in[BlendBlit::kRIndex] * ina) >> 8;
+                    
+                }
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+/**
+ * Optimized version of doBlit to be used with subtractive blended blitting
+ */
+template<bool doscale, bool rgbmod>
+static void doBlitSubtractiveBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            out[BlendBlit::kAIndex] = 255;
+            out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+            out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+            out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr * (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+/**
+ * Optimized version of doBlit to be used with additive blended blitting
+ */
+template<bool doscale, bool rgbmod, bool alphamod>
+static void doBlitAdditiveBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
+    const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
+    const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
+    const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
+    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
+    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
+    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+
+            if (ina != 0) {
+                out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
+                out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
+                out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+            }
+
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+template<bool doscale>
+static void doBlitOpaqueBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + (scaleYCtr + 1) / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+
+        if (doscale) {
+            for (uint32 j = 0; j < args.width; j++) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+                *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+                scaleXCtr += args.scaleX;
+                out += 4;
+            }
+        } else {
+            for (uint32 j = 0; j < args.width; j++) {
+                *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+                in += args.inStep;
+                out += 4;
+            }
+        }
+
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+template<bool doscale>
+static void doBlitBinaryBlendLogicGeneric(BlendBlit::Args &args) {
+    const byte *in;
+    byte *out;
+
+    int scaleXCtr, scaleYCtr = args.scaleYoff;
+    const byte *inBase;
+
+    for (uint32 i = 0; i < args.height; i++) {
+        if (doscale) {
+            inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+            scaleXCtr = args.scaleXoff;
+        } else {
+            in = args.ino;
+        }
+        out = args.outo;
+        for (uint32 j = 0; j < args.width; j++) {
+            if (doscale) {
+                in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+            }
+
+            uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
+            uint32 mask = (pix & BlendBlit::kAModMask) ? 0xffffffff : 0;
+            pixout &= ~mask;
+            pix = (pix | BlendBlit::kAModMask) & mask;
+            *(uint32 *)out = pixout | pix;
+            
+            if (doscale)
+                scaleXCtr += args.scaleX;
+            else
+                in += args.inStep;
+            out += 4;
+        }
+        if (doscale)
+            scaleYCtr += args.scaleY;
+        else
+            args.ino += args.inoStep;
+        args.outo += args.dstPitch;
+    }
+}
+
+}; // end of class BlendBlitImpl
+
+void BlendBlit::blitGeneric(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+    bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+    bool alphamod = ((args.color & kAModMask)   != kAModMask);
+    if (args.scaleX == BlendBlit::SCALE_THRESHOLD && args.scaleY == BlendBlit::SCALE_THRESHOLD) {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::doBlitOpaqueBlendLogicGeneric<false>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::doBlitBinaryBlendLogicGeneric<false>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<false, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<false, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<false, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<false, false, false>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::doBlitSubtractiveBlendLogicGeneric<false, true>(args);
+                } else {
+                    BlendBlitImpl::doBlitSubtractiveBlendLogicGeneric<false, false>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<false, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<false, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<false, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<false, false, false>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<false, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<false, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<false, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<false, false, false>(args);
+                    }
+                }
+            }
+        }
+    } else {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::doBlitOpaqueBlendLogicGeneric<true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::doBlitBinaryBlendLogicGeneric<true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<true, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<true, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAdditiveBlendLogicGeneric<true, false, false>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::doBlitSubtractiveBlendLogicGeneric<true, true>(args);
+                } else {
+                    BlendBlitImpl::doBlitSubtractiveBlendLogicGeneric<true, false>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<true, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<true, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitMultiplyBlendLogicGeneric<true, false, false>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<true, true, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<true, true, false>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::doBlitAlphaBlendLogicGeneric<true, false, false>(args);
+                    }
+                }
+            }
+        }
+    }
+}
+
 } // End of namespace Graphics
diff --git a/graphics/blit/blit-blend-avx2.h b/graphics/blit/blit-avx2.cpp
similarity index 77%
rename from graphics/blit/blit-blend-avx2.h
rename to graphics/blit/blit-avx2.cpp
index d22d7ef4763..452c2f32808 100644
--- a/graphics/blit/blit-blend-avx2.h
+++ b/graphics/blit/blit-avx2.cpp
@@ -19,9 +19,7 @@
  *
  */
 
-#ifndef GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
-#define GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
-
+#include "common/scummsys.h"
 #ifdef SCUMMVM_AVX2
 #include <immintrin.h>
 
@@ -31,7 +29,7 @@
 namespace Graphics {
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct AlphaBlendAVX2 {
+struct AlphaBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         __m256i ina;
         if (alphamod)
@@ -93,7 +91,7 @@ struct AlphaBlendAVX2 {
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct MultiplyBlendAVX2 {
+struct MultiplyBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         __m256i ina;
         if (alphamod)
@@ -144,7 +142,7 @@ struct MultiplyBlendAVX2 {
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct OpaqueBlendAVX2 {
+struct OpaqueBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
     }
@@ -155,7 +153,7 @@ struct OpaqueBlendAVX2 {
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct BinaryBlendAVX2 {
+struct BinaryBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         __m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
         dst = _mm256_and_si256(dst, alphaMask);
@@ -175,7 +173,7 @@ struct BinaryBlendAVX2 {
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct AdditiveBlendAVX2 {
+struct AdditiveBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         __m256i ina;
         if (alphamod)
@@ -239,7 +237,7 @@ struct AdditiveBlendAVX2 {
 };
 
 template<bool doscale, bool rgbmod, bool alphamod>
-struct SubtractiveBlendAVX2 {
+struct SubtractiveBlend {
     static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
         __m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
         __m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
@@ -264,8 +262,10 @@ struct SubtractiveBlendAVX2 {
     }
 };
 
+class BlendBlitImpl {
+public:
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
-void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
+static void blitInnerLoop(BlendBlit::Args &args) {
     const byte *in;
     byte *out;
 
@@ -343,32 +343,128 @@ void BlendBlitImpl::blitInnerLoopAVX2(BlendBlit::Args &args) {
     }
 }
 
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<AlphaBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<SubtractiveBlendAVX2, doscale, rgbmod, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<AdditiveBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<OpaqueBlendAVX2, doscale, false, false, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<BinaryBlendAVX2, doscale, false, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicAVX2(Args &args) {
-    BlendBlitImpl::blitInnerLoopAVX2<MultiplyBlendAVX2, doscale, rgbmod, alphamod, false, true>(args);
+}; // end of class BlendBlitImpl
+
+void BlendBlit::blitAVX2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+    bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+    bool alphamod = ((args.color & kAModMask)   != kAModMask);
+    if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, false, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, false, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    } else {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, true, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, true, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    }
 }
 
 } // End of namespace Graphics
 
 #endif // SCUMMVM_AVX2
-#endif // GRAPHICS_BLIT_BLIT_BLEND_AVX2_H
diff --git a/graphics/blit/blit-blend-normal.h b/graphics/blit/blit-blend-normal.h
deleted file mode 100644
index a7903c4097f..00000000000
--- a/graphics/blit/blit-blend-normal.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/* ScummVM - Graphic Adventure Engine
- *
- * ScummVM is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#ifndef GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
-#define GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
-#include "graphics/blit.h"
-
-namespace Graphics {
-
-/**
- * Optimized version of doBlit to be used with multiply blended blitting
- */
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                out[kBIndex] = out[kBIndex] * ((in[kBIndex] * cb * ina) >> 16) >> 8;
-                out[kGIndex] = out[kGIndex] * ((in[kGIndex] * cg * ina) >> 16) >> 8;
-                out[kRIndex] = out[kRIndex] * ((in[kRIndex] * cr * ina) >> 16) >> 8;
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-
-}
-
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const byte cr = rgbmod   ? ((args.color >> kRModShift) & 0xFF) : 255;
-    const byte cg = rgbmod   ? ((args.color >> kGModShift) & 0xFF) : 255;
-    const byte cb = rgbmod   ? ((args.color >> kBModShift) & 0xFF) : 255;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                if (rgbmod) {
-                    const uint outb = (out[kBIndex] * (255 - ina) >> 8);
-                    const uint outg = (out[kGIndex] * (255 - ina) >> 8);
-                    const uint outr = (out[kRIndex] * (255 - ina) >> 8);
-
-                    out[kAIndex] = 255;
-                    out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
-                    out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
-                    out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
-                } else {
-                    out[kAIndex] = 255;
-                    out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
-                    out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
-                    out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
-                    
-                }
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-/**
- * Optimized version of doBlit to be used with subtractive blended blitting
- */
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            out[kAIndex] = 255;
-            out[kBIndex] = MAX<int32>(out[kBIndex] - ((in[kBIndex] * cb  * (out[kBIndex]) * in[kAIndex]) >> 24), 0);
-            out[kGIndex] = MAX<int32>(out[kGIndex] - ((in[kGIndex] * cg  * (out[kGIndex]) * in[kAIndex]) >> 24), 0);
-            out[kRIndex] = MAX<int32>(out[kRIndex] - ((in[kRIndex] * cr * (out[kRIndex]) * in[kAIndex]) >> 24), 0);
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-/**
- * Optimized version of doBlit to be used with additive blended blitting
- */
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    const byte rawcr = (args.color >> kRModShift) & 0xFF;
-    const byte rawcg = (args.color >> kGModShift) & 0xFF;
-    const byte rawcb = (args.color >> kBModShift) & 0xFF;
-    const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
-    const uint32 cr = rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256;
-    const uint32 cg = rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256;
-    const uint32 cb = rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 ina = in[kAIndex] * ca >> 8;
-
-            if (ina != 0) {
-                out[kBIndex] = out[kBIndex] + ((in[kBIndex] * cb * ina) >> 16);
-                out[kGIndex] = out[kGIndex] + ((in[kGIndex] * cg * ina) >> 16);
-                out[kRIndex] = out[kRIndex] + ((in[kRIndex] * cr * ina) >> 16);
-            }
-
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + (scaleYCtr + 1) / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-
-        if (doscale) {
-            for (uint32 j = 0; j < args.width; j++) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-                *(uint32 *)out = *(const uint32 *)in | kAModMask;
-                scaleXCtr += args.scaleX;
-                out += 4;
-            }
-        } else {
-            for (uint32 j = 0; j < args.width; j++) {
-                *(uint32 *)out = *(const uint32 *)in | kAModMask;
-                in += args.inStep;
-                out += 4;
-            }
-        }
-
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicGeneric(Args &args) {
-    const byte *in;
-    byte *out;
-
-    int scaleXCtr, scaleYCtr = args.scaleYoff;
-    const byte *inBase;
-
-    for (uint32 i = 0; i < args.height; i++) {
-        if (doscale) {
-            inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
-            scaleXCtr = args.scaleXoff;
-        } else {
-            in = args.ino;
-        }
-        out = args.outo;
-        for (uint32 j = 0; j < args.width; j++) {
-            if (doscale) {
-                in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
-            }
-
-            uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
-            uint32 mask = (pix & kAModMask) ? 0xffffffff : 0;
-            pixout &= ~mask;
-            pix = (pix | kAModMask) & mask;
-            *(uint32 *)out = pixout | pix;
-            
-            if (doscale)
-                scaleXCtr += args.scaleX;
-            else
-                in += args.inStep;
-            out += 4;
-        }
-        if (doscale)
-            scaleYCtr += args.scaleY;
-        else
-            args.ino += args.inoStep;
-        args.outo += args.dstPitch;
-    }
-}
-
-} // end of namespace Graphics
-
-#endif // GRAPHICS_BLIT_BLIT_BLEND_NORMAL_H
diff --git a/graphics/blit/blit-blend.cpp b/graphics/blit/blit-blend.cpp
deleted file mode 100644
index a664f32eb06..00000000000
--- a/graphics/blit/blit-blend.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* ScummVM - Graphic Adventure Engine
- *
- * ScummVM is the legal property of its developers, whose names
- * are too numerous to list here. Please refer to the COPYRIGHT
- * file distributed with this source distribution.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#include "common/system.h"
-#include "graphics/blit/blit-blend-normal.h"
-#include "graphics/blit/blit-blend-neon.h"
-#include "graphics/blit/blit-blend-sse2.h"
-#include "graphics/blit/blit-blend-avx2.h"
-
-namespace Graphics {
-
-BlendBlit::Args::Args(byte *dst, const byte *src,
-    const uint _dstPitch, const uint _srcPitch,
-    const int posX, const int posY,
-    const uint _width, const uint _height,
-    const int _scaleX, const int _scaleY,
-    const int scaleXsrcOff, const int scaleYsrcOff,
-    const uint32 colorMod, const uint _flipping) :
-        xp(0), yp(0), dstPitch(_dstPitch),
-        width(_width), height(_height), color(colorMod),
-        scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
-        scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
-    bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
-    
-    rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
-    alphamod = ((colorMod & kAModMask)   != kAModMask);
-    inStep = 4;
-    inoStep = _srcPitch;
-    if (flipping & FLIP_H) {
-        inStep = -inStep;
-        xp = width - 1;
-        if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
-    }
-
-    if (flipping & FLIP_V) {
-        inoStep = -inoStep;
-        yp = height - 1;
-        if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
-    }
-
-    ino = src + yp * _srcPitch + xp * 4;
-    outo = dst + posY * _dstPitch + posX * 4;
-}
-
-// Initialize this to nullptr at the start
-BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
-
-// Only blits to and from 32bpp images
-// So this function is just here to jump to whatever function is in
-// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
-// the cpu has certain SIMD feature enabled or not.
-void BlendBlit::blit(byte *dst, const byte *src,
-                     const uint dstPitch, const uint srcPitch,
-                     const int posX, const int posY,
-                     const uint width, const uint height,
-                     const int scaleX, const int scaleY,
-                     const int scaleXsrcOff, const int scaleYsrcOff,
-                     const uint32 colorMod, const uint flipping,
-                     const TSpriteBlendMode blendMode,
-                     const AlphaType alphaType) {
-    if (width == 0 || height == 0) return;
-
-    // If no function has been selected yet, detect and select
-    if (!blitFunc) {
-        // Get the correct blit function
-        blitFunc = blitGeneric;
-#ifdef SCUMMVM_NEON
-        if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
-#endif
-#ifdef SCUMMVM_SSE2
-        if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
-#endif
-#ifdef SCUMMVM_AVX2
-        if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
-#endif
-    }
-    
-    Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
-    blitFunc(args, blendMode, alphaType);
-}
-
-// This is just a macro to expand it because its a pretty simple function where
-// readabiliy doesn't matter too much and macros tend to work faster better than functors
-#define BLIT_FUNC(ext) \
-    void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
-        bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask); \
-        bool alphamod = ((args.color & kAModMask)   != kAModMask); \
-        if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
-            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-                doBlitOpaqueBlendLogic##ext<false>(args); \
-            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-                doBlitBinaryBlendLogic##ext<false>(args); \
-            } else { \
-                if (blendMode == BLEND_ADDITIVE) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } else if (blendMode == BLEND_SUBTRACTIVE) { \
-                    if (rgbmod) { \
-                        doBlitSubtractiveBlendLogic##ext<false, true>(args); \
-                    } else { \
-                        doBlitSubtractiveBlendLogic##ext<false, false>(args); \
-                    } \
-                } else if (blendMode == BLEND_MULTIPLY) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } else { \
-                    assert(blendMode == BLEND_NORMAL); \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<false, true, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<false, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<false, false, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<false, false, false>(args); \
-                        } \
-                    } \
-                } \
-            } \
-        } else { \
-            if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
-                doBlitOpaqueBlendLogic##ext<true>(args); \
-            } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) { \
-                doBlitBinaryBlendLogic##ext<true>(args); \
-            } else { \
-                if (blendMode == BLEND_ADDITIVE) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } else if (blendMode == BLEND_SUBTRACTIVE) { \
-                    if (rgbmod) { \
-                        doBlitSubtractiveBlendLogic##ext<true, true>(args); \
-                    } else { \
-                        doBlitSubtractiveBlendLogic##ext<true, false>(args); \
-                    } \
-                } else if (blendMode == BLEND_MULTIPLY) { \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } else { \
-                    assert(blendMode == BLEND_NORMAL); \
-                    if (rgbmod) { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<true, true, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<true, true, false>(args); \
-                        } \
-                    } else { \
-                        if (alphamod) { \
-                            doBlitAlphaBlendLogic##ext<true, false, true>(args); \
-                        } else { \
-                            doBlitAlphaBlendLogic##ext<true, false, false>(args); \
-                        } \
-                    } \
-                } \
-            } \
-        } \
-    }
-BLIT_FUNC(Generic)
-#ifdef SCUMMVM_NEON
-BLIT_FUNC(NEON)
-#endif
-#ifdef SCUMMVM_SSE2
-BLIT_FUNC(SSE2)
-#endif
-#ifdef SCUMMVM_AVX2
-BLIT_FUNC(AVX2)
-#endif
-
-} // end of namespace Graphics
diff --git a/graphics/blit/blit-blend-neon.h b/graphics/blit/blit-neon.cpp
similarity index 76%
rename from graphics/blit/blit-blend-neon.h
rename to graphics/blit/blit-neon.cpp
index 66fa2d0484b..f95577c9a46 100644
--- a/graphics/blit/blit-blend-neon.h
+++ b/graphics/blit/blit-neon.cpp
@@ -19,9 +19,7 @@
  *
  */
 
-#ifndef GRAPHICS_BLIT_BLIT_BLEND_NEON_H
-#define GRAPHICS_BLIT_BLIT_BLEND_NEON_H
-
+#include "common/scummsys.h"
 #ifdef SCUMMVM_NEON
 #include <arm_neon.h>
 
@@ -343,34 +341,128 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
     }
 }
 
-};
+}; // end of class BlendBlitImpl
 
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
-    BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+void BlendBlit::blitNEON(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+    bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+    bool alphamod = ((args.color & kAModMask)   != kAModMask);
+    if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, false, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, false, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    } else {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, true, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, true, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    }
 }
 
 } // end of namespace Graphics
 
 #endif // SCUMMVM_NEON
-#endif // GRAPHICS_BLIT_BLIT_BLEND_NEON_H
diff --git a/graphics/blit/blit-blend-sse2.h b/graphics/blit/blit-sse2.cpp
similarity index 77%
rename from graphics/blit/blit-blend-sse2.h
rename to graphics/blit/blit-sse2.cpp
index 29a27832eba..fa30773b5ac 100644
--- a/graphics/blit/blit-blend-sse2.h
+++ b/graphics/blit/blit-sse2.cpp
@@ -19,9 +19,7 @@
  *
  */
 
-#ifndef GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
-#define GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
-
+#include "common/scummsys.h"
 #ifdef SCUMMVM_SSE2
 #include <immintrin.h>
 
@@ -271,13 +269,7 @@ struct SubtractiveBlend {
 };
 
 class BlendBlitImpl {
-
 public:
-#ifdef SCUMMVM_AVX2
-template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
-static inline void blitInnerLoopAVX2(BlendBlit::Args &args);
-#endif
-
 template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
     const byte *in;
@@ -352,34 +344,128 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
     }
 }
 
-};
+}; // End of class BlendBlitImpl
 
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAlphaBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<AlphaBlend, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale, bool rgbmod>
-void BlendBlit::doBlitSubtractiveBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, doscale, rgbmod, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitAdditiveBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<AdditiveBlend, doscale, rgbmod, alphamod, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitOpaqueBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<OpaqueBlend, doscale, false, false, false, true>(args);
-}
-template<bool doscale>
-void BlendBlit::doBlitBinaryBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<BinaryBlend, doscale, false, false, false, true>(args);
-}
-template<bool doscale, bool rgbmod, bool alphamod>
-void BlendBlit::doBlitMultiplyBlendLogicSSE2(Args &args) {
-    BlendBlitImpl::blitInnerLoop<MultiplyBlend, doscale, rgbmod, alphamod, false, true>(args);
+void BlendBlit::blitSSE2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+    bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+    bool alphamod = ((args.color & kAModMask)   != kAModMask);
+    if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, false, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, false, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, false, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, false, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, false, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    } else {
+        if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+            BlendBlitImpl::blitInnerLoop<OpaqueBlend, true, false, false, false, true>(args);
+        } else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+            BlendBlitImpl::blitInnerLoop<BinaryBlend, true, false, false, false, true>(args);
+        } else {
+            if (blendMode == BLEND_ADDITIVE) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AdditiveBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else if (blendMode == BLEND_SUBTRACTIVE) {
+                if (rgbmod) {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, true, false, false, true>(args);
+                } else {
+                    BlendBlitImpl::blitInnerLoop<SubtractiveBlend, true, false, false, false, true>(args);
+                }
+            } else if (blendMode == BLEND_MULTIPLY) {
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<MultiplyBlend, true, false, false, false, true>(args);
+                    }
+                }
+            } else {
+                assert(blendMode == BLEND_NORMAL);
+                if (rgbmod) {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, true, false, false, true>(args);
+                    }
+                } else {
+                    if (alphamod) {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, true, false, true>(args);
+                    } else {
+                        BlendBlitImpl::blitInnerLoop<AlphaBlend, true, false, false, false, true>(args);
+                    }
+                }
+            }
+        }
+    }
 }
 
 } // End of namespace Graphics
 
 #endif // SSE2
-#endif // GRAPHICS_BLIT_BLIT_BLEND_SSE2_H
diff --git a/graphics/module.mk b/graphics/module.mk
index c31c22e9875..4cb382564e8 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -3,7 +3,9 @@ MODULE := graphics
 MODULE_OBJS := \
 	big5.o \
 	blit/blit.o \
-	blit/blit-blend.o \
+	blit/blit-sse2.o \
+	blit/blit-avx2.o \
+	blit/blit-neon.o \
 	blit/blit-alpha.o \
 	blit/blit-scale.o \
 	cursorman.o \
@@ -141,13 +143,13 @@ endif
 endif
 
 ifeq ($(SCUMMVM_NEON),1)
-$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mfpu=neon
+$(MODULE)/blit/blit-neon.cpp: CXXFLAGS += -mfpu=neon
 endif
 ifeq ($(SCUMMVM_SSE2),1)
-$(MODULE)/blit/blit-blend.o: CXXFLAGS += -msse2
+$(MODULE)/blit/blit-sse2.cpp: CXXFLAGS += -msse2
 endif
 ifeq ($(SCUMMVM_AVX2),1)
-$(MODULE)/blit/blit-blend.o: CXXFLAGS += -mavx2
+$(MODULE)/blit/blit-avx2.cpp: CXXFLAGS += -mavx2
 endif
 
 # Include common rules


Commit: 41f82fbab8e944b3c663ba1abfa0ccc270b0c171
    https://github.com/scummvm/scummvm/commit/41f82fbab8e944b3c663ba1abfa0ccc270b0c171
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: Change wording for AVX2 option

Changed paths:
    configure
    graphics/module.mk


diff --git a/configure b/configure
index e947db2ad45..7904df0ca90 100755
--- a/configure
+++ b/configure
@@ -6919,10 +6919,10 @@ define_in_config_if_yes "$_ext_sse2" 'SCUMMVM_SSE2'
 echo_n "Enabling x86/64 SSE2... "
 echo "$_ext_sse2"
 define_in_config_if_yes "$_ext_avx2" 'SCUMMVM_AVX2'
-echo_n "Enabling x86/64 AVX2 and SSE2... "
+echo_n "Enabling x86/64 AVX2... "
 echo "$_ext_avx2"
 define_in_config_if_yes "$_ext_neon" 'SCUMMVM_NEON'
-echo_n "Enabling arm NEON... "
+echo_n "Enabling Arm NEON... "
 echo "$_ext_neon"
 
 echo_n "Backend... "
diff --git a/graphics/module.mk b/graphics/module.mk
index 4cb382564e8..cca8ebdf67f 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -143,13 +143,13 @@ endif
 endif
 
 ifeq ($(SCUMMVM_NEON),1)
-$(MODULE)/blit/blit-neon.cpp: CXXFLAGS += -mfpu=neon
+$(MODULE)/blit/blit-neon.o: CXXFLAGS += -mfpu=neon
 endif
 ifeq ($(SCUMMVM_SSE2),1)
-$(MODULE)/blit/blit-sse2.cpp: CXXFLAGS += -msse2
+$(MODULE)/blit/blit-sse2.o: CXXFLAGS += -msse2
 endif
 ifeq ($(SCUMMVM_AVX2),1)
-$(MODULE)/blit/blit-avx2.cpp: CXXFLAGS += -mavx2
+$(MODULE)/blit/blit-avx2.o: CXXFLAGS += -mavx2
 endif
 
 # Include common rules


Commit: 4fbde038662c20ffc5c21e5ac285bf0cf1cea6e9
    https://github.com/scummvm/scummvm/commit/4fbde038662c20ffc5c21e5ac285bf0cf1cea6e9
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
ALL: blendBlitFrom exaclty matched TS::blit

Changed paths:
    engines/sword25/gfx/image/renderedimage.cpp
    graphics/managed_surface.cpp
    graphics/managed_surface.h
    test/image/blending.h


diff --git a/engines/sword25/gfx/image/renderedimage.cpp b/engines/sword25/gfx/image/renderedimage.cpp
index 7ea25f33fd1..c8fe203fd88 100644
--- a/engines/sword25/gfx/image/renderedimage.cpp
+++ b/engines/sword25/gfx/image/renderedimage.cpp
@@ -235,7 +235,7 @@ bool RenderedImage::blit(int posX, int posY, int flipping, Common::Rect *pPartRe
 
 	if (width == -1) width = pPartRect ? pPartRect->width() : _surface.w;
 	if (height == -1) height = pPartRect ? pPartRect->height() : _surface.h;
-	_backSurface->blendBlitFrom(_surface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
+	_surface.blendBlitTo(*_backSurface, posX, posY, newFlipping, pPartRect, _surface.format.ARGBToColor(ca, cr, cg, cb), width, height);
 
 	return true;
 }
diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index ceb6df94874..f8c3ad379df 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -729,30 +729,29 @@ void ManagedSurface::transBlitFromInner(const Surface &src, const Common::Rect &
 
 #undef HANDLE_BLIT
 
-Common::Rect ManagedSurface::blendBlitFrom(const ManagedSurface &src,
-										   const int posX, const int posY,
-										   const int flipping,
-										   const Common::Rect *srcRect,
-										   const uint colorMod,
-										   const int width, const int height,
-										   const TSpriteBlendMode blend,
-										   const AlphaType alphaType) {
-	return blendBlitFrom(src.rawSurface(), posX, posY, flipping, srcRect, colorMod, width, height, blend, alphaType);
-}
-Common::Rect ManagedSurface::blendBlitFrom(const Surface &src,
-										   const int posX, const int posY,
-										   const int flipping,
-										   const Common::Rect *srcRect,
-										   const uint colorMod,
-										   const int width, const int height,
-										   const TSpriteBlendMode blend,
-										   const AlphaType alphaType) {
-
-	Common::Rect dstArea(posX, posY, posX + (width == -1 ? src.w : width), posY + (height == -1 ? src.h : height));
-	Common::Rect srcArea = srcRect ? *srcRect : Common::Rect(0, 0, src.w, src.h);
+Common::Rect ManagedSurface::blendBlitTo(ManagedSurface &target,
+										 const int posX, const int posY,
+										 const int flipping,
+										 const Common::Rect *srcRect,
+										 const uint colorMod,
+										 const int width, const int height,
+										 const TSpriteBlendMode blend,
+										 const AlphaType alphaType) {
+	return blendBlitTo(*target.surfacePtr(), posX, posY, flipping, srcRect, colorMod, width, height, blend, alphaType);
+}
+Common::Rect ManagedSurface::blendBlitTo(Surface &target,
+										 const int posX, const int posY,
+										 const int flipping,
+										 const Common::Rect *srcRect,
+										 const uint colorMod,
+										 const int width, const int height,
+										 const TSpriteBlendMode blend,
+										 const AlphaType alphaType) {
+	Common::Rect dstArea(posX, posY, posX + (width == -1 ? w : width), posY + (height == -1 ? h : height));
+	Common::Rect srcArea = srcRect ? *srcRect : Common::Rect(0, 0, w, h);
 	
-	if (!isBlendBlitPixelFormatSupported(src.format, format)) {
-		warning("ManagedSurface::blendBlitFrom only accepts RGBA32!");
+	if (!isBlendBlitPixelFormatSupported(format, target.format)) {
+		warning("ManagedSurface::blendBlitTo only accepts RGBA32!");
 		return Common::Rect(0, 0, 0, 0);
 	}
 
@@ -775,35 +774,35 @@ Common::Rect ManagedSurface::blendBlitFrom(const Surface &src,
 		dstArea.top = 0;
 	}
 
-	if (dstArea.right > w) {
-		srcArea.right -= (dstArea.right - w) * scaleX / BlendBlit::SCALE_THRESHOLD;
-		dstArea.right = w;
+	if (dstArea.right > target.w) {
+		srcArea.right -= (dstArea.right - target.w) * scaleX / BlendBlit::SCALE_THRESHOLD;
+		dstArea.right = target.w;
 	}
 
-	if (dstArea.bottom > h) {
-		srcArea.bottom -= (dstArea.bottom - h) * scaleY / BlendBlit::SCALE_THRESHOLD;
-		dstArea.bottom = h;
+	if (dstArea.bottom > target.h) {
+		srcArea.bottom -= (dstArea.bottom - target.h) * scaleY / BlendBlit::SCALE_THRESHOLD;
+		dstArea.bottom = target.h;
 	}
 
 	if (flipping & FLIP_H) {
 		int tmp_w = srcArea.width();
-		srcArea.left = src.w - srcArea.right;
+		srcArea.left = w - srcArea.right;
 		srcArea.right = srcArea.left + tmp_w;
 		scaleXoff = (BlendBlit::SCALE_THRESHOLD - (scaleXoff + dstArea.width() * scaleX)) % BlendBlit::SCALE_THRESHOLD;
 	}
 
 	if (flipping & FLIP_V) {
 		int tmp_h = srcArea.height();
-		srcArea.top = src.h - srcArea.bottom;
+		srcArea.top = h - srcArea.bottom;
 		srcArea.bottom = srcArea.top + tmp_h;
 		scaleYoff = (BlendBlit::SCALE_THRESHOLD - (scaleYoff + dstArea.height() * scaleY)) % BlendBlit::SCALE_THRESHOLD;
 	}
 
 	if (!dstArea.isEmpty() && !srcArea.isEmpty()) {
 		BlendBlit::blit(
-			(byte *)getBasePtr(0, 0),
-			(const byte *)src.getBasePtr(srcArea.left, srcArea.top),
-			pitch, src.pitch,
+			(byte *)target.getBasePtr(0, 0),
+			(const byte *)getBasePtr(srcArea.left, srcArea.top),
+			target.pitch, pitch,
 			dstArea.left, dstArea.top,
 			dstArea.width(), dstArea.height(),
 			scaleX, scaleY,
diff --git a/graphics/managed_surface.h b/graphics/managed_surface.h
index 4332dbf3d6f..5d8443fa62b 100644
--- a/graphics/managed_surface.h
+++ b/graphics/managed_surface.h
@@ -527,16 +527,17 @@ public:
 	}
 	
 	/**
-	 * ManagedSurface::blendBlitFrom is meant to be a highly optimized
+	 * ManagedSurface::blendBlitTo is meant to be a highly optimized
 	 * blending/blitting function, so it can only accept certain format combinations.
-	 * @return true if the formats can be used by blendBlitFrom.
+	 * @return true if the formats can be used by blendBlitTo.
 	 */
 	static inline bool isBlendBlitPixelFormatSupported(const PixelFormat &src, const PixelFormat &dst) {
 		return BlendBlit::getSupportedPixelFormat() == src && BlendBlit::getSupportedPixelFormat() == dst;
 	}
 
 	/**
-	 * @brief renders src onto this managed surface
+	 * @brief Renders this surface onto target
+	 * @param target renders this surface onto this one
 	 * @param src source surface
 	 * @param posX, posY are the position of the src onto this surface
 	 * @param flipping flipping flags (use Graphics::FLIP_FLAGS)
@@ -548,22 +549,22 @@ public:
 	 * @param alphaType what alpha mode to use. FULL is default
 	 * @return returns the size of the rendered rectangle
 	 */
-	Common::Rect blendBlitFrom(const ManagedSurface &src,
-							   const int posX = 0, const int posY = 0,
-							   const int flipping = FLIP_NONE,
-							   const Common::Rect *srcRect = nullptr,
-							   const uint colorMod = MS_ARGB(255, 255, 255, 255),
-							   const int width = -1, const int height = -1,
-							   const TSpriteBlendMode blend = BLEND_NORMAL,
-							   const AlphaType alphaType = ALPHA_FULL);
-	Common::Rect blendBlitFrom(const Surface &src,
-							   const int posX = 0, const int posY = 0,
-							   const int flipping = FLIP_NONE,
-							   const Common::Rect *srcRect = nullptr,
-							   const uint colorMod = MS_ARGB(255, 255, 255, 255),
-							   const int width = -1, const int height = -1,
-							   const TSpriteBlendMode blend = BLEND_NORMAL,
-							   const AlphaType alphaType = ALPHA_FULL);
+	Common::Rect blendBlitTo(ManagedSurface &target,
+							 const int posX = 0, const int posY = 0,
+							 const int flipping = FLIP_NONE,
+							 const Common::Rect *srcRect = nullptr,
+							 const uint colorMod = MS_ARGB(255, 255, 255, 255),
+							 const int width = -1, const int height = -1,
+							 const TSpriteBlendMode blend = BLEND_NORMAL,
+							 const AlphaType alphaType = ALPHA_FULL);
+	Common::Rect blendBlitTo(Surface &target,
+							 const int posX = 0, const int posY = 0,
+							 const int flipping = FLIP_NONE,
+							 const Common::Rect *srcRect = nullptr,
+							 const uint colorMod = MS_ARGB(255, 255, 255, 255),
+							 const int width = -1, const int height = -1,
+							 const TSpriteBlendMode blend = BLEND_NORMAL,
+							 const AlphaType alphaType = ALPHA_FULL);
 
 	/**
 	 * Clear the entire surface.
diff --git a/test/image/blending.h b/test/image/blending.h
index 25f3e59865b..0bbc2972421 100644
--- a/test/image/blending.h
+++ b/test/image/blending.h
@@ -839,7 +839,7 @@ public:
 			oldTime += g_system->getMillis() - oldStart;
 			uint32 newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurf.blendBlitTo(managedSurfDest, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTime += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
@@ -847,7 +847,7 @@ public:
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			uint32 genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurf.blendBlitTo(managedSurfDest, 0, 0, flipping, nullptr, color, -1, -1, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTime += g_system->getMillis() - genericStart;
@@ -864,14 +864,14 @@ public:
 			oldTimeScaled += g_system->getMillis() - oldStart;
 			newStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurf.blendBlitTo(managedSurfDest, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			newTimeScaled += g_system->getMillis() - newStart;
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
 			Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
 			genericStart = g_system->getMillis();
 			for (int i = 0; i < iters; i++) {
-				managedSurfDest.blendBlitFrom(managedSurf, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+				managedSurf.blendBlitTo(managedSurfDest, 0, 0, flipping, nullptr, color, managedSurfDest.w, managedSurfDest.h, (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 			}
 			Graphics::BlendBlit::blitFunc = oldFunc;
 			genericTimeScaled += g_system->getMillis() - genericStart;
@@ -882,11 +882,11 @@ public:
         } // blend
 
 		debug("Old TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTime / numIters);
-		debug("New ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTime / numIters);
-		debug("New ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTime / numIters);
+		debug("New ManagedSurface::blendBlitTo (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTime / numIters);
+		debug("New ManagedSurface::blendBlitTo avg time per %d iters (in milliseconds): %f\n", iters, newTime / numIters);
 		debug("Old SCALING TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTimeScaled / numItersScaled);
-		debug("New SCALING ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTimeScaled / numItersScaled);
-		debug("New SCALING ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
+		debug("New SCALING ManagedSurface::blendBlitTo (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTimeScaled / numItersScaled);
+		debug("New SCALING ManagedSurface::blendBlitTo avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
 
 	    baseSurface.free();
 	}
@@ -969,7 +969,7 @@ public:
             newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
             Common::Rect ret2 = newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
             managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
-            Common::Rect ret3 = managedSurfDest.blendBlitFrom(managedSurf, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
+            Common::Rect ret3 = managedSurf.blendBlitTo(managedSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], MS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
 
 			if (ret1 != ret2 || ret2 != ret3 || ret1 != ret3) {
                 warning("blendMode: %s, alphaType: %s, a: %d, r: %d, g: %d, b: %d, flipping: %s, test rect id: %s",


Commit: b16ae30ebd43b3ccbeb72ce856239bcab66781f9
    https://github.com/scummvm/scummvm/commit/b16ae30ebd43b3ccbeb72ce856239bcab66781f9
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRIFFON: Move to useing ManagedSurface

Changed paths:
    engines/griffon/combat.cpp
    engines/griffon/cutscenes.cpp
    engines/griffon/dialogs.cpp
    engines/griffon/draw.cpp
    engines/griffon/gfx.cpp
    engines/griffon/griffon.h
    engines/griffon/logic.cpp
    engines/griffon/resources.cpp
    engines/griffon/sound.cpp


diff --git a/engines/griffon/combat.cpp b/engines/griffon/combat.cpp
index 7f07d2ffd59..7e1e30a483d 100644
--- a/engines/griffon/combat.cpp
+++ b/engines/griffon/combat.cpp
@@ -730,7 +730,7 @@ void GriffonEngine::damageNPC(int npcnum, int damage, int spell) {
 			rcDest.setWidth(16);
 			rcDest.setHeight(16);
 
-			_tiles[curTileL]->blit(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+			_tiles[curTileL]->blendBlitTo(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 		}
 
 		// firehydra sword chest
diff --git a/engines/griffon/cutscenes.cpp b/engines/griffon/cutscenes.cpp
index 0ca9c48634b..b44aa3e626d 100644
--- a/engines/griffon/cutscenes.cpp
+++ b/engines/griffon/cutscenes.cpp
@@ -169,7 +169,7 @@ void GriffonEngine::showLogos() {
 		}
 
 		_videoBuffer->fillRect(Common::Rect(0, 0, 320, 240), 0);
-		_logosImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB((int)y, (int)y, (int)y, (int)y));
+		_logosImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, nullptr, MS_ARGB((int)y, (int)y, (int)y, (int)y));
 
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 		g_system->updateScreen();
@@ -206,8 +206,8 @@ void GriffonEngine::intro() {
 
 	_ticks = g_system->getMillis();
 
-	_videoBuffer->blit(*_videoBuffer3);
-	_videoBuffer->blit(*_videoBuffer2);
+	_videoBuffer->blendBlitTo(*_videoBuffer3);
+	_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 	_fpsr = 0.0;
 	int y = 140;
@@ -245,12 +245,12 @@ void GriffonEngine::intro() {
 		rc.left = -xofs;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		rc.left = -xofs + 320;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		if (++cnt >= 6) {
 			cnt = 0;
@@ -361,8 +361,8 @@ void GriffonEngine::endOfGame() {
 			break;
 
 		_videoBuffer2->fillRect(Common::Rect(0, 0, _videoBuffer2->w, _videoBuffer2->h), 0);
-		_videoBuffer->setAlpha(255 - ya);
-		_videoBuffer->blit(*_videoBuffer2);
+		_videoBuffer->surfacePtr()->setAlpha(255 - ya);
+		_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 		g_system->copyRectToScreen(_videoBuffer2->getPixels(), _videoBuffer2->pitch, 0, 0, _videoBuffer2->w, _videoBuffer2->h);
 
@@ -395,12 +395,12 @@ void GriffonEngine::endOfGame() {
 		rc.left = -xofs;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		rc.left = -xofs + 320;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		y = y - spd * _fpsr;
 
@@ -427,7 +427,7 @@ void GriffonEngine::endOfGame() {
 			ya = CLIP(ya, 0, 255);
 		}
 
-		_videoBuffer->setAlpha(ya);
+		_videoBuffer->surfacePtr()->setAlpha(ya);
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 
 		_ticksPassed = _ticks;
@@ -486,9 +486,9 @@ void GriffonEngine::endOfGame() {
 
 		_videoBuffer->fillRect(Common::Rect(0, 0, _videoBuffer->w, _videoBuffer->h), 0);
 
-		_videoBuffer->setAlpha(y1);
-		_videoBuffer2->blit(*_videoBuffer3);
-		_videoBuffer->blit(*_videoBuffer3);
+		_videoBuffer->surfacePtr()->setAlpha(y1);
+		_videoBuffer2->blendBlitTo(*_videoBuffer3);
+		_videoBuffer->blendBlitTo(*_videoBuffer3);
 
 		g_system->copyRectToScreen(_videoBuffer3->getPixels(), _videoBuffer3->pitch, 0, 0, _videoBuffer3->w, _videoBuffer3->h);
 		g_system->updateScreen();
@@ -526,10 +526,10 @@ void GriffonEngine::endOfGame() {
 			y1 = CLIP(y1, 0, 255);
 		}
 
-		_videoBuffer->setAlpha(y1);
+		_videoBuffer->surfacePtr()->setAlpha(y1);
 
 		_videoBuffer2->fillRect(Common::Rect(0, 0, _videoBuffer2->w, _videoBuffer2->h), 0);
-		_videoBuffer->blit(*_videoBuffer2);
+		_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 		g_system->copyRectToScreen(_videoBuffer2->getPixels(), _videoBuffer2->pitch, 0, 0, _videoBuffer2->w, _videoBuffer2->h);
 
@@ -572,8 +572,8 @@ void GriffonEngine::theEnd() {
 
 	for (float y = 0; y < 100; y += _fpsr) {
 		_videoBuffer2->fillRect(Common::Rect(0, 0, _videoBuffer2->w, _videoBuffer2->h), 0);
-		_videoBuffer->setAlpha((int)((100.0 - y) / 100 * 255));
-		_videoBuffer->blit(*_videoBuffer2);
+		_videoBuffer->surfacePtr()->setAlpha((int)((100.0 - y) / 100 * 255));
+		_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 		g_system->copyRectToScreen(_videoBuffer2->getPixels(), _videoBuffer2->pitch, 0, 0, _videoBuffer2->w, _videoBuffer2->h);
 		g_system->updateScreen();
diff --git a/engines/griffon/dialogs.cpp b/engines/griffon/dialogs.cpp
index 45e62744a94..f18d06c6147 100644
--- a/engines/griffon/dialogs.cpp
+++ b/engines/griffon/dialogs.cpp
@@ -71,8 +71,8 @@ void GriffonEngine::title(int mode) {
 
 	_ticks = g_system->getMillis();
 
-	_videoBuffer->blit(*_videoBuffer3);
-	_videoBuffer->blit(*_videoBuffer2);
+	_videoBuffer->blendBlitTo(*_videoBuffer3);
+	_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 	int cursel = 0;
 	int ticks1 = _ticks;
@@ -104,17 +104,17 @@ void GriffonEngine::title(int mode) {
 		rc.left = -xofs;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		rc.left = -xofs + 320.0;
 		rc.top = 0;
 
-		_titleImg->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		rc.left = 0;
 		rc.top = 0;
 
-		_titleImg2->blit(*_videoBuffer, rc.left, rc.top);
+		_titleImg2->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		int y = 172;
 		int x = 160 - 14 * 4;
@@ -131,7 +131,7 @@ void GriffonEngine::title(int mode) {
 		rc.left = (int16)(x - 16 - 4 * cos(2 * PI * _itemyloc / 16));
 		rc.top = (int16)(y - 4 + 16 * cursel);
 
-		_itemImg[15]->blit(*_videoBuffer, rc.left, rc.top);
+		_itemImg[15]->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		float yf = 255.0;
 		if (_ticks < ticks1 + 1000) {
@@ -139,7 +139,7 @@ void GriffonEngine::title(int mode) {
 			yf = CLIP<float>(yf, 0.0, 255.0);
 		}
 
-		_videoBuffer->setAlpha((int)yf);
+		_videoBuffer->surfacePtr()->setAlpha((int)yf);
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 		g_system->updateScreen();
 
@@ -300,8 +300,8 @@ void GriffonEngine::configMenu() {
 
 	_ticks = g_system->getMillis();
 
-	Graphics::TransparentSurface *configwindow = loadImage("art/configwindow.bmp", true);
-	configwindow->setAlpha(160, true);
+	Graphics::ManagedSurface *configwindow = loadImage("art/configwindow.bmp", true);
+	configwindow->surfacePtr()->setAlpha(160, true);
 
 	int ticks1 = _ticks;
 
@@ -316,16 +316,16 @@ void GriffonEngine::configMenu() {
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
+		_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		rcDest.left = 256;
 		rcDest.top = 192;
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
+		_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
-		configwindow->blit(*_videoBuffer);
+		configwindow->blendBlitTo(*_videoBuffer);
 
 		int sy = SY;
 
@@ -377,16 +377,16 @@ void GriffonEngine::configMenu() {
 		rc.left = 148 + 3 * cos(2 * PI * _itemyloc / 16.0);
 		rc.top = sy + 8 * curselt - 4;
 
-		_itemImg[15]->blit(*_videoBuffer, rc.left, rc.top);
+		_itemImg[15]->blendBlitTo(*_videoBuffer, rc.left, rc.top);
 
 		if (_ticks < ticks1 + 1000) {
 			float yy = 255.0 * ((float)(_ticks - ticks1) / 1000.0);
 			yy = CLIP<float>(yy, 0.0, 255.0);
 
-			_videoBuffer->setAlpha((uint8)yy);
+			_videoBuffer->surfacePtr()->setAlpha((uint8)yy);
 		}
 
-		_videoBuffer->blit(*_videoBuffer2);
+		_videoBuffer->blendBlitTo(*_videoBuffer2);
 		g_system->copyRectToScreen(_videoBuffer2->getPixels(), _videoBuffer2->pitch, 0, 0, _videoBuffer2->w, _videoBuffer2->h);
 
 		_ticksPassed = _ticks;
@@ -591,19 +591,19 @@ void GriffonEngine::renderSaveStates() {
 			int ss = (_playera.sword - 1) * 3;
 			if (_playera.sword == 3)
 				ss = 18;
-			_itemImg[ss]->blit(*_videoBuffer2, rcSrc.left, rcSrc.top);
+			_itemImg[ss]->blendBlitTo(*_videoBuffer2, rcSrc.left, rcSrc.top);
 
 			rcSrc.left += 16;
 			ss = (_playera.shield - 1) * 3 + 1;
 			if (_playera.shield == 3)
 				ss = 19;
-			_itemImg[ss]->blit(*_videoBuffer2, rcSrc.left, rcSrc.top);
+			_itemImg[ss]->blendBlitTo(*_videoBuffer2, rcSrc.left, rcSrc.top);
 
 			rcSrc.left += 16;
 			ss = (_playera.armour - 1) * 3 + 2;
 			if (_playera.armour == 3)
 				ss = 20;
-			_itemImg[ss]->blit(*_videoBuffer2, rcSrc.left, rcSrc.top);
+			_itemImg[ss]->blendBlitTo(*_videoBuffer2, rcSrc.left, rcSrc.top);
 
 			int nx = rcSrc.left + 13 + 3 * 8;
 			rcSrc.left = nx - 17;
@@ -612,7 +612,7 @@ void GriffonEngine::renderSaveStates() {
 				for (int i = 0; i < 5; i++) {
 					rcSrc.left += 17;
 					if (_playera.foundSpell[i])
-						_itemImg[7 + i]->blit(*_videoBuffer2, rcSrc.left, rcSrc.top);
+						_itemImg[7 + i]->blendBlitTo(*_videoBuffer2, rcSrc.left, rcSrc.top);
 				}
 			}
 		} else {
@@ -648,7 +648,7 @@ void GriffonEngine::saveLoadNew() {
 		uint32 color = *(uint32 *)_saveLoadImg->getBasePtr(120, 10);
 		_saveLoadImg->fillRect(Common::Rect(125, 15, 160, 33), color);
 	}
-	_saveLoadImg->setAlpha(192, true);
+	_saveLoadImg->surfacePtr()->setAlpha(192, true);
 
 	Common::TextToSpeechManager *ttsMan = g_system->getTextToSpeechManager();
 
@@ -661,16 +661,16 @@ void GriffonEngine::saveLoadNew() {
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
+		_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
 		rcDest.left = 256;
 		rcDest.top = 192;
 		rcDest.setWidth(320);
 		rcDest.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
+		_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest, MS_ARGB(128, 255, 255, 255));
 
-		_saveLoadImg->blit(*_videoBuffer);
+		_saveLoadImg->blendBlitTo(*_videoBuffer);
 
 		if (g_system->getEventManager()->pollEvent(_event)) {
 			if (_event.type == Common::EVENT_QUIT || _event.type == Common::EVENT_RETURN_TO_LAUNCHER) {
@@ -790,7 +790,7 @@ void GriffonEngine::saveLoadNew() {
 		}
 
 		// Render savestates
-		_videoBuffer2->blit(*_videoBuffer);
+		_videoBuffer2->blendBlitTo(*_videoBuffer);
 
 		// ------------------------------------------
 
@@ -820,7 +820,7 @@ void GriffonEngine::saveLoadNew() {
 			rcDest.top = (int16)(53 + (curRow - 1) * 48);
 		}
 
-		_itemImg[15]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+		_itemImg[15]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 
 		if (curRow != 0) {
 			rcDest.top = 18;
@@ -829,18 +829,18 @@ void GriffonEngine::saveLoadNew() {
 			else if (curCol == 2)
 				rcDest.left = 170;
 
-			_itemImg[15]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+			_itemImg[15]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 		}
 
 		if (_ticks < ticks1 + 1000) {
 			int yy = 255 * (_ticks - ticks1) / 1000;
 			yy = CLIP(yy, 0, 255);
 
-			_videoBuffer->setAlpha((uint8)yy);
+			_videoBuffer->surfacePtr()->setAlpha((uint8)yy);
 		}
 
 		_videoBuffer3->fillRect(Common::Rect(0, 0, _videoBuffer3->w, _videoBuffer3->h), 0);
-		_videoBuffer->blit(*_videoBuffer3);
+		_videoBuffer->blendBlitTo(*_videoBuffer3);
 
 		g_system->copyRectToScreen(_videoBuffer3->getPixels(), _videoBuffer3->pitch, 0, 0, _videoBuffer3->w, _videoBuffer3->h);
 		g_system->updateScreen();
diff --git a/engines/griffon/draw.cpp b/engines/griffon/draw.cpp
index d35f2573c5c..863227d33cf 100644
--- a/engines/griffon/draw.cpp
+++ b/engines/griffon/draw.cpp
@@ -85,7 +85,7 @@ const int invmap[4][7][13] = {
 };
 
 
-void game_fillrect(Graphics::TransparentSurface *surface, int x, int y, int w, int h, int color) {
+void game_fillrect(Graphics::ManagedSurface *surface, int x, int y, int w, int h, int color) {
 	surface->fillRect(Common::Rect(x, y, x + w, y + h), color);
 }
 
@@ -124,7 +124,7 @@ void GriffonEngine::drawAnims(int Layer) {
 							rcDest.setWidth(16);
 							rcDest.setHeight(16);
 
-							_tiles[curtilel]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+							_tiles[curtilel]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 						}
 
 						if (Layer == 1) {
@@ -160,7 +160,7 @@ void GriffonEngine::drawAnims(int Layer) {
 									}
 
 									if (pass)
-										_tiles[curtilel]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+										_tiles[curtilel]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 								}
 							}
 						}
@@ -217,7 +217,7 @@ void GriffonEngine::drawHud() {
 			rcDest.top = iy;
 
 			if (ico != 99)
-				_itemImg[ico]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+				_itemImg[ico]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 			if (ico == 99) {
 				int alpha = (int)(RND() * 96) + 96;
 
@@ -229,7 +229,7 @@ void GriffonEngine::drawHud() {
 				rcDest.left = ix;
 				rcDest.top = iy;
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+				_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 			}
 		}
 	}
@@ -247,7 +247,7 @@ void GriffonEngine::drawHud() {
 				rcSrc.left = rcSrc.left + 17;
 
 				if (_player.foundSpell[i]) {
-					_itemImg[7 + i]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+					_itemImg[7 + i]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 					game_fillrect(_videoBuffer, rcSrc.left, sy + 16, 16, 4, RGB(0, 32, 32));
 					game_fillrect(_videoBuffer, rcSrc.left + 1, sy + 17,
@@ -262,16 +262,16 @@ void GriffonEngine::drawHud() {
 	if (_selEnemyOn == false) {
 		rcDest = Common::Rect(320, 240);
 		_videoBuffer2->fillRect(rcDest, 0);
-		_videoBuffer2->setAlpha((int)(_player.itemselshade * 4));
-		_videoBuffer2->blit(*_videoBuffer);
+		_videoBuffer2->surfacePtr()->setAlpha((int)(_player.itemselshade * 4));
+		_videoBuffer2->blendBlitTo(*_videoBuffer);
 
 		int sy = 202;
 		rcSrc.left = 46;
 		rcSrc.top = 46;
 
-		_inventoryImg->setAlpha(160, true); // 128
-		_inventoryImg->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
-		_inventoryImg->setAlpha(255, true);
+		_inventoryImg->surfacePtr()->setAlpha(160, true); // 128
+		_inventoryImg->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
+		_inventoryImg->surfacePtr()->setAlpha(255, true);
 
 		int sx = 54;
 		sy = 55;
@@ -287,7 +287,7 @@ void GriffonEngine::drawHud() {
 			amap = 3;
 		if (_curMap > 5 && _curMap < 42)
 			amap = 1;
-		mapImg[amap]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+		mapImg[amap]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 
 		long ccc = _videoBuffer->format.RGBToColor(128 + 127 * sin(3.141592 * 2 * _itemyloc / 16), 0, 0);
 
@@ -348,19 +348,19 @@ void GriffonEngine::drawHud() {
 		int ss = (_player.sword - 1) * 3;
 		if (_player.sword == 3)
 			ss = 18;
-		_itemImg[ss]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+		_itemImg[ss]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 		rcSrc.left = rcSrc.left + 16;
 		ss = (_player.shield - 1) * 3 + 1;
 		if (_player.shield == 3)
 			ss = 19;
-		_itemImg[ss]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+		_itemImg[ss]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 		rcSrc.left = rcSrc.left + 16;
 		ss = (_player.armour - 1) * 3 + 2;
 		if (_player.armour == 3)
 			ss = 20;
-		_itemImg[ss]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+		_itemImg[ss]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 		for (int i = 0; i <= 4; i++) {
 			sx = 188;
@@ -368,15 +368,15 @@ void GriffonEngine::drawHud() {
 			rcSrc.left = sx;
 			rcSrc.top = sy;
 			if (i == 0)
-				_itemImg[6]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+				_itemImg[6]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 			else if (i == 1)
-				_itemImg[12]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+				_itemImg[12]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 			else if (i == 2)
-				_itemImg[17]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+				_itemImg[17]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 			else if (i == 3)
-				_itemImg[16]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+				_itemImg[16]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 			else if (i == 4)
-				_itemImg[14]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+				_itemImg[14]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 			Common::sprintf_s(line, "x%i", _player.inventory[i]);
 			drawString(_videoBuffer, line, sx + 17, sy + 7, 0);
@@ -390,7 +390,7 @@ void GriffonEngine::drawHud() {
 				sy = rcSrc.top;
 
 				if (_player.foundSpell[i]) {
-					_itemImg[7 + i]->blit(*_videoBuffer, rcSrc.left, rcSrc.top);
+					_itemImg[7 + i]->blendBlitTo(*_videoBuffer, rcSrc.left, rcSrc.top);
 
 					game_fillrect(_videoBuffer, rcSrc.left, sy + 16, 16, 4, RGB(0, 32, 32));
 					game_fillrect(_videoBuffer, rcSrc.left + 1, sy + 17,
@@ -405,11 +405,11 @@ void GriffonEngine::drawHud() {
 				if (_curItem == 5 + i) {
 					rcDest.left = (float)(243 - 12 + 3 * sin(3.141592 * 2 * _itemyloc / 16));
 					rcDest.top = 67 + 24 * i;
-					_itemImg[15]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+					_itemImg[15]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 				} else if (_curItem == i) {
 					rcDest.left = (float)(189 - 12 + 3 * sin(3.141592 * 2 * _itemyloc / 16));
 					rcDest.top = 70 + 24 * i;
-					_itemImg[15]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+					_itemImg[15]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 				}
 			}
 		}
@@ -425,7 +425,7 @@ void GriffonEngine::drawHud() {
 			rcDest.top = (float)(_npcInfo[_curEnemy].y + 4 - 16 - sin(3.141592 / 8 * _itemyloc));
 		}
 
-		_itemImg[13]->blit(*_videoBuffer, rcDest.left, rcDest.top);
+		_itemImg[13]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top);
 	}
 }
 
@@ -475,7 +475,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							rcDest.top += (int)(RND() * 3) - 1;
 						}
 
-						_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					} else {
 						int cframe = _npcInfo[i].cattackframe;
 
@@ -489,7 +489,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.setWidth(24);
 						rcDest.setHeight(24);
 
-						_animsAttack[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_animsAttack[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					}
 
 				}
@@ -506,7 +506,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = _npcInfo[i].bodysection[f].x - _animSet2[s].xofs;
 						rcDest.top = _npcInfo[i].bodysection[f].y - _animSet2[s].yofs + 2;
 
-						_anims[2]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_anims[2]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					}
 
 				}
@@ -527,7 +527,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = _npcInfo[i].bodysection[f].x - _animSet9[s].xofs;
 						rcDest.top = _npcInfo[i].bodysection[f].y - _animSet9[s].yofs + 2;
 
-						_anims[9]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_anims[9]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					}
 
 				}
@@ -549,7 +549,7 @@ void GriffonEngine::drawNPCs(int mode) {
 					rcDest.left = npx - 2;
 					rcDest.top = npy - 24;
 
-					_anims[3]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[3]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 
 				}
 
@@ -568,7 +568,7 @@ void GriffonEngine::drawNPCs(int mode) {
 					rcDest.left = npx - 2;
 					rcDest.top = npy - 24;
 
-					_anims[4]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[4]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 
 
@@ -587,7 +587,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							int x = 192 + ((int)(_itemyloc + ff * 5) % 3) * 64;
 							if (x > 255)
 								x = 255;
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
 
 							for (int f = 1; f <= 8; f++) {
 								rcSrc.left = 16 * (int)(RND() * 2);
@@ -601,7 +601,7 @@ void GriffonEngine::drawNPCs(int mode) {
 								x = 192 + f % 3 * 64;
 								if (x > 255)
 									x = 255;
-								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
+								_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(x, 255, 255, 255));
 							}
 
 							rcSrc.left = 0;
@@ -612,8 +612,8 @@ void GriffonEngine::drawNPCs(int mode) {
 							rcDest.left = _npcInfo[i].bodysection[10 * ff + 9].x - 21;
 							rcDest.top = _npcInfo[i].bodysection[10 * ff + 9].y - 21;
 
-							_spellImg->setAlpha(192, true);
-							_anims[5]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+							_spellImg->surfacePtr()->setAlpha(192, true);
+							_anims[5]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 						}
 
 					}
@@ -640,7 +640,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.top = rcDest.top + (int)(RND() * 3) - 1;
 					}
 
-					_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 
 				// wizard
@@ -663,7 +663,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = rcDest.left + (int)(RND() * 3) - 1;
 						rcDest.top = rcDest.top + (int)(RND() * 3) - 1;
 					}
-					_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 
 				// yellow dragon
@@ -685,7 +685,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = rcDest.left + (int)(RND() * 3) - 1;
 						rcDest.top = rcDest.top + (int)(RND() * 3) - 1;
 					}
-					_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 
 
@@ -725,7 +725,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							rcDest.top = rcDest.top + (int)(RND() * 3) - 1;
 						}
 
-						_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					} else {
 						_npcInfo[i].floating = _npcInfo[i].floating + 0.25 * _fpsr;
 						while (_npcInfo[i].floating >= 16)
@@ -743,7 +743,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.setWidth(24);
 						rcDest.setHeight(24);
 
-						_animsAttack[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_animsAttack[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					}
 				}
 
@@ -777,7 +777,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = sx + 32 + (int)(RND() * 3) - 1;
 						rcDest.top = sy - (int)(RND() * 6);
 
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 					}
 
 					for (int ii = 0; ii <= 8; ii++) {
@@ -796,7 +796,7 @@ void GriffonEngine::drawNPCs(int mode) {
 
 							int alpha = i2 / 3 * 224;
 
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 							int xloc = rcDest.left;
 							int yloc = rcDest.top;
@@ -820,7 +820,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							rcDest.left = (float)(sx + 36 - ii * 8 + ii * cos(3.14159 * 2 * (fr3 - ii) / 16) * 2);
 							rcDest.top = (float)(sy + 16 + ii * sin(3.14159 * 2 * (fr3 - ii) / 16) * 3 - ii); //  * 4
 
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 
 							xloc = rcDest.left;
 							yloc = rcDest.top;
@@ -857,7 +857,7 @@ void GriffonEngine::drawNPCs(int mode) {
 							rcDest.left = rcDest.top + (int)(RND() * 3) - 1;
 						}
 
-						_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					} else {
 						int cframe = (int)(_npcInfo[i].cattackframe);
 
@@ -869,7 +869,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.left = sx;
 						rcDest.top = sy;
 
-						_animsAttack[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_animsAttack[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 					}
 				}
 
@@ -908,7 +908,7 @@ void GriffonEngine::drawNPCs(int mode) {
 						rcDest.top = rcDest.top + (int)(RND() * 3) - 1;
 					}
 
-					_anims[sprite]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+					_anims[sprite]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 
 				rcDest.left = npx + 4;
@@ -989,7 +989,7 @@ void GriffonEngine::drawOver(int modx, int mody) {
 					}
 
 					if (pass)
-						_tiles[curtilel]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+						_tiles[curtilel]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 				}
 			}
 		}
@@ -1012,7 +1012,7 @@ void GriffonEngine::drawPlayer() {
 		rcDest.setWidth(24);
 		rcDest.setHeight(24);
 
-		_anims[f]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+		_anims[f]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 	} else {
 		rcSrc.left = (int)(_player.attackFrame / 4) * 24;
 		rcSrc.top = _player.walkDir * 24;
@@ -1024,7 +1024,7 @@ void GriffonEngine::drawPlayer() {
 		rcDest.setWidth(24);
 		rcDest.setHeight(24);
 
-		_animsAttack[f]->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+		_animsAttack[f]->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 	}
 
 	long ccc = _videoBuffer->format.RGBToColor(224, 224, 64);
@@ -1120,7 +1120,7 @@ void GriffonEngine::drawView() {
 		rc.setWidth(320);
 		rc.setHeight(240);
 
-		_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rc);
+		_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rc);
 	}
 
 	drawHud();
@@ -1133,7 +1133,7 @@ void GriffonEngine::swash() {
 	do {
 		y += 1 * _fpsr;
 
-		_videoBuffer->setAlpha((int)y);
+		_videoBuffer->surfacePtr()->setAlpha((int)y);
 		_videoBuffer->fillRect(Common::Rect(0, 0, _videoBuffer->w, _videoBuffer->h), 0);
 
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
@@ -1167,8 +1167,8 @@ void GriffonEngine::swash() {
 	do {
 		y += _fpsr;
 
-		_videoBuffer->setAlpha((int)(y * 25));
-		_mapBg->blit(*_videoBuffer);
+		_videoBuffer->surfacePtr()->setAlpha((int)(y * 25));
+		_mapBg->blendBlitTo(*_videoBuffer);
 
 		if (_cloudsOn) {
 			rcDest.left = (float)(256 + 256 * cos(3.141592 / 180 * _cloudAngle));
@@ -1176,7 +1176,7 @@ void GriffonEngine::swash() {
 			rcDest.setWidth(320);
 			rcDest.setHeight(240);
 
-			_cloudImg->blit(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest);
+			_cloudImg->blendBlitTo(*_videoBuffer, 0, 0, Graphics::FLIP_NONE, &rcDest);
 		}
 
 		g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
@@ -1205,7 +1205,7 @@ void GriffonEngine::swash() {
 	} while (y <= 10);
 
 
-	_videoBuffer->setAlpha(255);
+	_videoBuffer->surfacePtr()->setAlpha(255);
 }
 
 
diff --git a/engines/griffon/gfx.cpp b/engines/griffon/gfx.cpp
index 7fc5567754f..7211d6a9123 100644
--- a/engines/griffon/gfx.cpp
+++ b/engines/griffon/gfx.cpp
@@ -74,15 +74,15 @@ void GriffonEngine::eventText(const char *stri) {
 	int pause_ticks = _ticks + 500;
 	int b_ticks = _ticks;
 
-	_videoBuffer->blit(*_videoBuffer3);
-	_videoBuffer->blit(*_videoBuffer2);
+	_videoBuffer->blendBlitTo(*_videoBuffer3);
+	_videoBuffer->blendBlitTo(*_videoBuffer2);
 
 	do {
 		g_system->getEventManager()->pollEvent(_event);
 
 		if ((_event.type == Common::EVENT_KEYDOWN || _event.type == Common::EVENT_CUSTOM_ENGINE_ACTION_START) && pause_ticks < _ticks)
 			break;
-		_videoBuffer2->blit(*_videoBuffer);
+		_videoBuffer2->blendBlitTo(*_videoBuffer);
 
 		int fr = 192;
 
@@ -91,9 +91,9 @@ void GriffonEngine::eventText(const char *stri) {
 		if (fr > 192)
 			fr = 192;
 
-		_windowImg->setAlpha(fr, true);
+		_windowImg->surfacePtr()->setAlpha(fr, true);
 
-		_windowImg->blit(*_videoBuffer);
+		_windowImg->blendBlitTo(*_videoBuffer);
 		if (pause_ticks < _ticks)
 			drawString(_videoBuffer, stri, x, 15, 0);
 
@@ -119,12 +119,12 @@ void GriffonEngine::eventText(const char *stri) {
 		g_system->delayMillis(10);
 	} while (1);
 
-	_videoBuffer3->blit(*_videoBuffer);
+	_videoBuffer3->blendBlitTo(*_videoBuffer);
 
 	_itemTicks = _ticks + 210;
 }
 
-void GriffonEngine::drawLine(Graphics::TransparentSurface *buffer, int x1, int y1, int x2, int y2, int col) {
+void GriffonEngine::drawLine(Graphics::ManagedSurface *buffer, int x1, int y1, int x2, int y2, int col) {
 	int xdif = x2 - x1;
 	int ydif = y2 - y1;
 
@@ -143,14 +143,14 @@ void GriffonEngine::drawLine(Graphics::TransparentSurface *buffer, int x1, int y
 	}
 }
 
-void GriffonEngine::drawString(Graphics::TransparentSurface *buffer, const char *stri, int xloc, int yloc, int col) {
+void GriffonEngine::drawString(Graphics::ManagedSurface *buffer, const char *stri, int xloc, int yloc, int col) {
 	int l = strlen(stri);
 
 	for (int i = 0; i < l; i++) {
 		rcDest.left = xloc + i * 8;
 		rcDest.top = yloc;
 
-		_fontChr[stri[i] - 32][col]->blit(*buffer, rcDest.left, rcDest.top);
+		_fontChr[stri[i] - 32][col]->blendBlitTo(*buffer, rcDest.left, rcDest.top);
 	}
 }
 
diff --git a/engines/griffon/griffon.h b/engines/griffon/griffon.h
index ab2fc87e104..195791c54f2 100644
--- a/engines/griffon/griffon.h
+++ b/engines/griffon/griffon.h
@@ -43,7 +43,7 @@
 
 #include "audio/mixer.h"
 
-#include "graphics/transparent_surface.h"
+#include "graphics/managed_surface.h"
 
 namespace Griffon {
 
@@ -395,8 +395,8 @@ private:
 	void addFloatIcon(int ico, float xloc, float yloc);
 	void addFloatText(const char *stri, float xloc, float yloc, int col);
 	void eventText(const char *stri);
-	void drawLine(Graphics::TransparentSurface *buffer, int x1, int y1, int x2, int y2, int col);
-	void drawString(Graphics::TransparentSurface *buffer, const char *stri, int xloc, int yloc, int col);
+	void drawLine(Graphics::ManagedSurface *buffer, int x1, int y1, int x2, int y2, int col);
+	void drawString(Graphics::ManagedSurface *buffer, const char *stri, int xloc, int yloc, int col);
 	void drawProgress(int w, int wm);
 
 	// input.cpp
@@ -414,7 +414,7 @@ private:
 
 	// resources.cpp
 	void initialize();
-	Graphics::TransparentSurface *loadImage(const char *name, bool colorkey = false);
+	Graphics::ManagedSurface *loadImage(const char *name, bool colorkey = false);
 	void loadMap(int mapnum);
 	void loadAnims();
 	void loadFont();
@@ -450,23 +450,23 @@ private:
 	bool hasFeature(EngineFeature f) const override;
 
 private:
-	Graphics::TransparentSurface *_video, *_videoBuffer, *_videoBuffer2, *_videoBuffer3;
+	Graphics::ManagedSurface *_video, *_videoBuffer, *_videoBuffer2, *_videoBuffer3;
 
 	// system
-	Graphics::TransparentSurface *_titleImg, *_titleImg2, *_inventoryImg;
-	Graphics::TransparentSurface *_logosImg, *_theEndImg;
+	Graphics::ManagedSurface *_titleImg, *_titleImg2, *_inventoryImg;
+	Graphics::ManagedSurface *_logosImg, *_theEndImg;
 	Common::Event _event;
 
-	Graphics::TransparentSurface *_mapBg, *_clipBg, *_clipBg2;
+	Graphics::ManagedSurface *_mapBg, *_clipBg, *_clipBg2;
 	unsigned int _clipSurround[4][4];
 
 	float _animSpeed; // CHECKME: it seems to always be 0.5
 	int _rampData[40][24];
 
 	int _curMap;
-	Graphics::TransparentSurface *_fontChr[224][5]; // 256 - 32
-	Graphics::TransparentSurface *_itemImg[21], *_windowImg;
-	Graphics::TransparentSurface *_spellImg;
+	Graphics::ManagedSurface *_fontChr[224][5]; // 256 - 32
+	Graphics::ManagedSurface *_itemImg[21], *_windowImg;
+	Graphics::ManagedSurface *_spellImg;
 
 	bool _itemSelOn;
 	int _curItem, _itemTicks;
@@ -482,7 +482,7 @@ private:
 	float _fp, _fps, _fpsr; // CHECKME: _fp and _fps seems to be integers
 	int _secsInGame, _secStart;
 
-	Graphics::TransparentSurface *mapImg[4];
+	Graphics::ManagedSurface *mapImg[4];
 
 	Common::Rect rcSrc, rcDest;
 
@@ -490,14 +490,14 @@ private:
 	bool _dontDrawOver;   // used in map24 so that the candles don't draw over the boss, default set to 0
 
 	// saveload info
-	Graphics::TransparentSurface *_saveLoadImg;
+	Graphics::ManagedSurface *_saveLoadImg;
 
 	// post info
 	float _postInfo[21][3];
 	int _postInfoNbr;
 
 	// cloud info
-	Graphics::TransparentSurface *_cloudImg;
+	Graphics::ManagedSurface *_cloudImg;
 	float _cloudAngle;
 	int _cloudsOn;
 
@@ -512,13 +512,13 @@ private:
 	int _asecstart;
 
 	// tile info
-	Graphics::TransparentSurface *_tiles[4];
+	Graphics::ManagedSurface *_tiles[4];
 	int _tileinfo[3][40][24][3]; // maplayer, x, y, tiledata (tile, tilelayer)
 
 	// animation info
-	Graphics::TransparentSurface *_anims[100];
+	Graphics::ManagedSurface *_anims[100];
 	// id number 0&1 = players
-	Graphics::TransparentSurface *_animsAttack[100];
+	Graphics::ManagedSurface *_animsAttack[100];
 	// attack anims
 	AttackOffsetStruct _playerAttackOfs[4][16];
 
diff --git a/engines/griffon/logic.cpp b/engines/griffon/logic.cpp
index e6e8833d2cb..27cbb19adf7 100644
--- a/engines/griffon/logic.cpp
+++ b/engines/griffon/logic.cpp
@@ -1307,7 +1307,7 @@ void GriffonEngine::updateSpells() {
 						}
 
 						if (xloc > -16 && xloc < 304 && yloc > -16 && yloc < 224) {
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB((int)alf, 255, 255, 255));
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB((int)alf, 255, 255, 255));
 
 							if (_spellInfo[i].damagewho == 0) {
 								for (int e = 1; e <= _lastNpc; e++) {
@@ -1382,7 +1382,7 @@ void GriffonEngine::updateSpells() {
 				rcDest.left = xloc;
 				rcDest.top = yloc;
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
+				_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.2 * _fpsr;
 				if (_spellInfo[i].frame < 0)
@@ -1504,7 +1504,7 @@ void GriffonEngine::updateSpells() {
 						rcDest.top = yloc;
 
 						if (xloc > -16 && xloc < 304 && yloc > -16 && yloc < 224) {
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(255, 255, 255, 255));
 
 							if (scatter) {
 								if (_spellInfo[i].damagewho == 0) {
@@ -1583,7 +1583,7 @@ void GriffonEngine::updateSpells() {
 				if (fra > 24)
 					f = 192 * (1 - (fra - 24) / 8);
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
+				_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.3 * _fpsr;
 				if (_spellInfo[i].frame < 0) {
@@ -1680,7 +1680,7 @@ void GriffonEngine::updateSpells() {
 						rcDest.left = xloc;
 						rcDest.top = yloc;
 
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 					}
 				} else {
 
@@ -1712,7 +1712,7 @@ void GriffonEngine::updateSpells() {
 							rcDest.left = xloc;
 							rcDest.top = yloc;
 
-							_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(192, 255, 255, 255));
+							_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(192, 255, 255, 255));
 						}
 
 						if (xloc < -1 || yloc < -1 || xloc > 304 || yloc > 224)
@@ -2089,7 +2089,7 @@ void GriffonEngine::updateSpellsUnder() {
 				if (fra > 24)
 					f = 160 * (1 - (fra - 24) / 8);
 
-				_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
+				_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(f, 255, 255, 255));
 
 				_spellInfo[i].frame = _spellInfo[i].frame - 0.2 * _fpsr;
 				if (_spellInfo[i].frame < 0)
@@ -2191,7 +2191,7 @@ void GriffonEngine::updateSpellsUnder() {
 							rcDest.top = (int)yloc;
 
 							if (xloc > -1 && xloc < 304 && yloc > -1 && yloc < 224) {
-								_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+								_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 								int sx = (xloc / 2 + 4);
 								int sy = (yloc / 2 + 8);
@@ -2313,7 +2313,7 @@ void GriffonEngine::updateSpellsUnder() {
 					rcDest.top = yloc;
 
 					if (xloc > -16 && xloc < 320 && yloc > -16 && yloc < 240) {
-						_spellImg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+						_spellImg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 						if (_spellInfo[i].damagewho == 1) {
 							float xdif = (xloc + 8) - (_player.px + 12);
diff --git a/engines/griffon/resources.cpp b/engines/griffon/resources.cpp
index 1e23c7504be..fb41ba76be6 100644
--- a/engines/griffon/resources.cpp
+++ b/engines/griffon/resources.cpp
@@ -66,19 +66,19 @@ void GriffonEngine::initialize() {
 		_floatIcon[i].ico = 0;
 	}
 
-	_video = new Graphics::TransparentSurface;
+	_video = new Graphics::ManagedSurface;
 	_video->create(320, 240, g_system->getScreenFormat());
-	_videoBuffer = new Graphics::TransparentSurface;
+	_videoBuffer = new Graphics::ManagedSurface;
 	_videoBuffer->create(320, 240, g_system->getScreenFormat());
-	_videoBuffer2 = new Graphics::TransparentSurface;
+	_videoBuffer2 = new Graphics::ManagedSurface;
 	_videoBuffer2->create(320, 240, g_system->getScreenFormat());
-	_videoBuffer3 = new Graphics::TransparentSurface;
+	_videoBuffer3 = new Graphics::ManagedSurface;
 	_videoBuffer3->create(320, 240, g_system->getScreenFormat());
-	_mapBg = new Graphics::TransparentSurface;
+	_mapBg = new Graphics::ManagedSurface;
 	_mapBg->create(320, 240, g_system->getScreenFormat());
-	_clipBg = new Graphics::TransparentSurface;
+	_clipBg = new Graphics::ManagedSurface;
 	_clipBg->create(320, 240, g_system->getScreenFormat());
-	_clipBg2 = new Graphics::TransparentSurface;
+	_clipBg2 = new Graphics::ManagedSurface;
 	_clipBg2->create(320, 240, g_system->getScreenFormat());
 
 	for (int i = 0; i <= 3; i++) {
@@ -89,13 +89,13 @@ void GriffonEngine::initialize() {
 	}
 
 	_cloudImg = loadImage("art/clouds.bmp", true);
-	_cloudImg->setAlpha(64, true);
+	_cloudImg->surfacePtr()->setAlpha(64, true);
 
 	_saveLoadImg = nullptr;
 
 	_titleImg = loadImage("art/titleb.bmp");
 	_titleImg2 = loadImage("art/titlea.bmp", true);
-	//_titleimg2->setAlpha(204, true);
+	//_titleimg2->surfacePtr()->setAlpha(204, true);
 
 	_inventoryImg = loadImage("art/inventory.bmp", true);
 
@@ -130,7 +130,7 @@ void GriffonEngine::initialize() {
 	setupAudio();
 }
 
-Graphics::TransparentSurface *GriffonEngine::loadImage(const char *name, bool colorkey) {
+Graphics::ManagedSurface *GriffonEngine::loadImage(const char *name, bool colorkey) {
 	Common::File file;
 
 	file.open(name);
@@ -144,10 +144,10 @@ Graphics::TransparentSurface *GriffonEngine::loadImage(const char *name, bool co
 	bitmapDecoder.loadStream(file);
 	file.close();
 
-	Graphics::TransparentSurface *surface = new Graphics::TransparentSurface(*bitmapDecoder.getSurface()->convertTo(g_system->getScreenFormat()));
+	Graphics::ManagedSurface *surface = new Graphics::ManagedSurface(bitmapDecoder.getSurface()->convertTo(g_system->getScreenFormat()));
 
 	if (colorkey)
-		surface->applyColorKey(255, 0, 255);
+		surface->surfacePtr()->applyColorKey(255, 0, 255);
 
 	return surface;
 }
@@ -321,7 +321,7 @@ void GriffonEngine::loadMap(int mapnum) {
 						}
 					}
 
-					_tiles[curtilel]->blit(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
+					_tiles[curtilel]->blendBlitTo(*_mapBg, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(alpha, 255, 255, 255));
 
 					rcDest.left = x * 8;
 					rcDest.top = y * 8;
@@ -1087,10 +1087,10 @@ void GriffonEngine::loadAnims() {
 }
 
 void GriffonEngine::loadItemImgs() {
-	Graphics::TransparentSurface *temp = loadImage("art/icons.bmp", true);
+	Graphics::ManagedSurface *temp = loadImage("art/icons.bmp", true);
 
 	for (int i = 0; i <= 20; i++) {
-		_itemImg[i] = new Graphics::TransparentSurface;
+		_itemImg[i] = new Graphics::ManagedSurface;
 		_itemImg[i]->create(16, 16, g_system->getScreenFormat());
 
 		rcSrc.left = i * 16;
@@ -1098,20 +1098,20 @@ void GriffonEngine::loadItemImgs() {
 		rcSrc.setWidth(16);
 		rcSrc.setHeight(16);
 
-		temp->blit(*_itemImg[i], 0, 0, Graphics::FLIP_NONE, &rcSrc);
+		temp->blendBlitTo(*_itemImg[i], 0, 0, Graphics::FLIP_NONE, &rcSrc);
 	}
 
 	temp->free();
 }
 
 void GriffonEngine::loadFont() {
-	Graphics::TransparentSurface *font = loadImage("art/font.bmp", true);
+	Graphics::ManagedSurface *font = loadImage("art/font.bmp", true);
 
 	for (int i = 32; i <= 255; i++)
 		for (int f = 0; f <= 4; f++) {
 			int i2 = i - 32;
 
-			_fontChr[i2][f] = new Graphics::TransparentSurface;
+			_fontChr[i2][f] = new Graphics::ManagedSurface;
 			_fontChr[i2][f]->create(8, 8, g_system->getScreenFormat());
 
 			int col = i2 % 40;
@@ -1125,7 +1125,7 @@ void GriffonEngine::loadFont() {
 
 			rcDest.left = 0;
 			rcDest.top = 0;
-			font->blit(*_fontChr[i2][f], rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
+			font->blendBlitTo(*_fontChr[i2][f], rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc);
 		}
 
 	font->free();
diff --git a/engines/griffon/sound.cpp b/engines/griffon/sound.cpp
index 4c5f4d7b6d2..a1dea930d23 100644
--- a/engines/griffon/sound.cpp
+++ b/engines/griffon/sound.cpp
@@ -122,7 +122,7 @@ void GriffonEngine::setupAudio() {
 	const char *stri = "Loading...";
 	drawString(_videoBuffer, stri, 160 - 4 * strlen(stri), 116, 0);
 
-	Graphics::TransparentSurface *loadimg = loadImage("art/load.bmp", true);
+	Graphics::ManagedSurface *loadimg = loadImage("art/load.bmp", true);
 
 	rcSrc.left = 0;
 	rcSrc.top = 0;
@@ -132,7 +132,7 @@ void GriffonEngine::setupAudio() {
 	rcDest.left = 160 - 44;
 	rcDest.top = 116 + 12;
 
-	loadimg->blit(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(160, 255, 255, 255));
+	loadimg->blendBlitTo(*_videoBuffer, rcDest.left, rcDest.top, Graphics::FLIP_NONE, &rcSrc, MS_ARGB(160, 255, 255, 255));
 
 	g_system->copyRectToScreen(_videoBuffer->getPixels(), _videoBuffer->pitch, 0, 0, _videoBuffer->w, _videoBuffer->h);
 	g_system->updateScreen();


Commit: 5112d7969008cc7c5fa839d070e55d92a6867a96
    https://github.com/scummvm/scummvm/commit/5112d7969008cc7c5fa839d070e55d92a6867a96
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fixed ManagedSurface bug

Changed paths:
    graphics/managed_surface.cpp


diff --git a/graphics/managed_surface.cpp b/graphics/managed_surface.cpp
index f8c3ad379df..b51dafff0c1 100644
--- a/graphics/managed_surface.cpp
+++ b/graphics/managed_surface.cpp
@@ -747,8 +747,8 @@ Common::Rect ManagedSurface::blendBlitTo(Surface &target,
 										 const int width, const int height,
 										 const TSpriteBlendMode blend,
 										 const AlphaType alphaType) {
-	Common::Rect dstArea(posX, posY, posX + (width == -1 ? w : width), posY + (height == -1 ? h : height));
 	Common::Rect srcArea = srcRect ? *srcRect : Common::Rect(0, 0, w, h);
+	Common::Rect dstArea(posX, posY, posX + (width == -1 ? srcArea.width() : width), posY + (height == -1 ? srcArea.height() : height));
 	
 	if (!isBlendBlitPixelFormatSupported(format, target.format)) {
 		warning("ManagedSurface::blendBlitTo only accepts RGBA32!");


Commit: 9d354fc636e14e00d49fbcb59e790dc5237301bb
    https://github.com/scummvm/scummvm/commit/9d354fc636e14e00d49fbcb59e790dc5237301bb
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fixing blendBlitTo bugs

Changed paths:
    graphics/blit/blit-avx2.cpp
    graphics/blit/blit-neon.cpp
    graphics/blit/blit-sse2.cpp
    graphics/transparent_surface.cpp


diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
index 452c2f32808..d3b8e7cecd1 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -74,7 +74,7 @@ struct AlphaBlend {
         return _mm256_or_si256(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -130,7 +130,7 @@ struct MultiplyBlend {
         return _mm256_or_si256(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -147,7 +147,7 @@ struct OpaqueBlend {
         return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
     }
 };
@@ -161,7 +161,7 @@ struct BinaryBlend {
         return _mm256_or_si256(src, dst);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 pix = *(const uint32 *)in;
         int a = in[BlendBlit::kAIndex];
 
@@ -225,7 +225,7 @@ struct AdditiveBlend {
         return _mm256_or_si256(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -254,7 +254,7 @@ struct SubtractiveBlend {
         return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         out[BlendBlit::kAIndex] = 255;
         out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
         out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
@@ -327,7 +327,7 @@ static void blitInnerLoop(BlendBlit::Args &args) {
                 in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
             }
 
-            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
             
             if (doscale)
                 scaleXCtr += args.scaleX;
diff --git a/graphics/blit/blit-neon.cpp b/graphics/blit/blit-neon.cpp
index f95577c9a46..e63e7f22774 100644
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-neon.cpp
@@ -74,7 +74,7 @@ struct AlphaBlend {
         return vorrq_u32(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -130,7 +130,7 @@ struct MultiplyBlend {
         return vorrq_u32(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -147,7 +147,7 @@ struct OpaqueBlend {
         return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
     }
 };
@@ -161,7 +161,7 @@ struct BinaryBlend {
         return vorrq_u32(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 pix = *(const uint32 *)in;
         int a = in[BlendBlit::kAIndex];
 
@@ -225,7 +225,7 @@ struct AdditiveBlend {
         return vorrq_u32(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -254,7 +254,7 @@ struct SubtractiveBlend {
         return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         out[BlendBlit::kAIndex] = 255;
         out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
         out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
@@ -325,7 +325,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
                 in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
             }
 
-            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
             
             if (doscale)
                 scaleXCtr += args.scaleX;
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
index fa30773b5ac..149848454e2 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -80,7 +80,7 @@ struct AlphaBlend {
         return _mm_or_si128(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -136,7 +136,7 @@ struct MultiplyBlend {
         return _mm_or_si128(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -153,7 +153,7 @@ struct OpaqueBlend {
         return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         *(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
     }
 };
@@ -167,7 +167,7 @@ struct BinaryBlend {
         return _mm_or_si128(src, dst);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 pix = *(const uint32 *)in;
         int a = in[BlendBlit::kAIndex];
 
@@ -231,7 +231,7 @@ struct AdditiveBlend {
         return _mm_or_si128(dst, src);
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
 
         if (ina != 0) {
@@ -260,7 +260,7 @@ struct SubtractiveBlend {
         return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
     }
 
-    static inline void normal(const byte *in, byte *out, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+    static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
         out[BlendBlit::kAIndex] = 255;
         out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
         out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
@@ -328,7 +328,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
                 in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
             }
 
-            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, args.flipping & FLIP_H, ca, cr, cg, cb);
+            PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
             
             if (doscale)
                 scaleXCtr += args.scaleX;
diff --git a/graphics/transparent_surface.cpp b/graphics/transparent_surface.cpp
index 45f8442a6e4..5af183de056 100644
--- a/graphics/transparent_surface.cpp
+++ b/graphics/transparent_surface.cpp
@@ -82,10 +82,10 @@ Common::Rect TransparentSurface::blit(Graphics::Surface &target, int posX, int p
 	}
 
 	if (width == -1) {
-		width = srcImage.w;
+		width = srcW;
 	}
 	if (height == -1) {
-		height = srcImage.h;
+		height = srcH;
 	}
 
 	int scaleX = BlendBlit::getScaleFactor(srcW, width), scaleXoff = 0;
@@ -182,10 +182,10 @@ Common::Rect TransparentSurface::blitClip(Graphics::Surface &target, Common::Rec
 	}
 
 	if (width == -1) {
-		width = srcImage.w;
+		width = srcW;
 	}
 	if (height == -1) {
-		height = srcImage.h;
+		height = srcH;
 	}
 
 	int scaleX = BlendBlit::getScaleFactor(srcW, width), scaleXoff = 0;;


Commit: 1b752d6a6205bf8a2fe04ac4dc3979c159d54876
    https://github.com/scummvm/scummvm/commit/1b752d6a6205bf8a2fe04ac4dc3979c159d54876
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
BUILD: BlendBlit SIMD only compiles when needed

Changed paths:
    graphics/module.mk


diff --git a/graphics/module.mk b/graphics/module.mk
index cca8ebdf67f..4a041054ac0 100644
--- a/graphics/module.mk
+++ b/graphics/module.mk
@@ -3,9 +3,6 @@ MODULE := graphics
 MODULE_OBJS := \
 	big5.o \
 	blit/blit.o \
-	blit/blit-sse2.o \
-	blit/blit-avx2.o \
-	blit/blit-neon.o \
 	blit/blit-alpha.o \
 	blit/blit-scale.o \
 	cursorman.o \
@@ -143,12 +140,18 @@ endif
 endif
 
 ifeq ($(SCUMMVM_NEON),1)
+MODULE_OBJS += \
+	blit/blit-neon.o
 $(MODULE)/blit/blit-neon.o: CXXFLAGS += -mfpu=neon
 endif
 ifeq ($(SCUMMVM_SSE2),1)
+MODULE_OBJS += \
+	blit/blit-sse2.o
 $(MODULE)/blit/blit-sse2.o: CXXFLAGS += -msse2
 endif
 ifeq ($(SCUMMVM_AVX2),1)
+MODULE_OBJS += \
+	blit/blit-avx2.o
 $(MODULE)/blit/blit-avx2.o: CXXFLAGS += -mavx2
 endif
 


Commit: 41815c98168008552eeaa85deb8b8af2870285a3
    https://github.com/scummvm/scummvm/commit/41815c98168008552eeaa85deb8b8af2870285a3
Author: Wyatt Radkiewicz (wyattradkiewicz at Wyatts-MacBook-Air.local)
Date: 2023-08-13T00:22:10+02:00

Commit Message:
GRAPHICS: Fix BlendBlit additive blending mode

Changed paths:
    graphics/blit/blit-avx2.cpp
    graphics/blit/blit-neon.cpp
    graphics/blit/blit-sse2.cpp


diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
index d3b8e7cecd1..5e0bf936953 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -20,7 +20,6 @@
  */
 
 #include "common/scummsys.h"
-#ifdef SCUMMVM_AVX2
 #include <immintrin.h>
 
 #include "graphics/blit.h"
@@ -194,7 +193,7 @@ struct AdditiveBlend {
             srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
             srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
 
-            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
             src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
         } else if (alphamod) {
             __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
@@ -205,7 +204,7 @@ struct AdditiveBlend {
             srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
             srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
             src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
         } else {
             __m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
@@ -216,7 +215,7 @@ struct AdditiveBlend {
             srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
             srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+            src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
             src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
         }
 
@@ -466,5 +465,3 @@ void BlendBlit::blitAVX2(Args &args, const TSpriteBlendMode &blendMode, const Al
 }
 
 } // End of namespace Graphics
-
-#endif // SCUMMVM_AVX2
diff --git a/graphics/blit/blit-neon.cpp b/graphics/blit/blit-neon.cpp
index e63e7f22774..4a9afb8c134 100644
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-neon.cpp
@@ -20,7 +20,6 @@
  */
 
 #include "common/scummsys.h"
-#ifdef SCUMMVM_NEON
 #include <arm_neon.h>
 
 #include "graphics/blit.h"
@@ -194,7 +193,7 @@ struct AdditiveBlend {
             srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
             srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
 
-            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
             src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
         } else if (alphamod) {
             uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
@@ -205,7 +204,7 @@ struct AdditiveBlend {
             srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
             srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
             src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
         } else {
             uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
@@ -216,7 +215,7 @@ struct AdditiveBlend {
             srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
             srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+            src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
             src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
         }
 
@@ -464,5 +463,3 @@ void BlendBlit::blitNEON(Args &args, const TSpriteBlendMode &blendMode, const Al
 }
 
 } // end of namespace Graphics
-
-#endif // SCUMMVM_NEON
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
index 149848454e2..62fac9c11d6 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -20,7 +20,6 @@
  */
 
 #include "common/scummsys.h"
-#ifdef SCUMMVM_SSE2
 #include <immintrin.h>
 
 #include "graphics/blit.h"
@@ -200,7 +199,7 @@ struct AdditiveBlend {
             srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
             srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
 
-            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
             src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
         } else if (alphamod) {
             __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
@@ -211,7 +210,7 @@ struct AdditiveBlend {
             srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
             srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
             src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
         } else {
             __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
@@ -222,7 +221,7 @@ struct AdditiveBlend {
             srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
             srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
 
-            src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+            src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
             src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
         }
 
@@ -467,5 +466,3 @@ void BlendBlit::blitSSE2(Args &args, const TSpriteBlendMode &blendMode, const Al
 }
 
 } // End of namespace Graphics
-
-#endif // SSE2