[Scummvm-git-logs] scummvm master -> ec53c5ea87c7350eded25e6333c77bbbd3519a8d

Sat Sep 28 23:19:46 UTC 2024

This automated email contains information about 3 new commits which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
433d6804f7 GRAPHICS: Make the blend structs into objects on the stack
c41edfd2e5 GRAPHICS: Some optimisations for the alpha blending routines
ec53c5ea87 GRAPHICS: Simplify the generic blending routines


Commit: 433d6804f714b427f51d3900ff375f72c890339a
    https://github.com/scummvm/scummvm/commit/433d6804f714b427f51d3900ff375f72c890339a
Author: Cameron Cawley (ccawley2011 at gmail.com)
Date: 2024-09-29T02:19:42+03:00

Commit Message:
GRAPHICS: Make the blend structs into objects on the stack

Changed paths:
    graphics/blit/blit-alpha.h
    graphics/blit/blit-avx2.cpp
    graphics/blit/blit-generic.cpp
    graphics/blit/blit-neon.cpp
    graphics/blit/blit-sse2.cpp

diff --git a/graphics/blit/blit-alpha.h b/graphics/blit/blit-alpha.h
index 0abc3094895..0fbe93bd1da 100644
--- a/graphics/blit/blit-alpha.h
+++ b/graphics/blit/blit-alpha.h
@@ -27,10 +27,26 @@ class BlendBlitImpl_Base {
 	friend class BlendBlit;
 protected:
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AlphaBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+template<bool rgbmod, bool alphamod>
+struct BaseBlend {
+public:
+	constexpr BaseBlend(const uint32 color) :
+		ca(alphamod ? ((color >> BlendBlit::kAModShift) & 0xFF) : 255),
+		cr(rgbmod   ? ((color >> BlendBlit::kRModShift) & 0xFF) : 255),
+		cg(rgbmod   ? ((color >> BlendBlit::kGModShift) & 0xFF) : 255),
+		cb(rgbmod   ? ((color >> BlendBlit::kBModShift) & 0xFF) : 255) {}
+
+protected:
+	const byte ca, cr, cg, cb;
+};
+
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
 
 		if (ina != 0) {
 			if (rgbmod) {
@@ -39,9 +55,9 @@ struct AlphaBlend {
 				const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
 
 				out[BlendBlit::kAIndex] = 255;
-				out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * cb >> 16);
-				out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * cg >> 16);
-				out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * cr >> 16);
+				out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * this->cb >> 16);
+				out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * this->cg >> 16);
+				out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * this->cr >> 16);
 			} else {
 				out[BlendBlit::kAIndex] = 255;
 				out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + in[BlendBlit::kBIndex] * ina) >> 8;
@@ -53,29 +69,38 @@ struct AlphaBlend {
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct MultiplyBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
 
 		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * cb * ina) >> 16) >> 8;
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * cg * ina) >> 16) >> 8;
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * cr * ina) >> 16) >> 8;
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16) >> 8;
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16) >> 8;
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16) >> 8;
 		}
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct OpaqueBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
 		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct BinaryBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
 		uint32 pix = *(const uint32 *)in;
 		int a = in[BlendBlit::kAIndex];
 
@@ -86,26 +111,32 @@ struct BinaryBlend {
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AdditiveBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
-		uint32 ina = in[BlendBlit::kAIndex] * ca >> 8;
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
 
 		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * cb * ina) >> 16);
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * cg * ina) >> 16);
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * cr * ina) >> 16);
+			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16);
+			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16);
+			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16);
 		}
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct SubtractiveBlend {
-	static inline void normal(const byte *in, byte *out, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
 		out[BlendBlit::kAIndex] = 255;
-		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * cb  * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * cg  * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * cr *  (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
 	}
 };
 
@@ -117,114 +148,114 @@ void BlendBlit::blitT(Args &args, const TSpriteBlendMode &blendMode, const Alpha
 	bool alphamod = ((args.color & kAModMask)   != kAModMask);
 	if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
 		if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
-			T::template blitInnerLoop<T::template OpaqueBlend, false, false, false, false, true>(args);
+			T::template blitInnerLoop<T::template OpaqueBlend, false, false, false>(args);
 		} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
-			T::template blitInnerLoop<T::template BinaryBlend, false, false, false, false, true>(args);
+			T::template blitInnerLoop<T::template BinaryBlend, false, false, false>(args);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AdditiveBlend, false, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, false, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AdditiveBlend, false, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, false, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AdditiveBlend, false, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, false, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AdditiveBlend, false, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, false, false, false>(args);
 					}
 				}
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
 				if (rgbmod) {
-					T::template blitInnerLoop<T::template SubtractiveBlend, false, true, false, false, true>(args);
+					T::template blitInnerLoop<T::template SubtractiveBlend, false, true, false>(args);
 				} else {
-					T::template blitInnerLoop<T::template SubtractiveBlend, false, false, false, false, true>(args);
+					T::template blitInnerLoop<T::template SubtractiveBlend, false, false, false>(args);
 				}
 			} else if (blendMode == BLEND_MULTIPLY) {
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template MultiplyBlend, false, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, false, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template MultiplyBlend, false, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, false, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template MultiplyBlend, false, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, false, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template MultiplyBlend, false, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, false, false, false>(args);
 					}
 				}
 			} else {
 				assert(blendMode == BLEND_NORMAL);
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AlphaBlend, false, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, false, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AlphaBlend, false, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, false, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AlphaBlend, false, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, false, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AlphaBlend, false, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, false, false, false>(args);
 					}
 				}
 			}
 		}
 	} else {
 		if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
-			T::template blitInnerLoop<T::template OpaqueBlend, true, false, false, false, true>(args);
+			T::template blitInnerLoop<T::template OpaqueBlend, true, false, false>(args);
 		} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
-			T::template blitInnerLoop<T::template BinaryBlend, true, false, false, false, true>(args);
+			T::template blitInnerLoop<T::template BinaryBlend, true, false, false>(args);
 		} else {
 			if (blendMode == BLEND_ADDITIVE) {
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AdditiveBlend, true, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, true, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AdditiveBlend, true, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, true, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AdditiveBlend, true, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, true, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AdditiveBlend, true, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template AdditiveBlend, true, false, false>(args);
 					}
 				}
 			} else if (blendMode == BLEND_SUBTRACTIVE) {
 				if (rgbmod) {
-					T::template blitInnerLoop<T::template SubtractiveBlend, true, true, false, false, true>(args);
+					T::template blitInnerLoop<T::template SubtractiveBlend, true, true, false>(args);
 				} else {
-					T::template blitInnerLoop<T::template SubtractiveBlend, true, false, false, false, true>(args);
+					T::template blitInnerLoop<T::template SubtractiveBlend, true, false, false>(args);
 				}
 			} else if (blendMode == BLEND_MULTIPLY) {
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template MultiplyBlend, true, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, true, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template MultiplyBlend, true, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, true, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template MultiplyBlend, true, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, true, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template MultiplyBlend, true, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template MultiplyBlend, true, false, false>(args);
 					}
 				}
 			} else {
 				assert(blendMode == BLEND_NORMAL);
 				if (rgbmod) {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AlphaBlend, true, true, true, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, true, true, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AlphaBlend, true, true, false, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, true, true, false>(args);
 					}
 				} else {
 					if (alphamod) {
-						T::template blitInnerLoop<T::template AlphaBlend, true, false, true, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, true, false, true>(args);
 					} else {
-						T::template blitInnerLoop<T::template AlphaBlend, true, false, false, false, true>(args);
+						T::template blitInnerLoop<T::template AlphaBlend, true, false, false>(args);
 					}
 				}
 			}
diff --git a/graphics/blit/blit-avx2.cpp b/graphics/blit/blit-avx2.cpp
index 0f3134e1cdd..45f36121e90 100644
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -38,12 +38,15 @@ namespace Graphics {
 class BlendBlitImpl_AVX2 : public BlendBlitImpl_Base {
 	friend class BlendBlit;
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		__m256i ina;
 		if (alphamod)
-			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
 		else
 			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
 		__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
@@ -59,9 +62,9 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 			dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
 			dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
 			dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
-			srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(cr)), BlendBlit::kRModShift - 8));
-			srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(cg)), BlendBlit::kGModShift - 8));
-			srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(cb)));
+			srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(this->cb)));
 			src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
 			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
 			src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
@@ -85,12 +88,15 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		__m256i ina, alphaMask;
 		if (alphamod) {
-			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
 			alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
 		} else {
 			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
@@ -105,9 +111,9 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcB = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstB, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcB, _mm256_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
-			srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcG, _mm256_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcR = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstR, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcR, _mm256_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
+			srcB = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstB, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcB, _mm256_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcG, _mm256_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcR = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstR, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcR, _mm256_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
 
 			src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
 			src = _mm256_or_si256(src, _mm256_or_si256(srcB, _mm256_or_si256(srcG, srcR)));
@@ -131,16 +137,22 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		__m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
 		dst = _mm256_and_si256(dst, alphaMask);
 		src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
@@ -148,12 +160,15 @@ struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alp
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		__m256i ina;
 		if (alphamod)
-			ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(ca)), 8);
+			ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
 		else
 			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
 		__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
@@ -166,9 +181,9 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 			__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
+			srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(this->cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(this->cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
 
 			src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
 			src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
@@ -202,9 +217,12 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, rgbmod, alphamod> {
-	static inline __m256i simd(__m256i src, __m256i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
 		__m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
 		__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
 		__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
@@ -213,27 +231,23 @@ struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, r
 		__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 		__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-		srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
-		srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
-		srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
+		srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(this->cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(this->cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(this->cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
 
 		return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
 	}
 };
 
 public:
-template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
 static void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
 	const byte *in;
 	byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+	const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
 
 	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
@@ -273,7 +287,7 @@ static void blitInnerLoop(BlendBlit::Args &args) {
 				srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
 			}
 			{
-				const __m256i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+				const __m256i res = pixelFunc.simd(srcPixels, dstPixels);
 				_mm256_storeu_si256((__m256i *)out, res);
 			}
 			if (!doscale) in += (ptrdiff_t)args.inStep * 8;
@@ -285,8 +299,8 @@ static void blitInnerLoop(BlendBlit::Args &args) {
 				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
 			}
 
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
-			
+			pixelFunc.normal(in, out);
+
 			if (doscale)
 				scaleXCtr += args.scaleX;
 			else
diff --git a/graphics/blit/blit-generic.cpp b/graphics/blit/blit-generic.cpp
index 25033ffcea1..b153851b4fb 100644
--- a/graphics/blit/blit-generic.cpp
+++ b/graphics/blit/blit-generic.cpp
@@ -28,18 +28,12 @@ class BlendBlitImpl_Default : public BlendBlitImpl_Base {
 	friend class BlendBlit;
 public:
 
-template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
 	const byte *in;
 	byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+	const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
 
 	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
@@ -58,7 +52,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
 			}
 
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
+			pixelFunc.normal(in, out);
 
 			if (doscale)
 				scaleXCtr += args.scaleX;
@@ -158,22 +152,22 @@ static void doBlitBinaryBlendLogicGeneric(BlendBlit::Args &args) {
 }; // end of class BlendBlitImpl_Default
 
 template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, true, false, false, false, true>(BlendBlit::Args &args) {
+inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, true, false, false>(BlendBlit::Args &args) {
 	doBlitOpaqueBlendLogicGeneric<true>(args);
 }
 
 template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, false, false, false, false, true>(BlendBlit::Args &args) {
+inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, false, false, false>(BlendBlit::Args &args) {
 	doBlitOpaqueBlendLogicGeneric<false>(args);
 }
 
 template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, true, false, false, false, true>(BlendBlit::Args &args) {
+inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, true, false, false>(BlendBlit::Args &args) {
 	doBlitBinaryBlendLogicGeneric<true>(args);
 }
 
 template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, false, false, false, false, true>(BlendBlit::Args &args) {
+inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, false, false, false>(BlendBlit::Args &args) {
 	doBlitBinaryBlendLogicGeneric<false>(args);
 }
 
diff --git a/graphics/blit/blit-neon.cpp b/graphics/blit/blit-neon.cpp
index d35668e0aa0..b35a22827c3 100644
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-neon.cpp
@@ -44,12 +44,15 @@ namespace Graphics {
 class BlendBlitImpl_NEON : public BlendBlitImpl_Base {
 	friend class BlendBlit;
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		uint32x4_t ina;
 		if (alphamod)
-			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
 		else
 			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
 		uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
@@ -65,9 +68,9 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 			dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
 			dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
 			dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
-			srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
-			srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
-			srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
+			srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(this->cr)), 16));
+			srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(this->cg)), 16));
+			srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(this->cb)), 16));
 			src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
 			src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
 			src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
@@ -91,12 +94,15 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		uint32x4_t ina, alphaMask;
 		if (alphamod) {
-			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
 			alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
 		} else {
 			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
@@ -111,9 +117,9 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 			uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcB = vandq_u32(vshlq_n_u32(vmulq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, vmovq_n_u32(cb)), ina), 16)), BlendBlit::kBModShift - 8), vmovq_n_u32(BlendBlit::kBModMask));
-			srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, vmovq_n_u32(cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
-			srcR = vandq_u32(vshlq_n_u32(vmulq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, vmovq_n_u32(cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
+			srcB = vandq_u32(vshlq_n_u32(vmulq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, vmovq_n_u32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), vmovq_n_u32(BlendBlit::kBModMask));
+			srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, vmovq_n_u32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
+			srcR = vandq_u32(vshlq_n_u32(vmulq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, vmovq_n_u32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
 
 			src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
 			src = vorrq_u32(src, vorrq_u32(srcB, vorrq_u32(srcG, srcR)));
@@ -137,16 +143,22 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
 		dst = vandq_u32(dst, alphaMask);
 		src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
@@ -154,12 +166,15 @@ struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alp
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		uint32x4_t ina;
 		if (alphamod)
-			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
 		else
 			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
 		uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
@@ -172,9 +187,9 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 			uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
-			srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
-			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
+			srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(this->cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
+			srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(this->cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
+			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(this->cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
 
 			src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
 			src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
@@ -208,9 +223,12 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, rgbmod, alphamod> {
-	static inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
 		uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
 		uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
 		uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
@@ -219,27 +237,23 @@ struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, r
 		uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 		uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-		srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
-		srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
-		srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
+		srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(this->cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
+		srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(this->cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
+		srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(this->cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
 
 		return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
 	}
 };
 
 public:
-template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
 	const byte *in;
 	byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+	PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
 
 	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
@@ -276,7 +290,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 				srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
 			}
 			{
-				const uint32x4_t res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+				const uint32x4_t res = pixelFunc.simd(srcPixels, dstPixels);
 				vst1q_u32((uint32 *)out, res);
 			}
 			if (!doscale) in += args.inStep * 4;
@@ -288,7 +302,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
 			}
 
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
+			pixelFunc.normal(in, out);
 
 			if (doscale)
 				scaleXCtr += args.scaleX;
diff --git a/graphics/blit/blit-sse2.cpp b/graphics/blit/blit-sse2.cpp
index c454d88f3b8..e6634ade320 100644
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -48,12 +48,15 @@ static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) {
 class BlendBlitImpl_SSE2 : public BlendBlitImpl_Base {
 	friend class BlendBlit;
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		__m128i ina;
 		if (alphamod)
-			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
 		else
 			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
 		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
@@ -69,9 +72,9 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 			dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
 			dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
 			dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
-			srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(cr)), BlendBlit::kRModShift - 8));
-			srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(cg)), BlendBlit::kGModShift - 8));
-			srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(cb)));
+			srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(this->cb)));
 			src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
 			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
 			src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
@@ -95,12 +98,15 @@ struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<doscale, rgbmod, alpha
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		__m128i ina, alphaMask;
 		if (alphamod) {
-			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
 			alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
 		} else {
 			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
@@ -115,9 +121,9 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcB = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstB, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcB, _mm_set1_epi32(cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
-			srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcG, _mm_set1_epi32(cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcR = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcR, _mm_set1_epi32(cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
+			srcB = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstB, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcB, _mm_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcG, _mm_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcR = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcR, _mm_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
 
 			src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
 			src = _mm_or_si128(src, _mm_or_si128(srcB, _mm_or_si128(srcG, srcR)));
@@ -141,16 +147,22 @@ struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		__m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
 		dst = _mm_and_si128(dst, alphaMask);
 		src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
@@ -158,12 +170,15 @@ struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<doscale, rgbmod, alp
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		__m128i ina;
 		if (alphamod)
-			ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(ca)), 8);
+			ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
 		else
 			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
 		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
@@ -176,9 +191,9 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 			__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 			__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-			srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
-			srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
-			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
+			srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(this->cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(this->cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
 
 			src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
 			src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
@@ -212,9 +227,12 @@ struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<doscale, rgbmod,
 	}
 };
 
-template<bool doscale, bool rgbmod, bool alphamod>
-struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, rgbmod, alphamod> {
-	static inline __m128i simd(__m128i src, __m128i dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
 		__m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
 		__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
 		__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
@@ -223,27 +241,23 @@ struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<doscale, r
 		__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
 		__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
 
-		srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
-		srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
-		srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
+		srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(this->cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(this->cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(this->cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
 
 		return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
 	}
 };
 
 public:
-template<template <bool DOSCALE, bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod, bool coloradd1, bool loaddst>
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
 static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
 	const byte *in;
 	byte *out;
 
-	const byte rawcr = (args.color >> BlendBlit::kRModShift) & 0xFF;
-	const byte rawcg = (args.color >> BlendBlit::kGModShift) & 0xFF;
-	const byte rawcb = (args.color >> BlendBlit::kBModShift) & 0xFF;
-	const byte ca = alphamod ? ((args.color >> BlendBlit::kAModShift) & 0xFF) : 255;
-	const uint32 cr = coloradd1 ? (rgbmod   ? (rawcr == 255 ? 256 : rawcr) : 256) : (rgbmod   ? rawcr : 255);
-	const uint32 cg = coloradd1 ? (rgbmod   ? (rawcg == 255 ? 256 : rawcg) : 256) : (rgbmod   ? rawcg : 255);
-	const uint32 cb = coloradd1 ? (rgbmod   ? (rawcb == 255 ? 256 : rawcb) : 256) : (rgbmod   ? rawcb : 255);
+	PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
 
 	int scaleXCtr, scaleYCtr = args.scaleYoff;
 	const byte *inBase;
@@ -278,7 +292,7 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 				srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
 			}
 			{
-				const __m128i res = PixelFunc<doscale, rgbmod, alphamod>::simd(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
+				const __m128i res = pixelFunc.simd(srcPixels, dstPixels);
 				_mm_storeu_si128((__m128i *)out, res);
 			}
 			if (!doscale) in += (ptrdiff_t)args.inStep * 4;
@@ -290,8 +304,8 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
 			}
 
-			PixelFunc<doscale, rgbmod, alphamod>::normal(in, out, ca, cr, cg, cb);
-			
+			pixelFunc.normal(in, out);
+
 			if (doscale)
 				scaleXCtr += args.scaleX;
 			else


Commit: c41edfd2e5f771c4315cd1c3cb9bf761e75d1dab
    https://github.com/scummvm/scummvm/commit/c41edfd2e5f771c4315cd1c3cb9bf761e75d1dab
Author: Cameron Cawley (ccawley2011 at gmail.com)
Date: 2024-09-29T02:19:42+03:00

Commit Message:
GRAPHICS: Some optimisations for the alpha blending routines

Changed paths:
    graphics/blit/blit-alpha.h


diff --git a/graphics/blit/blit-alpha.h b/graphics/blit/blit-alpha.h
index 0fbe93bd1da..2ebe959c82d 100644
--- a/graphics/blit/blit-alpha.h
+++ b/graphics/blit/blit-alpha.h
@@ -46,9 +46,27 @@ public:
 	constexpr AlphaBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
 
 	inline void normal(const byte *in, byte *out) const {
-		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		uint32 ina;
 
-		if (ina != 0) {
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = (in[BlendBlit::kBIndex] * this->cb >> 8);
+				out[BlendBlit::kGIndex] = (in[BlendBlit::kGIndex] * this->cg >> 8);
+				out[BlendBlit::kRIndex] = (in[BlendBlit::kRIndex] * this->cr >> 8);
+			} else {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = in[BlendBlit::kBIndex];
+				out[BlendBlit::kGIndex] = in[BlendBlit::kGIndex];
+				out[BlendBlit::kRIndex] = in[BlendBlit::kRIndex];
+			}
+		} else if (ina != 0) {
 			if (rgbmod) {
 				const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
 				const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
@@ -75,12 +93,34 @@ public:
 	constexpr MultiplyBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
 
 	inline void normal(const byte *in, byte *out) const {
-		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		uint32 ina;
 
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16) >> 8;
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16) >> 8;
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16) >> 8;
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb) >> 8) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg) >> 8) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr) >> 8) >> 8;
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * in[BlendBlit::kBIndex] >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * in[BlendBlit::kGIndex] >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * in[BlendBlit::kRIndex] >> 8;
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16) >> 8;
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * ina) >> 8) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * ina) >> 8) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * ina) >> 8) >> 8;
+			}
 		}
 	}
 };
@@ -102,11 +142,10 @@ public:
 
 	inline void normal(const byte *in, byte *out) const {
 		uint32 pix = *(const uint32 *)in;
-		int a = in[BlendBlit::kAIndex];
+		uint32 a = pix & BlendBlit::kAModMask;
 
 		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
-			*(uint32 *)out = pix;
-			out[BlendBlit::kAIndex] = 0xFF;
+			*(uint32 *)out = pix | BlendBlit::kAModMask;
 		}
 	}
 };
@@ -117,12 +156,34 @@ public:
 	constexpr AdditiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
 
 	inline void normal(const byte *in, byte *out) const {
-		uint32 ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		uint32 ina;
 
-		if (ina != 0) {
-			out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16);
-			out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16);
-			out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16);
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb) >> 8);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg) >> 8);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr) >> 8);
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + in[BlendBlit::kBIndex];
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + in[BlendBlit::kGIndex];
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + in[BlendBlit::kRIndex];
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16);
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * ina) >> 8);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * ina) >> 8);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * ina) >> 8);
+			}
 		}
 	}
 };
@@ -133,10 +194,30 @@ public:
 	constexpr SubtractiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
 
 	inline void normal(const byte *in, byte *out) const {
+		uint32 ina = in[BlendBlit::kAIndex];
 		out[BlendBlit::kAIndex] = 255;
-		out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
-		out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex]) * in[BlendBlit::kAIndex]) >> 24), 0);
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex])) >> 16), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex])) >> 16), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex])) >> 16), 0);
+			} else {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex])) >> 8), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex])) >> 8), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex])) >> 8), 0);
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex]) * ina) >> 24), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex]) * ina) >> 24), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex]) * ina) >> 24), 0);
+			} else {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex]) * ina) >> 16), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex]) * ina) >> 16), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex]) * ina) >> 16), 0);
+			}
+		}
 	}
 };
 


Commit: ec53c5ea87c7350eded25e6333c77bbbd3519a8d
    https://github.com/scummvm/scummvm/commit/ec53c5ea87c7350eded25e6333c77bbbd3519a8d
Author: Cameron Cawley (ccawley2011 at gmail.com)
Date: 2024-09-29T02:19:42+03:00

Commit Message:
GRAPHICS: Simplify the generic blending routines

Changed paths:
    graphics/blit/blit-generic.cpp


diff --git a/graphics/blit/blit-generic.cpp b/graphics/blit/blit-generic.cpp
index b153851b4fb..bbdbc062f28 100644
--- a/graphics/blit/blit-generic.cpp
+++ b/graphics/blit/blit-generic.cpp
@@ -68,109 +68,8 @@ static inline void blitInnerLoop(BlendBlit::Args &args) {
 	}
 }
 
-template<bool doscale>
-static void doBlitOpaqueBlendLogicGeneric(BlendBlit::Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + (scaleYCtr + 1) / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-
-		if (doscale) {
-			for (uint32 j = 0; j < args.width; j++) {
-				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
-				*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
-				scaleXCtr += args.scaleX;
-				out += 4;
-			}
-		} else {
-			for (uint32 j = 0; j < args.width; j++) {
-				*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
-				in += args.inStep;
-				out += 4;
-			}
-		}
-
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
-}
-
-template<bool doscale>
-static void doBlitBinaryBlendLogicGeneric(BlendBlit::Args &args) {
-	const byte *in;
-	byte *out;
-
-	int scaleXCtr, scaleYCtr = args.scaleYoff;
-	const byte *inBase;
-
-	for (uint32 i = 0; i < args.height; i++) {
-		if (doscale) {
-			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
-			scaleXCtr = args.scaleXoff;
-		} else {
-			in = args.ino;
-		}
-		out = args.outo;
-		for (uint32 j = 0; j < args.width; j++) {
-			if (doscale) {
-				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
-			}
-
-			uint32 pix = *(const uint32 *)in, pixout = *(const uint32 *)out;
-			uint32 mask = (pix & BlendBlit::kAModMask) ? 0xffffffff : 0;
-			pixout &= ~mask;
-			pix = (pix | BlendBlit::kAModMask) & mask;
-			*(uint32 *)out = pixout | pix;
-			
-			if (doscale)
-				scaleXCtr += args.scaleX;
-			else
-				in += args.inStep;
-			out += 4;
-		}
-		if (doscale)
-			scaleYCtr += args.scaleY;
-		else
-			args.ino += args.inoStep;
-		args.outo += args.dstPitch;
-	}
-}
-
 }; // end of class BlendBlitImpl_Default
 
-template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, true, false, false>(BlendBlit::Args &args) {
-	doBlitOpaqueBlendLogicGeneric<true>(args);
-}
-
-template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::OpaqueBlend, false, false, false>(BlendBlit::Args &args) {
-	doBlitOpaqueBlendLogicGeneric<false>(args);
-}
-
-template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, true, false, false>(BlendBlit::Args &args) {
-	doBlitBinaryBlendLogicGeneric<true>(args);
-}
-
-template<>
-inline void BlendBlitImpl_Default::blitInnerLoop<BlendBlitImpl_Default::BinaryBlend, false, false, false>(BlendBlit::Args &args) {
-	doBlitBinaryBlendLogicGeneric<false>(args);
-}
-
 void BlendBlit::blitGeneric(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
 	blitT<BlendBlitImpl_Default>(args, blendMode, alphaType);
 }