[Scummvm-git-logs] scummvm master -> 5b29081355863c56784fd542908dfaf9e3e1c471

Sat Oct 14 11:33:15 UTC 2023

This automated email contains information about 4 new commits which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
5214d921c4 AGS: Fix edge case in Allegro optimized blit clipping
d6d65273c0 AGS: Allow usage of AVX2 optimized blit path
18a34cb7af AGS: Fix AVX2 scaled blitting
5b29081355 AGS: Apply clipping fix to 2bpp optimized blitting


Commit: 5214d921c4101807f89f7cebe61b6a3a619ad068
    https://github.com/scummvm/scummvm/commit/5214d921c4101807f89f7cebe61b6a3a619ad068
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-10-14T14:27:54+03:00

Commit Message:
AGS: Fix edge case in Allegro optimized blit clipping

This fixes a (literal) edge case that was introduced in the last
attempt at fixing the clipping issues that would crash
ScummVM when a sprite is drawn at the bottom right of
the screen. In particular, the specific Y value that would
result in only one row of the sprite being off screen no
longer produces a crash.

Changed paths:
    engines/ags/lib/allegro/surface_avx2.cpp
    engines/ags/lib/allegro/surface_neon.cpp
    engines/ags/lib/allegro/surface_sse2.cpp

diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index 70b0ea9d9ce..4a0b0c2d2bc 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -488,7 +488,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -499,7 +499,10 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = (xCtrWidth % 8 == 0) ? args.destArea.h - args.yStart : args.destArea.h - args.yStart - 1;
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	if (xCtrWidth % 8 != 0) {
+		--yCtrHeight;
 	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 6a370522ee2..9aa14b76ffe 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -484,7 +484,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -495,7 +495,10 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = (xCtrWidth % 4 == 0) ? args.destArea.h - args.yStart : args.destArea.h - args.yStart - 1;
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	if (xCtrWidth % 4 != 0) {
+		--yCtrHeight;
 	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 72b2f5bb5ea..8ff8fd5b66f 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -504,7 +504,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * SrcBytesPerPixel;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -515,7 +515,10 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		}
 	}
 	if (args.yStart + yCtrHeight > args.destArea.h) {
-		yCtrHeight = (xCtrWidth % 4 == 0) ? args.destArea.h - args.yStart : args.destArea.h - args.yStart - 1;
+		yCtrHeight = args.destArea.h - args.yStart;
+	}
+	if (xCtrWidth % 4 != 0) {
+		--yCtrHeight;
 	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);


Commit: d6d65273c07650ff41d818a858abc789a73dd66a
    https://github.com/scummvm/scummvm/commit/d6d65273c07650ff41d818a858abc789a73dd66a
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-10-14T14:27:54+03:00

Commit Message:
AGS: Allow usage of AVX2 optimized blit path

The previous design of the optimized paths used the same
name for the dummy class used to access BITMAP's private
members. On machines that support AVX2 this meant that
the SSE2 path would shadow the AVX2 one, and make it
inaccessible unless the configure script specifically
disabled SSE2 extensions.
The dummy class has now been split into three, one
for each of the optimization paths; those dummy classes
now contain all of the internal functions to avoid potential
name clashes; and finally, all the internal functions have
been marked as static.

Changed paths:
    engines/ags/lib/allegro/surface.h
    engines/ags/lib/allegro/surface_avx2.cpp
    engines/ags/lib/allegro/surface_neon.cpp
    engines/ags/lib/allegro/surface_sse2.cpp


diff --git a/engines/ags/lib/allegro/surface.h b/engines/ags/lib/allegro/surface.h
index 8bc9ab1cc6a..1f6536d5b89 100644
--- a/engines/ags/lib/allegro/surface.h
+++ b/engines/ags/lib/allegro/surface.h
@@ -265,6 +265,10 @@ public:
 	// kTintBlenderMode and kTintLightBlenderMode
 	void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
 
+	friend class DrawInnerImpl_AVX2;
+	friend class DrawInnerImpl_SSE2;
+	friend class DrawInnerImpl_NEON;
+
 	constexpr static int SCALE_THRESHOLD_BITS = 8;
 	constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
 	struct DrawInnerArgs {
@@ -287,7 +291,7 @@ public:
 					  bool vertFlip, int tintRed, int tintGreen, int tintBlue,
 					  bool doScale);
 	};
-	friend class DrawInnerImpl;
+
 	template<bool Scale>
 	void drawGeneric(DrawInnerArgs &args);
 #ifdef SCUMMVM_NEON
diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index 4a0b0c2d2bc..cfd2a36e234 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -30,7 +30,9 @@
 
 namespace AGS3 {
 
-inline __m256i simd2BppTo4Bpp(__m256i pixels) {
+class DrawInnerImpl_AVX2 {
+
+static inline __m256i simd2BppTo4Bpp(__m256i pixels) {
 	__m128i x128 = _mm256_castsi256_si128(pixels);
 	__m256i x = _mm256_cvtepu16_epi32(x128);
 
@@ -49,7 +51,7 @@ inline __m256i simd2BppTo4Bpp(__m256i pixels) {
 	return _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(r, g), b), _mm256_set1_epi32(0xff000000));
 }
 
-inline __m256i simd4BppTo2Bpp(__m256i pixels) {
+static inline __m256i simd4BppTo2Bpp(__m256i pixels) {
 	// x is the final 16 bit rgb pixel
 	__m256i x = _mm256_srli_epi32(_mm256_and_si256(pixels, _mm256_set1_epi32(0x000000ff)), 3);
 	x = _mm256_or_si256(x, _mm256_slli_epi32(_mm256_srli_epi32(_mm256_and_si256(pixels, _mm256_set1_epi32(0x0000ff00)), 8+2), 5));
@@ -59,7 +61,7 @@ inline __m256i simd4BppTo2Bpp(__m256i pixels) {
 	return _mm256_packs_epi32(x, _mm256_setzero_si256());
 }
 
-inline __m256i rgbBlendSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
+static inline __m256i rgbBlendSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = _mm256_add_epi16(alphas, _mm256_and_si256(_mm256_cmpgt_epi16(alphas, _mm256_setzero_si256()), _mm256_set1_epi16(1)));
 
@@ -102,7 +104,7 @@ inline __m256i rgbBlendSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alpha
 // preserveAlpha:
 //		false => set destCols's alpha to 0
 // 		true => keep destCols's alpha
-inline __m256i rgbBlendSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool preserveAlpha) {
+static inline __m256i rgbBlendSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool preserveAlpha) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did.
 	alphas = _mm256_add_epi32(alphas, _mm256_and_si256(_mm256_cmpgt_epi32(alphas, _mm256_setzero_si256()), _mm256_set1_epi32(1)));
 
@@ -144,7 +146,7 @@ inline __m256i rgbBlendSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, b
 	return srcCols;
 }
 
-inline __m256i argbBlendSIMD(__m256i srcCols, __m256i destCols) {
+static inline __m256i argbBlendSIMD(__m256i srcCols, __m256i destCols) {
 	__m256 srcA = _mm256_cvtepi32_ps(_mm256_srli_epi32(srcCols, 24));
 	srcA = _mm256_mul_ps(srcA, _mm256_set1_ps(1.0f / 255.0f));
 	__m256 srcR = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(srcCols, 16), _mm256_set1_epi32(0xff)));
@@ -175,7 +177,7 @@ inline __m256i argbBlendSIMD(__m256i srcCols, __m256i destCols) {
 			_mm256_cvtps_epi32(destB))));
 }
 
-inline __m256i blendTintSpriteSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool light) {
+static inline __m256i blendTintSpriteSIMD(__m256i srcCols, __m256i destCols, __m256i alphas, bool light) {
 	// This function is NOT 1 to 1 with the original... It just approximates it
 	// It gets the value of the HSV of the dest color
 	// Then it gets the HSV of the srcCols
@@ -275,21 +277,21 @@ inline __m256i blendTintSpriteSIMD(__m256i srcCols, __m256i destCols, __m256i al
 	return final;
 }
 
-inline __m256i mul32_as16(__m256i a, __m256i b) {
+static inline __m256i mul32_as16(__m256i a, __m256i b) {
 	__m256i a16 = _mm256_packs_epi32(a, _mm256_setzero_si256());
 	__m256i b16 = _mm256_packs_epi32(b, _mm256_setzero_si256());
 	__m256i res = _mm256_mullo_epi16(a16, b16);
 	return _mm256_unpacklo_epi16(res, _mm256_setzero_si256());
 }
 
-inline __m256i findmin32_as16(__m256i a, __m256i b) {
+static inline __m256i findmin32_as16(__m256i a, __m256i b) {
 	__m256i a16 = _mm256_packs_epi32(a, _mm256_setzero_si256());
 	__m256i b16 = _mm256_packs_epi32(b, _mm256_setzero_si256());
 	__m256i res = _mm256_min_epi16(a16, b16);
 	return _mm256_unpacklo_epi16(res, _mm256_setzero_si256());
 }
 
-inline __m256i blendPixelSIMD(__m256i srcCols, __m256i destCols, __m256i alphas) {
+static inline __m256i blendPixelSIMD(__m256i srcCols, __m256i destCols, __m256i alphas) {
 	__m256i srcAlphas, difAlphas, mask, ch1, ch2;
 	auto setupArgbAlphas = [&]() {
 		// This acts the same as this in the normal blender functions
@@ -354,7 +356,7 @@ inline __m256i blendPixelSIMD(__m256i srcCols, __m256i destCols, __m256i alphas)
 	return _mm256_setzero_si256();
 }
 
-inline __m256i blendPixelSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
+static inline __m256i blendPixelSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alphas) {
 	__m256i mask, ch1, ch2;
 	switch (_G(_blender_mode)) {
 	case kSourceAlphaBlender:
@@ -391,7 +393,7 @@ inline __m256i blendPixelSIMD2Bpp(__m256i srcCols, __m256i destCols, __m256i alp
 }
 
 template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i maskedAlphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
+static inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i maskedAlphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
 	__m256i srcCols, destCol;
 
 	if (DestBytesPerPixel == 4)
@@ -428,7 +430,7 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint, __m256
 	}
 }
 
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
+static inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m256i tint, __m256i alphas, __m256i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m256i skipMask) {
 	__m256i destCol = _mm256_loadu_si256((const __m256i *)destPtr);
 	__m256i srcCols = _mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp));
 	__m256i mask1 = skipTrans ? _mm256_cmpeq_epi16(srcCols, transColors) : _mm256_setzero_si256();
@@ -452,7 +454,6 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m256i tint, __
 	_mm256_storeu_si256((__m256i *)destPtr, final);
 }
 
-class DrawInnerImpl {
 public:
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
@@ -950,20 +951,20 @@ static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 }
 
-}; // end of class DrawInnerImpl
+}; // end of class DrawInnerImpl_AVX2
 
 template<bool Scale>
 void BITMAP::drawAVX2(DrawInnerArgs &args) {
 	if (args.sameFormat) {
 		switch (format.bytesPerPixel) {
-		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
-		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
-		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		case 1: DrawInnerImpl_AVX2::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl_AVX2::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl_AVX2::drawInner4BppWithConv<4, 4, Scale>(args); break;
 		}
 	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) {
-		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+		DrawInnerImpl_AVX2::drawInner4BppWithConv<4, 2, Scale>(args);
 	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+		DrawInnerImpl_AVX2::drawInner4BppWithConv<2, 4, Scale>(args);
 	}
 }
 
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 9aa14b76ffe..0aa9f0ca839 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -20,9 +20,6 @@
  */
 
 #include "ags/ags.h"
-
-#ifdef SCUMMVM_NEON
-
 #include <arm_neon.h>
 #include "ags/globals.h"
 #include "ags/lib/allegro/color.h"
@@ -33,7 +30,9 @@
 
 namespace AGS3 {
 
-inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
+class DrawInnerImpl_NEON {
+
+static inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
 	uint32x4_t x = vmovl_u16(pixels);
 
 	// c is the extracted 5/6 bit color from the image
@@ -51,7 +50,7 @@ inline uint32x4_t simd2BppTo4Bpp(uint16x4_t pixels) {
 	return vorrq_u32(vorrq_u32(vorrq_u32(r, g), b), vmovq_n_u32(0xff000000));
 }
 
-inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
+static inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
 	// x is the final 16 bit rgb pixel
 	uint32x4_t x = vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x000000ff)), 3);
 	x = vorrq_u32(x, vshlq_n_u32(vshrq_n_u32(vandq_u32(pixels, vmovq_n_u32(0x0000ff00)), 8+2), 5));
@@ -59,7 +58,7 @@ inline uint16x4_t simd4BppTo2Bpp(uint32x4_t pixels) {
 	return vmovn_u32(x);
 }
 
-inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+static inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = vaddq_u16(alphas, vandq_u16(vceqq_u16(alphas, vmovq_n_u16(0)), vmovq_n_u16(1)));
 
@@ -116,7 +115,7 @@ inline uint16x8_t rgbBlendSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint
 // preserveAlpha:
 //		false => set destCols's alpha to 0
 // 		true => keep destCols's alpha
-inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
+static inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool preserveAlpha) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = vaddq_u32(alphas, vandq_u32(vcgtq_u32(alphas, vmovq_n_u32(0)), vmovq_n_u32(1)));
 
@@ -157,7 +156,7 @@ inline uint32x4_t rgbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4
 }
 
 // uses the alpha from srcCols and destCols
-inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
+static inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
 	float32x4_t srcA = vcvtq_f32_u32(vshrq_n_u32(srcCols, 24));
 	srcA = vmulq_n_f32(srcA, 1.0f / 255.0f);
 	float32x4_t srcR = vcvtq_f32_u32(vandq_u32(vshrq_n_u32(srcCols, 16), vmovq_n_u32(0xff)));
@@ -188,7 +187,7 @@ inline uint32x4_t argbBlendSIMD(uint32x4_t srcCols, uint32x4_t destCols) {
 			vcvtq_u32_f32(destB))));
 }
 
-inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
+static inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas, bool light) {
 	// This function is NOT 1 to 1 with the original... It just approximates it
 	// It gets the value of the HSV of the dest color
 	// Then it gets the HSV of the srcCols
@@ -287,7 +286,7 @@ inline uint32x4_t blendTintSpriteSIMD(uint32x4_t srcCols, uint32x4_t destCols, u
 	return final;
 }
 
-inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
+static inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32x4_t alphas) {
 	uint32x4_t srcAlphas, difAlphas, mask, ch1, ch2;
 	auto setupArgbAlphas = [&]() {
 		// This acts the same as this in the normal blender functions
@@ -352,7 +351,7 @@ inline uint32x4_t blendPixelSIMD(uint32x4_t srcCols, uint32x4_t destCols, uint32
 	return srcCols;
 }
 
-inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
+static inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, uint16x8_t alphas) {
 	uint16x8_t mask, ch1, ch2;
 	switch (_G(_blender_mode)) {
 	case kSourceAlphaBlender:
@@ -389,7 +388,7 @@ inline uint16x8_t blendPixelSIMD2Bpp(uint16x8_t srcCols, uint16x8_t destCols, ui
 }
 
 template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
+static inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uint32x4_t alphas, uint32x4_t maskedAlphas, uint32x4_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint32x4_t skipMask) {
 	uint32x4_t srcCols, destCol;
 
 	if (DestBytesPerPixel == 4)
@@ -425,7 +424,7 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, uint32x4_t tint, uin
 	}
 }
 
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
+static inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint, uint16x8_t alphas, uint16x8_t transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, uint16x8_t skipMask) {
 	uint16x8_t destCol = vld1q_u16((uint16 *)destPtr);
 	uint16x8_t srcCols = vld1q_u16((const uint16 *)(srcP2 + xDir * xCtrBpp));
 	uint16x8_t mask1 = skipTrans ? vceqq_u16(srcCols, transColors) : vmovq_n_u16(0);
@@ -448,7 +447,6 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, uint16x8_t tint,
 	vst1q_u16((uint16 *)destPtr, final);
 }
 
-class DrawInnerImpl {
 public:
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
@@ -910,20 +908,20 @@ static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 }
 
-}; // end of class DrawInnerImpl
+}; // end of class DrawInnerImpl_NEON
 
 template<bool Scale>
 void BITMAP::drawNEON(DrawInnerArgs &args) {
 	if (args.sameFormat) {
 		switch (format.bytesPerPixel) {
-		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
-		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
-		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		case 1: DrawInnerImpl_NEON::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl_NEON::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl_NEON::drawInner4BppWithConv<4, 4, Scale>(args); break;
 		}
 	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) {
-		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+		DrawInnerImpl_NEON::drawInner4BppWithConv<4, 2, Scale>(args);
 	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+		DrawInnerImpl_NEON::drawInner4BppWithConv<2, 4, Scale>(args);
 	}
 }
 
@@ -931,4 +929,3 @@ template void BITMAP::drawNEON<false>(DrawInnerArgs &);
 template void BITMAP::drawNEON<true>(DrawInnerArgs &);
 
 } // namespace AGS3
-#endif // SCUMMVM_NEON
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 8ff8fd5b66f..c620c5fbdb3 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -30,7 +30,9 @@
 
 namespace AGS3 {
 
-inline __m128i simd2BppTo4Bpp(__m128i pixels) {
+class DrawInnerImpl_SSE2 {
+
+static inline __m128i simd2BppTo4Bpp(__m128i pixels) {
 	__m128i x = _mm_unpacklo_epi16(pixels, _mm_setzero_si128());
 
 	// c is the extracted 5/6 bit color from the image
@@ -48,7 +50,7 @@ inline __m128i simd2BppTo4Bpp(__m128i pixels) {
 	return _mm_or_si128(_mm_or_si128(_mm_or_si128(r, g), b), _mm_set1_epi32(0xff000000));
 }
 
-inline __m128i simd4BppTo2Bpp(__m128i pixels) {
+static inline __m128i simd4BppTo2Bpp(__m128i pixels) {
 	// x is the final 16 bit rgb pixel
 	__m128i x = _mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x000000ff)), 3);
 	x = _mm_or_si128(x, _mm_slli_epi32(_mm_srli_epi32(_mm_and_si128(pixels, _mm_set1_epi32(0x0000ff00)), 8+2), 5));
@@ -58,7 +60,7 @@ inline __m128i simd4BppTo2Bpp(__m128i pixels) {
 	return _mm_packs_epi32(x, _mm_setzero_si128());
 }
 
-inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+static inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did
 	alphas = _mm_add_epi16(alphas, _mm_and_si128(_mm_cmpgt_epi16(alphas, _mm_setzero_si128()), _mm_set1_epi16(1)));
 
@@ -98,7 +100,7 @@ inline __m128i rgbBlendSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alpha
 	return _mm_or_si128(diffs[0], _mm_slli_epi16(diffs[2], 11));
 }
 
-inline __m128i mul32_as32(__m128i a, __m128i b) {
+static inline __m128i mul32_as32(__m128i a, __m128i b) {
 	__m128i tmp1 = _mm_mul_epu32(a,b);
 	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a,4), _mm_srli_si128(b,4));
 	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))); /* shuffle results to [63..0] and pack */
@@ -107,7 +109,7 @@ inline __m128i mul32_as32(__m128i a, __m128i b) {
 // preserveAlpha:
 //		false => set destCols's alpha to 0
 // 		true => keep destCols's alpha
-inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool preserveAlpha) {
+static inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool preserveAlpha) {
 	// Here we add 1 to alphas if its 0. This is what the original blender function did.
 	alphas = _mm_add_epi32(alphas, _mm_and_si128(_mm_cmpgt_epi32(alphas, _mm_setzero_si128()), _mm_set1_epi32(1)));
 
@@ -149,7 +151,7 @@ inline __m128i rgbBlendSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, b
 	return srcCols;
 }
 
-inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
+static inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
 	__m128 srcA = _mm_cvtepi32_ps(_mm_srli_epi32(srcCols, 24));
 	srcA = _mm_mul_ps(srcA, _mm_set1_ps(1.0f / 255.0f));
 	__m128 srcR = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(srcCols, 16), _mm_set1_epi32(0xff)));
@@ -180,7 +182,7 @@ inline __m128i argbBlendSIMD(__m128i srcCols, __m128i destCols) {
 			_mm_cvtps_epi32(destB))));
 }
 
-inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool light) {
+static inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i alphas, bool light) {
 	// This function is NOT 1 to 1 with the original... It just approximates it
 	// It gets the value of the HSV of the dest color
 	// Then it gets the HSV of the srcCols
@@ -280,21 +282,21 @@ inline __m128i blendTintSpriteSIMD(__m128i srcCols, __m128i destCols, __m128i al
 	return final;
 }
 
-inline __m128i mul32_as16(__m128i a, __m128i b) {
+static inline __m128i mul32_as16(__m128i a, __m128i b) {
 	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
 	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
 	__m128i res = _mm_mullo_epi16(a16, b16);
 	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
 }
 
-inline __m128i findmin32_as16(__m128i a, __m128i b) {
+static inline __m128i findmin32_as16(__m128i a, __m128i b) {
 	__m128i a16 = _mm_packs_epi32(a, _mm_setzero_si128());
 	__m128i b16 = _mm_packs_epi32(b, _mm_setzero_si128());
 	__m128i res = _mm_min_epi16(a16, b16);
 	return _mm_unpacklo_epi16(res, _mm_setzero_si128());
 }
 
-inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas) {
+static inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas) {
 	__m128i srcAlphas, difAlphas, mask, ch1, ch2;
 	auto setupArgbAlphas = [&]() {
 		// This acts the same as this in the normal blender functions
@@ -359,7 +361,7 @@ inline __m128i blendPixelSIMD(__m128i srcCols, __m128i destCols, __m128i alphas)
 	return _mm_setzero_si128();
 }
 
-inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
+static inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alphas) {
 	__m128i mask, ch1, ch2;
 	switch (_G(_blender_mode)) {
 	case kSourceAlphaBlender:
@@ -396,7 +398,7 @@ inline __m128i blendPixelSIMD2Bpp(__m128i srcCols, __m128i destCols, __m128i alp
 }
 
 template<int DestBytesPerPixel, int SrcBytesPerPixel>
-inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i maskedAlphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+static inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i maskedAlphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
 	__m128i srcCols, destCol;
 
 	if (DestBytesPerPixel == 4)
@@ -432,7 +434,7 @@ inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m128i tint, __m128
 	}
 }
 
-inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
+static inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __m128i alphas, __m128i transColors, int xDir, int xCtrBpp, int srcAlpha, int skipTrans, bool horizFlip, bool useTint, __m128i skipMask) {
 	__m128i destCol = _mm_loadu_si128((const __m128i *)destPtr);
 	__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp));
 	__m128i mask1 = skipTrans ? _mm_cmpeq_epi16(srcCols, transColors) : _mm_setzero_si128();
@@ -456,20 +458,19 @@ inline void drawPixelSIMD2Bpp(byte *destPtr, const byte *srcP2, __m128i tint, __
 	_mm_storeu_si128((__m128i *)destPtr, final);
 }
 
-inline uint32 extract32_idx0(__m128i x) {
+static inline uint32 extract32_idx0(__m128i x) {
 	return _mm_cvtsi128_si32(x);
 }
-inline uint32 extract32_idx1(__m128i x) {
+static inline uint32 extract32_idx1(__m128i x) {
 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 1, 1)));
 }
-inline uint32 extract32_idx2(__m128i x) {
+static inline uint32 extract32_idx2(__m128i x) {
 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(2, 2, 2, 2)));
 }
-inline uint32 extract32_idx3(__m128i x) {
+static inline uint32 extract32_idx3(__m128i x) {
 	return _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 3, 3)));
 }
 
-class DrawInnerImpl {
 public:
 
 // This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
@@ -938,20 +939,20 @@ static void drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 }
 
-}; // end of class DrawInnerImpl
+}; // end of class DrawInnerImpl_SSE2
 
 template<bool Scale>
 void BITMAP::drawSSE2(DrawInnerArgs &args) {
 	if (args.sameFormat) {
 		switch (format.bytesPerPixel) {
-		case 1: DrawInnerImpl::drawInner1Bpp<Scale>(args); break;
-		case 2: DrawInnerImpl::drawInner2Bpp<Scale>(args); break;
-		case 4: DrawInnerImpl::drawInner4BppWithConv<4, 4, Scale>(args); break;
+		case 1: DrawInnerImpl_SSE2::drawInner1Bpp<Scale>(args); break;
+		case 2: DrawInnerImpl_SSE2::drawInner2Bpp<Scale>(args); break;
+		case 4: DrawInnerImpl_SSE2::drawInner4BppWithConv<4, 4, Scale>(args); break;
 		}
 	} else if (format.bytesPerPixel == 4 && args.src.format.bytesPerPixel == 2) {
-		DrawInnerImpl::drawInner4BppWithConv<4, 2, Scale>(args);
+		DrawInnerImpl_SSE2::drawInner4BppWithConv<4, 2, Scale>(args);
 	} else if (format.bytesPerPixel == 2 && args.src.format.bytesPerPixel == 4) {
-		DrawInnerImpl::drawInner4BppWithConv<2, 4, Scale>(args);
+		DrawInnerImpl_SSE2::drawInner4BppWithConv<2, 4, Scale>(args);
 	}
 }
 


Commit: 18a34cb7af8dce39f5e86a20b07d7490cb25eccb
    https://github.com/scummvm/scummvm/commit/18a34cb7af8dce39f5e86a20b07d7490cb25eccb
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-10-14T14:27:54+03:00

Commit Message:
AGS: Fix AVX2 scaled blitting

Scaled blitting in the AVX2 path would immediately crash
ScummVM, due to a couple of incorrect increments inside
a for-loop; this has now been fixed.

Changed paths:
    engines/ags/lib/allegro/surface_avx2.cpp


diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index cfd2a36e234..dcb4a2e4ea5 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -539,7 +539,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 			// scaling size, we create a small dummy buffer that we copy the pixels into and then
 			// call the drawPixelsSIMD function
 			byte srcBuffer[4*8] = {0};
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*4) {
 				if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
 				__m256i indexes = _mm256_set1_epi32(scaleXCtr);
 				// Calculate in parallel the indexes of the pixels


Commit: 5b29081355863c56784fd542908dfaf9e3e1c471
    https://github.com/scummvm/scummvm/commit/5b29081355863c56784fd542908dfaf9e3e1c471
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-10-14T14:27:55+03:00

Commit Message:
AGS: Apply clipping fix to 2bpp optimized blitting

The issue that produced crashes when blitting beyond the
bottom-right edge of the screen was previously fixed,
but only for cases involving a 4bpp surface. The fix has
now been extended to the 2bpp functions as well.

Changed paths:
    engines/ags/lib/allegro/surface_avx2.cpp
    engines/ags/lib/allegro/surface_neon.cpp
    engines/ags/lib/allegro/surface_sse2.cpp


diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index dcb4a2e4ea5..d8ac206a341 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -663,7 +663,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * 2;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 16 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -676,6 +676,9 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
+	if (xCtrWidth % 16 != 0) {
+		--yCtrHeight;
+	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 0aa9f0ca839..ecb1a479973 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -649,7 +649,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * 2;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -662,6 +662,9 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
+	if (xCtrWidth % 8 != 0) {
+		--yCtrHeight;
+	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index c620c5fbdb3..876d30ad8cf 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -675,7 +675,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		xCtrBppStart = xCtrStart * 2;
 		args.xStart = 0;
 	}
-	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
+	int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
 	if (Scale) yCtrHeight = args.dstRect.height();
 	if (args.yStart < 0) {
 		yCtr = -args.yStart;
@@ -688,6 +688,9 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
+	if (xCtrWidth % 8 != 0) {
+		--yCtrHeight;
+	}
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(