[Scummvm-git-logs] scummvm master -> 283bfd228d852730fbd2ad5c0394bc3855cbf333

Sat Dec 2 15:43:12 UTC 2023

This automated email contains information about 1 new commit which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .

Summary:
283bfd228d AGS: Fix ASAN crashes in SIMD blit paths


Commit: 283bfd228d852730fbd2ad5c0394bc3855cbf333
    https://github.com/scummvm/scummvm/commit/283bfd228d852730fbd2ad5c0394bc3855cbf333
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-12-02T16:43:09+01:00

Commit Message:
AGS: Fix ASAN crashes in SIMD blit paths

All optimized blitting paths exhibit similar overreading
behavior, due to the flawed design of (almost) always
loading a full 128/256-bit line, and discarding overread
pixels afterwards. This commit patches up the code so the
overreads never happen, albeit at the cost of a few
memcpys at the end of every line of pixels.

Changed paths:
    engines/ags/lib/allegro/surface_avx2.cpp
    engines/ags/lib/allegro/surface_neon.cpp
    engines/ags/lib/allegro/surface_sse2.cpp

diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index d5cbc8716b4..0fdd7992809 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -400,11 +400,11 @@ static inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint,
 	if (DestBytesPerPixel == 4)
 		destCol = _mm256_loadu_si256((const __m256i *)destPtr);
 	else
-		destCol = simd2BppTo4Bpp(_mm256_loadu_si256((const __m256i *)destPtr));
+		destCol = simd2BppTo4Bpp(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)destPtr)));
 	if (SrcBytesPerPixel == 4)
 		srcCols = _mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp));
 	else
-		srcCols = simd2BppTo4Bpp(_mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp)));
+		srcCols = simd2BppTo4Bpp(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp))));
 
 	// we do this here because we need to check if we should skip the pixel before we blend it
 	__m256i mask1 = skipTrans ? _mm256_cmpeq_epi32(_mm256_and_si256(srcCols, maskedAlphas), transColors) : _mm256_setzero_si256();
@@ -503,9 +503,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (xCtrWidth % 8 != 0) {
+	/*if (xCtrWidth % 8 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 8;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -516,12 +518,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 
 		if (!Scale) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				__m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
 			}
+
+			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			__m256i srcCols = _mm256_setzero_si256();
+			__m256i destCols = _mm256_setzero_si256();
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+			memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+			// Skip pixels that are beyond the row
+			// __m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
+			memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -580,7 +593,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
+	/*if (xCtrWidth % 8 == 0)*/ return;
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
@@ -677,9 +690,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (!Scale && xCtrWidth % 16 != 0) {
+	/*if (!Scale && xCtrWidth % 16 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 16;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -689,12 +704,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		__m256i xCtrWidthSIMD = _mm256_set1_epi16(xCtrWidth); // This is the width of the row
 		if (!Scale) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 32) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 16, xCtr += 16, xCtrBpp += 32) {
 				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				__m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
 			}
+
+			byte *destPtr = &destP[destX * 2];
+			__m256i srcCols = _mm256_setzero_si256();
+			__m256i destCols = _mm256_setzero_si256();
+			const int copySize = (xCtrWidth - xCtr) * 2;
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+			memcpy(&destCols, destPtr, copySize);
+
+			// Skip pixels that are beyond the row
+			// __m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
+			drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
+			memcpy(destPtr, &destCols, copySize);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -757,7 +784,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 
 	// We have a picture that is a multiple of 16, so no extra pixels to draw
-	if (xCtrWidth % 16 == 0) return;
+	/*if (xCtrWidth % 16 == 0)*/ return;
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// Drawing the last few not scaled pixels here.
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 187374968b1..781ea450e5b 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -499,9 +499,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (!Scale && xCtrWidth % 4 != 0) {
+	/*if (!Scale && xCtrWidth % 4 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 4;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -511,12 +513,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 		uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
 
 		if (!Scale) {
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
 			}
+
+			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			uint32x4_t srcCols = vmovq_n_u32(0);
+			uint32x4_t destCols = vmovq_n_u32(0);
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+			memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+			// Skip pixels that are beyond the row
+			// uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
+			memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -568,7 +581,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// We have a picture that is a multiple of 4, so no extra pixels to draw
-	if (xCtrWidth % 4 == 0) return;
+	/*if (xCtrWidth % 4 == 0)*/ return;
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
@@ -666,9 +679,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (!Scale && xCtrWidth % 8 != 0) {
+	/*if (!Scale && xCtrWidth % 8 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 8;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -678,12 +693,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
 		if (!Scale) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
 			}
+
+			byte *destPtr = &destP[destX * 2];
+			uint16x8_t srcCols = vmovq_n_u16(0);
+			uint16x8_t destCols = vmovq_n_u16(0);
+			const int copySize = (xCtrWidth - xCtr) * 2;
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+			memcpy(&destCols, destPtr, copySize);
+
+			// Skip pixels that are beyond the row
+			// uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+			drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
+			memcpy(destPtr, &destCols, copySize);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -738,7 +765,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 
 	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
+	/*if (xCtrWidth % 8 == 0)*/ return;
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// Drawing the last few not scaled pixels here.
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 7f15705880e..6e6c9263284 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -518,9 +518,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (!Scale && xCtrWidth % 4 != 0) {
+	/*if (!Scale && xCtrWidth % 4 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 4;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -531,12 +533,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 
 		if (!Scale) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
 				byte *destPtr = &destP[destX * DestBytesPerPixel];
-				// Skip pixels that are beyond the row
-				__m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
-				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi32(0));
 			}
+
+			byte *destPtr = &destP[destX * DestBytesPerPixel];
+			__m128i srcCols = _mm_setzero_si128();
+			__m128i destCols = _mm_setzero_si128();
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+			memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+			// Skip pixels that are beyond the row
+			// __m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+			drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi32(0));
+			memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -591,7 +604,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// We have a picture that is a multiple of 4, so no extra pixels to draw
-	if (xCtrWidth % 4 == 0) return;
+	/*if (xCtrWidth % 4 == 0)*/ return;
 	// Drawing the last few not scaled pixels here.
 	// Same as the loop above but now we check if we are going to overflow,
 	// and thus we don't need to mask out pixels that go over the row.
@@ -688,9 +701,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	if (args.yStart + yCtrHeight > args.destArea.h) {
 		yCtrHeight = args.destArea.h - args.yStart;
 	}
-	if (!Scale && xCtrWidth % 8 != 0) {
+	/*if (!Scale && xCtrWidth % 8 != 0) {
 		--yCtrHeight;
-	}
+	}*/
+
+	const int secondToLast = xCtrWidth - 8;
 
 	byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
 	const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -700,12 +715,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 		__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
 		if (!Scale) {
 			// If we are not scaling the image
-			for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+			int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+			for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += 16) {
 				byte *destPtr = &destP[destX * 2];
-				// Skip pixels that are beyond the row
-				__m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
-				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+				drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi16(0));
 			}
+
+			byte *destPtr = &destP[destX * 2];
+			__m128i srcCols = _mm_setzero_si128();
+			__m128i destCols = _mm_setzero_si128();
+			const int copySize = (xCtrWidth - xCtr) * 2;
+			memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+			memcpy(&destCols, destPtr, copySize);
+
+			// Skip pixels that are beyond the row
+			// __m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+			drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi16(0));
+			memcpy(destPtr, &destCols, copySize);
+
 			// Goto next row in source and destination image
 			destP += args.destArea.pitch;
 			srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -760,7 +787,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
 	}
 
 	// We have a picture that is a multiple of 8, so no extra pixels to draw
-	if (xCtrWidth % 8 == 0) return;
+	/*if (xCtrWidth % 8 == 0)*/ return;
 	// Get the last x values of the last row
 	int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
 	// Drawing the last few not scaled pixels here.