[Scummvm-git-logs] scummvm master -> 283bfd228d852730fbd2ad5c0394bc3855cbf333
sev-
noreply at scummvm.org
Sat Dec 2 15:43:12 UTC 2023
This automated email contains information about 1 new commit which have been
pushed to the 'scummvm' repo located at https://github.com/scummvm/scummvm .
Summary:
283bfd228d AGS: Fix ASAN crashes in SIMD blit paths
Commit: 283bfd228d852730fbd2ad5c0394bc3855cbf333
https://github.com/scummvm/scummvm/commit/283bfd228d852730fbd2ad5c0394bc3855cbf333
Author: Kaloyan Chehlarski (strahy at outlook.com)
Date: 2023-12-02T16:43:09+01:00
Commit Message:
AGS: Fix ASAN crashes in SIMD blit paths
All optimized blitting paths exhibit similar overreading
behavior, due to the flawed design of (almost) always
loading a full 128/256-bit line, and discarding overread
pixels afterwards. This commit patches up the code so the
overreads never happen, albeit at the cost of a few
memcpys at the end of every line of pixels.
Changed paths:
engines/ags/lib/allegro/surface_avx2.cpp
engines/ags/lib/allegro/surface_neon.cpp
engines/ags/lib/allegro/surface_sse2.cpp
diff --git a/engines/ags/lib/allegro/surface_avx2.cpp b/engines/ags/lib/allegro/surface_avx2.cpp
index d5cbc8716b4..0fdd7992809 100644
--- a/engines/ags/lib/allegro/surface_avx2.cpp
+++ b/engines/ags/lib/allegro/surface_avx2.cpp
@@ -400,11 +400,11 @@ static inline void drawPixelSIMD(byte *destPtr, const byte *srcP2, __m256i tint,
if (DestBytesPerPixel == 4)
destCol = _mm256_loadu_si256((const __m256i *)destPtr);
else
- destCol = simd2BppTo4Bpp(_mm256_loadu_si256((const __m256i *)destPtr));
+ destCol = simd2BppTo4Bpp(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)destPtr)));
if (SrcBytesPerPixel == 4)
srcCols = _mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp));
else
- srcCols = simd2BppTo4Bpp(_mm256_loadu_si256((const __m256i *)(srcP2 + xDir * xCtrBpp)));
+ srcCols = simd2BppTo4Bpp(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(srcP2 + xDir * xCtrBpp))));
// we do this here because we need to check if we should skip the pixel before we blend it
__m256i mask1 = skipTrans ? _mm256_cmpeq_epi32(_mm256_and_si256(srcCols, maskedAlphas), transColors) : _mm256_setzero_si256();
@@ -503,9 +503,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (xCtrWidth % 8 != 0) {
+ /*if (xCtrWidth % 8 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 8;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -516,12 +518,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
if (!Scale) {
// If we are not scaling the image
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += SrcBytesPerPixel*8) {
byte *destPtr = &destP[destX * DestBytesPerPixel];
- // Skip pixels that are beyond the row
- __m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
- drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
}
+
+ byte *destPtr = &destP[destX * DestBytesPerPixel];
+ __m256i srcCols = _mm256_setzero_si256();
+ __m256i destCols = _mm256_setzero_si256();
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+ memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+ // Skip pixels that are beyond the row
+ // __m256i skipMask = _mm256_cmpgt_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_set1_epi32(xCtr), addIndexes), _mm256_set1_epi32(1)), xCtrWidthSIMD);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
+ memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -580,7 +593,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// We have a picture that is a multiple of 8, so no extra pixels to draw
- if (xCtrWidth % 8 == 0) return;
+ /*if (xCtrWidth % 8 == 0)*/ return;
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
@@ -677,9 +690,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (!Scale && xCtrWidth % 16 != 0) {
+ /*if (!Scale && xCtrWidth % 16 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 16;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -689,12 +704,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
__m256i xCtrWidthSIMD = _mm256_set1_epi16(xCtrWidth); // This is the width of the row
if (!Scale) {
// If we are not scaling the image
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 16, xCtr += 16, xCtrBpp += 32) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 16, xCtr += 16, xCtrBpp += 32) {
byte *destPtr = &destP[destX * 2];
- // Skip pixels that are beyond the row
- __m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
- drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
}
+
+ byte *destPtr = &destP[destX * 2];
+ __m256i srcCols = _mm256_setzero_si256();
+ __m256i destCols = _mm256_setzero_si256();
+ const int copySize = (xCtrWidth - xCtr) * 2;
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+ memcpy(&destCols, destPtr, copySize);
+
+ // Skip pixels that are beyond the row
+ // __m256i skipMask = _mm256_cmpgt_epi16(_mm256_add_epi16(_mm256_add_epi16(_mm256_set1_epi16(xCtr), addIndexes), _mm256_set1_epi16(1)), xCtrWidthSIMD);
+ drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm256_set1_epi32(0));
+ memcpy(destPtr, &destCols, copySize);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -757,7 +784,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
}
// We have a picture that is a multiple of 16, so no extra pixels to draw
- if (xCtrWidth % 16 == 0) return;
+ /*if (xCtrWidth % 16 == 0)*/ return;
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// Drawing the last few not scaled pixels here.
diff --git a/engines/ags/lib/allegro/surface_neon.cpp b/engines/ags/lib/allegro/surface_neon.cpp
index 187374968b1..781ea450e5b 100644
--- a/engines/ags/lib/allegro/surface_neon.cpp
+++ b/engines/ags/lib/allegro/surface_neon.cpp
@@ -499,9 +499,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (!Scale && xCtrWidth % 4 != 0) {
+ /*if (!Scale && xCtrWidth % 4 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 4;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -511,12 +513,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
if (!Scale) {
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[destX * DestBytesPerPixel];
- // Skip pixels that are beyond the row
- uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
- drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
}
+
+ byte *destPtr = &destP[destX * DestBytesPerPixel];
+ uint32x4_t srcCols = vmovq_n_u32(0);
+ uint32x4_t destCols = vmovq_n_u32(0);
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+ memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+ // Skip pixels that are beyond the row
+ // uint32x4_t skipMask = vcgeq_u32(vaddq_u32(vdupq_n_u32(xCtr), addIndexes), xCtrWidthSIMD);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
+ memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -568,7 +581,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// We have a picture that is a multiple of 4, so no extra pixels to draw
- if (xCtrWidth % 4 == 0) return;
+ /*if (xCtrWidth % 4 == 0)*/ return;
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
@@ -666,9 +679,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (!Scale && xCtrWidth % 8 != 0) {
+ /*if (!Scale && xCtrWidth % 8 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 8;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -678,12 +693,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
if (!Scale) {
// If we are not scaling the image
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
- // Skip pixels that are beyond the row
- uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
- drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
}
+
+ byte *destPtr = &destP[destX * 2];
+ uint16x8_t srcCols = vmovq_n_u16(0);
+ uint16x8_t destCols = vmovq_n_u16(0);
+ const int copySize = (xCtrWidth - xCtr) * 2;
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+ memcpy(&destCols, destPtr, copySize);
+
+ // Skip pixels that are beyond the row
+ // uint16x8_t skipMask = vcgeq_u16(vaddq_u16(vdupq_n_u16(xCtr), addIndexes), xCtrWidthSIMD);
+ drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
+ memcpy(destPtr, &destCols, copySize);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -738,7 +765,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
}
// We have a picture that is a multiple of 8, so no extra pixels to draw
- if (xCtrWidth % 8 == 0) return;
+ /*if (xCtrWidth % 8 == 0)*/ return;
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// Drawing the last few not scaled pixels here.
diff --git a/engines/ags/lib/allegro/surface_sse2.cpp b/engines/ags/lib/allegro/surface_sse2.cpp
index 7f15705880e..6e6c9263284 100644
--- a/engines/ags/lib/allegro/surface_sse2.cpp
+++ b/engines/ags/lib/allegro/surface_sse2.cpp
@@ -518,9 +518,11 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (!Scale && xCtrWidth % 4 != 0) {
+ /*if (!Scale && xCtrWidth % 4 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 4;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -531,12 +533,23 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
if (!Scale) {
// If we are not scaling the image
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[destX * DestBytesPerPixel];
- // Skip pixels that are beyond the row
- __m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
- drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi32(0));
}
+
+ byte *destPtr = &destP[destX * DestBytesPerPixel];
+ __m128i srcCols = _mm_setzero_si128();
+ __m128i destCols = _mm_setzero_si128();
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, (xCtrWidth - xCtr) * SrcBytesPerPixel);
+ memcpy(&destCols, destPtr, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
+ // Skip pixels that are beyond the row
+ // __m128i skipMask = _mm_cmpgt_epi32(_mm_add_epi32(_mm_add_epi32(_mm_set1_epi32(xCtr), addIndexes), _mm_set1_epi32(1)), xCtrWidthSIMD);
+ drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>((byte *)&destCols, (byte *)&srcCols, tint, alphas, maskedAlphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi32(0));
+ memcpy(destPtr, &destCols, (xCtrWidth - xCtr) * DestBytesPerPixel);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -591,7 +604,7 @@ static void drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// We have a picture that is a multiple of 4, so no extra pixels to draw
- if (xCtrWidth % 4 == 0) return;
+ /*if (xCtrWidth % 4 == 0)*/ return;
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
@@ -688,9 +701,11 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
if (args.yStart + yCtrHeight > args.destArea.h) {
yCtrHeight = args.destArea.h - args.yStart;
}
- if (!Scale && xCtrWidth % 8 != 0) {
+ /*if (!Scale && xCtrWidth % 8 != 0) {
--yCtrHeight;
- }
+ }*/
+
+ const int secondToLast = xCtrWidth - 8;
byte *destP = (byte *)args.destArea.getBasePtr(0, destY);
const byte *srcP = (const byte *)args.src.getBasePtr(
@@ -700,12 +715,24 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
if (!Scale) {
// If we are not scaling the image
- for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
+ int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
+ for (; xCtr < secondToLast; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
- // Skip pixels that are beyond the row
- __m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
- drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, skipMask);
+ drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi16(0));
}
+
+ byte *destPtr = &destP[destX * 2];
+ __m128i srcCols = _mm_setzero_si128();
+ __m128i destCols = _mm_setzero_si128();
+ const int copySize = (xCtrWidth - xCtr) * 2;
+ memcpy(&srcCols, srcP + xDir * xCtrBpp, copySize);
+ memcpy(&destCols, destPtr, copySize);
+
+ // Skip pixels that are beyond the row
+ // __m128i skipMask = _mm_cmpgt_epi16(_mm_add_epi16(_mm_add_epi16(_mm_set1_epi16(xCtr), addIndexes), _mm_set1_epi16(1)), xCtrWidthSIMD);
+ drawPixelSIMD2Bpp((byte *)&destCols, (byte *)&srcCols, tint, alphas, transColors, xDir, 0, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_set1_epi16(0));
+ memcpy(destPtr, &destCols, copySize);
+
// Goto next row in source and destination image
destP += args.destArea.pitch;
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
@@ -760,7 +787,7 @@ static void drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
}
// We have a picture that is a multiple of 8, so no extra pixels to draw
- if (xCtrWidth % 8 == 0) return;
+ /*if (xCtrWidth % 8 == 0)*/ return;
// Get the last x values of the last row
int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart;
// Drawing the last few not scaled pixels here.
More information about the Scummvm-git-logs
mailing list